diff --git a/.gitignore b/.gitignore index c2ed4ecb0acd28126d8703e17e86775ccbed1bb6..0c39aa20b6ba8d2afbc69670636b9223b3bf23fa 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ *.lzo *.patch *.gcno +*.ll modules.builtin Module.symvers *.dwo diff --git a/.mailmap b/.mailmap index 1d6f4e7280dc67f630b79456b3f1e5a4c0a41343..5273cfd70ad62996ba8374698fb5e22e91d33f45 100644 --- a/.mailmap +++ b/.mailmap @@ -111,6 +111,7 @@ Mauro Carvalho Chehab Mauro Carvalho Chehab Matt Ranostay Matthew Ranostay Matt Ranostay +Matt Ranostay Mayuresh Janorkar Michael Buesch Michel Dänzer @@ -145,6 +146,8 @@ Santosh Shilimkar Santosh Shilimkar Sascha Hauer S.Çağlar Onur +Sebastian Reichel +Sebastian Reichel Shiraz Hashim Shuah Khan Shuah Khan diff --git a/CREDITS b/CREDITS index c5626bf06264e41e136fbe0c0420881e72de0ce2..5d09c26d69cdc0f4709cf48f5f3ff6195d53e85d 100644 --- a/CREDITS +++ b/CREDITS @@ -1034,6 +1034,10 @@ S: 2037 Walnut #6 S: Boulder, Colorado 80302 S: USA +N: Hans-Christian Noren Egtvedt +E: egtvedt@samfundet.no +D: AVR32 architecture maintainer. + N: Heiko Eißfeldt E: heiko@colossus.escape.de heiko@unifix.de D: verify_area stuff, generic SCSI fixes @@ -3398,6 +3402,10 @@ S: Suite 101 S: Markham, Ontario L3R 2Z6 S: Canada +N: Haavard Skinnemoen +M: Haavard Skinnemoen +D: AVR32 architecture port to Linux and maintainer. + N: Rick Sladkey E: jrs@world.std.com D: utility hacker: Emacs, NFS server, mount, kmem-ps, UPS debugger, strace, GDB diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index 793acf999e9eac87057af3214ca1f98ad65b922f..ed3e5e949fce303efec625259b527d503cf82f8d 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -412,6 +412,8 @@ sysctl/ - directory with info on the /proc/sys/* files. target/ - directory with info on generating TCM v4 fabric .ko modules +tee.txt + - info on the TEE subsystem and drivers this_cpu_ops.txt - List rationale behind and the way to use this_cpu operations. thermal/ diff --git a/Documentation/ABI/obsolete/sysfs-firmware-acpi b/Documentation/ABI/obsolete/sysfs-firmware-acpi new file mode 100644 index 0000000000000000000000000000000000000000..6715a71bec3d71a27d3f4f97bce1b19dde154764 --- /dev/null +++ b/Documentation/ABI/obsolete/sysfs-firmware-acpi @@ -0,0 +1,8 @@ +What: /sys/firmware/acpi/hotplug/force_remove +Date: Mar 2017 +Contact: Rafael J. Wysocki +Description: + Since the force_remove is inherently broken and dangerous to + use for some hotplugable resources like memory (because ignoring + the offline failure might lead to memory corruption and crashes) + enabling this knob is not safe and thus unsupported. diff --git a/Documentation/ABI/stable/sysfs-bus-usb b/Documentation/ABI/stable/sysfs-bus-usb index 831f15d9672f29e90cca5650356d2f69599e14b8..b832eeff999914b0afa4e4a99d7379e4f852dea8 100644 --- a/Documentation/ABI/stable/sysfs-bus-usb +++ b/Documentation/ABI/stable/sysfs-bus-usb @@ -9,7 +9,7 @@ Description: hubs this facility is always enabled and their device directories will not contain this file. - For more information, see Documentation/usb/persist.txt. + For more information, see Documentation/driver-api/usb/persist.rst. What: /sys/bus/usb/devices/.../power/autosuspend Date: March 2007 diff --git a/Documentation/ABI/stable/vdso b/Documentation/ABI/stable/vdso index 7cdfc28cc2c6d93b861d6ec3acb05bc5aca8bc70..55406ec8a35a34d9c0f67713344e769d5fc76021 100644 --- a/Documentation/ABI/stable/vdso +++ b/Documentation/ABI/stable/vdso @@ -16,7 +16,8 @@ The vDSO uses symbol versioning; whenever you request a symbol from the vDSO, specify the version you are expecting. Programs that dynamically link to glibc will use the vDSO automatically. -Otherwise, you can use the reference parser in Documentation/vDSO/parse_vdso.c. +Otherwise, you can use the reference parser in +tools/testing/selftests/vDSO/parse_vdso.c. Unless otherwise noted, the set of symbols with any given version and the ABI of those symbols is considered stable. It may vary across architectures, diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 2da04ce6aeef482645bfd9edd924ce195275ea26..dea212db9df3531f3dfc69204c8e4fcac8ec2bb5 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -213,14 +213,8 @@ What: /sys/block//queue/discard_zeroes_data Date: May 2011 Contact: Martin K. Petersen Description: - Devices that support discard functionality may return - stale or random data when a previously discarded block - is read back. This can cause problems if the filesystem - expects discarded blocks to be explicitly cleared. If a - device reports that it deterministically returns zeroes - when a discarded area is read the discard_zeroes_data - parameter will be set to one. Otherwise it will be 0 and - the result of reading a discarded area is undefined. + Will always return 0. Don't rely on any specific behavior + for discards, and don't read this file. What: /sys/block//queue/write_same_max_bytes Date: January 2012 diff --git a/Documentation/ABI/testing/sysfs-bus-iio b/Documentation/ABI/testing/sysfs-bus-iio index 530809ccfacf336710a40cf501401ccae0bb6ae4..8c24d0892f61e36727fdeee50c5bd39313ecc04a 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio +++ b/Documentation/ABI/testing/sysfs-bus-iio @@ -55,6 +55,7 @@ Description: then it is to be found in the base device directory. What: /sys/bus/iio/devices/iio:deviceX/sampling_frequency_available +What: /sys/bus/iio/devices/iio:deviceX/in_proximity_sampling_frequency_available What: /sys/.../iio:deviceX/buffer/sampling_frequency_available What: /sys/bus/iio/devices/triggerX/sampling_frequency_available KernelVersion: 2.6.35 @@ -1593,7 +1594,7 @@ Description: can be processed to siemens per meter. What: /sys/bus/iio/devices/iio:deviceX/in_countY_raw -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Raw counter device counts from channel Y. For quadrature @@ -1601,10 +1602,24 @@ Description: the counts of a single quadrature signal phase from channel Y. What: /sys/bus/iio/devices/iio:deviceX/in_indexY_raw -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Raw counter device index value from channel Y. This attribute provides an absolute positional reference (e.g. a pulse once per revolution) which may be used to home positional systems as required. + +What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available +KernelVersion: 4.12 +Contact: linux-iio@vger.kernel.org +Description: + A list of possible counting directions which are: + - "up" : counter device is increasing. + - "down": counter device is decreasing. + +What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction +KernelVersion: 4.12 +Contact: linux-iio@vger.kernel.org +Description: + Raw counter device counters direction for channel Y. diff --git a/Documentation/ABI/testing/sysfs-bus-iio-adc-max9611 b/Documentation/ABI/testing/sysfs-bus-iio-adc-max9611 new file mode 100644 index 0000000000000000000000000000000000000000..6d2d2b094941d34f8dba2ce28473dcf5a22fcbad --- /dev/null +++ b/Documentation/ABI/testing/sysfs-bus-iio-adc-max9611 @@ -0,0 +1,17 @@ +What: /sys/bus/iio/devices/iio:deviceX/in_power_shunt_resistor +Date: March 2017 +KernelVersion: 4.12 +Contact: linux-iio@vger.kernel.org +Description: The value of the shunt resistor used to compute power drain on + common input voltage pin (RS+). In Ohms. + +What: /sys/bus/iio/devices/iio:deviceX/in_current_shunt_resistor +Date: March 2017 +KernelVersion: 4.12 +Contact: linux-iio@vger.kernel.org +Description: The value of the shunt resistor used to compute current flowing + between RS+ and RS- voltage sense inputs. In Ohms. + +These attributes describe a single physical component, exposed as two distinct +attributes as it is used to calculate two different values: power load and +current flowing between RS+ and RS- inputs. diff --git a/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8 b/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8 index ba676520b9530e7c6d955ef0cdd6e68a2fd93823..7fac2c268d9a9050e06d7248bcc33dbdf30978bb 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8 +++ b/Documentation/ABI/testing/sysfs-bus-iio-counter-104-quad-8 @@ -1,24 +1,16 @@ -What: /sys/bus/iio/devices/iio:deviceX/in_count_count_direction_available What: /sys/bus/iio/devices/iio:deviceX/in_count_count_mode_available What: /sys/bus/iio/devices/iio:deviceX/in_count_noise_error_available What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available What: /sys/bus/iio/devices/iio:deviceX/in_index_index_polarity_available What: /sys/bus/iio/devices/iio:deviceX/in_index_synchronous_mode_available -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Discrete set of available values for the respective counter configuration are listed in this file. -What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_direction -KernelVersion: 4.9 -Contact: linux-iio@vger.kernel.org -Description: - Read-only attribute that indicates whether the counter for - channel Y is counting up or down. - What: /sys/bus/iio/devices/iio:deviceX/in_countY_count_mode -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Count mode for channel Y. Four count modes are available: @@ -52,7 +44,7 @@ Description: continuously throughout. What: /sys/bus/iio/devices/iio:deviceX/in_countY_noise_error -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Read-only attribute that indicates whether excessive noise is @@ -60,14 +52,14 @@ Description: irrelevant in non-quadrature clock mode. What: /sys/bus/iio/devices/iio:deviceX/in_countY_preset -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: If the counter device supports preset registers, the preset count for channel Y is provided by this attribute. What: /sys/bus/iio/devices/iio:deviceX/in_countY_quadrature_mode -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Configure channel Y counter for non-quadrature or quadrature @@ -88,7 +80,7 @@ Description: decoded for UP/DN clock. What: /sys/bus/iio/devices/iio:deviceX/in_countY_set_to_preset_on_index -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Whether to set channel Y counter with channel Y preset value @@ -96,14 +88,14 @@ Description: Valid attribute values are boolean. What: /sys/bus/iio/devices/iio:deviceX/in_indexY_index_polarity -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Active level of channel Y index input; irrelevant in non-synchronous load mode. What: /sys/bus/iio/devices/iio:deviceX/in_indexY_synchronous_mode -KernelVersion: 4.9 +KernelVersion: 4.10 Contact: linux-iio@vger.kernel.org Description: Configure channel Y counter for non-synchronous or synchronous diff --git a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 index 6534a60037ffa6129c7a1f90d7dca62f68b23c54..230020e06677d73a2b8b24c031962e3fbda19c45 100644 --- a/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 +++ b/Documentation/ABI/testing/sysfs-bus-iio-timer-stm32 @@ -3,11 +3,15 @@ KernelVersion: 4.11 Contact: benjamin.gaignard@st.com Description: Reading returns the list possible master modes which are: - - "reset" : The UG bit from the TIMx_EGR register is used as trigger output (TRGO). - - "enable" : The Counter Enable signal CNT_EN is used as trigger output. + - "reset" : The UG bit from the TIMx_EGR register is + used as trigger output (TRGO). + - "enable" : The Counter Enable signal CNT_EN is used + as trigger output. - "update" : The update event is selected as trigger output. - For instance a master timer can then be used as a prescaler for a slave timer. - - "compare_pulse" : The trigger output send a positive pulse when the CC1IF flag is to be set. + For instance a master timer can then be used + as a prescaler for a slave timer. + - "compare_pulse" : The trigger output send a positive pulse + when the CC1IF flag is to be set. - "OC1REF" : OC1REF signal is used as trigger output. - "OC2REF" : OC2REF signal is used as trigger output. - "OC3REF" : OC3REF signal is used as trigger output. @@ -27,3 +31,62 @@ Description: Reading returns the current sampling frequency. Writing an value different of 0 set and start sampling. Writing 0 stop sampling. + +What: /sys/bus/iio/devices/iio:deviceX/in_count0_preset +KernelVersion: 4.12 +Contact: benjamin.gaignard@st.com +Description: + Reading returns the current preset value. + Writing sets the preset value. + When counting up the counter starts from 0 and fires an + event when reach preset value. + When counting down the counter start from preset value + and fire event when reach 0. + +What: /sys/bus/iio/devices/iio:deviceX/in_count_quadrature_mode_available +KernelVersion: 4.12 +Contact: benjamin.gaignard@st.com +Description: + Reading returns the list possible quadrature modes. + +What: /sys/bus/iio/devices/iio:deviceX/in_count0_quadrature_mode +KernelVersion: 4.12 +Contact: benjamin.gaignard@st.com +Description: + Configure the device counter quadrature modes: + channel_A: + Encoder A input servers as the count input and B as + the UP/DOWN direction control input. + + channel_B: + Encoder B input serves as the count input and A as + the UP/DOWN direction control input. + + quadrature: + Encoder A and B inputs are mixed to get direction + and count with a scale of 0.25. + +What: /sys/bus/iio/devices/iio:deviceX/in_count_enable_mode_available +KernelVersion: 4.12 +Contact: benjamin.gaignard@st.com +Description: + Reading returns the list possible enable modes. + +What: /sys/bus/iio/devices/iio:deviceX/in_count0_enable_mode +KernelVersion: 4.12 +Contact: benjamin.gaignard@st.com +Description: + Configure the device counter enable modes, in all case + counting direction is set by in_count0_count_direction + attribute and the counter is clocked by the internal clock. + always: + Counter is always ON. + + gated: + Counting is enabled when connected trigger signal + level is high else counting is disabled. + + triggered: + Counting is enabled on rising edge of the connected + trigger, and remains enabled for the duration of this + selected mode. diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci index e4e90104d7c38ff89bae718df7485337ee3e9108..44d4b2be92fd4a56ab2420944f76669a2466a304 100644 --- a/Documentation/ABI/testing/sysfs-bus-pci +++ b/Documentation/ABI/testing/sysfs-bus-pci @@ -301,3 +301,25 @@ Contact: Emil Velikov Description: This file contains the revision field of the PCI device. The value comes from device config space. The file is read only. + +What: /sys/bus/pci/devices/.../sriov_drivers_autoprobe +Date: April 2017 +Contact: Bodong Wang +Description: + This file is associated with the PF of a device that + supports SR-IOV. It determines whether newly-enabled VFs + are immediately bound to a driver. It initially contains + 1, which means the kernel automatically binds VFs to a + compatible driver immediately after they are enabled. If + an application writes 0 to the file before enabling VFs, + the kernel will not bind VFs to a driver. + + A typical use case is to write 0 to this file, then enable + VFs, then assign the newly-created VFs to virtual machines. + Note that changing this file does not affect already- + enabled VFs. In this scenario, the user must first disable + the VFs, write 0 to sriov_drivers_autoprobe, then re-enable + the VFs. + + This is similar to /sys/bus/pci/drivers_autoprobe, but + affects only the VFs associated with a specific PF. diff --git a/Documentation/ABI/testing/sysfs-class-net-qmi b/Documentation/ABI/testing/sysfs-class-net-qmi index fa5a00bb114372bfbc67008e8d0ebe8d1566ed6b..7122d6264c49d6c02c2c0074e145f19b93fc63dd 100644 --- a/Documentation/ABI/testing/sysfs-class-net-qmi +++ b/Documentation/ABI/testing/sysfs-class-net-qmi @@ -21,3 +21,30 @@ Description: is responsible for coordination of driver and firmware link framing mode, changing this setting to 'Y' if the firmware is configured for 'raw-ip' mode. + +What: /sys/class/net//qmi/add_mux +Date: March 2017 +KernelVersion: 4.11 +Contact: Bjørn Mork +Description: + Unsigned integer. + + Write a number ranging from 1 to 127 to add a qmap mux + based network device, supported by recent Qualcomm based + modems. + + The network device will be called qmimux. + + Userspace is in charge of managing the qmux network device + activation and data stream setup on the modem side by + using the proper QMI protocol requests. + +What: /sys/class/net//qmi/del_mux +Date: March 2017 +KernelVersion: 4.11 +Contact: Bjørn Mork +Description: + Unsigned integer. + + Write a number ranging from 1 to 127 to delete a previously + created qmap mux based network device. diff --git a/Documentation/ABI/testing/sysfs-class-switchtec b/Documentation/ABI/testing/sysfs-class-switchtec new file mode 100644 index 0000000000000000000000000000000000000000..48cb4c15e430995c61f5af818368c0feeac8317b --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-switchtec @@ -0,0 +1,96 @@ +switchtec - Microsemi Switchtec PCI Switch Management Endpoint + +For details on this subsystem look at Documentation/switchtec.txt. + +What: /sys/class/switchtec +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: The switchtec class subsystem folder. + Each registered switchtec driver is represented by a switchtecX + subfolder (X being an integer >= 0). + + +What: /sys/class/switchtec/switchtec[0-9]+/component_id +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Component identifier as stored in the hardware (eg. PM8543) + (read only) +Values: arbitrary string. + + +What: /sys/class/switchtec/switchtec[0-9]+/component_revision +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Component revision stored in the hardware (read only) +Values: integer. + + +What: /sys/class/switchtec/switchtec[0-9]+/component_vendor +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Component vendor as stored in the hardware (eg. MICROSEM) + (read only) +Values: arbitrary string. + + +What: /sys/class/switchtec/switchtec[0-9]+/device_version +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Device version as stored in the hardware (read only) +Values: integer. + + +What: /sys/class/switchtec/switchtec[0-9]+/fw_version +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Currently running firmware version (read only) +Values: integer (in hexadecimal). + + +What: /sys/class/switchtec/switchtec[0-9]+/partition +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Partition number for this device in the switch (read only) +Values: integer. + + +What: /sys/class/switchtec/switchtec[0-9]+/partition_count +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Total number of partitions in the switch (read only) +Values: integer. + + +What: /sys/class/switchtec/switchtec[0-9]+/product_id +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Product identifier as stored in the hardware (eg. PSX 48XG3) + (read only) +Values: arbitrary string. + + +What: /sys/class/switchtec/switchtec[0-9]+/product_revision +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Product revision stored in the hardware (eg. RevB) + (read only) +Values: arbitrary string. + + +What: /sys/class/switchtec/switchtec[0-9]+/product_vendor +Date: 05-Jan-2017 +KernelVersion: v4.11 +Contact: Logan Gunthorpe +Description: Product vendor as stored in the hardware (eg. MICROSEM) + (read only) +Values: arbitrary string. diff --git a/Documentation/ABI/testing/sysfs-class-typec b/Documentation/ABI/testing/sysfs-class-typec new file mode 100644 index 0000000000000000000000000000000000000000..d4a3d23eb09c94adbf51b60dc26b55646e4fceeb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-typec @@ -0,0 +1,276 @@ +USB Type-C port devices (eg. /sys/class/typec/port0/) + +What: /sys/class/typec//data_role +Date: April 2017 +Contact: Heikki Krogerus +Description: + The supported USB data roles. This attribute can be used for + requesting data role swapping on the port. Swapping is supported + as synchronous operation, so write(2) to the attribute will not + return until the operation has finished. The attribute is + notified about role changes so that poll(2) on the attribute + wakes up. Change on the role will also generate uevent + KOBJ_CHANGE on the port. The current role is show in brackets, + for example "[host] device" when DRP port is in host mode. + + Valid values: host, device + +What: /sys/class/typec//power_role +Date: April 2017 +Contact: Heikki Krogerus +Description: + The supported power roles. This attribute can be used to request + power role swap on the port when the port supports USB Power + Delivery. Swapping is supported as synchronous operation, so + write(2) to the attribute will not return until the operation + has finished. The attribute is notified about role changes so + that poll(2) on the attribute wakes up. Change on the role will + also generate uevent KOBJ_CHANGE. The current role is show in + brackets, for example "[source] sink" when in source mode. + + Valid values: source, sink + +What: /sys/class/typec//vconn_source +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows is the port VCONN Source. This attribute can be used to + request VCONN swap to change the VCONN Source during connection + when both the port and the partner support USB Power Delivery. + Swapping is supported as synchronous operation, so write(2) to + the attribute will not return until the operation has finished. + The attribute is notified about VCONN source changes so that + poll(2) on the attribute wakes up. Change on VCONN source also + generates uevent KOBJ_CHANGE. + + Valid values: + - "no" when the port is not the VCONN Source + - "yes" when the port is the VCONN Source + +What: /sys/class/typec//power_operation_mode +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows the current power operational mode the port is in. The + power operation mode means current level for VBUS. In case USB + Power Delivery communication is used for negotiating the levels, + power operation mode should show "usb_power_delivery". + + Valid values: + - default + - 1.5A + - 3.0A + - usb_power_delivery + +What: /sys/class/typec//preferred_role +Date: April 2017 +Contact: Heikki Krogerus +Description: + The user space can notify the driver about the preferred role. + It should be handled as enabling of Try.SRC or Try.SNK, as + defined in USB Type-C specification, in the port drivers. By + default the preferred role should come from the platform. + + Valid values: source, sink, none (to remove preference) + +What: /sys/class/typec//supported_accessory_modes +Date: April 2017 +Contact: Heikki Krogerus +Description: + Space separated list of accessory modes, defined in the USB + Type-C specification, the port supports. + +What: /sys/class/typec//usb_power_delivery_revision +Date: April 2017 +Contact: Heikki Krogerus +Description: + Revision number of the supported USB Power Delivery + specification, or 0 when USB Power Delivery is not supported. + +What: /sys/class/typec//usb_typec_revision +Date: April 2017 +Contact: Heikki Krogerus +Description: + Revision number of the supported USB Type-C specification. + + +USB Type-C partner devices (eg. /sys/class/typec/port0-partner/) + +What: /sys/class/typec/-partner/accessory_mode +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows the Accessory Mode name when the partner is an Accessory. + The Accessory Modes are defined in USB Type-C Specification. + +What: /sys/class/typec/-partner/supports_usb_power_delivery +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows if the partner supports USB Power Delivery communication: + Valid values: yes, no + +What: /sys/class/typec/-partner>/identity/ +Date: April 2017 +Contact: Heikki Krogerus +Description: + This directory appears only if the port device driver is capable + of showing the result of Discover Identity USB power delivery + command. That will not always be possible even when USB power + delivery is supported, for example when USB power delivery + communication for the port is mostly handled in firmware. If the + directory exists, it will have an attribute file for every VDO + in Discover Identity command result. + +What: /sys/class/typec/-partner/identity/id_header +Date: April 2017 +Contact: Heikki Krogerus +Description: + ID Header VDO part of Discover Identity command result. The + value will show 0 until Discover Identity command result becomes + available. The value can be polled. + +What: /sys/class/typec/-partner/identity/cert_stat +Date: April 2017 +Contact: Heikki Krogerus +Description: + Cert Stat VDO part of Discover Identity command result. The + value will show 0 until Discover Identity command result becomes + available. The value can be polled. + +What: /sys/class/typec/-partner/identity/product +Date: April 2017 +Contact: Heikki Krogerus +Description: + Product VDO part of Discover Identity command result. The value + will show 0 until Discover Identity command result becomes + available. The value can be polled. + + +USB Type-C cable devices (eg. /sys/class/typec/port0-cable/) + +Note: Electronically Marked Cables will have a device also for one cable plug +(eg. /sys/class/typec/port0-plug0). If the cable is active and has also SOP +Double Prime controller (USB Power Deliver specification ch. 2.4) it will have +second device also for the other plug. Both plugs may have alternate modes as +described in USB Type-C and USB Power Delivery specifications. + +What: /sys/class/typec/-cable/type +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows if the cable is active. + Valid values: active, passive + +What: /sys/class/typec/-cable/plug_type +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows type of the plug on the cable: + - type-a - Standard A + - type-b - Standard B + - type-c + - captive + +What: /sys/class/typec/-cable/identity/ +Date: April 2017 +Contact: Heikki Krogerus +Description: + This directory appears only if the port device driver is capable + of showing the result of Discover Identity USB power delivery + command. That will not always be possible even when USB power + delivery is supported. If the directory exists, it will have an + attribute for every VDO returned by Discover Identity command. + +What: /sys/class/typec/-cable/identity/id_header +Date: April 2017 +Contact: Heikki Krogerus +Description: + ID Header VDO part of Discover Identity command result. The + value will show 0 until Discover Identity command result becomes + available. The value can be polled. + +What: /sys/class/typec/-cable/identity/cert_stat +Date: April 2017 +Contact: Heikki Krogerus +Description: + Cert Stat VDO part of Discover Identity command result. The + value will show 0 until Discover Identity command result becomes + available. The value can be polled. + +What: /sys/class/typec/-cable/identity/product +Date: April 2017 +Contact: Heikki Krogerus +Description: + Product VDO part of Discover Identity command result. The value + will show 0 until Discover Identity command result becomes + available. The value can be polled. + + +Alternate Mode devices. + +The alternate modes will have Standard or Vendor ID (SVID) assigned by USB-IF. +The ports, partners and cable plugs can have alternate modes. A supported SVID +will consist of a set of modes. Every SVID a port/partner/plug supports will +have a device created for it, and every supported mode for a supported SVID will +have its own directory under that device. Below refers to the device for +the alternate mode. + +What: /sys/class/typec///svid +Date: April 2017 +Contact: Heikki Krogerus +Description: + The SVID (Standard or Vendor ID) assigned by USB-IF for this + alternate mode. + +What: /sys/class/typec///mode/ +Date: April 2017 +Contact: Heikki Krogerus +Description: + Every supported mode will have its own directory. The name of + a mode will be "mode" (for example mode1), where + is the actual index to the mode VDO returned by Discover Modes + USB power delivery command. + +What: /sys/class/typec///mode/description +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows description of the mode. The description is optional for + the drivers, just like with the Billboard Devices. + +What: /sys/class/typec///mode/vdo +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows the VDO in hexadecimal returned by Discover Modes command + for this mode. + +What: /sys/class/typec///mode/active +Date: April 2017 +Contact: Heikki Krogerus +Description: + Shows if the mode is active or not. The attribute can be used + for entering/exiting the mode with partners and cable plugs, and + with the port alternate modes it can be used for disabling + support for specific alternate modes. Entering/exiting modes is + supported as synchronous operation so write(2) to the attribute + does not return until the enter/exit mode operation has + finished. The attribute is notified when the mode is + entered/exited so poll(2) on the attribute wakes up. + Entering/exiting a mode will also generate uevent KOBJ_CHANGE. + + Valid values: yes, no + +What: /sys/class/typec///mode/supported_roles +Date: April 2017 +Contact: Heikki Krogerus +Description: + Space separated list of the supported roles. + + This attribute is available for the devices describing the + alternate modes a port supports, and it will not be exposed with + the devices presenting the alternate modes the partners or cable + plugs support. + + Valid values: source, sink diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 2a4a423d08e0d3ed8bce7cc0c9bcd36bb30bb19d..f3d5817c4ef0fae54150bc3e772e7cc535bde805 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -366,3 +366,10 @@ Contact: Linux ARM Kernel Mailing list Description: AArch64 CPU registers 'identification' directory exposes the CPU ID registers for identifying model and revision of the CPU. + +What: /sys/devices/system/cpu/cpu#/cpu_capacity +Date: December 2016 +Contact: Linux kernel mailing list +Description: information about CPUs heterogeneity. + + cpu_capacity: capacity of cpu#. diff --git a/Documentation/ABI/testing/sysfs-firmware-acpi b/Documentation/ABI/testing/sysfs-firmware-acpi index c7fc72d4495ca0cbd2203e40cfba522bdb2fcfa8..613f42a9d5cdcd7b6a9f754b59387065daa1576f 100644 --- a/Documentation/ABI/testing/sysfs-firmware-acpi +++ b/Documentation/ABI/testing/sysfs-firmware-acpi @@ -44,16 +44,6 @@ Description: or 0 (unset). Attempts to write any other values to it will cause -EINVAL to be returned. -What: /sys/firmware/acpi/hotplug/force_remove -Date: May 2013 -Contact: Rafael J. Wysocki -Description: - The number in this file (0 or 1) determines whether (1) or not - (0) the ACPI subsystem will allow devices to be hot-removed even - if they cannot be put offline gracefully (from the kernel's - viewpoint). That number can be changed by writing a boolean - value to this file. - What: /sys/firmware/acpi/interrupts/ Date: February 2008 Contact: Len Brown diff --git a/Documentation/ABI/testing/sysfs-kernel-livepatch b/Documentation/ABI/testing/sysfs-kernel-livepatch index da87f43aec584de74945714247c6849bebf0cfa7..d5d39748382f4dbdd9f4ad14163a9af29235fb56 100644 --- a/Documentation/ABI/testing/sysfs-kernel-livepatch +++ b/Documentation/ABI/testing/sysfs-kernel-livepatch @@ -25,6 +25,14 @@ Description: code is currently applied. Writing 0 will disable the patch while writing 1 will re-enable the patch. +What: /sys/kernel/livepatch//transition +Date: Feb 2017 +KernelVersion: 4.12.0 +Contact: live-patching@vger.kernel.org +Description: + An attribute which indicates whether the patch is currently in + transition. + What: /sys/kernel/livepatch// Date: Nov 2014 KernelVersion: 3.19.0 diff --git a/Documentation/ABI/testing/sysfs-platform-chipidea-usb2 b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2 new file mode 100644 index 0000000000000000000000000000000000000000..b0f4684a83fe7ab6d72a9ff780565933833c64a0 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-chipidea-usb2 @@ -0,0 +1,9 @@ +What: /sys/bus/platform/devices/ci_hdrc.0/role +Date: Mar 2017 +Contact: Peter Chen +Description: + It returns string "gadget" or "host" when read it, it indicates + current controller role. + + It will do role switch when write "gadget" or "host" to it. + Only controller at dual-role configuration supports writing. diff --git a/Documentation/ABI/testing/sysfs-platform-renesas_usb3 b/Documentation/ABI/testing/sysfs-platform-renesas_usb3 new file mode 100644 index 0000000000000000000000000000000000000000..5621c15d5dc0c30756f77971e44919211834a458 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-renesas_usb3 @@ -0,0 +1,15 @@ +What: /sys/devices/platform//role +Date: March 2017 +KernelVersion: 4.13 +Contact: Yoshihiro Shimoda +Description: + This file can be read and write. + The file can show/change the drd mode of usb. + + Write the following string to change the mode: + "host" - switching mode from peripheral to host. + "peripheral" - switching mode from host to peripheral. + + Read the file, then it shows the following strings: + "host" - The mode is host now. + "peripheral" - The mode is peripheral now. diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 164c1c76971f0962cf3a44bf9bc3afd4876597bb..85916f13d330b78f675ba198636b4c19d0b825f0 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -8,12 +8,11 @@ DOCBOOKS := z8530book.xml \ kernel-hacking.xml kernel-locking.xml \ - writing_usb_driver.xml networking.xml \ - kernel-api.xml filesystems.xml lsm.xml kgdb.xml \ - gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ - genericirq.xml s390-drivers.xml scsi.xml \ - sh.xml w1.xml \ - writing_musb_glue_layer.xml + networking.xml \ + filesystems.xml lsm.xml kgdb.xml \ + libata.xml mtdnand.xml librs.xml rapidio.xml \ + s390-drivers.xml scsi.xml \ + sh.xml w1.xml ifeq ($(DOCBOOKS),) @@ -62,11 +61,14 @@ MAN := $(patsubst %.xml, %.9, $(BOOKS)) mandocs: $(MAN) find $(obj)/man -name '*.9' | xargs gzip -nf +# Default location for installed man pages +export INSTALL_MAN_PATH = $(objtree)/usr + installmandocs: mandocs - mkdir -p /usr/local/man/man9/ + mkdir -p $(INSTALL_MAN_PATH)/man/man9/ find $(obj)/man -name '*.9.gz' -printf '%h %f\n' | \ sort -k 2 -k 1 | uniq -f 1 | sed -e 's: :/:' | \ - xargs install -m 644 -t /usr/local/man/man9/ + xargs install -m 644 -t $(INSTALL_MAN_PATH)/man/man9/ # no-op for the DocBook toolchain epubdocs: @@ -238,7 +240,9 @@ dochelp: @echo ' psdocs - Postscript' @echo ' xmldocs - XML DocBook' @echo ' mandocs - man pages' - @echo ' installmandocs - install man pages generated by mandocs' + @echo ' installmandocs - install man pages generated by mandocs to INSTALL_MAN_PATH'; \ + echo ' (default: $(INSTALL_MAN_PATH))'; \ + echo '' @echo ' cleandocs - clean all generated DocBook files' @echo @echo ' make DOCBOOKS="s1.xml s2.xml" [target] Generate only docs s1.xml s2.xml' diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl deleted file mode 100644 index 64162922117651344d698ffd53ece06eafca3324..0000000000000000000000000000000000000000 --- a/Documentation/DocBook/gadget.tmpl +++ /dev/null @@ -1,793 +0,0 @@ - - - - - - USB Gadget API for Linux - 20 August 2004 - 20 August 2004 - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - 2003-2004 - David Brownell - - - - David - Brownell - -
dbrownell@users.sourceforge.net
-
-
-
- - - -Introduction - -This document presents a Linux-USB "Gadget" -kernel mode -API, for use within peripherals and other USB devices -that embed Linux. -It provides an overview of the API structure, -and shows how that fits into a system development project. -This is the first such API released on Linux to address -a number of important problems, including: - - - Supports USB 2.0, for high speed devices which - can stream data at several dozen megabytes per second. - - Handles devices with dozens of endpoints just as - well as ones with just two fixed-function ones. Gadget drivers - can be written so they're easy to port to new hardware. - - Flexible enough to expose more complex USB device - capabilities such as multiple configurations, multiple interfaces, - composite devices, - and alternate interface settings. - - USB "On-The-Go" (OTG) support, in conjunction - with updates to the Linux-USB host side. - - Sharing data structures and API models with the - Linux-USB host side API. This helps the OTG support, and - looks forward to more-symmetric frameworks (where the same - I/O model is used by both host and device side drivers). - - Minimalist, so it's easier to support new device - controller hardware. I/O processing doesn't imply large - demands for memory or CPU resources. - - - - -Most Linux developers will not be able to use this API, since they -have USB "host" hardware in a PC, workstation, or server. -Linux users with embedded systems are more likely to -have USB peripheral hardware. -To distinguish drivers running inside such hardware from the -more familiar Linux "USB device drivers", -which are host side proxies for the real USB devices, -a different term is used: -the drivers inside the peripherals are "USB gadget drivers". -In USB protocol interactions, the device driver is the master -(or "client driver") -and the gadget driver is the slave (or "function driver"). - - -The gadget API resembles the host side Linux-USB API in that both -use queues of request objects to package I/O buffers, and those requests -may be submitted or canceled. -They share common definitions for the standard USB -Chapter 9 messages, structures, and constants. -Also, both APIs bind and unbind drivers to devices. -The APIs differ in detail, since the host side's current -URB framework exposes a number of implementation details -and assumptions that are inappropriate for a gadget API. -While the model for control transfers and configuration -management is necessarily different (one side is a hardware-neutral master, -the other is a hardware-aware slave), the endpoint I/0 API used here -should also be usable for an overhead-reduced host side API. - - - - -Structure of Gadget Drivers - -A system running inside a USB peripheral -normally has at least three layers inside the kernel to handle -USB protocol processing, and may have additional layers in -user space code. -The "gadget" API is used by the middle layer to interact -with the lowest level (which directly handles hardware). - - -In Linux, from the bottom up, these layers are: - - - - - - USB Controller Driver - - - This is the lowest software level. - It is the only layer that talks to hardware, - through registers, fifos, dma, irqs, and the like. - The <linux/usb/gadget.h> API abstracts - the peripheral controller endpoint hardware. - That hardware is exposed through endpoint objects, which accept - streams of IN/OUT buffers, and through callbacks that interact - with gadget drivers. - Since normal USB devices only have one upstream - port, they only have one of these drivers. - The controller driver can support any number of different - gadget drivers, but only one of them can be used at a time. - - - Examples of such controller hardware include - the PCI-based NetChip 2280 USB 2.0 high speed controller, - the SA-11x0 or PXA-25x UDC (found within many PDAs), - and a variety of other products. - - - - - - Gadget Driver - - - The lower boundary of this driver implements hardware-neutral - USB functions, using calls to the controller driver. - Because such hardware varies widely in capabilities and restrictions, - and is used in embedded environments where space is at a premium, - the gadget driver is often configured at compile time - to work with endpoints supported by one particular controller. - Gadget drivers may be portable to several different controllers, - using conditional compilation. - (Recent kernels substantially simplify the work involved in - supporting new hardware, by autoconfiguring - endpoints automatically for many bulk-oriented drivers.) - Gadget driver responsibilities include: - - - handling setup requests (ep0 protocol responses) - possibly including class-specific functionality - - returning configuration and string descriptors - - (re)setting configurations and interface - altsettings, including enabling and configuring endpoints - - handling life cycle events, such as managing - bindings to hardware, - USB suspend/resume, remote wakeup, - and disconnection from the USB host. - - managing IN and OUT transfers on all currently - enabled endpoints - - - - - Such drivers may be modules of proprietary code, although - that approach is discouraged in the Linux community. - - - - - Upper Level - - - Most gadget drivers have an upper boundary that connects - to some Linux driver or framework in Linux. - Through that boundary flows the data which the gadget driver - produces and/or consumes through protocol transfers over USB. - Examples include: - - - user mode code, using generic (gadgetfs) - or application specific files in - /dev - - networking subsystem (for network gadgets, - like the CDC Ethernet Model gadget driver) - - data capture drivers, perhaps video4Linux or - a scanner driver; or test and measurement hardware. - - input subsystem (for HID gadgets) - - sound subsystem (for audio gadgets) - - file system (for PTP gadgets) - - block i/o subsystem (for usb-storage gadgets) - - ... and more - - - - - Additional Layers - - - Other layers may exist. - These could include kernel layers, such as network protocol stacks, - as well as user mode applications building on standard POSIX - system call APIs such as - open(), close(), - read() and write(). - On newer systems, POSIX Async I/O calls may be an option. - Such user mode code will not necessarily be subject to - the GNU General Public License (GPL). - - - - - - -OTG-capable systems will also need to include a standard Linux-USB -host side stack, -with usbcore, -one or more Host Controller Drivers (HCDs), -USB Device Drivers to support -the OTG "Targeted Peripheral List", -and so forth. -There will also be an OTG Controller Driver, -which is visible to gadget and device driver developers only indirectly. -That helps the host and device side USB controllers implement the -two new OTG protocols (HNP and SRP). -Roles switch (host to peripheral, or vice versa) using HNP -during USB suspend processing, and SRP can be viewed as a -more battery-friendly kind of device wakeup protocol. - - -Over time, reusable utilities are evolving to help make some -gadget driver tasks simpler. -For example, building configuration descriptors from vectors of -descriptors for the configurations interfaces and endpoints is -now automated, and many drivers now use autoconfiguration to -choose hardware endpoints and initialize their descriptors. - -A potential example of particular interest -is code implementing standard USB-IF protocols for -HID, networking, storage, or audio classes. -Some developers are interested in KDB or KGDB hooks, to let -target hardware be remotely debugged. -Most such USB protocol code doesn't need to be hardware-specific, -any more than network protocols like X11, HTTP, or NFS are. -Such gadget-side interface drivers should eventually be combined, -to implement composite devices. - - - - - -Kernel Mode Gadget API - -Gadget drivers declare themselves through a -struct usb_gadget_driver, which is responsible for -most parts of enumeration for a struct usb_gadget. -The response to a set_configuration usually involves -enabling one or more of the struct usb_ep objects -exposed by the gadget, and submitting one or more -struct usb_request buffers to transfer data. -Understand those four data types, and their operations, and -you will understand how this API works. - - -Incomplete Data Type Descriptions - -This documentation was prepared using the standard Linux -kernel docproc tool, which turns text -and in-code comments into SGML DocBook and then into usable -formats such as HTML or PDF. -Other than the "Chapter 9" data types, most of the significant -data types and functions are described here. - - -However, docproc does not understand all the C constructs -that are used, so some relevant information is likely omitted from -what you are reading. -One example of such information is endpoint autoconfiguration. -You'll have to read the header file, and use example source -code (such as that for "Gadget Zero"), to fully understand the API. - - -The part of the API implementing some basic -driver capabilities is specific to the version of the -Linux kernel that's in use. -The 2.6 kernel includes a driver model -framework that has no analogue on earlier kernels; -so those parts of the gadget API are not fully portable. -(They are implemented on 2.4 kernels, but in a different way.) -The driver model state is another part of this API that is -ignored by the kerneldoc tools. - - - -The core API does not expose -every possible hardware feature, only the most widely available ones. -There are significant hardware features, such as device-to-device DMA -(without temporary storage in a memory buffer) -that would be added using hardware-specific APIs. - - -This API allows drivers to use conditional compilation to handle -endpoint capabilities of different hardware, but doesn't require that. -Hardware tends to have arbitrary restrictions, relating to -transfer types, addressing, packet sizes, buffering, and availability. -As a rule, such differences only matter for "endpoint zero" logic -that handles device configuration and management. -The API supports limited run-time -detection of capabilities, through naming conventions for endpoints. -Many drivers will be able to at least partially autoconfigure -themselves. -In particular, driver init sections will often have endpoint -autoconfiguration logic that scans the hardware's list of endpoints -to find ones matching the driver requirements -(relying on those conventions), to eliminate some of the most -common reasons for conditional compilation. - - -Like the Linux-USB host side API, this API exposes -the "chunky" nature of USB messages: I/O requests are in terms -of one or more "packets", and packet boundaries are visible to drivers. -Compared to RS-232 serial protocols, USB resembles -synchronous protocols like HDLC -(N bytes per frame, multipoint addressing, host as the primary -station and devices as secondary stations) -more than asynchronous ones -(tty style: 8 data bits per frame, no parity, one stop bit). -So for example the controller drivers won't buffer -two single byte writes into a single two-byte USB IN packet, -although gadget drivers may do so when they implement -protocols where packet boundaries (and "short packets") -are not significant. - - -Driver Life Cycle - -Gadget drivers make endpoint I/O requests to hardware without -needing to know many details of the hardware, but driver -setup/configuration code needs to handle some differences. -Use the API like this: - - - - -Register a driver for the particular device side -usb controller hardware, -such as the net2280 on PCI (USB 2.0), -sa11x0 or pxa25x as found in Linux PDAs, -and so on. -At this point the device is logically in the USB ch9 initial state -("attached"), drawing no power and not usable -(since it does not yet support enumeration). -Any host should not see the device, since it's not -activated the data line pullup used by the host to -detect a device, even if VBUS power is available. - - -Register a gadget driver that implements some higher level -device function. That will then bind() to a usb_gadget, which -activates the data line pullup sometime after detecting VBUS. - - -The hardware driver can now start enumerating. -The steps it handles are to accept USB power and set_address requests. -Other steps are handled by the gadget driver. -If the gadget driver module is unloaded before the host starts to -enumerate, steps before step 7 are skipped. - - -The gadget driver's setup() call returns usb descriptors, -based both on what the bus interface hardware provides and on the -functionality being implemented. -That can involve alternate settings or configurations, -unless the hardware prevents such operation. -For OTG devices, each configuration descriptor includes -an OTG descriptor. - - -The gadget driver handles the last step of enumeration, -when the USB host issues a set_configuration call. -It enables all endpoints used in that configuration, -with all interfaces in their default settings. -That involves using a list of the hardware's endpoints, enabling each -endpoint according to its descriptor. -It may also involve using usb_gadget_vbus_draw -to let more power be drawn from VBUS, as allowed by that configuration. -For OTG devices, setting a configuration may also involve reporting -HNP capabilities through a user interface. - - -Do real work and perform data transfers, possibly involving -changes to interface settings or switching to new configurations, until the -device is disconnect()ed from the host. -Queue any number of transfer requests to each endpoint. -It may be suspended and resumed several times before being disconnected. -On disconnect, the drivers go back to step 3 (above). - - -When the gadget driver module is being unloaded, -the driver unbind() callback is issued. That lets the controller -driver be unloaded. - - - - -Drivers will normally be arranged so that just loading the -gadget driver module (or statically linking it into a Linux kernel) -allows the peripheral device to be enumerated, but some drivers -will defer enumeration until some higher level component (like -a user mode daemon) enables it. -Note that at this lowest level there are no policies about how -ep0 configuration logic is implemented, -except that it should obey USB specifications. -Such issues are in the domain of gadget drivers, -including knowing about implementation constraints -imposed by some USB controllers -or understanding that composite devices might happen to -be built by integrating reusable components. - - -Note that the lifecycle above can be slightly different -for OTG devices. -Other than providing an additional OTG descriptor in each -configuration, only the HNP-related differences are particularly -visible to driver code. -They involve reporting requirements during the SET_CONFIGURATION -request, and the option to invoke HNP during some suspend callbacks. -Also, SRP changes the semantics of -usb_gadget_wakeup -slightly. - - - - -USB 2.0 Chapter 9 Types and Constants - -Gadget drivers -rely on common USB structures and constants -defined in the -<linux/usb/ch9.h> -header file, which is standard in Linux 2.6 kernels. -These are the same types and constants used by host -side drivers (and usbcore). - - -!Iinclude/linux/usb/ch9.h - - -Core Objects and Methods - -These are declared in -<linux/usb/gadget.h>, -and are used by gadget drivers to interact with -USB peripheral controller drivers. - - - - -!Iinclude/linux/usb/gadget.h - - -Optional Utilities - -The core API is sufficient for writing a USB Gadget Driver, -but some optional utilities are provided to simplify common tasks. -These utilities include endpoint autoconfiguration. - - -!Edrivers/usb/gadget/usbstring.c -!Edrivers/usb/gadget/config.c - - - -Composite Device Framework - -The core API is sufficient for writing drivers for composite -USB devices (with more than one function in a given configuration), -and also multi-configuration devices (also more than one function, -but not necessarily sharing a given configuration). -There is however an optional framework which makes it easier to -reuse and combine functions. - - -Devices using this framework provide a struct -usb_composite_driver, which in turn provides one or -more struct usb_configuration instances. -Each such configuration includes at least one -struct usb_function, which packages a user -visible role such as "network link" or "mass storage device". -Management functions may also exist, such as "Device Firmware -Upgrade". - - -!Iinclude/linux/usb/composite.h -!Edrivers/usb/gadget/composite.c - - - -Composite Device Functions - -At this writing, a few of the current gadget drivers have -been converted to this framework. -Near-term plans include converting all of them, except for "gadgetfs". - - -!Edrivers/usb/gadget/function/f_acm.c -!Edrivers/usb/gadget/function/f_ecm.c -!Edrivers/usb/gadget/function/f_subset.c -!Edrivers/usb/gadget/function/f_obex.c -!Edrivers/usb/gadget/function/f_serial.c - - - - - - -Peripheral Controller Drivers - -The first hardware supporting this API was the NetChip 2280 -controller, which supports USB 2.0 high speed and is based on PCI. -This is the net2280 driver module. -The driver supports Linux kernel versions 2.4 and 2.6; -contact NetChip Technologies for development boards and product -information. - - -Other hardware working in the "gadget" framework includes: -Intel's PXA 25x and IXP42x series processors -(pxa2xx_udc), -Toshiba TC86c001 "Goku-S" (goku_udc), -Renesas SH7705/7727 (sh_udc), -MediaQ 11xx (mq11xx_udc), -Hynix HMS30C7202 (h7202_udc), -National 9303/4 (n9604_udc), -Texas Instruments OMAP (omap_udc), -Sharp LH7A40x (lh7a40x_udc), -and more. -Most of those are full speed controllers. - - -At this writing, there are people at work on drivers in -this framework for several other USB device controllers, -with plans to make many of them be widely available. - - - - -A partial USB simulator, -the dummy_hcd driver, is available. -It can act like a net2280, a pxa25x, or an sa11x0 in terms -of available endpoints and device speeds; and it simulates -control, bulk, and to some extent interrupt transfers. -That lets you develop some parts of a gadget driver on a normal PC, -without any special hardware, and perhaps with the assistance -of tools such as GDB running with User Mode Linux. -At least one person has expressed interest in adapting that -approach, hooking it up to a simulator for a microcontroller. -Such simulators can help debug subsystems where the runtime hardware -is unfriendly to software development, or is not yet available. - - -Support for other controllers is expected to be developed -and contributed -over time, as this driver framework evolves. - - - - -Gadget Drivers - -In addition to Gadget Zero -(used primarily for testing and development with drivers -for usb controller hardware), other gadget drivers exist. - - -There's an ethernet gadget -driver, which implements one of the most useful -Communications Device Class (CDC) models. -One of the standards for cable modem interoperability even -specifies the use of this ethernet model as one of two -mandatory options. -Gadgets using this code look to a USB host as if they're -an Ethernet adapter. -It provides access to a network where the gadget's CPU is one host, -which could easily be bridging, routing, or firewalling -access to other networks. -Since some hardware can't fully implement the CDC Ethernet -requirements, this driver also implements a "good parts only" -subset of CDC Ethernet. -(That subset doesn't advertise itself as CDC Ethernet, -to avoid creating problems.) - - -Support for Microsoft's RNDIS -protocol has been contributed by Pengutronix and Auerswald GmbH. -This is like CDC Ethernet, but it runs on more slightly USB hardware -(but less than the CDC subset). -However, its main claim to fame is being able to connect directly to -recent versions of Windows, using drivers that Microsoft bundles -and supports, making it much simpler to network with Windows. - - -There is also support for user mode gadget drivers, -using gadgetfs. -This provides a User Mode API that presents -each endpoint as a single file descriptor. I/O is done using -normal read() and read() calls. -Familiar tools like GDB and pthreads can be used to -develop and debug user mode drivers, so that once a robust -controller driver is available many applications for it -won't require new kernel mode software. -Linux 2.6 Async I/O (AIO) -support is available, so that user mode software -can stream data with only slightly more overhead -than a kernel driver. - - -There's a USB Mass Storage class driver, which provides -a different solution for interoperability with systems such -as MS-Windows and MacOS. -That Mass Storage driver uses a -file or block device as backing store for a drive, -like the loop driver. -The USB host uses the BBB, CB, or CBI versions of the mass -storage class specification, using transparent SCSI commands -to access the data from the backing store. - - -There's a "serial line" driver, useful for TTY style -operation over USB. -The latest version of that driver supports CDC ACM style -operation, like a USB modem, and so on most hardware it can -interoperate easily with MS-Windows. -One interesting use of that driver is in boot firmware (like a BIOS), -which can sometimes use that model with very small systems without -real serial lines. - - -Support for other kinds of gadget is expected to -be developed and contributed -over time, as this driver framework evolves. - - - - -USB On-The-GO (OTG) - -USB OTG support on Linux 2.6 was initially developed -by Texas Instruments for -OMAP 16xx and 17xx -series processors. -Other OTG systems should work in similar ways, but the -hardware level details could be very different. - - -Systems need specialized hardware support to implement OTG, -notably including a special Mini-AB jack -and associated transceiver to support Dual-Role -operation: -they can act either as a host, using the standard -Linux-USB host side driver stack, -or as a peripheral, using this "gadget" framework. -To do that, the system software relies on small additions -to those programming interfaces, -and on a new internal component (here called an "OTG Controller") -affecting which driver stack connects to the OTG port. -In each role, the system can re-use the existing pool of -hardware-neutral drivers, layered on top of the controller -driver interfaces (usb_bus or -usb_gadget). -Such drivers need at most minor changes, and most of the calls -added to support OTG can also benefit non-OTG products. - - - - Gadget drivers test the is_otg - flag, and use it to determine whether or not to include - an OTG descriptor in each of their configurations. - - Gadget drivers may need changes to support the - two new OTG protocols, exposed in new gadget attributes - such as b_hnp_enable flag. - HNP support should be reported through a user interface - (two LEDs could suffice), and is triggered in some cases - when the host suspends the peripheral. - SRP support can be user-initiated just like remote wakeup, - probably by pressing the same button. - - On the host side, USB device drivers need - to be taught to trigger HNP at appropriate moments, using - usb_suspend_device(). - That also conserves battery power, which is useful even - for non-OTG configurations. - - Also on the host side, a driver must support the - OTG "Targeted Peripheral List". That's just a whitelist, - used to reject peripherals not supported with a given - Linux OTG host. - This whitelist is product-specific; - each product must modify otg_whitelist.h - to match its interoperability specification. - - - Non-OTG Linux hosts, like PCs and workstations, - normally have some solution for adding drivers, so that - peripherals that aren't recognized can eventually be supported. - That approach is unreasonable for consumer products that may - never have their firmware upgraded, and where it's usually - unrealistic to expect traditional PC/workstation/server kinds - of support model to work. - For example, it's often impractical to change device firmware - once the product has been distributed, so driver bugs can't - normally be fixed if they're found after shipment. - - - - -Additional changes are needed below those hardware-neutral -usb_bus and usb_gadget -driver interfaces; those aren't discussed here in any detail. -Those affect the hardware-specific code for each USB Host or Peripheral -controller, and how the HCD initializes (since OTG can be active only -on a single port). -They also involve what may be called an OTG Controller -Driver, managing the OTG transceiver and the OTG state -machine logic as well as much of the root hub behavior for the -OTG port. -The OTG controller driver needs to activate and deactivate USB -controllers depending on the relevant device role. -Some related changes were needed inside usbcore, so that it -can identify OTG-capable devices and respond appropriately -to HNP or SRP protocols. - - - - -
- diff --git a/Documentation/DocBook/genericirq.tmpl b/Documentation/DocBook/genericirq.tmpl deleted file mode 100644 index 59fb5c077541b15f6f319a17cbf259195c9849f0..0000000000000000000000000000000000000000 --- a/Documentation/DocBook/genericirq.tmpl +++ /dev/null @@ -1,520 +0,0 @@ - - - - - - Linux generic IRQ handling - - - - Thomas - Gleixner - -
- tglx@linutronix.de -
-
-
- - Ingo - Molnar - -
- mingo@elte.hu -
-
-
-
- - - 2005-2010 - Thomas Gleixner - - - 2005-2006 - Ingo Molnar - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License version 2 as published by the Free Software Foundation. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - -
- - - - - Introduction - - The generic interrupt handling layer is designed to provide a - complete abstraction of interrupt handling for device drivers. - It is able to handle all the different types of interrupt controller - hardware. Device drivers use generic API functions to request, enable, - disable and free interrupts. The drivers do not have to know anything - about interrupt hardware details, so they can be used on different - platforms without code changes. - - - This documentation is provided to developers who want to implement - an interrupt subsystem based for their architecture, with the help - of the generic IRQ handling layer. - - - - - Rationale - - The original implementation of interrupt handling in Linux uses - the __do_IRQ() super-handler, which is able to deal with every - type of interrupt logic. - - - Originally, Russell King identified different types of handlers to - build a quite universal set for the ARM interrupt handler - implementation in Linux 2.5/2.6. He distinguished between: - - Level type - Edge type - Simple type - - During the implementation we identified another type: - - Fast EOI type - - In the SMP world of the __do_IRQ() super-handler another type - was identified: - - Per CPU type - - - - This split implementation of high-level IRQ handlers allows us to - optimize the flow of the interrupt handling for each specific - interrupt type. This reduces complexity in that particular code path - and allows the optimized handling of a given type. - - - The original general IRQ implementation used hw_interrupt_type - structures and their ->ack(), ->end() [etc.] callbacks to - differentiate the flow control in the super-handler. This leads to - a mix of flow logic and low-level hardware logic, and it also leads - to unnecessary code duplication: for example in i386, there is an - ioapic_level_irq and an ioapic_edge_irq IRQ-type which share many - of the low-level details but have different flow handling. - - - A more natural abstraction is the clean separation of the - 'irq flow' and the 'chip details'. - - - Analysing a couple of architecture's IRQ subsystem implementations - reveals that most of them can use a generic set of 'irq flow' - methods and only need to add the chip-level specific code. - The separation is also valuable for (sub)architectures - which need specific quirks in the IRQ flow itself but not in the - chip details - and thus provides a more transparent IRQ subsystem - design. - - - Each interrupt descriptor is assigned its own high-level flow - handler, which is normally one of the generic - implementations. (This high-level flow handler implementation also - makes it simple to provide demultiplexing handlers which can be - found in embedded platforms on various architectures.) - - - The separation makes the generic interrupt handling layer more - flexible and extensible. For example, an (sub)architecture can - use a generic IRQ-flow implementation for 'level type' interrupts - and add a (sub)architecture specific 'edge type' implementation. - - - To make the transition to the new model easier and prevent the - breakage of existing implementations, the __do_IRQ() super-handler - is still available. This leads to a kind of duality for the time - being. Over time the new model should be used in more and more - architectures, as it enables smaller and cleaner IRQ subsystems. - It's deprecated for three years now and about to be removed. - - - - Known Bugs And Assumptions - - None (knock on wood). - - - - - Abstraction layers - - There are three main levels of abstraction in the interrupt code: - - High-level driver API - High-level IRQ flow handlers - Chip-level hardware encapsulation - - - - Interrupt control flow - - Each interrupt is described by an interrupt descriptor structure - irq_desc. The interrupt is referenced by an 'unsigned int' numeric - value which selects the corresponding interrupt description structure - in the descriptor structures array. - The descriptor structure contains status information and pointers - to the interrupt flow method and the interrupt chip structure - which are assigned to this interrupt. - - - Whenever an interrupt triggers, the low-level architecture code calls - into the generic interrupt code by calling desc->handle_irq(). - This high-level IRQ handling function only uses desc->irq_data.chip - primitives referenced by the assigned chip descriptor structure. - - - - High-level Driver API - - The high-level Driver API consists of following functions: - - request_irq() - free_irq() - disable_irq() - enable_irq() - disable_irq_nosync() (SMP only) - synchronize_irq() (SMP only) - irq_set_irq_type() - irq_set_irq_wake() - irq_set_handler_data() - irq_set_chip() - irq_set_chip_data() - - See the autogenerated function documentation for details. - - - - High-level IRQ flow handlers - - The generic layer provides a set of pre-defined irq-flow methods: - - handle_level_irq - handle_edge_irq - handle_fasteoi_irq - handle_simple_irq - handle_percpu_irq - handle_edge_eoi_irq - handle_bad_irq - - The interrupt flow handlers (either pre-defined or architecture - specific) are assigned to specific interrupts by the architecture - either during bootup or during device initialization. - - - Default flow implementations - - Helper functions - - The helper functions call the chip primitives and - are used by the default flow implementations. - The following helper functions are implemented (simplified excerpt): - -default_enable(struct irq_data *data) -{ - desc->irq_data.chip->irq_unmask(data); -} - -default_disable(struct irq_data *data) -{ - if (!delay_disable(data)) - desc->irq_data.chip->irq_mask(data); -} - -default_ack(struct irq_data *data) -{ - chip->irq_ack(data); -} - -default_mask_ack(struct irq_data *data) -{ - if (chip->irq_mask_ack) { - chip->irq_mask_ack(data); - } else { - chip->irq_mask(data); - chip->irq_ack(data); - } -} - -noop(struct irq_data *data)) -{ -} - - - - - - - Default flow handler implementations - - Default Level IRQ flow handler - - handle_level_irq provides a generic implementation - for level-triggered interrupts. - - - The following control flow is implemented (simplified excerpt): - -desc->irq_data.chip->irq_mask_ack(); -handle_irq_event(desc->action); -desc->irq_data.chip->irq_unmask(); - - - - - Default Fast EOI IRQ flow handler - - handle_fasteoi_irq provides a generic implementation - for interrupts, which only need an EOI at the end of - the handler. - - - The following control flow is implemented (simplified excerpt): - -handle_irq_event(desc->action); -desc->irq_data.chip->irq_eoi(); - - - - - Default Edge IRQ flow handler - - handle_edge_irq provides a generic implementation - for edge-triggered interrupts. - - - The following control flow is implemented (simplified excerpt): - -if (desc->status & running) { - desc->irq_data.chip->irq_mask_ack(); - desc->status |= pending | masked; - return; -} -desc->irq_data.chip->irq_ack(); -desc->status |= running; -do { - if (desc->status & masked) - desc->irq_data.chip->irq_unmask(); - desc->status &= ~pending; - handle_irq_event(desc->action); -} while (status & pending); -desc->status &= ~running; - - - - - Default simple IRQ flow handler - - handle_simple_irq provides a generic implementation - for simple interrupts. - - - Note: The simple flow handler does not call any - handler/chip primitives. - - - The following control flow is implemented (simplified excerpt): - -handle_irq_event(desc->action); - - - - - Default per CPU flow handler - - handle_percpu_irq provides a generic implementation - for per CPU interrupts. - - - Per CPU interrupts are only available on SMP and - the handler provides a simplified version without - locking. - - - The following control flow is implemented (simplified excerpt): - -if (desc->irq_data.chip->irq_ack) - desc->irq_data.chip->irq_ack(); -handle_irq_event(desc->action); -if (desc->irq_data.chip->irq_eoi) - desc->irq_data.chip->irq_eoi(); - - - - - EOI Edge IRQ flow handler - - handle_edge_eoi_irq provides an abnomination of the edge - handler which is solely used to tame a badly wreckaged - irq controller on powerpc/cell. - - - - Bad IRQ flow handler - - handle_bad_irq is used for spurious interrupts which - have no real handler assigned.. - - - - - Quirks and optimizations - - The generic functions are intended for 'clean' architectures and chips, - which have no platform-specific IRQ handling quirks. If an architecture - needs to implement quirks on the 'flow' level then it can do so by - overriding the high-level irq-flow handler. - - - - Delayed interrupt disable - - This per interrupt selectable feature, which was introduced by Russell - King in the ARM interrupt implementation, does not mask an interrupt - at the hardware level when disable_irq() is called. The interrupt is - kept enabled and is masked in the flow handler when an interrupt event - happens. This prevents losing edge interrupts on hardware which does - not store an edge interrupt event while the interrupt is disabled at - the hardware level. When an interrupt arrives while the IRQ_DISABLED - flag is set, then the interrupt is masked at the hardware level and - the IRQ_PENDING bit is set. When the interrupt is re-enabled by - enable_irq() the pending bit is checked and if it is set, the - interrupt is resent either via hardware or by a software resend - mechanism. (It's necessary to enable CONFIG_HARDIRQS_SW_RESEND when - you want to use the delayed interrupt disable feature and your - hardware is not capable of retriggering an interrupt.) - The delayed interrupt disable is not configurable. - - - - - Chip-level hardware encapsulation - - The chip-level hardware descriptor structure irq_chip - contains all the direct chip relevant functions, which - can be utilized by the irq flow implementations. - - irq_ack() - irq_mask_ack() - Optional, recommended for performance - irq_mask() - irq_unmask() - irq_eoi() - Optional, required for EOI flow handlers - irq_retrigger() - Optional - irq_set_type() - Optional - irq_set_wake() - Optional - - These primitives are strictly intended to mean what they say: ack means - ACK, masking means masking of an IRQ line, etc. It is up to the flow - handler(s) to use these basic units of low-level functionality. - - - - - - __do_IRQ entry point - - The original implementation __do_IRQ() was an alternative entry - point for all types of interrupts. It no longer exists. - - - This handler turned out to be not suitable for all - interrupt hardware and was therefore reimplemented with split - functionality for edge/level/simple/percpu interrupts. This is not - only a functional optimization. It also shortens code paths for - interrupts. - - - - - Locking on SMP - - The locking of chip registers is up to the architecture that - defines the chip primitives. The per-irq structure is - protected via desc->lock, by the generic layer. - - - - - Generic interrupt chip - - To avoid copies of identical implementations of IRQ chips the - core provides a configurable generic interrupt chip - implementation. Developers should check carefully whether the - generic chip fits their needs before implementing the same - functionality slightly differently themselves. - -!Ekernel/irq/generic-chip.c - - - - Structures - - This chapter contains the autogenerated documentation of the structures which are - used in the generic IRQ layer. - -!Iinclude/linux/irq.h -!Iinclude/linux/interrupt.h - - - - Public Functions Provided - - This chapter contains the autogenerated documentation of the kernel API functions - which are exported. - -!Ekernel/irq/manage.c -!Ekernel/irq/chip.c - - - - Internal Functions Provided - - This chapter contains the autogenerated documentation of the internal functions. - -!Ikernel/irq/irqdesc.c -!Ikernel/irq/handle.c -!Ikernel/irq/chip.c - - - - Credits - - The following people have contributed to this document: - - Thomas Gleixnertglx@linutronix.de - Ingo Molnarmingo@elte.hu - - - -
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl deleted file mode 100644 index ecfd0ea406619b0a2c76071b2024c7219b9c7a6a..0000000000000000000000000000000000000000 --- a/Documentation/DocBook/kernel-api.tmpl +++ /dev/null @@ -1,331 +0,0 @@ - - - - - - The Linux Kernel API - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - - - - - Data Types - Doubly Linked Lists -!Iinclude/linux/list.h - - - - - Basic C Library Functions - - - When writing drivers, you cannot in general use routines which are - from the C Library. Some of the functions have been found generally - useful and they are listed below. The behaviour of these functions - may vary slightly from those defined by ANSI, and these deviations - are noted in the text. - - - String Conversions -!Elib/vsprintf.c -!Finclude/linux/kernel.h kstrtol -!Finclude/linux/kernel.h kstrtoul -!Elib/kstrtox.c - - String Manipulation - -!Elib/string.c - - Bit Operations -!Iarch/x86/include/asm/bitops.h - - - - - Basic Kernel Library Functions - - - The Linux kernel provides more basic utility functions. - - - Bitmap Operations -!Elib/bitmap.c -!Ilib/bitmap.c - - - Command-line Parsing -!Elib/cmdline.c - - - CRC Functions -!Elib/crc7.c -!Elib/crc16.c -!Elib/crc-itu-t.c -!Elib/crc32.c -!Elib/crc-ccitt.c - - - idr/ida Functions -!Pinclude/linux/idr.h idr sync -!Plib/idr.c IDA description -!Elib/idr.c - - - - - Memory Management in Linux - The Slab Cache -!Iinclude/linux/slab.h -!Emm/slab.c -!Emm/util.c - - User Space Memory Access -!Iarch/x86/include/asm/uaccess_32.h -!Earch/x86/lib/usercopy_32.c - - More Memory Management Functions -!Emm/readahead.c -!Emm/filemap.c -!Emm/memory.c -!Emm/vmalloc.c -!Imm/page_alloc.c -!Emm/mempool.c -!Emm/dmapool.c -!Emm/page-writeback.c -!Emm/truncate.c - - - - - - Kernel IPC facilities - - IPC utilities -!Iipc/util.c - - - - - FIFO Buffer - kfifo interface -!Iinclude/linux/kfifo.h - - - - - relay interface support - - - Relay interface support - is designed to provide an efficient mechanism for tools and - facilities to relay large amounts of data from kernel space to - user space. - - - relay interface -!Ekernel/relay.c -!Ikernel/relay.c - - - - - Module Support - Module Loading -!Ekernel/kmod.c - - Inter Module support - - Refer to the file kernel/module.c for more information. - - - - - - - Hardware Interfaces - Interrupt Handling -!Ekernel/irq/manage.c - - - DMA Channels -!Ekernel/dma.c - - - Resources Management -!Ikernel/resource.c -!Ekernel/resource.c - - - MTRR Handling -!Earch/x86/kernel/cpu/mtrr/main.c - - - PCI Support Library -!Edrivers/pci/pci.c -!Edrivers/pci/pci-driver.c -!Edrivers/pci/remove.c -!Edrivers/pci/search.c -!Edrivers/pci/msi.c -!Edrivers/pci/bus.c -!Edrivers/pci/access.c -!Edrivers/pci/irq.c -!Edrivers/pci/htirq.c - -!Edrivers/pci/probe.c -!Edrivers/pci/slot.c -!Edrivers/pci/rom.c -!Edrivers/pci/iov.c -!Idrivers/pci/pci-sysfs.c - - PCI Hotplug Support Library -!Edrivers/pci/hotplug/pci_hotplug_core.c - - - - - Firmware Interfaces - DMI Interfaces -!Edrivers/firmware/dmi_scan.c - - EDD Interfaces -!Idrivers/firmware/edd.c - - - - - Security Framework -!Isecurity/security.c -!Esecurity/inode.c - - - - Audit Interfaces -!Ekernel/audit.c -!Ikernel/auditsc.c -!Ikernel/auditfilter.c - - - - Accounting Framework -!Ikernel/acct.c - - - - Block Devices -!Eblock/blk-core.c -!Iblock/blk-core.c -!Eblock/blk-map.c -!Iblock/blk-sysfs.c -!Eblock/blk-settings.c -!Eblock/blk-exec.c -!Eblock/blk-flush.c -!Eblock/blk-lib.c -!Eblock/blk-tag.c -!Iblock/blk-tag.c -!Eblock/blk-integrity.c -!Ikernel/trace/blktrace.c -!Iblock/genhd.c -!Eblock/genhd.c - - - - Char devices -!Efs/char_dev.c - - - - Miscellaneous Devices -!Edrivers/char/misc.c - - - - Clock Framework - - - The clock framework defines programming interfaces to support - software management of the system clock tree. - This framework is widely used with System-On-Chip (SOC) platforms - to support power management and various devices which may need - custom clock rates. - Note that these "clocks" don't relate to timekeeping or real - time clocks (RTCs), each of which have separate frameworks. - These struct clk instances may be used - to manage for example a 96 MHz signal that is used to shift bits - into and out of peripherals or busses, or otherwise trigger - synchronous state machine transitions in system hardware. - - - - Power management is supported by explicit software clock gating: - unused clocks are disabled, so the system doesn't waste power - changing the state of transistors that aren't in active use. - On some systems this may be backed by hardware clock gating, - where clocks are gated without being disabled in software. - Sections of chips that are powered but not clocked may be able - to retain their last state. - This low power state is often called a retention - mode. - This mode still incurs leakage currents, especially with finer - circuit geometries, but for CMOS circuits power is mostly used - by clocked state changes. - - - - Power-aware drivers only enable their clocks when the device - they manage is in active use. Also, system sleep states often - differ according to which clock domains are active: while a - "standby" state may allow wakeup from several active domains, a - "mem" (suspend-to-RAM) state may require a more wholesale shutdown - of clocks derived from higher speed PLLs and oscillators, limiting - the number of possible wakeup event sources. A driver's suspend - method may need to be aware of system-specific clock constraints - on the target sleep state. - - - - Some platforms support programmable clock generators. These - can be used by external chips of various kinds, such as other - CPUs, multimedia codecs, and devices with strict requirements - for interface clocking. - - -!Iinclude/linux/clk.h - - - diff --git a/Documentation/DocBook/rapidio.tmpl b/Documentation/DocBook/rapidio.tmpl index 50479360d84590640c0dd6deba02935a295239fb..ac3cca3399a1e4d4ee205bdbfa3435e8b02f74a7 100644 --- a/Documentation/DocBook/rapidio.tmpl +++ b/Documentation/DocBook/rapidio.tmpl @@ -128,9 +128,6 @@ Device model support !Idrivers/rapidio/rio-driver.c - - Sysfs support -!Idrivers/rapidio/rio-sysfs.c PPC32 support !Iarch/powerpc/sysdev/fsl_rio.c diff --git a/Documentation/DocBook/writing_musb_glue_layer.tmpl b/Documentation/DocBook/writing_musb_glue_layer.tmpl deleted file mode 100644 index 837eca77f274236464225ba280d32a4898640f54..0000000000000000000000000000000000000000 --- a/Documentation/DocBook/writing_musb_glue_layer.tmpl +++ /dev/null @@ -1,873 +0,0 @@ - - - - - - Writing an MUSB Glue Layer - - - - Apelete - Seketeli - -
- apelete at seketeli.net -
-
-
-
- - - 2014 - Apelete Seketeli - - - - - This documentation is free software; you can redistribute it - and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later version. - - - - This documentation is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public License - along with this documentation; if not, write to the Free Software - Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA - 02111-1307 USA - - - - For more details see the file COPYING in the Linux kernel source - tree. - - -
- - - - - Introduction - - The Linux MUSB subsystem is part of the larger Linux USB - subsystem. It provides support for embedded USB Device Controllers - (UDC) that do not use Universal Host Controller Interface (UHCI) - or Open Host Controller Interface (OHCI). - - - Instead, these embedded UDC rely on the USB On-the-Go (OTG) - specification which they implement at least partially. The silicon - reference design used in most cases is the Multipoint USB - Highspeed Dual-Role Controller (MUSB HDRC) found in the Mentor - Graphics Inventra™ design. - - - As a self-taught exercise I have written an MUSB glue layer for - the Ingenic JZ4740 SoC, modelled after the many MUSB glue layers - in the kernel source tree. This layer can be found at - drivers/usb/musb/jz4740.c. In this documentation I will walk - through the basics of the jz4740.c glue layer, explaining the - different pieces and what needs to be done in order to write your - own device glue layer. - - - - - Linux MUSB Basics - - To get started on the topic, please read USB On-the-Go Basics (see - Resources) which provides an introduction of USB OTG operation at - the hardware level. A couple of wiki pages by Texas Instruments - and Analog Devices also provide an overview of the Linux kernel - MUSB configuration, albeit focused on some specific devices - provided by these companies. Finally, getting acquainted with the - USB specification at USB home page may come in handy, with - practical instance provided through the Writing USB Device Drivers - documentation (again, see Resources). - - - Linux USB stack is a layered architecture in which the MUSB - controller hardware sits at the lowest. The MUSB controller driver - abstract the MUSB controller hardware to the Linux USB stack. - - - ------------------------ - | | <------- drivers/usb/gadget - | Linux USB Core Stack | <------- drivers/usb/host - | | <------- drivers/usb/core - ------------------------ - ⬍ - -------------------------- - | | <------ drivers/usb/musb/musb_gadget.c - | MUSB Controller driver | <------ drivers/usb/musb/musb_host.c - | | <------ drivers/usb/musb/musb_core.c - -------------------------- - ⬍ - --------------------------------- - | MUSB Platform Specific Driver | - | | <-- drivers/usb/musb/jz4740.c - | aka "Glue Layer" | - --------------------------------- - ⬍ - --------------------------------- - | MUSB Controller Hardware | - --------------------------------- - - - As outlined above, the glue layer is actually the platform - specific code sitting in between the controller driver and the - controller hardware. - - - Just like a Linux USB driver needs to register itself with the - Linux USB subsystem, the MUSB glue layer needs first to register - itself with the MUSB controller driver. This will allow the - controller driver to know about which device the glue layer - supports and which functions to call when a supported device is - detected or released; remember we are talking about an embedded - controller chip here, so no insertion or removal at run-time. - - - All of this information is passed to the MUSB controller driver - through a platform_driver structure defined in the glue layer as: - - -static struct platform_driver jz4740_driver = { - .probe = jz4740_probe, - .remove = jz4740_remove, - .driver = { - .name = "musb-jz4740", - }, -}; - - - The probe and remove function pointers are called when a matching - device is detected and, respectively, released. The name string - describes the device supported by this glue layer. In the current - case it matches a platform_device structure declared in - arch/mips/jz4740/platform.c. Note that we are not using device - tree bindings here. - - - In order to register itself to the controller driver, the glue - layer goes through a few steps, basically allocating the - controller hardware resources and initialising a couple of - circuits. To do so, it needs to keep track of the information used - throughout these steps. This is done by defining a private - jz4740_glue structure: - - -struct jz4740_glue { - struct device *dev; - struct platform_device *musb; - struct clk *clk; -}; - - - The dev and musb members are both device structure variables. The - first one holds generic information about the device, since it's - the basic device structure, and the latter holds information more - closely related to the subsystem the device is registered to. The - clk variable keeps information related to the device clock - operation. - - - Let's go through the steps of the probe function that leads the - glue layer to register itself to the controller driver. - - - N.B.: For the sake of readability each function will be split in - logical parts, each part being shown as if it was independent from - the others. - - -static int jz4740_probe(struct platform_device *pdev) -{ - struct platform_device *musb; - struct jz4740_glue *glue; - struct clk *clk; - int ret; - - glue = devm_kzalloc(&pdev->dev, sizeof(*glue), GFP_KERNEL); - if (!glue) - return -ENOMEM; - - musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO); - if (!musb) { - dev_err(&pdev->dev, "failed to allocate musb device\n"); - return -ENOMEM; - } - - clk = devm_clk_get(&pdev->dev, "udc"); - if (IS_ERR(clk)) { - dev_err(&pdev->dev, "failed to get clock\n"); - ret = PTR_ERR(clk); - goto err_platform_device_put; - } - - ret = clk_prepare_enable(clk); - if (ret) { - dev_err(&pdev->dev, "failed to enable clock\n"); - goto err_platform_device_put; - } - - musb->dev.parent = &pdev->dev; - - glue->dev = &pdev->dev; - glue->musb = musb; - glue->clk = clk; - - return 0; - -err_platform_device_put: - platform_device_put(musb); - return ret; -} - - - The first few lines of the probe function allocate and assign the - glue, musb and clk variables. The GFP_KERNEL flag (line 8) allows - the allocation process to sleep and wait for memory, thus being - usable in a blocking situation. The PLATFORM_DEVID_AUTO flag (line - 12) allows automatic allocation and management of device IDs in - order to avoid device namespace collisions with explicit IDs. With - devm_clk_get() (line 18) the glue layer allocates the clock -- the - devm_ prefix indicates that clk_get() is - managed: it automatically frees the allocated clock resource data - when the device is released -- and enable it. - - - Then comes the registration steps: - - -static int jz4740_probe(struct platform_device *pdev) -{ - struct musb_hdrc_platform_data *pdata = &jz4740_musb_platform_data; - - pdata->platform_ops = &jz4740_musb_ops; - - platform_set_drvdata(pdev, glue); - - ret = platform_device_add_resources(musb, pdev->resource, - pdev->num_resources); - if (ret) { - dev_err(&pdev->dev, "failed to add resources\n"); - goto err_clk_disable; - } - - ret = platform_device_add_data(musb, pdata, sizeof(*pdata)); - if (ret) { - dev_err(&pdev->dev, "failed to add platform_data\n"); - goto err_clk_disable; - } - - return 0; - -err_clk_disable: - clk_disable_unprepare(clk); -err_platform_device_put: - platform_device_put(musb); - return ret; -} - - - The first step is to pass the device data privately held by the - glue layer on to the controller driver through - platform_set_drvdata() (line 7). Next is passing on the device - resources information, also privately held at that point, through - platform_device_add_resources() (line 9). - - - Finally comes passing on the platform specific data to the - controller driver (line 16). Platform data will be discussed in - Chapter 4, but here - we are looking at the platform_ops function pointer (line 5) in - musb_hdrc_platform_data structure (line 3). This function - pointer allows the MUSB controller driver to know which function - to call for device operation: - - -static const struct musb_platform_ops jz4740_musb_ops = { - .init = jz4740_musb_init, - .exit = jz4740_musb_exit, -}; - - - Here we have the minimal case where only init and exit functions - are called by the controller driver when needed. Fact is the - JZ4740 MUSB controller is a basic controller, lacking some - features found in other controllers, otherwise we may also have - pointers to a few other functions like a power management function - or a function to switch between OTG and non-OTG modes, for - instance. - - - At that point of the registration process, the controller driver - actually calls the init function: - - -static int jz4740_musb_init(struct musb *musb) -{ - musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2); - if (!musb->xceiv) { - pr_err("HS UDC: no transceiver configured\n"); - return -ENODEV; - } - - /* Silicon does not implement ConfigData register. - * Set dyn_fifo to avoid reading EP config from hardware. - */ - musb->dyn_fifo = true; - - musb->isr = jz4740_musb_interrupt; - - return 0; -} - - - The goal of jz4740_musb_init() is to get hold of the transceiver - driver data of the MUSB controller hardware and pass it on to the - MUSB controller driver, as usual. The transceiver is the circuitry - inside the controller hardware responsible for sending/receiving - the USB data. Since it is an implementation of the physical layer - of the OSI model, the transceiver is also referred to as PHY. - - - Getting hold of the MUSB PHY driver data is done with - usb_get_phy() which returns a pointer to the structure - containing the driver instance data. The next couple of - instructions (line 12 and 14) are used as a quirk and to setup - IRQ handling respectively. Quirks and IRQ handling will be - discussed later in Chapter - 5 and Chapter 3. - - -static int jz4740_musb_exit(struct musb *musb) -{ - usb_put_phy(musb->xceiv); - - return 0; -} - - - Acting as the counterpart of init, the exit function releases the - MUSB PHY driver when the controller hardware itself is about to be - released. - - - Again, note that init and exit are fairly simple in this case due - to the basic set of features of the JZ4740 controller hardware. - When writing an musb glue layer for a more complex controller - hardware, you might need to take care of more processing in those - two functions. - - - Returning from the init function, the MUSB controller driver jumps - back into the probe function: - - -static int jz4740_probe(struct platform_device *pdev) -{ - ret = platform_device_add(musb); - if (ret) { - dev_err(&pdev->dev, "failed to register musb device\n"); - goto err_clk_disable; - } - - return 0; - -err_clk_disable: - clk_disable_unprepare(clk); -err_platform_device_put: - platform_device_put(musb); - return ret; -} - - - This is the last part of the device registration process where the - glue layer adds the controller hardware device to Linux kernel - device hierarchy: at this stage, all known information about the - device is passed on to the Linux USB core stack. - - -static int jz4740_remove(struct platform_device *pdev) -{ - struct jz4740_glue *glue = platform_get_drvdata(pdev); - - platform_device_unregister(glue->musb); - clk_disable_unprepare(glue->clk); - - return 0; -} - - - Acting as the counterpart of probe, the remove function unregister - the MUSB controller hardware (line 5) and disable the clock (line - 6), allowing it to be gated. - - - - - Handling IRQs - - Additionally to the MUSB controller hardware basic setup and - registration, the glue layer is also responsible for handling the - IRQs: - - -static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci) -{ - unsigned long flags; - irqreturn_t retval = IRQ_NONE; - struct musb *musb = __hci; - - spin_lock_irqsave(&musb->lock, flags); - - musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB); - musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX); - musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX); - - /* - * The controller is gadget only, the state of the host mode IRQ bits is - * undefined. Mask them to make sure that the musb driver core will - * never see them set - */ - musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME | - MUSB_INTR_RESET | MUSB_INTR_SOF; - - if (musb->int_usb || musb->int_tx || musb->int_rx) - retval = musb_interrupt(musb); - - spin_unlock_irqrestore(&musb->lock, flags); - - return retval; -} - - - Here the glue layer mostly has to read the relevant hardware - registers and pass their values on to the controller driver which - will handle the actual event that triggered the IRQ. - - - The interrupt handler critical section is protected by the - spin_lock_irqsave() and counterpart spin_unlock_irqrestore() - functions (line 7 and 24 respectively), which prevent the - interrupt handler code to be run by two different threads at the - same time. - - - Then the relevant interrupt registers are read (line 9 to 11): - - - - - MUSB_INTRUSB: indicates which USB interrupts are currently - active, - - - - - MUSB_INTRTX: indicates which of the interrupts for TX - endpoints are currently active, - - - - - MUSB_INTRRX: indicates which of the interrupts for TX - endpoints are currently active. - - - - - Note that musb_readb() is used to read 8-bit registers at most, - while musb_readw() allows us to read at most 16-bit registers. - There are other functions that can be used depending on the size - of your device registers. See musb_io.h for more information. - - - Instruction on line 18 is another quirk specific to the JZ4740 - USB device controller, which will be discussed later in Chapter 5. - - - The glue layer still needs to register the IRQ handler though. - Remember the instruction on line 14 of the init function: - - -static int jz4740_musb_init(struct musb *musb) -{ - musb->isr = jz4740_musb_interrupt; - - return 0; -} - - - This instruction sets a pointer to the glue layer IRQ handler - function, in order for the controller hardware to call the handler - back when an IRQ comes from the controller hardware. The interrupt - handler is now implemented and registered. - - - - - Device Platform Data - - In order to write an MUSB glue layer, you need to have some data - describing the hardware capabilities of your controller hardware, - which is called the platform data. - - - Platform data is specific to your hardware, though it may cover a - broad range of devices, and is generally found somewhere in the - arch/ directory, depending on your device architecture. - - - For instance, platform data for the JZ4740 SoC is found in - arch/mips/jz4740/platform.c. In the platform.c file each device of - the JZ4740 SoC is described through a set of structures. - - - Here is the part of arch/mips/jz4740/platform.c that covers the - USB Device Controller (UDC): - - -/* USB Device Controller */ -struct platform_device jz4740_udc_xceiv_device = { - .name = "usb_phy_gen_xceiv", - .id = 0, -}; - -static struct resource jz4740_udc_resources[] = { - [0] = { - .start = JZ4740_UDC_BASE_ADDR, - .end = JZ4740_UDC_BASE_ADDR + 0x10000 - 1, - .flags = IORESOURCE_MEM, - }, - [1] = { - .start = JZ4740_IRQ_UDC, - .end = JZ4740_IRQ_UDC, - .flags = IORESOURCE_IRQ, - .name = "mc", - }, -}; - -struct platform_device jz4740_udc_device = { - .name = "musb-jz4740", - .id = -1, - .dev = { - .dma_mask = &jz4740_udc_device.dev.coherent_dma_mask, - .coherent_dma_mask = DMA_BIT_MASK(32), - }, - .num_resources = ARRAY_SIZE(jz4740_udc_resources), - .resource = jz4740_udc_resources, -}; - - - The jz4740_udc_xceiv_device platform device structure (line 2) - describes the UDC transceiver with a name and id number. - - - At the time of this writing, note that - "usb_phy_gen_xceiv" is the specific name to be used for - all transceivers that are either built-in with reference USB IP or - autonomous and doesn't require any PHY programming. You will need - to set CONFIG_NOP_USB_XCEIV=y in the kernel configuration to make - use of the corresponding transceiver driver. The id field could be - set to -1 (equivalent to PLATFORM_DEVID_NONE), -2 (equivalent to - PLATFORM_DEVID_AUTO) or start with 0 for the first device of this - kind if we want a specific id number. - - - The jz4740_udc_resources resource structure (line 7) defines the - UDC registers base addresses. - - - The first array (line 9 to 11) defines the UDC registers base - memory addresses: start points to the first register memory - address, end points to the last register memory address and the - flags member defines the type of resource we are dealing with. So - IORESOURCE_MEM is used to define the registers memory addresses. - The second array (line 14 to 17) defines the UDC IRQ registers - addresses. Since there is only one IRQ register available for the - JZ4740 UDC, start and end point at the same address. The - IORESOURCE_IRQ flag tells that we are dealing with IRQ resources, - and the name "mc" is in fact hard-coded in the MUSB core - in order for the controller driver to retrieve this IRQ resource - by querying it by its name. - - - Finally, the jz4740_udc_device platform device structure (line 21) - describes the UDC itself. - - - The "musb-jz4740" name (line 22) defines the MUSB - driver that is used for this device; remember this is in fact - the name that we used in the jz4740_driver platform driver - structure in Chapter - 2. The id field (line 23) is set to -1 (equivalent to - PLATFORM_DEVID_NONE) since we do not need an id for the device: - the MUSB controller driver was already set to allocate an - automatic id in Chapter - 2. In the dev field we care for DMA related information - here. The dma_mask field (line 25) defines the width of the DMA - mask that is going to be used, and coherent_dma_mask (line 26) - has the same purpose but for the alloc_coherent DMA mappings: in - both cases we are using a 32 bits mask. Then the resource field - (line 29) is simply a pointer to the resource structure defined - before, while the num_resources field (line 28) keeps track of - the number of arrays defined in the resource structure (in this - case there were two resource arrays defined before). - - - With this quick overview of the UDC platform data at the arch/ - level now done, let's get back to the MUSB glue layer specific - platform data in drivers/usb/musb/jz4740.c: - - -static struct musb_hdrc_config jz4740_musb_config = { - /* Silicon does not implement USB OTG. */ - .multipoint = 0, - /* Max EPs scanned, driver will decide which EP can be used. */ - .num_eps = 4, - /* RAMbits needed to configure EPs from table */ - .ram_bits = 9, - .fifo_cfg = jz4740_musb_fifo_cfg, - .fifo_cfg_size = ARRAY_SIZE(jz4740_musb_fifo_cfg), -}; - -static struct musb_hdrc_platform_data jz4740_musb_platform_data = { - .mode = MUSB_PERIPHERAL, - .config = &jz4740_musb_config, -}; - - - First the glue layer configures some aspects of the controller - driver operation related to the controller hardware specifics. - This is done through the jz4740_musb_config musb_hdrc_config - structure. - - - Defining the OTG capability of the controller hardware, the - multipoint member (line 3) is set to 0 (equivalent to false) - since the JZ4740 UDC is not OTG compatible. Then num_eps (line - 5) defines the number of USB endpoints of the controller - hardware, including endpoint 0: here we have 3 endpoints + - endpoint 0. Next is ram_bits (line 7) which is the width of the - RAM address bus for the MUSB controller hardware. This - information is needed when the controller driver cannot - automatically configure endpoints by reading the relevant - controller hardware registers. This issue will be discussed when - we get to device quirks in Chapter - 5. Last two fields (line 8 and 9) are also about device - quirks: fifo_cfg points to the USB endpoints configuration table - and fifo_cfg_size keeps track of the size of the number of - entries in that configuration table. More on that later in Chapter 5. - - - Then this configuration is embedded inside - jz4740_musb_platform_data musb_hdrc_platform_data structure (line - 11): config is a pointer to the configuration structure itself, - and mode tells the controller driver if the controller hardware - may be used as MUSB_HOST only, MUSB_PERIPHERAL only or MUSB_OTG - which is a dual mode. - - - Remember that jz4740_musb_platform_data is then used to convey - platform data information as we have seen in the probe function - in Chapter 2 - - - - - Device Quirks - - Completing the platform data specific to your device, you may also - need to write some code in the glue layer to work around some - device specific limitations. These quirks may be due to some - hardware bugs, or simply be the result of an incomplete - implementation of the USB On-the-Go specification. - - - The JZ4740 UDC exhibits such quirks, some of which we will discuss - here for the sake of insight even though these might not be found - in the controller hardware you are working on. - - - Let's get back to the init function first: - - -static int jz4740_musb_init(struct musb *musb) -{ - musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2); - if (!musb->xceiv) { - pr_err("HS UDC: no transceiver configured\n"); - return -ENODEV; - } - - /* Silicon does not implement ConfigData register. - * Set dyn_fifo to avoid reading EP config from hardware. - */ - musb->dyn_fifo = true; - - musb->isr = jz4740_musb_interrupt; - - return 0; -} - - - Instruction on line 12 helps the MUSB controller driver to work - around the fact that the controller hardware is missing registers - that are used for USB endpoints configuration. - - - Without these registers, the controller driver is unable to read - the endpoints configuration from the hardware, so we use line 12 - instruction to bypass reading the configuration from silicon, and - rely on a hard-coded table that describes the endpoints - configuration instead: - - -static struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = { -{ .hw_ep_num = 1, .style = FIFO_TX, .maxpacket = 512, }, -{ .hw_ep_num = 1, .style = FIFO_RX, .maxpacket = 512, }, -{ .hw_ep_num = 2, .style = FIFO_TX, .maxpacket = 64, }, -}; - - - Looking at the configuration table above, we see that each - endpoints is described by three fields: hw_ep_num is the endpoint - number, style is its direction (either FIFO_TX for the controller - driver to send packets in the controller hardware, or FIFO_RX to - receive packets from hardware), and maxpacket defines the maximum - size of each data packet that can be transmitted over that - endpoint. Reading from the table, the controller driver knows that - endpoint 1 can be used to send and receive USB data packets of 512 - bytes at once (this is in fact a bulk in/out endpoint), and - endpoint 2 can be used to send data packets of 64 bytes at once - (this is in fact an interrupt endpoint). - - - Note that there is no information about endpoint 0 here: that one - is implemented by default in every silicon design, with a - predefined configuration according to the USB specification. For - more examples of endpoint configuration tables, see musb_core.c. - - - Let's now get back to the interrupt handler function: - - -static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci) -{ - unsigned long flags; - irqreturn_t retval = IRQ_NONE; - struct musb *musb = __hci; - - spin_lock_irqsave(&musb->lock, flags); - - musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB); - musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX); - musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX); - - /* - * The controller is gadget only, the state of the host mode IRQ bits is - * undefined. Mask them to make sure that the musb driver core will - * never see them set - */ - musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME | - MUSB_INTR_RESET | MUSB_INTR_SOF; - - if (musb->int_usb || musb->int_tx || musb->int_rx) - retval = musb_interrupt(musb); - - spin_unlock_irqrestore(&musb->lock, flags); - - return retval; -} - - - Instruction on line 18 above is a way for the controller driver to - work around the fact that some interrupt bits used for USB host - mode operation are missing in the MUSB_INTRUSB register, thus left - in an undefined hardware state, since this MUSB controller - hardware is used in peripheral mode only. As a consequence, the - glue layer masks these missing bits out to avoid parasite - interrupts by doing a logical AND operation between the value read - from MUSB_INTRUSB and the bits that are actually implemented in - the register. - - - These are only a couple of the quirks found in the JZ4740 USB - device controller. Some others were directly addressed in the MUSB - core since the fixes were generic enough to provide a better - handling of the issues for others controller hardware eventually. - - - - - Conclusion - - Writing a Linux MUSB glue layer should be a more accessible task, - as this documentation tries to show the ins and outs of this - exercise. - - - The JZ4740 USB device controller being fairly simple, I hope its - glue layer serves as a good example for the curious mind. Used - with the current MUSB glue layers, this documentation should - provide enough guidance to get started; should anything gets out - of hand, the linux-usb mailing list archive is another helpful - resource to browse through. - - - - - Acknowledgements - - Many thanks to Lars-Peter Clausen and Maarten ter Huurne for - answering my questions while I was writing the JZ4740 glue layer - and for helping me out getting the code in good shape. - - - I would also like to thank the Qi-Hardware community at large for - its cheerful guidance and support. - - - - - Resources - - USB Home Page: - http://www.usb.org - - - linux-usb Mailing List Archives: - http://marc.info/?l=linux-usb - - - USB On-the-Go Basics: - http://www.maximintegrated.com/app-notes/index.mvp/id/1822 - - - Writing USB Device Drivers: - https://www.kernel.org/doc/htmldocs/writing_usb_driver/index.html - - - Texas Instruments USB Configuration Wiki Page: - http://processors.wiki.ti.com/index.php/Usbgeneralpage - - - Analog Devices Blackfin MUSB Configuration: - http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb - - - -
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl deleted file mode 100644 index 3210dcf741c9743c53924e56d3f2768353674157..0000000000000000000000000000000000000000 --- a/Documentation/DocBook/writing_usb_driver.tmpl +++ /dev/null @@ -1,412 +0,0 @@ - - - - - - Writing USB Device Drivers - - - - Greg - Kroah-Hartman - -
- greg@kroah.com -
-
-
-
- - - 2001-2002 - Greg Kroah-Hartman - - - - - This documentation is free software; you can redistribute - it and/or modify it under the terms of the GNU General Public - License as published by the Free Software Foundation; either - version 2 of the License, or (at your option) any later - version. - - - - This program is distributed in the hope that it will be - useful, but WITHOUT ANY WARRANTY; without even the implied - warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - See the GNU General Public License for more details. - - - - You should have received a copy of the GNU General Public - License along with this program; if not, write to the Free - Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, - MA 02111-1307 USA - - - - For more details see the file COPYING in the source - distribution of Linux. - - - - This documentation is based on an article published in - Linux Journal Magazine, October 2001, Issue 90. - - -
- - - - - Introduction - - The Linux USB subsystem has grown from supporting only two different - types of devices in the 2.2.7 kernel (mice and keyboards), to over 20 - different types of devices in the 2.4 kernel. Linux currently supports - almost all USB class devices (standard types of devices like keyboards, - mice, modems, printers and speakers) and an ever-growing number of - vendor-specific devices (such as USB to serial converters, digital - cameras, Ethernet devices and MP3 players). For a full list of the - different USB devices currently supported, see Resources. - - - The remaining kinds of USB devices that do not have support on Linux are - almost all vendor-specific devices. Each vendor decides to implement a - custom protocol to talk to their device, so a custom driver usually needs - to be created. Some vendors are open with their USB protocols and help - with the creation of Linux drivers, while others do not publish them, and - developers are forced to reverse-engineer. See Resources for some links - to handy reverse-engineering tools. - - - Because each different protocol causes a new driver to be created, I have - written a generic USB driver skeleton, modelled after the pci-skeleton.c - file in the kernel source tree upon which many PCI network drivers have - been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c - in the kernel source tree. In this article I will walk through the basics - of the skeleton driver, explaining the different pieces and what needs to - be done to customize it to your specific device. - - - - - Linux USB Basics - - If you are going to write a Linux USB driver, please become familiar with - the USB protocol specification. It can be found, along with many other - useful documents, at the USB home page (see Resources). An excellent - introduction to the Linux USB subsystem can be found at the USB Working - Devices List (see Resources). It explains how the Linux USB subsystem is - structured and introduces the reader to the concept of USB urbs - (USB Request Blocks), which are essential to USB drivers. - - - The first thing a Linux USB driver needs to do is register itself with - the Linux USB subsystem, giving it some information about which devices - the driver supports and which functions to call when a device supported - by the driver is inserted or removed from the system. All of this - information is passed to the USB subsystem in the usb_driver structure. - The skeleton driver declares a usb_driver as: - - -static struct usb_driver skel_driver = { - .name = "skeleton", - .probe = skel_probe, - .disconnect = skel_disconnect, - .fops = &skel_fops, - .minor = USB_SKEL_MINOR_BASE, - .id_table = skel_table, -}; - - - The variable name is a string that describes the driver. It is used in - informational messages printed to the system log. The probe and - disconnect function pointers are called when a device that matches the - information provided in the id_table variable is either seen or removed. - - - The fops and minor variables are optional. Most USB drivers hook into - another kernel subsystem, such as the SCSI, network or TTY subsystem. - These types of drivers register themselves with the other kernel - subsystem, and any user-space interactions are provided through that - interface. But for drivers that do not have a matching kernel subsystem, - such as MP3 players or scanners, a method of interacting with user space - is needed. The USB subsystem provides a way to register a minor device - number and a set of file_operations function pointers that enable this - user-space interaction. The skeleton driver needs this kind of interface, - so it provides a minor starting number and a pointer to its - file_operations functions. - - - The USB driver is then registered with a call to usb_register, usually in - the driver's init function, as shown here: - - -static int __init usb_skel_init(void) -{ - int result; - - /* register this driver with the USB subsystem */ - result = usb_register(&skel_driver); - if (result < 0) { - err("usb_register failed for the "__FILE__ "driver." - "Error number %d", result); - return -1; - } - - return 0; -} -module_init(usb_skel_init); - - - When the driver is unloaded from the system, it needs to deregister - itself with the USB subsystem. This is done with the usb_deregister - function: - - -static void __exit usb_skel_exit(void) -{ - /* deregister this driver with the USB subsystem */ - usb_deregister(&skel_driver); -} -module_exit(usb_skel_exit); - - - To enable the linux-hotplug system to load the driver automatically when - the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The - following code tells the hotplug scripts that this module supports a - single device with a specific vendor and product ID: - - -/* table of devices that work with this driver */ -static struct usb_device_id skel_table [] = { - { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) }, - { } /* Terminating entry */ -}; -MODULE_DEVICE_TABLE (usb, skel_table); - - - There are other macros that can be used in describing a usb_device_id for - drivers that support a whole class of USB drivers. See usb.h for more - information on this. - - - - - Device operation - - When a device is plugged into the USB bus that matches the device ID - pattern that your driver registered with the USB core, the probe function - is called. The usb_device structure, interface number and the interface ID - are passed to the function: - - -static int skel_probe(struct usb_interface *interface, - const struct usb_device_id *id) - - - The driver now needs to verify that this device is actually one that it - can accept. If so, it returns 0. - If not, or if any error occurs during initialization, an errorcode - (such as -ENOMEM or -ENODEV) - is returned from the probe function. - - - In the skeleton driver, we determine what end points are marked as bulk-in - and bulk-out. We create buffers to hold the data that will be sent and - received from the device, and a USB urb to write data to the device is - initialized. - - - Conversely, when the device is removed from the USB bus, the disconnect - function is called with the device pointer. The driver needs to clean any - private data that has been allocated at this time and to shut down any - pending urbs that are in the USB system. - - - Now that the device is plugged into the system and the driver is bound to - the device, any of the functions in the file_operations structure that - were passed to the USB subsystem will be called from a user program trying - to talk to the device. The first function called will be open, as the - program tries to open the device for I/O. We increment our private usage - count and save a pointer to our internal structure in the file - structure. This is done so that future calls to file operations will - enable the driver to determine which device the user is addressing. All - of this is done with the following code: - - -/* increment our usage count for the module */ -++skel->open_count; - -/* save our object in the file's private structure */ -file->private_data = dev; - - - After the open function is called, the read and write functions are called - to receive and send data to the device. In the skel_write function, we - receive a pointer to some data that the user wants to send to the device - and the size of the data. The function determines how much data it can - send to the device based on the size of the write urb it has created (this - size depends on the size of the bulk out end point that the device has). - Then it copies the data from user space to kernel space, points the urb to - the data and submits the urb to the USB subsystem. This can be seen in - the following code: - - -/* we can only write as much as 1 urb will hold */ -bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count; - -/* copy the data from user space into our urb */ -copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written); - -/* set up our urb */ -usb_fill_bulk_urb(skel->write_urb, - skel->dev, - usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr), - skel->write_urb->transfer_buffer, - bytes_written, - skel_write_bulk_callback, - skel); - -/* send the data out the bulk port */ -result = usb_submit_urb(skel->write_urb); -if (result) { - err("Failed submitting write urb, error %d", result); -} - - - When the write urb is filled up with the proper information using the - usb_fill_bulk_urb function, we point the urb's completion callback to call our - own skel_write_bulk_callback function. This function is called when the - urb is finished by the USB subsystem. The callback function is called in - interrupt context, so caution must be taken not to do very much processing - at that time. Our implementation of skel_write_bulk_callback merely - reports if the urb was completed successfully or not and then returns. - - - The read function works a bit differently from the write function in that - we do not use an urb to transfer data from the device to the driver. - Instead we call the usb_bulk_msg function, which can be used to send or - receive data from a device without having to create urbs and handle - urb completion callback functions. We call the usb_bulk_msg function, - giving it a buffer into which to place any data received from the device - and a timeout value. If the timeout period expires without receiving any - data from the device, the function will fail and return an error message. - This can be shown with the following code: - - -/* do an immediate bulk read to get data from the device */ -retval = usb_bulk_msg (skel->dev, - usb_rcvbulkpipe (skel->dev, - skel->bulk_in_endpointAddr), - skel->bulk_in_buffer, - skel->bulk_in_size, - &count, HZ*10); -/* if the read was successful, copy the data to user space */ -if (!retval) { - if (copy_to_user (buffer, skel->bulk_in_buffer, count)) - retval = -EFAULT; - else - retval = count; -} - - - The usb_bulk_msg function can be very useful for doing single reads or - writes to a device; however, if you need to read or write constantly to a - device, it is recommended to set up your own urbs and submit them to the - USB subsystem. - - - When the user program releases the file handle that it has been using to - talk to the device, the release function in the driver is called. In this - function we decrement our private usage count and wait for possible - pending writes: - - -/* decrement our usage count for the device */ ---skel->open_count; - - - One of the more difficult problems that USB drivers must be able to handle - smoothly is the fact that the USB device may be removed from the system at - any point in time, even if a program is currently talking to it. It needs - to be able to shut down any current reads and writes and notify the - user-space programs that the device is no longer there. The following - code (function skel_delete) - is an example of how to do this: - -static inline void skel_delete (struct usb_skel *dev) -{ - kfree (dev->bulk_in_buffer); - if (dev->bulk_out_buffer != NULL) - usb_free_coherent (dev->udev, dev->bulk_out_size, - dev->bulk_out_buffer, - dev->write_urb->transfer_dma); - usb_free_urb (dev->write_urb); - kfree (dev); -} - - - If a program currently has an open handle to the device, we reset the flag - device_present. For - every read, write, release and other functions that expect a device to be - present, the driver first checks this flag to see if the device is - still present. If not, it releases that the device has disappeared, and a - -ENODEV error is returned to the user-space program. When the release - function is eventually called, it determines if there is no device - and if not, it does the cleanup that the skel_disconnect - function normally does if there are no open files on the device (see - Listing 5). - - - - - Isochronous Data - - This usb-skeleton driver does not have any examples of interrupt or - isochronous data being sent to or from the device. Interrupt data is sent - almost exactly as bulk data is, with a few minor exceptions. Isochronous - data works differently with continuous streams of data being sent to or - from the device. The audio and video camera drivers are very good examples - of drivers that handle isochronous data and will be useful if you also - need to do this. - - - - - Conclusion - - Writing Linux USB device drivers is not a difficult task as the - usb-skeleton driver shows. This driver, combined with the other current - USB drivers, should provide enough examples to help a beginning author - create a working driver in a minimal amount of time. The linux-usb-devel - mailing list archives also contain a lot of helpful information. - - - - - Resources - - The Linux USB Project: http://www.linux-usb.org/ - - - Linux Hotplug Project: http://linux-hotplug.sourceforge.net/ - - - Linux USB Working Devices List: http://www.qbik.ch/usb/devices/ - - - linux-usb-devel Mailing List Archives: http://marc.theaimsgroup.com/?l=linux-usb-devel - - - Programming Guide for Linux USB Device Drivers: http://usb.cs.tum.edu/usbdoc - - - USB Home Page: http://www.usb.org - - - -
diff --git a/Documentation/PCI/00-INDEX b/Documentation/PCI/00-INDEX index 147231f1613ef3a0a8df8d9addc2a754d5ac69af..00c9a90b6f3854050f1778961be557b10c25fa0d 100644 --- a/Documentation/PCI/00-INDEX +++ b/Documentation/PCI/00-INDEX @@ -12,3 +12,13 @@ pci.txt - info on the PCI subsystem for device driver authors pcieaer-howto.txt - the PCI Express Advanced Error Reporting Driver Guide HOWTO +endpoint/pci-endpoint.txt + - guide to add endpoint controller driver and endpoint function driver. +endpoint/pci-endpoint-cfs.txt + - guide to use configfs to configure the PCI endpoint function. +endpoint/pci-test-function.txt + - specification of *PCI test* function device. +endpoint/pci-test-howto.txt + - userguide for PCI endpoint test function. +endpoint/function/binding/ + - binding documentation for PCI endpoint function diff --git a/Documentation/PCI/endpoint/function/binding/pci-test.txt b/Documentation/PCI/endpoint/function/binding/pci-test.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b68b955fb503374b0426e75de246971111dd463 --- /dev/null +++ b/Documentation/PCI/endpoint/function/binding/pci-test.txt @@ -0,0 +1,17 @@ +PCI TEST ENDPOINT FUNCTION + +name: Should be "pci_epf_test" to bind to the pci_epf_test driver. + +Configurable Fields: +vendorid : should be 0x104c +deviceid : should be 0xb500 for DRA74x and 0xb501 for DRA72x +revid : don't care +progif_code : don't care +subclass_code : don't care +baseclass_code : should be 0xff +cache_line_size : don't care +subsys_vendor_id : don't care +subsys_id : don't care +interrupt_pin : Should be 1 - INTA, 2 - INTB, 3 - INTC, 4 -INTD +msi_interrupts : Should be 1 to 32 depending on the number of MSI interrupts + to test diff --git a/Documentation/PCI/endpoint/pci-endpoint-cfs.txt b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt new file mode 100644 index 0000000000000000000000000000000000000000..d740f29960a47edc21e1811d3f496496c8865ecd --- /dev/null +++ b/Documentation/PCI/endpoint/pci-endpoint-cfs.txt @@ -0,0 +1,105 @@ + CONFIGURING PCI ENDPOINT USING CONFIGFS + Kishon Vijay Abraham I + +The PCI Endpoint Core exposes configfs entry (pci_ep) to configure the +PCI endpoint function and to bind the endpoint function +with the endpoint controller. (For introducing other mechanisms to +configure the PCI Endpoint Function refer to [1]). + +*) Mounting configfs + +The PCI Endpoint Core layer creates pci_ep directory in the mounted configfs +directory. configfs can be mounted using the following command. + + mount -t configfs none /sys/kernel/config + +*) Directory Structure + +The pci_ep configfs has two directories at its root: controllers and +functions. Every EPC device present in the system will have an entry in +the *controllers* directory and and every EPF driver present in the system +will have an entry in the *functions* directory. + +/sys/kernel/config/pci_ep/ + .. controllers/ + .. functions/ + +*) Creating EPF Device + +Every registered EPF driver will be listed in controllers directory. The +entries corresponding to EPF driver will be created by the EPF core. + +/sys/kernel/config/pci_ep/functions/ + .. / + ... / + ... / + .. / + ... / + ... / + +In order to create a of the type probed by , the +user has to create a directory inside . + +Every directory consists of the following entries that can be +used to configure the standard configuration header of the endpoint function. +(These entries are created by the framework when any new is +created) + + .. / + ... / + ... vendorid + ... deviceid + ... revid + ... progif_code + ... subclass_code + ... baseclass_code + ... cache_line_size + ... subsys_vendor_id + ... subsys_id + ... interrupt_pin + +*) EPC Device + +Every registered EPC device will be listed in controllers directory. The +entries corresponding to EPC device will be created by the EPC core. + +/sys/kernel/config/pci_ep/controllers/ + .. / + ... / + ... / + ... start + .. / + ... / + ... / + ... start + +The directory will have a list of symbolic links to +. These symbolic links should be created by the user to +represent the functions present in the endpoint device. + +The directory will also have a *start* field. Once +"1" is written to this field, the endpoint device will be ready to +establish the link with the host. This is usually done after +all the EPF devices are created and linked with the EPC device. + + + | controllers/ + | / + | + | start + | functions/ + | / + | / + | vendorid + | deviceid + | revid + | progif_code + | subclass_code + | baseclass_code + | cache_line_size + | subsys_vendor_id + | subsys_id + | interrupt_pin + | function + +[1] -> Documentation/PCI/endpoint/pci-endpoint.txt diff --git a/Documentation/PCI/endpoint/pci-endpoint.txt b/Documentation/PCI/endpoint/pci-endpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b1d668292904373a8a15b563c5cb41903699416 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-endpoint.txt @@ -0,0 +1,215 @@ + PCI ENDPOINT FRAMEWORK + Kishon Vijay Abraham I + +This document is a guide to use the PCI Endpoint Framework in order to create +endpoint controller driver, endpoint function driver, and using configfs +interface to bind the function driver to the controller driver. + +1. Introduction + +Linux has a comprehensive PCI subsystem to support PCI controllers that +operates in Root Complex mode. The subsystem has capability to scan PCI bus, +assign memory resources and IRQ resources, load PCI driver (based on +vendor ID, device ID), support other services like hot-plug, power management, +advanced error reporting and virtual channels. + +However the PCI controller IP integrated in some SoCs is capable of operating +either in Root Complex mode or Endpoint mode. PCI Endpoint Framework will +add endpoint mode support in Linux. This will help to run Linux in an +EP system which can have a wide variety of use cases from testing or +validation, co-processor accelerator, etc. + +2. PCI Endpoint Core + +The PCI Endpoint Core layer comprises 3 components: the Endpoint Controller +library, the Endpoint Function library, and the configfs layer to bind the +endpoint function with the endpoint controller. + +2.1 PCI Endpoint Controller(EPC) Library + +The EPC library provides APIs to be used by the controller that can operate +in endpoint mode. It also provides APIs to be used by function driver/library +in order to implement a particular endpoint function. + +2.1.1 APIs for the PCI controller Driver + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI controller driver. + +*) devm_pci_epc_create()/pci_epc_create() + + The PCI controller driver should implement the following ops: + * write_header: ops to populate configuration space header + * set_bar: ops to configure the BAR + * clear_bar: ops to reset the BAR + * alloc_addr_space: ops to allocate in PCI controller address space + * free_addr_space: ops to free the allocated address space + * raise_irq: ops to raise a legacy or MSI interrupt + * start: ops to start the PCI link + * stop: ops to stop the PCI link + + The PCI controller driver can then create a new EPC device by invoking + devm_pci_epc_create()/pci_epc_create(). + +*) devm_pci_epc_destroy()/pci_epc_destroy() + + The PCI controller driver can destroy the EPC device created by either + devm_pci_epc_create() or pci_epc_create() using devm_pci_epc_destroy() or + pci_epc_destroy(). + +*) pci_epc_linkup() + + In order to notify all the function devices that the EPC device to which + they are linked has established a link with the host, the PCI controller + driver should invoke pci_epc_linkup(). + +*) pci_epc_mem_init() + + Initialize the pci_epc_mem structure used for allocating EPC addr space. + +*) pci_epc_mem_exit() + + Cleanup the pci_epc_mem structure allocated during pci_epc_mem_init(). + +2.1.2 APIs for the PCI Endpoint Function Driver + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint function driver. + +*) pci_epc_write_header() + + The PCI endpoint function driver should use pci_epc_write_header() to + write the standard configuration header to the endpoint controller. + +*) pci_epc_set_bar() + + The PCI endpoint function driver should use pci_epc_set_bar() to configure + the Base Address Register in order for the host to assign PCI addr space. + Register space of the function driver is usually configured + using this API. + +*) pci_epc_clear_bar() + + The PCI endpoint function driver should use pci_epc_clear_bar() to reset + the BAR. + +*) pci_epc_raise_irq() + + The PCI endpoint function driver should use pci_epc_raise_irq() to raise + Legacy Interrupt or MSI Interrupt. + +*) pci_epc_mem_alloc_addr() + + The PCI endpoint function driver should use pci_epc_mem_alloc_addr(), to + allocate memory address from EPC addr space which is required to access + RC's buffer + +*) pci_epc_mem_free_addr() + + The PCI endpoint function driver should use pci_epc_mem_free_addr() to + free the memory space allocated using pci_epc_mem_alloc_addr(). + +2.1.3 Other APIs + +There are other APIs provided by the EPC library. These are used for binding +the EPF device with EPC device. pci-ep-cfs.c can be used as reference for +using these APIs. + +*) pci_epc_get() + + Get a reference to the PCI endpoint controller based on the device name of + the controller. + +*) pci_epc_put() + + Release the reference to the PCI endpoint controller obtained using + pci_epc_get() + +*) pci_epc_add_epf() + + Add a PCI endpoint function to a PCI endpoint controller. A PCIe device + can have up to 8 functions according to the specification. + +*) pci_epc_remove_epf() + + Remove the PCI endpoint function from PCI endpoint controller. + +*) pci_epc_start() + + The PCI endpoint function driver should invoke pci_epc_start() once it + has configured the endpoint function and wants to start the PCI link. + +*) pci_epc_stop() + + The PCI endpoint function driver should invoke pci_epc_stop() to stop + the PCI LINK. + +2.2 PCI Endpoint Function(EPF) Library + +The EPF library provides APIs to be used by the function driver and the EPC +library to provide endpoint mode functionality. + +2.2.1 APIs for the PCI Endpoint Function Driver + +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint function driver. + +*) pci_epf_register_driver() + + The PCI Endpoint Function driver should implement the following ops: + * bind: ops to perform when a EPC device has been bound to EPF device + * unbind: ops to perform when a binding has been lost between a EPC + device and EPF device + * linkup: ops to perform when the EPC device has established a + connection with a host system + + The PCI Function driver can then register the PCI EPF driver by using + pci_epf_register_driver(). + +*) pci_epf_unregister_driver() + + The PCI Function driver can unregister the PCI EPF driver by using + pci_epf_unregister_driver(). + +*) pci_epf_alloc_space() + + The PCI Function driver can allocate space for a particular BAR using + pci_epf_alloc_space(). + +*) pci_epf_free_space() + + The PCI Function driver can free the allocated space + (using pci_epf_alloc_space) by invoking pci_epf_free_space(). + +2.2.2 APIs for the PCI Endpoint Controller Library +This section lists the APIs that the PCI Endpoint core provides to be used +by the PCI endpoint controller library. + +*) pci_epf_linkup() + + The PCI endpoint controller library invokes pci_epf_linkup() when the + EPC device has established the connection to the host. + +2.2.2 Other APIs +There are other APIs provided by the EPF library. These are used to notify +the function driver when the EPF device is bound to the EPC device. +pci-ep-cfs.c can be used as reference for using these APIs. + +*) pci_epf_create() + + Create a new PCI EPF device by passing the name of the PCI EPF device. + This name will be used to bind the the EPF device to a EPF driver. + +*) pci_epf_destroy() + + Destroy the created PCI EPF device. + +*) pci_epf_bind() + + pci_epf_bind() should be invoked when the EPF device has been bound to + a EPC device. + +*) pci_epf_unbind() + + pci_epf_unbind() should be invoked when the binding between EPC device + and EPF device is lost. diff --git a/Documentation/PCI/endpoint/pci-test-function.txt b/Documentation/PCI/endpoint/pci-test-function.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c519c9bf94a35802bff554004ab23021e1382a6 --- /dev/null +++ b/Documentation/PCI/endpoint/pci-test-function.txt @@ -0,0 +1,66 @@ + PCI TEST + Kishon Vijay Abraham I + +Traditionally PCI RC has always been validated by using standard +PCI cards like ethernet PCI cards or USB PCI cards or SATA PCI cards. +However with the addition of EP-core in linux kernel, it is possible +to configure a PCI controller that can operate in EP mode to work as +a test device. + +The PCI endpoint test device is a virtual device (defined in software) +used to test the endpoint functionality and serve as a sample driver +for other PCI endpoint devices (to use the EP framework). + +The PCI endpoint test device has the following registers: + + 1) PCI_ENDPOINT_TEST_MAGIC + 2) PCI_ENDPOINT_TEST_COMMAND + 3) PCI_ENDPOINT_TEST_STATUS + 4) PCI_ENDPOINT_TEST_SRC_ADDR + 5) PCI_ENDPOINT_TEST_DST_ADDR + 6) PCI_ENDPOINT_TEST_SIZE + 7) PCI_ENDPOINT_TEST_CHECKSUM + +*) PCI_ENDPOINT_TEST_MAGIC + +This register will be used to test BAR0. A known pattern will be written +and read back from MAGIC register to verify BAR0. + +*) PCI_ENDPOINT_TEST_COMMAND: + +This register will be used by the host driver to indicate the function +that the endpoint device must perform. + +Bitfield Description: + Bit 0 : raise legacy IRQ + Bit 1 : raise MSI IRQ + Bit 2 - 7 : MSI interrupt number + Bit 8 : read command (read data from RC buffer) + Bit 9 : write command (write data to RC buffer) + Bit 10 : copy command (copy data from one RC buffer to another + RC buffer) + +*) PCI_ENDPOINT_TEST_STATUS + +This register reflects the status of the PCI endpoint device. + +Bitfield Description: + Bit 0 : read success + Bit 1 : read fail + Bit 2 : write success + Bit 3 : write fail + Bit 4 : copy success + Bit 5 : copy fail + Bit 6 : IRQ raised + Bit 7 : source address is invalid + Bit 8 : destination address is invalid + +*) PCI_ENDPOINT_TEST_SRC_ADDR + +This register contains the source address (RC buffer address) for the +COPY/READ command. + +*) PCI_ENDPOINT_TEST_DST_ADDR + +This register contains the destination address (RC buffer address) for +the COPY/WRITE command. diff --git a/Documentation/PCI/endpoint/pci-test-howto.txt b/Documentation/PCI/endpoint/pci-test-howto.txt new file mode 100644 index 0000000000000000000000000000000000000000..75f48c3bb19135b11300d23d09b5446de431bf5e --- /dev/null +++ b/Documentation/PCI/endpoint/pci-test-howto.txt @@ -0,0 +1,179 @@ + PCI TEST USERGUIDE + Kishon Vijay Abraham I + +This document is a guide to help users use pci-epf-test function driver +and pci_endpoint_test host driver for testing PCI. The list of steps to +be followed in the host side and EP side is given below. + +1. Endpoint Device + +1.1 Endpoint Controller Devices + +To find the list of endpoint controller devices in the system: + + # ls /sys/class/pci_epc/ + 51000000.pcie_ep + +If PCI_ENDPOINT_CONFIGFS is enabled + # ls /sys/kernel/config/pci_ep/controllers + 51000000.pcie_ep + +1.2 Endpoint Function Drivers + +To find the list of endpoint function drivers in the system: + + # ls /sys/bus/pci-epf/drivers + pci_epf_test + +If PCI_ENDPOINT_CONFIGFS is enabled + # ls /sys/kernel/config/pci_ep/functions + pci_epf_test + +1.3 Creating pci-epf-test Device + +PCI endpoint function device can be created using the configfs. To create +pci-epf-test device, the following commands can be used + + # mount -t configfs none /sys/kernel/config + # cd /sys/kernel/config/pci_ep/ + # mkdir functions/pci_epf_test/func1 + +The "mkdir func1" above creates the pci-epf-test function device that will +be probed by pci_epf_test driver. + +The PCI endpoint framework populates the directory with the following +configurable fields. + + # ls functions/pci_epf_test/func1 + baseclass_code interrupt_pin revid subsys_vendor_id + cache_line_size msi_interrupts subclass_code vendorid + deviceid progif_code subsys_id + +The PCI endpoint function driver populates these entries with default values +when the device is bound to the driver. The pci-epf-test driver populates +vendorid with 0xffff and interrupt_pin with 0x0001 + + # cat functions/pci_epf_test/func1/vendorid + 0xffff + # cat functions/pci_epf_test/func1/interrupt_pin + 0x0001 + +1.4 Configuring pci-epf-test Device + +The user can configure the pci-epf-test device using configfs entry. In order +to change the vendorid and the number of MSI interrupts used by the function +device, the following commands can be used. + + # echo 0x104c > functions/pci_epf_test/func1/vendorid + # echo 0xb500 > functions/pci_epf_test/func1/deviceid + # echo 16 > functions/pci_epf_test/func1/msi_interrupts + +1.5 Binding pci-epf-test Device to EP Controller + +In order for the endpoint function device to be useful, it has to be bound to +a PCI endpoint controller driver. Use the configfs to bind the function +device to one of the controller driver present in the system. + + # ln -s functions/pci_epf_test/func1 controllers/51000000.pcie_ep/ + +Once the above step is completed, the PCI endpoint is ready to establish a link +with the host. + +1.6 Start the Link + +In order for the endpoint device to establish a link with the host, the _start_ +field should be populated with '1'. + + # echo 1 > controllers/51000000.pcie_ep/start + +2. RootComplex Device + +2.1 lspci Output + +Note that the devices listed here correspond to the value populated in 1.4 above + + 00:00.0 PCI bridge: Texas Instruments Device 8888 (rev 01) + 01:00.0 Unassigned class [ff00]: Texas Instruments Device b500 + +2.2 Using Endpoint Test function Device + +pcitest.sh added in tools/pci/ can be used to run all the default PCI endpoint +tests. Before pcitest.sh can be used pcitest.c should be compiled using the +following commands. + + cd + make headers_install ARCH=arm + arm-linux-gnueabihf-gcc -Iusr/include tools/pci/pcitest.c -o pcitest + cp pcitest /usr/sbin/ + cp tools/pci/pcitest.sh + +2.2.1 pcitest.sh Output + # ./pcitest.sh + BAR tests + + BAR0: OKAY + BAR1: OKAY + BAR2: OKAY + BAR3: OKAY + BAR4: NOT OKAY + BAR5: NOT OKAY + + Interrupt tests + + LEGACY IRQ: NOT OKAY + MSI1: OKAY + MSI2: OKAY + MSI3: OKAY + MSI4: OKAY + MSI5: OKAY + MSI6: OKAY + MSI7: OKAY + MSI8: OKAY + MSI9: OKAY + MSI10: OKAY + MSI11: OKAY + MSI12: OKAY + MSI13: OKAY + MSI14: OKAY + MSI15: OKAY + MSI16: OKAY + MSI17: NOT OKAY + MSI18: NOT OKAY + MSI19: NOT OKAY + MSI20: NOT OKAY + MSI21: NOT OKAY + MSI22: NOT OKAY + MSI23: NOT OKAY + MSI24: NOT OKAY + MSI25: NOT OKAY + MSI26: NOT OKAY + MSI27: NOT OKAY + MSI28: NOT OKAY + MSI29: NOT OKAY + MSI30: NOT OKAY + MSI31: NOT OKAY + MSI32: NOT OKAY + + Read Tests + + READ ( 1 bytes): OKAY + READ ( 1024 bytes): OKAY + READ ( 1025 bytes): OKAY + READ (1024000 bytes): OKAY + READ (1024001 bytes): OKAY + + Write Tests + + WRITE ( 1 bytes): OKAY + WRITE ( 1024 bytes): OKAY + WRITE ( 1025 bytes): OKAY + WRITE (1024000 bytes): OKAY + WRITE (1024001 bytes): OKAY + + Copy Tests + + COPY ( 1 bytes): OKAY + COPY ( 1024 bytes): OKAY + COPY ( 1025 bytes): OKAY + COPY (1024000 bytes): OKAY + COPY (1024001 bytes): OKAY diff --git a/Documentation/PCI/pci-error-recovery.txt b/Documentation/PCI/pci-error-recovery.txt index da3b2176d5da74a4e279266c7b0326e27bfdf8cf..0b6bb3ef449ee7a8ee45afe9b4be61a2859ac8e3 100644 --- a/Documentation/PCI/pci-error-recovery.txt +++ b/Documentation/PCI/pci-error-recovery.txt @@ -11,7 +11,7 @@ Many PCI bus controllers are able to detect a variety of hardware PCI errors on the bus, such as parity errors on the data and address -busses, as well as SERR and PERR errors. Some of the more advanced +buses, as well as SERR and PERR errors. Some of the more advanced chipsets are able to deal with these errors; these include PCI-E chipsets, and the PCI-host bridges found on IBM Power4, Power5 and Power6-based pSeries boxes. A typical action taken is to disconnect the affected device, @@ -173,7 +173,7 @@ is STEP 6 (Permanent Failure). >>> a value of 0xff on read, and writes will be dropped. If more than >>> EEH_MAX_FAILS I/O's are attempted to a frozen adapter, EEH >>> assumes that the device driver has gone into an infinite loop ->>> and prints an error to syslog. A reboot is then required to +>>> and prints an error to syslog. A reboot is then required to >>> get the device working again. STEP 2: MMIO Enabled @@ -231,14 +231,14 @@ proceeds to STEP 4 (Slot Reset) STEP 3: Link Reset ------------------ The platform resets the link. This is a PCI-Express specific step -and is done whenever a non-fatal error has been detected that can be +and is done whenever a fatal error has been detected that can be "solved" by resetting the link. STEP 4: Slot Reset ------------------ In response to a return value of PCI_ERS_RESULT_NEED_RESET, the -the platform will perform a slot reset on the requesting PCI device(s). +the platform will perform a slot reset on the requesting PCI device(s). The actual steps taken by a platform to perform a slot reset will be platform-dependent. Upon completion of slot reset, the platform will call the device slot_reset() callback. @@ -258,7 +258,7 @@ configuration registers to initialize to their default conditions. For most PCI devices, a soft reset will be sufficient for recovery. Optional fundamental reset is provided to support a limited number -of PCI Express PCI devices for which a soft reset is not sufficient +of PCI Express devices for which a soft reset is not sufficient for recovery. If the platform supports PCI hotplug, then the reset might be @@ -303,7 +303,7 @@ driver performs device init only from PCI function 0: Same as above. Drivers for PCI Express cards that require a fundamental reset must -set the needs_freset bit in the pci_dev structure in their probe function. +set the needs_freset bit in the pci_dev structure in their probe function. For example, the QLogic qla2xxx driver sets the needs_freset bit for certain PCI card types: diff --git a/Documentation/PCI/pci-iov-howto.txt b/Documentation/PCI/pci-iov-howto.txt index 2d91ae25198295dc8559d2ef8d00cae1b5314989..d2a84151e99c0a37242d5dd754a3e571ab58ac53 100644 --- a/Documentation/PCI/pci-iov-howto.txt +++ b/Documentation/PCI/pci-iov-howto.txt @@ -68,6 +68,18 @@ To disable SR-IOV capability: echo 0 > \ /sys/bus/pci/devices//sriov_numvfs +To enable auto probing VFs by a compatible driver on the host, run +command below before enabling SR-IOV capabilities. This is the +default behavior. + echo 1 > \ + /sys/bus/pci/devices//sriov_drivers_autoprobe + +To disable auto probing VFs by a compatible driver on the host, run +command below before enabling SR-IOV capabilities. Updating this +entry will not affect VFs which are already probed. + echo 0 > \ + /sys/bus/pci/devices//sriov_drivers_autoprobe + 3.2 Usage example Following piece of code illustrates the usage of the SR-IOV API. diff --git a/Documentation/RCU/00-INDEX b/Documentation/RCU/00-INDEX index f773a264ae02918ef6b79f002ff17d0bd4e5df65..1672573b037a73ebe4b169f7044e525b7a4bf246 100644 --- a/Documentation/RCU/00-INDEX +++ b/Documentation/RCU/00-INDEX @@ -17,7 +17,7 @@ rcu_dereference.txt rcubarrier.txt - RCU and Unloadable Modules rculist_nulls.txt - - RCU list primitives for use with SLAB_DESTROY_BY_RCU + - RCU list primitives for use with SLAB_TYPESAFE_BY_RCU rcuref.txt - Reference-count design for elements of lists/arrays protected by RCU rcu.txt diff --git a/Documentation/RCU/Design/Data-Structures/Data-Structures.html b/Documentation/RCU/Design/Data-Structures/Data-Structures.html index d583c653a703f0645c10c11cd8c1dfe761095cd6..38d6d800761f7cd944c7b290eeea19dbc0d3866e 100644 --- a/Documentation/RCU/Design/Data-Structures/Data-Structures.html +++ b/Documentation/RCU/Design/Data-Structures/Data-Structures.html @@ -19,6 +19,8 @@ to each other. The rcu_state Structure
  • The rcu_node Structure +
  • + The rcu_segcblist Structure
  • The rcu_data Structure
  • @@ -841,6 +843,134 @@ for lockdep lock-class names. Finally, lines 64-66 produce an error if the maximum number of CPUs is too large for the specified fanout. +

    +The rcu_segcblist Structure

    + +The rcu_segcblist structure maintains a segmented list of +callbacks as follows: + +
    + 1 #define RCU_DONE_TAIL        0
    + 2 #define RCU_WAIT_TAIL        1
    + 3 #define RCU_NEXT_READY_TAIL  2
    + 4 #define RCU_NEXT_TAIL        3
    + 5 #define RCU_CBLIST_NSEGS     4
    + 6
    + 7 struct rcu_segcblist {
    + 8   struct rcu_head *head;
    + 9   struct rcu_head **tails[RCU_CBLIST_NSEGS];
    +10   unsigned long gp_seq[RCU_CBLIST_NSEGS];
    +11   long len;
    +12   long len_lazy;
    +13 };
    +
    + +

    +The segments are as follows: + +

      +
    1. RCU_DONE_TAIL: Callbacks whose grace periods have elapsed. + These callbacks are ready to be invoked. +
    2. RCU_WAIT_TAIL: Callbacks that are waiting for the + current grace period. + Note that different CPUs can have different ideas about which + grace period is current, hence the ->gp_seq field. +
    3. RCU_NEXT_READY_TAIL: Callbacks waiting for the next + grace period to start. +
    4. RCU_NEXT_TAIL: Callbacks that have not yet been + associated with a grace period. +
    + +

    +The ->head pointer references the first callback or +is NULL if the list contains no callbacks (which is +not the same as being empty). +Each element of the ->tails[] array references the +->next pointer of the last callback in the corresponding +segment of the list, or the list's ->head pointer if +that segment and all previous segments are empty. +If the corresponding segment is empty but some previous segment is +not empty, then the array element is identical to its predecessor. +Older callbacks are closer to the head of the list, and new callbacks +are added at the tail. +This relationship between the ->head pointer, the +->tails[] array, and the callbacks is shown in this +diagram: + +

    nxtlist.svg + +

    In this figure, the ->head pointer references the +first +RCU callback in the list. +The ->tails[RCU_DONE_TAIL] array element references +the ->head pointer itself, indicating that none +of the callbacks is ready to invoke. +The ->tails[RCU_WAIT_TAIL] array element references callback +CB 2's ->next pointer, which indicates that +CB 1 and CB 2 are both waiting on the current grace period, +give or take possible disagreements about exactly which grace period +is the current one. +The ->tails[RCU_NEXT_READY_TAIL] array element +references the same RCU callback that ->tails[RCU_WAIT_TAIL] +does, which indicates that there are no callbacks waiting on the next +RCU grace period. +The ->tails[RCU_NEXT_TAIL] array element references +CB 4's ->next pointer, indicating that all the +remaining RCU callbacks have not yet been assigned to an RCU grace +period. +Note that the ->tails[RCU_NEXT_TAIL] array element +always references the last RCU callback's ->next pointer +unless the callback list is empty, in which case it references +the ->head pointer. + +

    +There is one additional important special case for the +->tails[RCU_NEXT_TAIL] array element: It can be NULL +when this list is disabled. +Lists are disabled when the corresponding CPU is offline or when +the corresponding CPU's callbacks are offloaded to a kthread, +both of which are described elsewhere. + +

    CPUs advance their callbacks from the +RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the +RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments +as grace periods advance. + +

    The ->gp_seq[] array records grace-period +numbers corresponding to the list segments. +This is what allows different CPUs to have different ideas as to +which is the current grace period while still avoiding premature +invocation of their callbacks. +In particular, this allows CPUs that go idle for extended periods +to determine which of their callbacks are ready to be invoked after +reawakening. + +

    The ->len counter contains the number of +callbacks in ->head, and the +->len_lazy contains the number of those callbacks that +are known to only free memory, and whose invocation can therefore +be safely deferred. + +

    Important note: It is the ->len field that +determines whether or not there are callbacks associated with +this rcu_segcblist structure, not the ->head +pointer. +The reason for this is that all the ready-to-invoke callbacks +(that is, those in the RCU_DONE_TAIL segment) are extracted +all at once at callback-invocation time. +If callback invocation must be postponed, for example, because a +high-priority process just woke up on this CPU, then the remaining +callbacks are placed back on the RCU_DONE_TAIL segment. +Either way, the ->len and ->len_lazy counts +are adjusted after the corresponding callbacks have been invoked, and so +again it is the ->len count that accurately reflects whether +or not there are callbacks associated with this rcu_segcblist +structure. +Of course, off-CPU sampling of the ->len count requires +the use of appropriate synchronization, for example, memory barriers. +This synchronization can be a bit subtle, particularly in the case +of rcu_barrier(). +

    The rcu_data Structure

    @@ -983,62 +1113,18 @@ choice. as follows:
    - 1 struct rcu_head *nxtlist;
    - 2 struct rcu_head **nxttail[RCU_NEXT_SIZE];
    - 3 unsigned long nxtcompleted[RCU_NEXT_SIZE];
    - 4 long qlen_lazy;
    - 5 long qlen;
    - 6 long qlen_last_fqs_check;
    + 1 struct rcu_segcblist cblist;
    + 2 long qlen_last_fqs_check;
    + 3 unsigned long n_cbs_invoked;
    + 4 unsigned long n_nocbs_invoked;
    + 5 unsigned long n_cbs_orphaned;
    + 6 unsigned long n_cbs_adopted;
      7 unsigned long n_force_qs_snap;
    - 8 unsigned long n_cbs_invoked;
    - 9 unsigned long n_cbs_orphaned;
    -10 unsigned long n_cbs_adopted;
    -11 long blimit;
    + 8 long blimit;
     
    -

    The ->nxtlist pointer and the -->nxttail[] array form a four-segment list with -older callbacks near the head and newer ones near the tail. -Each segment contains callbacks with the corresponding relationship -to the current grace period. -The pointer out of the end of each of the four segments is referenced -by the element of the ->nxttail[] array indexed by -RCU_DONE_TAIL (for callbacks handled by a prior grace period), -RCU_WAIT_TAIL (for callbacks waiting on the current grace period), -RCU_NEXT_READY_TAIL (for callbacks that will wait on the next -grace period), and -RCU_NEXT_TAIL (for callbacks that are not yet associated -with a specific grace period) -respectively, as shown in the following figure. - -

    nxtlist.svg - -

    In this figure, the ->nxtlist pointer references the -first -RCU callback in the list. -The ->nxttail[RCU_DONE_TAIL] array element references -the ->nxtlist pointer itself, indicating that none -of the callbacks is ready to invoke. -The ->nxttail[RCU_WAIT_TAIL] array element references callback -CB 2's ->next pointer, which indicates that -CB 1 and CB 2 are both waiting on the current grace period. -The ->nxttail[RCU_NEXT_READY_TAIL] array element -references the same RCU callback that ->nxttail[RCU_WAIT_TAIL] -does, which indicates that there are no callbacks waiting on the next -RCU grace period. -The ->nxttail[RCU_NEXT_TAIL] array element references -CB 4's ->next pointer, indicating that all the -remaining RCU callbacks have not yet been assigned to an RCU grace -period. -Note that the ->nxttail[RCU_NEXT_TAIL] array element -always references the last RCU callback's ->next pointer -unless the callback list is empty, in which case it references -the ->nxtlist pointer. - -

    CPUs advance their callbacks from the -RCU_NEXT_TAIL to the RCU_NEXT_READY_TAIL to the -RCU_WAIT_TAIL to the RCU_DONE_TAIL list segments -as grace periods advance. +

    The ->cblist structure is the segmented callback list +described earlier. The CPU advances the callbacks in its rcu_data structure whenever it notices that another RCU grace period has completed. The CPU detects the completion of an RCU grace period by noticing @@ -1049,16 +1135,7 @@ Recall that each rcu_node structure's ->completed field is updated at the end of each grace period. -

    The ->nxtcompleted[] array records grace-period -numbers corresponding to the list segments. -This allows CPUs that go idle for extended periods to determine -which of their callbacks are ready to be invoked after reawakening. - -

    The ->qlen counter contains the number of -callbacks in ->nxtlist, and the -->qlen_lazy contains the number of those callbacks that -are known to only free memory, and whose invocation can therefore -be safely deferred. +

    The ->qlen_last_fqs_check and ->n_force_qs_snap coordinate the forcing of quiescent states from call_rcu() and friends when callback @@ -1069,6 +1146,10 @@ lists grow excessively long. fields count the number of callbacks invoked, sent to other CPUs when this CPU goes offline, and received from other CPUs when those other CPUs go offline. +The ->n_nocbs_invoked is used when the CPU's callbacks +are offloaded to a kthread. + +

    Finally, the ->blimit counter is the maximum number of RCU callbacks that may be invoked at a given time. @@ -1104,6 +1185,9 @@ Its fields are as follows: 1 int dynticks_nesting; 2 int dynticks_nmi_nesting; 3 atomic_t dynticks; + 4 bool rcu_need_heavy_qs; + 5 unsigned long rcu_qs_ctr; + 6 bool rcu_urgent_qs;

    The ->dynticks_nesting field counts the @@ -1117,11 +1201,32 @@ NMIs are counted by the ->dynticks_nmi_nesting field, except that NMIs that interrupt non-dyntick-idle execution are not counted. -

    Finally, the ->dynticks field counts the corresponding +

    The ->dynticks field counts the corresponding CPU's transitions to and from dyntick-idle mode, so that this counter has an even value when the CPU is in dyntick-idle mode and an odd value otherwise. +

    The ->rcu_need_heavy_qs field is used +to record the fact that the RCU core code would really like to +see a quiescent state from the corresponding CPU, so much so that +it is willing to call for heavy-weight dyntick-counter operations. +This flag is checked by RCU's context-switch and cond_resched() +code, which provide a momentary idle sojourn in response. + +

    The ->rcu_qs_ctr field is used to record +quiescent states from cond_resched(). +Because cond_resched() can execute quite frequently, this +must be quite lightweight, as in a non-atomic increment of this +per-CPU field. + +

    Finally, the ->rcu_urgent_qs field is used to record +the fact that the RCU core code would really like to see a quiescent +state from the corresponding CPU, with the various other fields indicating +just how badly RCU wants this quiescent state. +This flag is checked by RCU's context-switch and cond_resched() +code, which, if nothing else, non-atomically increment ->rcu_qs_ctr +in response. + diff --git a/Documentation/RCU/Design/Data-Structures/nxtlist.svg b/Documentation/RCU/Design/Data-Structures/nxtlist.svg index abc4cc73a0977195317d7a0397f1f5e7c52f35b3..0223e79c38e0036373dcfc919b7296bd87fa300e 100644 --- a/Documentation/RCU/Design/Data-Structures/nxtlist.svg +++ b/Documentation/RCU/Design/Data-Structures/nxtlist.svg @@ -19,7 +19,7 @@ id="svg2" version="1.1" inkscape:version="0.48.4 r9939" - sodipodi:docname="nxtlist.fig"> + sodipodi:docname="segcblist.svg"> @@ -28,7 +28,7 @@ image/svg+xml - + @@ -241,61 +241,51 @@ xml:space="preserve" x="225" y="675" - fill="#000000" - font-family="Courier" font-style="normal" font-weight="bold" font-size="324" - text-anchor="start" - id="text64">nxtlist + id="text64" + style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->headnxttail[RCU_DONE_TAIL] + id="text66" + style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_DONE_TAIL]nxttail[RCU_WAIT_TAIL] + id="text68" + style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_WAIT_TAIL]nxttail[RCU_NEXT_READY_TAIL] + id="text70" + style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_READY_TAIL]nxttail[RCU_NEXT_TAIL] + id="text72" + style="font-size:324px;font-style:normal;font-weight:bold;text-anchor:start;fill:#000000;font-family:Courier">->tails[RCU_NEXT_TAIL] Funnel locking and wait/wakeup.
  • Use of Workqueues.
  • Stall warnings. +
  • Mid-boot operation.

    Idle-CPU Checks

    @@ -524,7 +525,7 @@ their grace periods and carrying out their wakeups. In earlier implementations, the task requesting the expedited grace period also drove it to completion. This straightforward approach had the disadvantage of needing to -account for signals sent to user tasks, +account for POSIX signals sent to user tasks, so more recent implemementations use the Linux kernel's workqueues. @@ -533,8 +534,8 @@ The requesting task still does counter snapshotting and funnel-lock processing, but the task reaching the top of the funnel lock does a schedule_work() (from _synchronize_rcu_expedited() so that a workqueue kthread does the actual grace-period processing. -Because workqueue kthreads do not accept signals, grace-period-wait -processing need not allow for signals. +Because workqueue kthreads do not accept POSIX signals, grace-period-wait +processing need not allow for POSIX signals. In addition, this approach allows wakeups for the previous expedited grace period to be overlapped with processing for the next expedited @@ -586,6 +587,46 @@ blocking the current grace period are printed. Each stall warning results in another pass through the loop, but the second and subsequent passes use longer stall times. +

    Mid-boot operation

    + +

    +The use of workqueues has the advantage that the expedited +grace-period code need not worry about POSIX signals. +Unfortunately, it has the +corresponding disadvantage that workqueues cannot be used until +they are initialized, which does not happen until some time after +the scheduler spawns the first task. +Given that there are parts of the kernel that really do want to +execute grace periods during this mid-boot “dead zone”, +expedited grace periods must do something else during thie time. + +

    +What they do is to fall back to the old practice of requiring that the +requesting task drive the expedited grace period, as was the case +before the use of workqueues. +However, the requesting task is only required to drive the grace period +during the mid-boot dead zone. +Before mid-boot, a synchronous grace period is a no-op. +Some time after mid-boot, workqueues are used. + +

    +Non-expedited non-SRCU synchronous grace periods must also operate +normally during mid-boot. +This is handled by causing non-expedited grace periods to take the +expedited code path during mid-boot. + +

    +The current code assumes that there are no POSIX signals during +the mid-boot dead zone. +However, if an overwhelming need for POSIX signals somehow arises, +appropriate adjustments can be made to the expedited stall-warning code. +One such adjustment would reinstate the pre-workqueue stall-warning +checks, but only during the mid-boot dead zone. + +

    +With this refinement, synchronous grace periods can now be used from +task context pretty much any time during the life of the kernel. +

    Summary

    diff --git a/Documentation/RCU/Design/Requirements/Requirements.html b/Documentation/RCU/Design/Requirements/Requirements.html index 21593496aca6f617957b667f4f3aa5d627e86c4d..f60adf112663aad038e928e7b7cce04a65752277 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.html +++ b/Documentation/RCU/Design/Requirements/Requirements.html @@ -659,8 +659,9 @@ systems with more than one CPU: In other words, a given instance of synchronize_rcu() can avoid waiting on a given RCU read-side critical section only if it can prove that synchronize_rcu() started first. + -

    +

    A related question is “When rcu_read_lock() doesn't generate any code, why does it matter how it relates to a grace period?” @@ -675,8 +676,9 @@ systems with more than one CPU: within the critical section, in which case none of the accesses within the critical section may observe the effects of any access following the grace period. + -

    +

    As of late 2016, mathematical models of RCU take this viewpoint, for example, see slides 62 and 63 of the @@ -1616,8 +1618,8 @@ CPUs should at least make reasonable forward progress. In return for its shorter latencies, synchronize_rcu_expedited() is permitted to impose modest degradation of real-time latency on non-idle online CPUs. -That said, it will likely be necessary to take further steps to reduce this -degradation, hopefully to roughly that of a scheduling-clock interrupt. +Here, “modest” means roughly the same latency +degradation as a scheduling-clock interrupt.

    There are a number of situations where even @@ -1913,12 +1915,9 @@ This requirement is another factor driving batching of grace periods, but it is also the driving force behind the checks for large numbers of queued RCU callbacks in the call_rcu() code path. Finally, high update rates should not delay RCU read-side critical -sections, although some read-side delays can occur when using +sections, although some small read-side delays can occur when using synchronize_rcu_expedited(), courtesy of this function's use -of try_stop_cpus(). -(In the future, synchronize_rcu_expedited() will be -converted to use lighter-weight inter-processor interrupts (IPIs), -but this will still disturb readers, though to a much smaller degree.) +of smp_call_function_single().

    Although all three of these corner cases were understood in the early @@ -2154,7 +2153,8 @@ as will rcu_assign_pointer().

    Although call_rcu() may be invoked at any time during boot, callbacks are not guaranteed to be invoked until after -the scheduler is fully up and running. +all of RCU's kthreads have been spawned, which occurs at +early_initcall() time. This delay in callback invocation is due to the fact that RCU does not invoke callbacks until it is fully initialized, and this full initialization cannot occur until after the scheduler has initialized itself to the @@ -2167,8 +2167,10 @@ on what operations those callbacks could invoke. Perhaps surprisingly, synchronize_rcu(), synchronize_rcu_bh() (discussed below), -and -synchronize_sched() +synchronize_sched(), +synchronize_rcu_expedited(), +synchronize_rcu_bh_expedited(), and +synchronize_sched_expedited() will all operate normally during very early boot, the reason being that there is only one CPU and preemption is disabled. @@ -2178,45 +2180,59 @@ state and thus a grace period, so the early-boot implementation can be a no-op.

    -Both synchronize_rcu_bh() and synchronize_sched() -continue to operate normally through the remainder of boot, courtesy -of the fact that preemption is disabled across their RCU read-side -critical sections and also courtesy of the fact that there is still -only one CPU. -However, once the scheduler starts initializing, preemption is enabled. -There is still only a single CPU, but the fact that preemption is enabled -means that the no-op implementation of synchronize_rcu() no -longer works in CONFIG_PREEMPT=y kernels. -Therefore, as soon as the scheduler starts initializing, the early-boot -fastpath is disabled. -This means that synchronize_rcu() switches to its runtime -mode of operation where it posts callbacks, which in turn means that -any call to synchronize_rcu() will block until the corresponding -callback is invoked. -Unfortunately, the callback cannot be invoked until RCU's runtime -grace-period machinery is up and running, which cannot happen until -the scheduler has initialized itself sufficiently to allow RCU's -kthreads to be spawned. -Therefore, invoking synchronize_rcu() during scheduler -initialization can result in deadlock. +However, once the scheduler has spawned its first kthread, this early +boot trick fails for synchronize_rcu() (as well as for +synchronize_rcu_expedited()) in CONFIG_PREEMPT=y +kernels. +The reason is that an RCU read-side critical section might be preempted, +which means that a subsequent synchronize_rcu() really does have +to wait for something, as opposed to simply returning immediately. +Unfortunately, synchronize_rcu() can't do this until all of +its kthreads are spawned, which doesn't happen until some time during +early_initcalls() time. +But this is no excuse: RCU is nevertheless required to correctly handle +synchronous grace periods during this time period. +Once all of its kthreads are up and running, RCU starts running +normally.

  •  
    Quick Quiz:
     
    Quick Quiz:
    - So what happens with synchronize_rcu() during - scheduler initialization for CONFIG_PREEMPT=n - kernels? + How can RCU possibly handle grace periods before all of its + kthreads have been spawned???
    Answer:
    - In CONFIG_PREEMPT=n kernel, synchronize_rcu() - maps directly to synchronize_sched(). - Therefore, synchronize_rcu() works normally throughout - boot in CONFIG_PREEMPT=n kernels. - However, your code must also work in CONFIG_PREEMPT=y kernels, - so it is still necessary to avoid invoking synchronize_rcu() - during scheduler initialization. + Very carefully! + + +

    + During the “dead zone” between the time that the + scheduler spawns the first task and the time that all of RCU's + kthreads have been spawned, all synchronous grace periods are + handled by the expedited grace-period mechanism. + At runtime, this expedited mechanism relies on workqueues, but + during the dead zone the requesting task itself drives the + desired expedited grace period. + Because dead-zone execution takes place within task context, + everything works. + Once the dead zone ends, expedited grace periods go back to + using workqueues, as is required to avoid problems that would + otherwise occur when a user task received a POSIX signal while + driving an expedited grace period. + + +

    + And yes, this does mean that it is unhelpful to send POSIX + signals to random tasks between the time that the scheduler + spawns its first kthread and the time that RCU's kthreads + have all been spawned. + If there ever turns out to be a good reason for sending POSIX + signals during that time, appropriate adjustments will be made. + (If it turns out that POSIX signals are sent during this time for + no good reason, other adjustments will be made, appropriate + or otherwise.)

     
    @@ -2295,12 +2311,61 @@ situation, and Dipankar Sarma incorporated rcu_barrier() into RCU. The need for rcu_barrier() for module unloading became apparent later. +

    +Important note: The rcu_barrier() function is not, +repeat, not, obligated to wait for a grace period. +It is instead only required to wait for RCU callbacks that have +already been posted. +Therefore, if there are no RCU callbacks posted anywhere in the system, +rcu_barrier() is within its rights to return immediately. +Even if there are callbacks posted, rcu_barrier() does not +necessarily need to wait for a grace period. + + + + + + + + +
     
    Quick Quiz:
    + Wait a minute! + Each RCU callbacks must wait for a grace period to complete, + and rcu_barrier() must wait for each pre-existing + callback to be invoked. + Doesn't rcu_barrier() therefore need to wait for + a full grace period if there is even one callback posted anywhere + in the system? +
    Answer:
    + Absolutely not!!! + + +

    + Yes, each RCU callbacks must wait for a grace period to complete, + but it might well be partly (or even completely) finished waiting + by the time rcu_barrier() is invoked. + In that case, rcu_barrier() need only wait for the + remaining portion of the grace period to elapse. + So even if there are quite a few callbacks posted, + rcu_barrier() might well return quite quickly. + + +

    + So if you need to wait for a grace period as well as for all + pre-existing callbacks, you will need to invoke both + synchronize_rcu() and rcu_barrier(). + If latency is a concern, you can always use workqueues + to invoke them concurrently. +

     
    +

    Hotplug CPU

    The Linux kernel supports CPU hotplug, which means that CPUs can come and go. -It is of course illegal to use any RCU API member from an offline CPU. +It is of course illegal to use any RCU API member from an offline CPU, +with the exception of SRCU read-side +critical sections. This requirement was present from day one in DYNIX/ptx, but on the other hand, the Linux kernel's CPU-hotplug implementation is “interesting.” @@ -2310,19 +2375,18 @@ The Linux-kernel CPU-hotplug implementation has notifiers that are used to allow the various kernel subsystems (including RCU) to respond appropriately to a given CPU-hotplug operation. Most RCU operations may be invoked from CPU-hotplug notifiers, -including even normal synchronous grace-period operations -such as synchronize_rcu(). -However, expedited grace-period operations such as -synchronize_rcu_expedited() are not supported, -due to the fact that current implementations block CPU-hotplug -operations, which could result in deadlock. +including even synchronous grace-period operations such as +synchronize_rcu() and synchronize_rcu_expedited().

    -In addition, all-callback-wait operations such as +However, all-callback-wait operations such as rcu_barrier() are also not supported, due to the fact that there are phases of CPU-hotplug operations where the outgoing CPU's callbacks will not be invoked until after the CPU-hotplug operation ends, which could also result in deadlock. +Furthermore, rcu_barrier() blocks CPU-hotplug operations +during its execution, which results in another type of deadlock +when invoked from a CPU-hotplug notifier.

    Scheduler and RCU

    @@ -2863,6 +2927,27 @@ It also motivates the smp_mb__after_srcu_read_unlock() API, which, in combination with srcu_read_unlock(), guarantees a full memory barrier. +

    +Also unlike other RCU flavors, SRCU's callbacks-wait function +srcu_barrier() may be invoked from CPU-hotplug notifiers, +though this is not necessarily a good idea. +The reason that this is possible is that SRCU is insensitive +to whether or not a CPU is online, which means that srcu_barrier() +need not exclude CPU-hotplug operations. + +

    +As of v4.12, SRCU's callbacks are maintained per-CPU, eliminating +a locking bottleneck present in prior kernel versions. +Although this will allow users to put much heavier stress on +call_srcu(), it is important to note that SRCU does not +yet take any special steps to deal with callback flooding. +So if you are posting (say) 10,000 SRCU callbacks per second per CPU, +you are probably totally OK, but if you intend to post (say) 1,000,000 +SRCU callbacks per second per CPU, please run some tests first. +SRCU just might need a few adjustment to deal with that sort of load. +Of course, your mileage may vary based on the speed of your CPUs and +the size of your memory. +

    The SRCU API @@ -3021,8 +3106,8 @@ to do some redesign to avoid this scalability problem.

    RCU disables CPU hotplug in a few places, perhaps most notably in the -expedited grace-period and rcu_barrier() operations. -If there is a strong reason to use expedited grace periods in CPU-hotplug +rcu_barrier() operations. +If there is a strong reason to use rcu_barrier() in CPU-hotplug notifiers, it will be necessary to avoid disabling CPU hotplug. This would introduce some complexity, so there had better be a very good reason. @@ -3096,9 +3181,5 @@ Andy Lutomirski for their help in rendering this article human readable, and to Michelle Rankin for her support of this effort. Other contributions are acknowledged in the Linux kernel's git archive. -The cartoon is copyright (c) 2013 by Melissa Broussard, -and is provided -under the terms of the Creative Commons Attribution-Share Alike 3.0 -United States license. diff --git a/Documentation/RCU/rcu_dereference.txt b/Documentation/RCU/rcu_dereference.txt index c0bf2441a2baf5f254ed64d6e4c117dda808fd2c..b2a613f16d747828e35fd182a0c9fe06c1107d0f 100644 --- a/Documentation/RCU/rcu_dereference.txt +++ b/Documentation/RCU/rcu_dereference.txt @@ -138,6 +138,15 @@ o Be very careful about comparing pointers obtained from This sort of comparison occurs frequently when scanning RCU-protected circular linked lists. + Note that if checks for being within an RCU read-side + critical section are not required and the pointer is never + dereferenced, rcu_access_pointer() should be used in place + of rcu_dereference(). The rcu_access_pointer() primitive + does not require an enclosing read-side critical section, + and also omits the smp_read_barrier_depends() included in + rcu_dereference(), which in turn should provide a small + performance gain in some CPUs (e.g., the DEC Alpha). + o The comparison is against a pointer that references memory that was initialized "a long time ago." The reason this is safe is that even if misordering occurs, the diff --git a/Documentation/RCU/rculist_nulls.txt b/Documentation/RCU/rculist_nulls.txt index 18f9651ff23d411e96737ec070d4fc6bc29c50fe..8151f0195f7688386e8057ad34fdc9cc153b7fb8 100644 --- a/Documentation/RCU/rculist_nulls.txt +++ b/Documentation/RCU/rculist_nulls.txt @@ -1,5 +1,5 @@ Using hlist_nulls to protect read-mostly linked lists and -objects using SLAB_DESTROY_BY_RCU allocations. +objects using SLAB_TYPESAFE_BY_RCU allocations. Please read the basics in Documentation/RCU/listRCU.txt @@ -7,7 +7,7 @@ Using special makers (called 'nulls') is a convenient way to solve following problem : A typical RCU linked list managing objects which are -allocated with SLAB_DESTROY_BY_RCU kmem_cache can +allocated with SLAB_TYPESAFE_BY_RCU kmem_cache can use following algos : 1) Lookup algo @@ -96,7 +96,7 @@ unlock_chain(); // typically a spin_unlock() 3) Remove algo -------------- Nothing special here, we can use a standard RCU hlist deletion. -But thanks to SLAB_DESTROY_BY_RCU, beware a deleted object can be reused +But thanks to SLAB_TYPESAFE_BY_RCU, beware a deleted object can be reused very very fast (before the end of RCU grace period) if (put_last_reference_on(obj) { diff --git a/Documentation/RCU/stallwarn.txt b/Documentation/RCU/stallwarn.txt index e93d04133fe7ae58cfb6c3da3e0717b0640f3b9a..96a3d81837e1b120098eccfd1b95b0bfc4947be9 100644 --- a/Documentation/RCU/stallwarn.txt +++ b/Documentation/RCU/stallwarn.txt @@ -1,9 +1,102 @@ Using RCU's CPU Stall Detector -The rcu_cpu_stall_suppress module parameter enables RCU's CPU stall -detector, which detects conditions that unduly delay RCU grace periods. -This module parameter enables CPU stall detection by default, but -may be overridden via boot-time parameter or at runtime via sysfs. +This document first discusses what sorts of issues RCU's CPU stall +detector can locate, and then discusses kernel parameters and Kconfig +options that can be used to fine-tune the detector's operation. Finally, +this document explains the stall detector's "splat" format. + + +What Causes RCU CPU Stall Warnings? + +So your kernel printed an RCU CPU stall warning. The next question is +"What caused it?" The following problems can result in RCU CPU stall +warnings: + +o A CPU looping in an RCU read-side critical section. + +o A CPU looping with interrupts disabled. + +o A CPU looping with preemption disabled. This condition can + result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh + stalls. + +o A CPU looping with bottom halves disabled. This condition can + result in RCU-sched and RCU-bh stalls. + +o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the + kernel without invoking schedule(). Note that cond_resched() + does not necessarily prevent RCU CPU stall warnings. Therefore, + if the looping in the kernel is really expected and desirable + behavior, you might need to replace some of the cond_resched() + calls with calls to cond_resched_rcu_qs(). + +o Booting Linux using a console connection that is too slow to + keep up with the boot-time console-message rate. For example, + a 115Kbaud serial console can be -way- too slow to keep up + with boot-time message rates, and will frequently result in + RCU CPU stall warning messages. Especially if you have added + debug printk()s. + +o Anything that prevents RCU's grace-period kthreads from running. + This can result in the "All QSes seen" console-log message. + This message will include information on when the kthread last + ran and how often it should be expected to run. + +o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might + happen to preempt a low-priority task in the middle of an RCU + read-side critical section. This is especially damaging if + that low-priority task is not permitted to run on any other CPU, + in which case the next RCU grace period can never complete, which + will eventually cause the system to run out of memory and hang. + While the system is in the process of running itself out of + memory, you might see stall-warning messages. + +o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that + is running at a higher priority than the RCU softirq threads. + This will prevent RCU callbacks from ever being invoked, + and in a CONFIG_PREEMPT_RCU kernel will further prevent + RCU grace periods from ever completing. Either way, the + system will eventually run out of memory and hang. In the + CONFIG_PREEMPT_RCU case, you might see stall-warning + messages. + +o A hardware or software issue shuts off the scheduler-clock + interrupt on a CPU that is not in dyntick-idle mode. This + problem really has happened, and seems to be most likely to + result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. + +o A bug in the RCU implementation. + +o A hardware failure. This is quite unlikely, but has occurred + at least once in real life. A CPU failed in a running system, + becoming unresponsive, but not causing an immediate crash. + This resulted in a series of RCU CPU stall warnings, eventually + leading the realization that the CPU had failed. + +The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall +warning. Note that SRCU does -not- have CPU stall warnings. Please note +that RCU only detects CPU stalls when there is a grace period in progress. +No grace period, no CPU stall warnings. + +To diagnose the cause of the stall, inspect the stack traces. +The offending function will usually be near the top of the stack. +If you have a series of stall warnings from a single extended stall, +comparing the stack traces can often help determine where the stall +is occurring, which will usually be in the function nearest the top of +that portion of the stack which remains the same from trace to trace. +If you can reliably trigger the stall, ftrace can be quite helpful. + +RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE +and with RCU's event tracing. For information on RCU's event tracing, +see include/trace/events/rcu.h. + + +Fine-Tuning the RCU CPU Stall Detector + +The rcuupdate.rcu_cpu_stall_suppress module parameter disables RCU's +CPU stall detector, which detects conditions that unduly delay RCU grace +periods. This module parameter enables CPU stall detection by default, +but may be overridden via boot-time parameter or at runtime via sysfs. The stall detector's idea of what constitutes "unduly delayed" is controlled by a set of kernel configuration variables and cpp macros: @@ -56,6 +149,9 @@ rcupdate.rcu_task_stall_timeout And continues with the output of sched_show_task() for each task stalling the current RCU-tasks grace period. + +Interpreting RCU's CPU Stall-Detector "Splats" + For non-RCU-tasks flavors of RCU, when a CPU detects that it is stalling, it will print a message similar to the following: @@ -178,89 +274,3 @@ grace period is in flight. It is entirely possible to see stall warnings from normal and from expedited grace periods at about the same time from the same run. - - -What Causes RCU CPU Stall Warnings? - -So your kernel printed an RCU CPU stall warning. The next question is -"What caused it?" The following problems can result in RCU CPU stall -warnings: - -o A CPU looping in an RCU read-side critical section. - -o A CPU looping with interrupts disabled. This condition can - result in RCU-sched and RCU-bh stalls. - -o A CPU looping with preemption disabled. This condition can - result in RCU-sched stalls and, if ksoftirqd is in use, RCU-bh - stalls. - -o A CPU looping with bottom halves disabled. This condition can - result in RCU-sched and RCU-bh stalls. - -o For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the - kernel without invoking schedule(). Note that cond_resched() - does not necessarily prevent RCU CPU stall warnings. Therefore, - if the looping in the kernel is really expected and desirable - behavior, you might need to replace some of the cond_resched() - calls with calls to cond_resched_rcu_qs(). - -o Booting Linux using a console connection that is too slow to - keep up with the boot-time console-message rate. For example, - a 115Kbaud serial console can be -way- too slow to keep up - with boot-time message rates, and will frequently result in - RCU CPU stall warning messages. Especially if you have added - debug printk()s. - -o Anything that prevents RCU's grace-period kthreads from running. - This can result in the "All QSes seen" console-log message. - This message will include information on when the kthread last - ran and how often it should be expected to run. - -o A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might - happen to preempt a low-priority task in the middle of an RCU - read-side critical section. This is especially damaging if - that low-priority task is not permitted to run on any other CPU, - in which case the next RCU grace period can never complete, which - will eventually cause the system to run out of memory and hang. - While the system is in the process of running itself out of - memory, you might see stall-warning messages. - -o A CPU-bound real-time task in a CONFIG_PREEMPT_RT kernel that - is running at a higher priority than the RCU softirq threads. - This will prevent RCU callbacks from ever being invoked, - and in a CONFIG_PREEMPT_RCU kernel will further prevent - RCU grace periods from ever completing. Either way, the - system will eventually run out of memory and hang. In the - CONFIG_PREEMPT_RCU case, you might see stall-warning - messages. - -o A hardware or software issue shuts off the scheduler-clock - interrupt on a CPU that is not in dyntick-idle mode. This - problem really has happened, and seems to be most likely to - result in RCU CPU stall warnings for CONFIG_NO_HZ_COMMON=n kernels. - -o A bug in the RCU implementation. - -o A hardware failure. This is quite unlikely, but has occurred - at least once in real life. A CPU failed in a running system, - becoming unresponsive, but not causing an immediate crash. - This resulted in a series of RCU CPU stall warnings, eventually - leading the realization that the CPU had failed. - -The RCU, RCU-sched, RCU-bh, and RCU-tasks implementations have CPU stall -warning. Note that SRCU does -not- have CPU stall warnings. Please note -that RCU only detects CPU stalls when there is a grace period in progress. -No grace period, no CPU stall warnings. - -To diagnose the cause of the stall, inspect the stack traces. -The offending function will usually be near the top of the stack. -If you have a series of stall warnings from a single extended stall, -comparing the stack traces can often help determine where the stall -is occurring, which will usually be in the function nearest the top of -that portion of the stack which remains the same from trace to trace. -If you can reliably trigger the stall, ftrace can be quite helpful. - -RCU bugs can often be debugged with the help of CONFIG_RCU_TRACE -and with RCU's event tracing. For information on RCU's event tracing, -see include/trace/events/rcu.h. diff --git a/Documentation/RCU/whatisRCU.txt b/Documentation/RCU/whatisRCU.txt index 5cbd8b2395b811489c68acb7da4616b4a1ef9523..8ed6c9f6133c45a54c442d04ed0814dc2dcd1a45 100644 --- a/Documentation/RCU/whatisRCU.txt +++ b/Documentation/RCU/whatisRCU.txt @@ -562,7 +562,9 @@ This section presents a "toy" RCU implementation that is based on familiar locking primitives. Its overhead makes it a non-starter for real-life use, as does its lack of scalability. It is also unsuitable for realtime use, since it allows scheduling latency to "bleed" from -one read-side critical section to another. +one read-side critical section to another. It also assumes recursive +reader-writer locks: If you try this with non-recursive locks, and +you allow nested rcu_read_lock() calls, you can deadlock. However, it is probably the easiest implementation to relate to, so is a good starting point. @@ -587,20 +589,21 @@ It is extremely simple: write_unlock(&rcu_gp_mutex); } -[You can ignore rcu_assign_pointer() and rcu_dereference() without -missing much. But here they are anyway. And whatever you do, don't -forget about them when submitting patches making use of RCU!] +[You can ignore rcu_assign_pointer() and rcu_dereference() without missing +much. But here are simplified versions anyway. And whatever you do, +don't forget about them when submitting patches making use of RCU!] - #define rcu_assign_pointer(p, v) ({ \ - smp_wmb(); \ - (p) = (v); \ - }) + #define rcu_assign_pointer(p, v) \ + ({ \ + smp_store_release(&(p), (v)); \ + }) - #define rcu_dereference(p) ({ \ - typeof(p) _________p1 = p; \ - smp_read_barrier_depends(); \ - (_________p1); \ - }) + #define rcu_dereference(p) \ + ({ \ + typeof(p) _________p1 = p; \ + smp_read_barrier_depends(); \ + (_________p1); \ + }) The rcu_read_lock() and rcu_read_unlock() primitive read-acquire @@ -925,7 +928,8 @@ d. Do you need RCU grace periods to complete even in the face e. Is your workload too update-intensive for normal use of RCU, but inappropriate for other synchronization mechanisms? - If so, consider SLAB_DESTROY_BY_RCU. But please be careful! + If so, consider SLAB_TYPESAFE_BY_RCU (which was originally + named SLAB_DESTROY_BY_RCU). But please be careful! f. Do you need read-side critical sections that are respected even though they are in the middle of the idle loop, during diff --git a/Documentation/acpi/aml-debugger.txt b/Documentation/acpi/aml-debugger.txt index 5f62aa4a493b60ef9ab8043dbf29eb12f7537361..e851cc5de63f29e8e8b89bc9c5f95e660f51f856 100644 --- a/Documentation/acpi/aml-debugger.txt +++ b/Documentation/acpi/aml-debugger.txt @@ -15,7 +15,7 @@ kernel. CONFIG_ACPI_DEBUGGER=y CONFIG_ACPI_DEBUGGER_USER=m - The userspace utlities can be built from the kernel source tree using + The userspace utilities can be built from the kernel source tree using the following commands: $ cd tools diff --git a/Documentation/acpi/dsd/graph.txt b/Documentation/acpi/dsd/graph.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac09e3138b791213c9bdd4e74f698d818a6ecbd6 --- /dev/null +++ b/Documentation/acpi/dsd/graph.txt @@ -0,0 +1,162 @@ +Graphs + + +_DSD +---- + +_DSD (Device Specific Data) [7] is a predefined ACPI device +configuration object that can be used to convey information on +hardware features which are not specifically covered by the ACPI +specification [1][6]. There are two _DSD extensions that are relevant +for graphs: property [4] and hierarchical data extensions [5]. The +property extension provides generic key-value pairs whereas the +hierarchical data extension supports nodes with references to other +nodes, forming a tree. The nodes in the tree may contain properties as +defined by the property extension. The two extensions together provide +a tree-like structure with zero or more properties (key-value pairs) +in each node of the tree. + +The data structure may be accessed at runtime by using the device_* +and fwnode_* functions defined in include/linux/fwnode.h . + +Fwnode represents a generic firmware node object. It is independent on +the firmware type. In ACPI, fwnodes are _DSD hierarchical data +extensions objects. A device's _DSD object is represented by an +fwnode. + +The data structure may be referenced to elsewhere in the ACPI tables +by using a hard reference to the device itself and an index to the +hierarchical data extension array on each depth. + + +Ports and endpoints +------------------- + +The port and endpoint concepts are very similar to those in Devicetree +[3]. A port represents an interface in a device, and an endpoint +represents a connection to that interface. + +All port nodes are located under the device's "_DSD" node in the +hierarchical data extension tree. The property extension related to +each port node must contain the key "port" and an integer value which +is the number of the port. The object it refers to should be called "PRTX", +where "X" is the number of the port. + +Further on, endpoints are located under the individual port nodes. The +first hierarchical data extension package list entry of the endpoint +nodes must begin with "endpoint" and must be followed by the number +of the endpoint. The object it refers to should be called "EPXY", where +"X" is the number of the port and "Y" is the number of the endpoint. + +Each port node contains a property extension key "port", the value of +which is the number of the port node. The each endpoint is similarly numbered +with a property extension key "endpoint". Port numbers must be unique within a +device and endpoint numbers must be unique within a port. + +The endpoint reference uses property extension with "remote-endpoint" property +name followed by a reference in the same package. Such references consist of the +the remote device reference, number of the port in the device and finally the +number of the endpoint in that port. Individual references thus appear as: + + Package() { device, port_number, endpoint_number } + +The references to endpoints must be always done both ways, to the +remote endpoint and back from the referred remote endpoint node. + +A simple example of this is show below: + + Scope (\_SB.PCI0.I2C2) + { + Device (CAM0) + { + Name (_DSD, Package () { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "compatible", Package () { "nokia,smia" } }, + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "port0", "PRT0" }, + } + }) + Name (PRT0, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "port", 0 }, + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "endpoint0", "EP00" }, + } + }) + Name (EP00, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "endpoint", 0 }, + Package () { "remote-endpoint", Package() { \_SB.PCI0.ISP, 4, 0 } }, + } + }) + } + } + + Scope (\_SB.PCI0) + { + Device (ISP) + { + Name (_DSD, Package () { + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "port4", "PRT4" }, + } + }) + + Name (PRT4, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "port", 4 }, /* CSI-2 port number */ + }, + ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), + Package () { + Package () { "endpoint0", "EP40" }, + } + }) + + Name (EP40, Package() { + ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), + Package () { + Package () { "endpoint", 0 }, + Package () { "remote-endpoint", Package () { \_SB.PCI0.I2C2.CAM0, 0, 0 } }, + } + }) + } + } + +Here, the port 0 of the "CAM0" device is connected to the port 4 of +the "ISP" device and vice versa. + + +References +---------- + +[1] _DSD (Device Specific Data) Implementation Guide. + , + referenced 2016-10-03. + +[2] Devicetree. , referenced 2016-10-03. + +[3] Documentation/devicetree/bindings/graph.txt + +[4] Device Properties UUID For _DSD. + , + referenced 2016-10-04. + +[5] Hierarchical Data Extension UUID For _DSD. + , + referenced 2016-10-04. + +[6] Advanced Configuration and Power Interface Specification. + , + referenced 2016-10-04. + +[7] _DSD Device Properties Usage Rules. + Documentation/acpi/DSD-properties-rules.txt diff --git a/Documentation/acpi/enumeration.txt b/Documentation/acpi/enumeration.txt index 209a5eba6b8711e34000b102f68814e76bfcded8..7bcf9c3d9fbe27520f2167e77cba45c294cd9e89 100644 --- a/Documentation/acpi/enumeration.txt +++ b/Documentation/acpi/enumeration.txt @@ -367,10 +367,10 @@ resulting child platform device. Device Tree namespace link device ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Device Tree protocol uses device indentification based on the "compatible" +The Device Tree protocol uses device identification based on the "compatible" property whose value is a string or an array of strings recognized as device identifiers by drivers and the driver core. The set of all those strings may be -regarded as a device indentification namespace analogous to the ACPI/PNP device +regarded as a device identification namespace analogous to the ACPI/PNP device ID namespace. Consequently, in principle it should not be necessary to allocate a new (and arguably redundant) ACPI/PNP device ID for a devices with an existing identification string in the Device Tree (DT) namespace, especially if that ID @@ -381,7 +381,7 @@ In ACPI, the device identification object called _CID (Compatible ID) is used to list the IDs of devices the given one is compatible with, but those IDs must belong to one of the namespaces prescribed by the ACPI specification (see Section 6.1.2 of ACPI 6.0 for details) and the DT namespace is not one of them. -Moreover, the specification mandates that either a _HID or an _ADR identificaion +Moreover, the specification mandates that either a _HID or an _ADR identification object be present for all ACPI objects representing devices (Section 6.1 of ACPI 6.0). For non-enumerable bus types that object must be _HID and its value must be a device ID from one of the namespaces prescribed by the specification too. diff --git a/Documentation/acpi/linuxized-acpica.txt b/Documentation/acpi/linuxized-acpica.txt index defe2eec5331066a0eeb4206dda20ff84b7124e7..3ad7b0dfb083377d043573de0de94206881853f4 100644 --- a/Documentation/acpi/linuxized-acpica.txt +++ b/Documentation/acpi/linuxized-acpica.txt @@ -24,7 +24,7 @@ upstream. The homepage of ACPICA project is: www.acpica.org, it is maintained and supported by Intel Corporation. - The following figure depicts the Linux ACPI subystem where the ACPICA + The following figure depicts the Linux ACPI subsystem where the ACPICA adaptation is included: +---------------------------------------------------------+ @@ -110,7 +110,7 @@ upstream. Linux patches. The patches generated by this process are referred to as "linuxized ACPICA patches". The release process is carried out on a local copy the ACPICA git repository. Each commit in the monthly release is - converted into a linuxized ACPICA patch. Together, they form the montly + converted into a linuxized ACPICA patch. Together, they form the monthly ACPICA release patchset for the Linux ACPI community. This process is illustrated in the following figure: @@ -165,7 +165,7 @@ upstream. . Before the linuxized ACPICA patches are sent to the Linux ACPI community - for review, there is a quality ensurance build test process to reduce + for review, there is a quality assurance build test process to reduce porting issues. Currently this build process only takes care of the following kernel configuration options: CONFIG_ACPI/CONFIG_ACPI_DEBUG/CONFIG_ACPI_DEBUGGER @@ -195,12 +195,12 @@ upstream. release utilities (please refer to Section 4 below for the details). 3. Linux specific features - Sometimes it's impossible to use the current ACPICA APIs to implement features required by the Linux kernel, - so Linux developers occasionaly have to change ACPICA code directly. + so Linux developers occasionally have to change ACPICA code directly. Those changes may not be acceptable by ACPICA upstream and in such cases they are left as committed ACPICA divergences unless the ACPICA side can implement new mechanisms as replacements for them. 4. ACPICA release fixups - ACPICA only tests commits using a set of the - user space simulation utilies, thus the linuxized ACPICA patches may + user space simulation utilities, thus the linuxized ACPICA patches may break the Linux kernel, leaving us build/boot failures. In order to avoid breaking Linux bisection, fixes are applied directly to the linuxized ACPICA patches during the release process. When the release diff --git a/Documentation/admin-guide/README.rst b/Documentation/admin-guide/README.rst index 697a00ccec25eebf2d4b063fc4820fc6fd1d71c7..b96e80f79e853109abdb0d73a6a688453f9689cc 100644 --- a/Documentation/admin-guide/README.rst +++ b/Documentation/admin-guide/README.rst @@ -27,7 +27,7 @@ On what hardware does it run? today Linux also runs on (at least) the Compaq Alpha AXP, Sun SPARC and UltraSPARC, Motorola 68000, PowerPC, PowerPC64, ARM, Hitachi SuperH, Cell, IBM S/390, MIPS, HP PA-RISC, Intel IA-64, DEC VAX, AMD x86-64, AXIS CRIS, - Xtensa, Tilera TILE, AVR32, ARC and Renesas M32R architectures. + Xtensa, Tilera TILE, ARC and Renesas M32R architectures. Linux is easily portable to most general-purpose 32- or 64-bit architectures as long as they have a paged memory management unit (PMMU) and a port of the @@ -362,7 +362,7 @@ If something goes wrong as is, otherwise you will have to use the ``ksymoops`` program to make sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred). This utility can be downloaded from - ftp://ftp..kernel.org/pub/linux/utils/kernel/ksymoops/ . + https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ . Alternatively, you can do the dump lookup by hand: - In debugging dumps like the above, it helps enormously if you can diff --git a/Documentation/admin-guide/index.rst b/Documentation/admin-guide/index.rst index 8ddae4e4299aa10e84f27ab480a3ec3f79c97261..8c60a8a32a1a1c82a4d539021a2dbd6449340f4f 100644 --- a/Documentation/admin-guide/index.rst +++ b/Documentation/admin-guide/index.rst @@ -60,6 +60,7 @@ configure specific aspects of kernel behavior to your liking. mono java ras + pm/index .. only:: subproject and html diff --git a/Documentation/admin-guide/kernel-parameters.rst b/Documentation/admin-guide/kernel-parameters.rst index b516164999a801abf47eae22f90a28116f20eb2b..d76ab3907e2bd3b07cba39de90434f6e0490d0e7 100644 --- a/Documentation/admin-guide/kernel-parameters.rst +++ b/Documentation/admin-guide/kernel-parameters.rst @@ -1,3 +1,5 @@ +.. _kernelparameters: + The kernel's command-line parameters ==================================== @@ -86,7 +88,6 @@ parameter is applicable:: APIC APIC support is enabled. APM Advanced Power Management support is enabled. ARM ARM architecture is enabled. - AVR32 AVR32 architecture is enabled. AX25 Appropriate AX.25 support is enabled. BLACKFIN Blackfin architecture is enabled. CLK Common clock infrastructure is enabled. @@ -197,7 +198,7 @@ and is between 256 and 4096 characters. It is defined in the file Finally, the [KMG] suffix is commonly described after a number of kernel parameter values. These 'K', 'M', and 'G' letters represent the _binary_ -multipliers 'Kilo', 'Mega', and 'Giga', equalling 2^10, 2^20, and 2^30 +multipliers 'Kilo', 'Mega', and 'Giga', equaling 2^10, 2^20, and 2^30 bytes respectively. Such letter suffixes can also be entirely omitted: .. include:: kernel-parameters.txt diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index facc20a3f96280472396ad3f7d2e8f2dba62fecc..15f79c27748df1611b1643b77ca68e2a5e7cfaab 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -531,7 +531,6 @@ [ACPI] acpi_pm [ARM] imx_timer1,OSTS,netx_timer,mpu_timer2, pxa_timer,timer3,32k_counter,timer0_1 - [AVR32] avr32 [X86-32] pit,hpet,tsc; scx200_hrt on Geode; cyclone on IBM x440 [MIPS] MIPS @@ -973,7 +972,7 @@ A valid base address must be provided, and the serial port must already be setup and configured. - armada3700_uart, + ar3700_uart, Start an early, polled-mode console on the Armada 3700 serial port at the specified address. The serial port must already be setup @@ -989,6 +988,7 @@ earlyprintk=ttySn[,baudrate] earlyprintk=dbgp[debugController#] earlyprintk=pciserial,bus:device.function[,baudrate] + earlyprintk=xdbc[xhciController#] earlyprintk is useful when the kernel crashes before the normal console is initialized. It is not enabled by @@ -1578,6 +1578,15 @@ extended tables themselves, and also PASID support. With this option set, extended tables will not be used even on hardware which claims to support them. + tboot_noforce [Default Off] + Do not force the Intel IOMMU enabled under tboot. + By default, tboot will force Intel IOMMU on, which + could harm performance of some high-throughput + devices like 40GBit network cards, even if identity + mapping is enabled. + Note that using this option lowers the security + provided by tboot because it makes the system + vulnerable to DMA attacks. intel_idle.max_cstate= [KNL,HW,ACPI,X86] 0 disables intel_idle and fall back on acpi_idle. @@ -1644,6 +1653,12 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. + iommu.passthrough= + [ARM64] Configure DMA to bypass the IOMMU by default. + Format: { "0" | "1" } + 0 - Use IOMMU translation for DMA. + 1 - Bypass the IOMMU for DMA. + unset - Use IOMMU translation for DMA. io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in @@ -2419,13 +2434,7 @@ and gids from such clients. This is intended to ease migration from NFSv2/v3. - objlayoutdriver.osd_login_prog= - [NFS] [OBJLAYOUT] sets the pathname to the program which - is used to automatically discover and login into new - osd-targets. Please see: - Documentation/filesystems/pnfs.txt for more explanations - - nmi_debug= [KNL,AVR32,SH] Specify one or more actions to take + nmi_debug= [KNL,SH] Specify one or more actions to take when a NMI is triggered. Format: [state][,regs][,debounce][,die] @@ -3178,6 +3187,12 @@ ramdisk_size= [RAM] Sizes of RAM disks in kilobytes See Documentation/blockdev/ramdisk.txt. + ras=option[,option,...] [KNL] RAS-specific options + + cec_disable [X86] + Disable the Correctable Errors Collector, + see CONFIG_RAS_CEC help text. + rcu_nocbs= [KNL] The argument is a cpu list, as described above. @@ -3779,6 +3794,14 @@ spia_pedr= spia_peddr= + srcutree.exp_holdoff [KNL] + Specifies how many nanoseconds must elapse + since the end of the last SRCU grace period for + a given srcu_struct until the next normal SRCU + grace period will be considered for automatic + expediting. Set to zero to disable automatic + expediting. + stacktrace [FTRACE] Enabled the stack tracer on boot up. @@ -4121,6 +4144,9 @@ usbhid.mousepoll= [USBHID] The interval which mice are to be polled at. + usbhid.jspoll= + [USBHID] The interval which joysticks are to be polled at. + usb-storage.delay_use= [UMS] The delay in seconds before a new device is scanned for Logical Units (default 1). diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index 1e61bf50595c84c936cfe8788bb37114a6f7d5f0..84de718f24a469ee455b6e5e3aab5b6608f3f1dc 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -276,14 +276,14 @@ All md devices contain: array creation it will default to 0, though starting the array as ``clean`` will set it much larger. - new_dev + new_dev This file can be written but not read. The value written should be a block device number as major:minor. e.g. 8:0 This will cause that device to be attached to the array, if it is available. It will then appear at md/dev-XXX (depending on the name of the device) and further configuration is then possible. - safe_mode_delay + safe_mode_delay When an md array has seen no write requests for a certain period of time, it will be marked as ``clean``. When another write request arrives, the array is marked as ``dirty`` before the write @@ -292,7 +292,7 @@ All md devices contain: period as a number of seconds. The default is 200msec (0.200). Writing a value of 0 disables safemode. - array_state + array_state This file contains a single word which describes the current state of the array. In many cases, the state can be set by writing the word for the desired state, however some states @@ -401,7 +401,30 @@ All md devices contain: once the array becomes non-degraded, and this fact has been recorded in the metadata. + consistency_policy + This indicates how the array maintains consistency in case of unexpected + shutdown. It can be: + none + Array has no redundancy information, e.g. raid0, linear. + + resync + Full resync is performed and all redundancy is regenerated when the + array is started after unclean shutdown. + + bitmap + Resync assisted by a write-intent bitmap. + + journal + For raid4/5/6, journal device is used to log transactions and replay + after unclean shutdown. + + ppl + For raid5 only, Partial Parity Log is used to close the write hole and + eliminate resync. + + The accepted values when writing to this file are ``ppl`` and ``resync``, + used to enable and disable PPL. As component devices are added to an md array, they appear in the ``md`` @@ -563,6 +586,9 @@ Each directory contains: adds bad blocks without acknowledging them. This is largely for testing. + ppl_sector, ppl_size + Location and size (in sectors) of the space used for Partial Parity Log + on this device. An active md device will also contain an entry for each active device diff --git a/Documentation/admin-guide/pm/cpufreq.rst b/Documentation/admin-guide/pm/cpufreq.rst new file mode 100644 index 0000000000000000000000000000000000000000..289c80f7760ebb966b13d91b5627e61963e5b3aa --- /dev/null +++ b/Documentation/admin-guide/pm/cpufreq.rst @@ -0,0 +1,700 @@ +.. |struct cpufreq_policy| replace:: :c:type:`struct cpufreq_policy ` + +======================= +CPU Performance Scaling +======================= + +:: + + Copyright (c) 2017 Intel Corp., Rafael J. Wysocki + +The Concept of CPU Performance Scaling +====================================== + +The majority of modern processors are capable of operating in a number of +different clock frequency and voltage configurations, often referred to as +Operating Performance Points or P-states (in ACPI terminology). As a rule, +the higher the clock frequency and the higher the voltage, the more instructions +can be retired by the CPU over a unit of time, but also the higher the clock +frequency and the higher the voltage, the more energy is consumed over a unit of +time (or the more power is drawn) by the CPU in the given P-state. Therefore +there is a natural tradeoff between the CPU capacity (the number of instructions +that can be executed over a unit of time) and the power drawn by the CPU. + +In some situations it is desirable or even necessary to run the program as fast +as possible and then there is no reason to use any P-states different from the +highest one (i.e. the highest-performance frequency/voltage configuration +available). In some other cases, however, it may not be necessary to execute +instructions so quickly and maintaining the highest available CPU capacity for a +relatively long time without utilizing it entirely may be regarded as wasteful. +It also may not be physically possible to maintain maximum CPU capacity for too +long for thermal or power supply capacity reasons or similar. To cover those +cases, there are hardware interfaces allowing CPUs to be switched between +different frequency/voltage configurations or (in the ACPI terminology) to be +put into different P-states. + +Typically, they are used along with algorithms to estimate the required CPU +capacity, so as to decide which P-states to put the CPUs into. Of course, since +the utilization of the system generally changes over time, that has to be done +repeatedly on a regular basis. The activity by which this happens is referred +to as CPU performance scaling or CPU frequency scaling (because it involves +adjusting the CPU clock frequency). + + +CPU Performance Scaling in Linux +================================ + +The Linux kernel supports CPU performance scaling by means of the ``CPUFreq`` +(CPU Frequency scaling) subsystem that consists of three layers of code: the +core, scaling governors and scaling drivers. + +The ``CPUFreq`` core provides the common code infrastructure and user space +interfaces for all platforms that support CPU performance scaling. It defines +the basic framework in which the other components operate. + +Scaling governors implement algorithms to estimate the required CPU capacity. +As a rule, each governor implements one, possibly parametrized, scaling +algorithm. + +Scaling drivers talk to the hardware. They provide scaling governors with +information on the available P-states (or P-state ranges in some cases) and +access platform-specific hardware interfaces to change CPU P-states as requested +by scaling governors. + +In principle, all available scaling governors can be used with every scaling +driver. That design is based on the observation that the information used by +performance scaling algorithms for P-state selection can be represented in a +platform-independent form in the majority of cases, so it should be possible +to use the same performance scaling algorithm implemented in exactly the same +way regardless of which scaling driver is used. Consequently, the same set of +scaling governors should be suitable for every supported platform. + +However, that observation may not hold for performance scaling algorithms +based on information provided by the hardware itself, for example through +feedback registers, as that information is typically specific to the hardware +interface it comes from and may not be easily represented in an abstract, +platform-independent way. For this reason, ``CPUFreq`` allows scaling drivers +to bypass the governor layer and implement their own performance scaling +algorithms. That is done by the ``intel_pstate`` scaling driver. + + +``CPUFreq`` Policy Objects +========================== + +In some cases the hardware interface for P-state control is shared by multiple +CPUs. That is, for example, the same register (or set of registers) is used to +control the P-state of multiple CPUs at the same time and writing to it affects +all of those CPUs simultaneously. + +Sets of CPUs sharing hardware P-state control interfaces are represented by +``CPUFreq`` as |struct cpufreq_policy| objects. For consistency, +|struct cpufreq_policy| is also used when there is only one CPU in the given +set. + +The ``CPUFreq`` core maintains a pointer to a |struct cpufreq_policy| object for +every CPU in the system, including CPUs that are currently offline. If multiple +CPUs share the same hardware P-state control interface, all of the pointers +corresponding to them point to the same |struct cpufreq_policy| object. + +``CPUFreq`` uses |struct cpufreq_policy| as its basic data type and the design +of its user space interface is based on the policy concept. + + +CPU Initialization +================== + +First of all, a scaling driver has to be registered for ``CPUFreq`` to work. +It is only possible to register one scaling driver at a time, so the scaling +driver is expected to be able to handle all CPUs in the system. + +The scaling driver may be registered before or after CPU registration. If +CPUs are registered earlier, the driver core invokes the ``CPUFreq`` core to +take a note of all of the already registered CPUs during the registration of the +scaling driver. In turn, if any CPUs are registered after the registration of +the scaling driver, the ``CPUFreq`` core will be invoked to take note of them +at their registration time. + +In any case, the ``CPUFreq`` core is invoked to take note of any logical CPU it +has not seen so far as soon as it is ready to handle that CPU. [Note that the +logical CPU may be a physical single-core processor, or a single core in a +multicore processor, or a hardware thread in a physical processor or processor +core. In what follows "CPU" always means "logical CPU" unless explicitly stated +otherwise and the word "processor" is used to refer to the physical part +possibly including multiple logical CPUs.] + +Once invoked, the ``CPUFreq`` core checks if the policy pointer is already set +for the given CPU and if so, it skips the policy object creation. Otherwise, +a new policy object is created and initialized, which involves the creation of +a new policy directory in ``sysfs``, and the policy pointer corresponding to +the given CPU is set to the new policy object's address in memory. + +Next, the scaling driver's ``->init()`` callback is invoked with the policy +pointer of the new CPU passed to it as the argument. That callback is expected +to initialize the performance scaling hardware interface for the given CPU (or, +more precisely, for the set of CPUs sharing the hardware interface it belongs +to, represented by its policy object) and, if the policy object it has been +called for is new, to set parameters of the policy, like the minimum and maximum +frequencies supported by the hardware, the table of available frequencies (if +the set of supported P-states is not a continuous range), and the mask of CPUs +that belong to the same policy (including both online and offline CPUs). That +mask is then used by the core to populate the policy pointers for all of the +CPUs in it. + +The next major initialization step for a new policy object is to attach a +scaling governor to it (to begin with, that is the default scaling governor +determined by the kernel configuration, but it may be changed later +via ``sysfs``). First, a pointer to the new policy object is passed to the +governor's ``->init()`` callback which is expected to initialize all of the +data structures necessary to handle the given policy and, possibly, to add +a governor ``sysfs`` interface to it. Next, the governor is started by +invoking its ``->start()`` callback. + +That callback it expected to register per-CPU utilization update callbacks for +all of the online CPUs belonging to the given policy with the CPU scheduler. +The utilization update callbacks will be invoked by the CPU scheduler on +important events, like task enqueue and dequeue, on every iteration of the +scheduler tick or generally whenever the CPU utilization may change (from the +scheduler's perspective). They are expected to carry out computations needed +to determine the P-state to use for the given policy going forward and to +invoke the scaling driver to make changes to the hardware in accordance with +the P-state selection. The scaling driver may be invoked directly from +scheduler context or asynchronously, via a kernel thread or workqueue, depending +on the configuration and capabilities of the scaling driver and the governor. + +Similar steps are taken for policy objects that are not new, but were "inactive" +previously, meaning that all of the CPUs belonging to them were offline. The +only practical difference in that case is that the ``CPUFreq`` core will attempt +to use the scaling governor previously used with the policy that became +"inactive" (and is re-initialized now) instead of the default governor. + +In turn, if a previously offline CPU is being brought back online, but some +other CPUs sharing the policy object with it are online already, there is no +need to re-initialize the policy object at all. In that case, it only is +necessary to restart the scaling governor so that it can take the new online CPU +into account. That is achieved by invoking the governor's ``->stop`` and +``->start()`` callbacks, in this order, for the entire policy. + +As mentioned before, the ``intel_pstate`` scaling driver bypasses the scaling +governor layer of ``CPUFreq`` and provides its own P-state selection algorithms. +Consequently, if ``intel_pstate`` is used, scaling governors are not attached to +new policy objects. Instead, the driver's ``->setpolicy()`` callback is invoked +to register per-CPU utilization update callbacks for each policy. These +callbacks are invoked by the CPU scheduler in the same way as for scaling +governors, but in the ``intel_pstate`` case they both determine the P-state to +use and change the hardware configuration accordingly in one go from scheduler +context. + +The policy objects created during CPU initialization and other data structures +associated with them are torn down when the scaling driver is unregistered +(which happens when the kernel module containing it is unloaded, for example) or +when the last CPU belonging to the given policy in unregistered. + + +Policy Interface in ``sysfs`` +============================= + +During the initialization of the kernel, the ``CPUFreq`` core creates a +``sysfs`` directory (kobject) called ``cpufreq`` under +:file:`/sys/devices/system/cpu/`. + +That directory contains a ``policyX`` subdirectory (where ``X`` represents an +integer number) for every policy object maintained by the ``CPUFreq`` core. +Each ``policyX`` directory is pointed to by ``cpufreq`` symbolic links +under :file:`/sys/devices/system/cpu/cpuY/` (where ``Y`` represents an integer +that may be different from the one represented by ``X``) for all of the CPUs +associated with (or belonging to) the given policy. The ``policyX`` directories +in :file:`/sys/devices/system/cpu/cpufreq` each contain policy-specific +attributes (files) to control ``CPUFreq`` behavior for the corresponding policy +objects (that is, for all of the CPUs associated with them). + +Some of those attributes are generic. They are created by the ``CPUFreq`` core +and their behavior generally does not depend on what scaling driver is in use +and what scaling governor is attached to the given policy. Some scaling drivers +also add driver-specific attributes to the policy directories in ``sysfs`` to +control policy-specific aspects of driver behavior. + +The generic attributes under :file:`/sys/devices/system/cpu/cpufreq/policyX/` +are the following: + +``affected_cpus`` + List of online CPUs belonging to this policy (i.e. sharing the hardware + performance scaling interface represented by the ``policyX`` policy + object). + +``bios_limit`` + If the platform firmware (BIOS) tells the OS to apply an upper limit to + CPU frequencies, that limit will be reported through this attribute (if + present). + + The existence of the limit may be a result of some (often unintentional) + BIOS settings, restrictions coming from a service processor or another + BIOS/HW-based mechanisms. + + This does not cover ACPI thermal limitations which can be discovered + through a generic thermal driver. + + This attribute is not present if the scaling driver in use does not + support it. + +``cpuinfo_max_freq`` + Maximum possible operating frequency the CPUs belonging to this policy + can run at (in kHz). + +``cpuinfo_min_freq`` + Minimum possible operating frequency the CPUs belonging to this policy + can run at (in kHz). + +``cpuinfo_transition_latency`` + The time it takes to switch the CPUs belonging to this policy from one + P-state to another, in nanoseconds. + + If unknown or if known to be so high that the scaling driver does not + work with the `ondemand`_ governor, -1 (:c:macro:`CPUFREQ_ETERNAL`) + will be returned by reads from this attribute. + +``related_cpus`` + List of all (online and offline) CPUs belonging to this policy. + +``scaling_available_governors`` + List of ``CPUFreq`` scaling governors present in the kernel that can + be attached to this policy or (if the ``intel_pstate`` scaling driver is + in use) list of scaling algorithms provided by the driver that can be + applied to this policy. + + [Note that some governors are modular and it may be necessary to load a + kernel module for the governor held by it to become available and be + listed by this attribute.] + +``scaling_cur_freq`` + Current frequency of all of the CPUs belonging to this policy (in kHz). + + For the majority of scaling drivers, this is the frequency of the last + P-state requested by the driver from the hardware using the scaling + interface provided by it, which may or may not reflect the frequency + the CPU is actually running at (due to hardware design and other + limitations). + + Some scaling drivers (e.g. ``intel_pstate``) attempt to provide + information more precisely reflecting the current CPU frequency through + this attribute, but that still may not be the exact current CPU + frequency as seen by the hardware at the moment. + +``scaling_driver`` + The scaling driver currently in use. + +``scaling_governor`` + The scaling governor currently attached to this policy or (if the + ``intel_pstate`` scaling driver is in use) the scaling algorithm + provided by the driver that is currently applied to this policy. + + This attribute is read-write and writing to it will cause a new scaling + governor to be attached to this policy or a new scaling algorithm + provided by the scaling driver to be applied to it (in the + ``intel_pstate`` case), as indicated by the string written to this + attribute (which must be one of the names listed by the + ``scaling_available_governors`` attribute described above). + +``scaling_max_freq`` + Maximum frequency the CPUs belonging to this policy are allowed to be + running at (in kHz). + + This attribute is read-write and writing a string representing an + integer to it will cause a new limit to be set (it must not be lower + than the value of the ``scaling_min_freq`` attribute). + +``scaling_min_freq`` + Minimum frequency the CPUs belonging to this policy are allowed to be + running at (in kHz). + + This attribute is read-write and writing a string representing a + non-negative integer to it will cause a new limit to be set (it must not + be higher than the value of the ``scaling_max_freq`` attribute). + +``scaling_setspeed`` + This attribute is functional only if the `userspace`_ scaling governor + is attached to the given policy. + + It returns the last frequency requested by the governor (in kHz) or can + be written to in order to set a new frequency for the policy. + + +Generic Scaling Governors +========================= + +``CPUFreq`` provides generic scaling governors that can be used with all +scaling drivers. As stated before, each of them implements a single, possibly +parametrized, performance scaling algorithm. + +Scaling governors are attached to policy objects and different policy objects +can be handled by different scaling governors at the same time (although that +may lead to suboptimal results in some cases). + +The scaling governor for a given policy object can be changed at any time with +the help of the ``scaling_governor`` policy attribute in ``sysfs``. + +Some governors expose ``sysfs`` attributes to control or fine-tune the scaling +algorithms implemented by them. Those attributes, referred to as governor +tunables, can be either global (system-wide) or per-policy, depending on the +scaling driver in use. If the driver requires governor tunables to be +per-policy, they are located in a subdirectory of each policy directory. +Otherwise, they are located in a subdirectory under +:file:`/sys/devices/system/cpu/cpufreq/`. In either case the name of the +subdirectory containing the governor tunables is the name of the governor +providing them. + +``performance`` +--------------- + +When attached to a policy object, this governor causes the highest frequency, +within the ``scaling_max_freq`` policy limit, to be requested for that policy. + +The request is made once at that time the governor for the policy is set to +``performance`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq`` +policy limits change after that. + +``powersave`` +------------- + +When attached to a policy object, this governor causes the lowest frequency, +within the ``scaling_min_freq`` policy limit, to be requested for that policy. + +The request is made once at that time the governor for the policy is set to +``powersave`` and whenever the ``scaling_max_freq`` or ``scaling_min_freq`` +policy limits change after that. + +``userspace`` +------------- + +This governor does not do anything by itself. Instead, it allows user space +to set the CPU frequency for the policy it is attached to by writing to the +``scaling_setspeed`` attribute of that policy. + +``schedutil`` +------------- + +This governor uses CPU utilization data available from the CPU scheduler. It +generally is regarded as a part of the CPU scheduler, so it can access the +scheduler's internal data structures directly. + +It runs entirely in scheduler context, although in some cases it may need to +invoke the scaling driver asynchronously when it decides that the CPU frequency +should be changed for a given policy (that depends on whether or not the driver +is capable of changing the CPU frequency from scheduler context). + +The actions of this governor for a particular CPU depend on the scheduling class +invoking its utilization update callback for that CPU. If it is invoked by the +RT or deadline scheduling classes, the governor will increase the frequency to +the allowed maximum (that is, the ``scaling_max_freq`` policy limit). In turn, +if it is invoked by the CFS scheduling class, the governor will use the +Per-Entity Load Tracking (PELT) metric for the root control group of the +given CPU as the CPU utilization estimate (see the `Per-entity load tracking`_ +LWN.net article for a description of the PELT mechanism). Then, the new +CPU frequency to apply is computed in accordance with the formula + + f = 1.25 * ``f_0`` * ``util`` / ``max`` + +where ``util`` is the PELT number, ``max`` is the theoretical maximum of +``util``, and ``f_0`` is either the maximum possible CPU frequency for the given +policy (if the PELT number is frequency-invariant), or the current CPU frequency +(otherwise). + +This governor also employs a mechanism allowing it to temporarily bump up the +CPU frequency for tasks that have been waiting on I/O most recently, called +"IO-wait boosting". That happens when the :c:macro:`SCHED_CPUFREQ_IOWAIT` flag +is passed by the scheduler to the governor callback which causes the frequency +to go up to the allowed maximum immediately and then draw back to the value +returned by the above formula over time. + +This governor exposes only one tunable: + +``rate_limit_us`` + Minimum time (in microseconds) that has to pass between two consecutive + runs of governor computations (default: 1000 times the scaling driver's + transition latency). + + The purpose of this tunable is to reduce the scheduler context overhead + of the governor which might be excessive without it. + +This governor generally is regarded as a replacement for the older `ondemand`_ +and `conservative`_ governors (described below), as it is simpler and more +tightly integrated with the CPU scheduler, its overhead in terms of CPU context +switches and similar is less significant, and it uses the scheduler's own CPU +utilization metric, so in principle its decisions should not contradict the +decisions made by the other parts of the scheduler. + +``ondemand`` +------------ + +This governor uses CPU load as a CPU frequency selection metric. + +In order to estimate the current CPU load, it measures the time elapsed between +consecutive invocations of its worker routine and computes the fraction of that +time in which the given CPU was not idle. The ratio of the non-idle (active) +time to the total CPU time is taken as an estimate of the load. + +If this governor is attached to a policy shared by multiple CPUs, the load is +estimated for all of them and the greatest result is taken as the load estimate +for the entire policy. + +The worker routine of this governor has to run in process context, so it is +invoked asynchronously (via a workqueue) and CPU P-states are updated from +there if necessary. As a result, the scheduler context overhead from this +governor is minimum, but it causes additional CPU context switches to happen +relatively often and the CPU P-state updates triggered by it can be relatively +irregular. Also, it affects its own CPU load metric by running code that +reduces the CPU idle time (even though the CPU idle time is only reduced very +slightly by it). + +It generally selects CPU frequencies proportional to the estimated load, so that +the value of the ``cpuinfo_max_freq`` policy attribute corresponds to the load of +1 (or 100%), and the value of the ``cpuinfo_min_freq`` policy attribute +corresponds to the load of 0, unless when the load exceeds a (configurable) +speedup threshold, in which case it will go straight for the highest frequency +it is allowed to use (the ``scaling_max_freq`` policy limit). + +This governor exposes the following tunables: + +``sampling_rate`` + This is how often the governor's worker routine should run, in + microseconds. + + Typically, it is set to values of the order of 10000 (10 ms). Its + default value is equal to the value of ``cpuinfo_transition_latency`` + for each policy this governor is attached to (but since the unit here + is greater by 1000, this means that the time represented by + ``sampling_rate`` is 1000 times greater than the transition latency by + default). + + If this tunable is per-policy, the following shell command sets the time + represented by it to be 750 times as high as the transition latency:: + + # echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate + + +``min_sampling_rate`` + The minimum value of ``sampling_rate``. + + Equal to 10000 (10 ms) if :c:macro:`CONFIG_NO_HZ_COMMON` and + :c:data:`tick_nohz_active` are both set or to 20 times the value of + :c:data:`jiffies` in microseconds otherwise. + +``up_threshold`` + If the estimated CPU load is above this value (in percent), the governor + will set the frequency to the maximum value allowed for the policy. + Otherwise, the selected frequency will be proportional to the estimated + CPU load. + +``ignore_nice_load`` + If set to 1 (default 0), it will cause the CPU load estimation code to + treat the CPU time spent on executing tasks with "nice" levels greater + than 0 as CPU idle time. + + This may be useful if there are tasks in the system that should not be + taken into account when deciding what frequency to run the CPUs at. + Then, to make that happen it is sufficient to increase the "nice" level + of those tasks above 0 and set this attribute to 1. + +``sampling_down_factor`` + Temporary multiplier, between 1 (default) and 100 inclusive, to apply to + the ``sampling_rate`` value if the CPU load goes above ``up_threshold``. + + This causes the next execution of the governor's worker routine (after + setting the frequency to the allowed maximum) to be delayed, so the + frequency stays at the maximum level for a longer time. + + Frequency fluctuations in some bursty workloads may be avoided this way + at the cost of additional energy spent on maintaining the maximum CPU + capacity. + +``powersave_bias`` + Reduction factor to apply to the original frequency target of the + governor (including the maximum value used when the ``up_threshold`` + value is exceeded by the estimated CPU load) or sensitivity threshold + for the AMD frequency sensitivity powersave bias driver + (:file:`drivers/cpufreq/amd_freq_sensitivity.c`), between 0 and 1000 + inclusive. + + If the AMD frequency sensitivity powersave bias driver is not loaded, + the effective frequency to apply is given by + + f * (1 - ``powersave_bias`` / 1000) + + where f is the governor's original frequency target. The default value + of this attribute is 0 in that case. + + If the AMD frequency sensitivity powersave bias driver is loaded, the + value of this attribute is 400 by default and it is used in a different + way. + + On Family 16h (and later) AMD processors there is a mechanism to get a + measured workload sensitivity, between 0 and 100% inclusive, from the + hardware. That value can be used to estimate how the performance of the + workload running on a CPU will change in response to frequency changes. + + The performance of a workload with the sensitivity of 0 (memory-bound or + IO-bound) is not expected to increase at all as a result of increasing + the CPU frequency, whereas workloads with the sensitivity of 100% + (CPU-bound) are expected to perform much better if the CPU frequency is + increased. + + If the workload sensitivity is less than the threshold represented by + the ``powersave_bias`` value, the sensitivity powersave bias driver + will cause the governor to select a frequency lower than its original + target, so as to avoid over-provisioning workloads that will not benefit + from running at higher CPU frequencies. + +``conservative`` +---------------- + +This governor uses CPU load as a CPU frequency selection metric. + +It estimates the CPU load in the same way as the `ondemand`_ governor described +above, but the CPU frequency selection algorithm implemented by it is different. + +Namely, it avoids changing the frequency significantly over short time intervals +which may not be suitable for systems with limited power supply capacity (e.g. +battery-powered). To achieve that, it changes the frequency in relatively +small steps, one step at a time, up or down - depending on whether or not a +(configurable) threshold has been exceeded by the estimated CPU load. + +This governor exposes the following tunables: + +``freq_step`` + Frequency step in percent of the maximum frequency the governor is + allowed to set (the ``scaling_max_freq`` policy limit), between 0 and + 100 (5 by default). + + This is how much the frequency is allowed to change in one go. Setting + it to 0 will cause the default frequency step (5 percent) to be used + and setting it to 100 effectively causes the governor to periodically + switch the frequency between the ``scaling_min_freq`` and + ``scaling_max_freq`` policy limits. + +``down_threshold`` + Threshold value (in percent, 20 by default) used to determine the + frequency change direction. + + If the estimated CPU load is greater than this value, the frequency will + go up (by ``freq_step``). If the load is less than this value (and the + ``sampling_down_factor`` mechanism is not in effect), the frequency will + go down. Otherwise, the frequency will not be changed. + +``sampling_down_factor`` + Frequency decrease deferral factor, between 1 (default) and 10 + inclusive. + + It effectively causes the frequency to go down ``sampling_down_factor`` + times slower than it ramps up. + + +Frequency Boost Support +======================= + +Background +---------- + +Some processors support a mechanism to raise the operating frequency of some +cores in a multicore package temporarily (and above the sustainable frequency +threshold for the whole package) under certain conditions, for example if the +whole chip is not fully utilized and below its intended thermal or power budget. + +Different names are used by different vendors to refer to this functionality. +For Intel processors it is referred to as "Turbo Boost", AMD calls it +"Turbo-Core" or (in technical documentation) "Core Performance Boost" and so on. +As a rule, it also is implemented differently by different vendors. The simple +term "frequency boost" is used here for brevity to refer to all of those +implementations. + +The frequency boost mechanism may be either hardware-based or software-based. +If it is hardware-based (e.g. on x86), the decision to trigger the boosting is +made by the hardware (although in general it requires the hardware to be put +into a special state in which it can control the CPU frequency within certain +limits). If it is software-based (e.g. on ARM), the scaling driver decides +whether or not to trigger boosting and when to do that. + +The ``boost`` File in ``sysfs`` +------------------------------- + +This file is located under :file:`/sys/devices/system/cpu/cpufreq/` and controls +the "boost" setting for the whole system. It is not present if the underlying +scaling driver does not support the frequency boost mechanism (or supports it, +but provides a driver-specific interface for controlling it, like +``intel_pstate``). + +If the value in this file is 1, the frequency boost mechanism is enabled. This +means that either the hardware can be put into states in which it is able to +trigger boosting (in the hardware-based case), or the software is allowed to +trigger boosting (in the software-based case). It does not mean that boosting +is actually in use at the moment on any CPUs in the system. It only means a +permission to use the frequency boost mechanism (which still may never be used +for other reasons). + +If the value in this file is 0, the frequency boost mechanism is disabled and +cannot be used at all. + +The only values that can be written to this file are 0 and 1. + +Rationale for Boost Control Knob +-------------------------------- + +The frequency boost mechanism is generally intended to help to achieve optimum +CPU performance on time scales below software resolution (e.g. below the +scheduler tick interval) and it is demonstrably suitable for many workloads, but +it may lead to problems in certain situations. + +For this reason, many systems make it possible to disable the frequency boost +mechanism in the platform firmware (BIOS) setup, but that requires the system to +be restarted for the setting to be adjusted as desired, which may not be +practical at least in some cases. For example: + + 1. Boosting means overclocking the processor, although under controlled + conditions. Generally, the processor's energy consumption increases + as a result of increasing its frequency and voltage, even temporarily. + That may not be desirable on systems that switch to power sources of + limited capacity, such as batteries, so the ability to disable the boost + mechanism while the system is running may help there (but that depends on + the workload too). + + 2. In some situations deterministic behavior is more important than + performance or energy consumption (or both) and the ability to disable + boosting while the system is running may be useful then. + + 3. To examine the impact of the frequency boost mechanism itself, it is useful + to be able to run tests with and without boosting, preferably without + restarting the system in the meantime. + + 4. Reproducible results are important when running benchmarks. Since + the boosting functionality depends on the load of the whole package, + single-thread performance may vary because of it which may lead to + unreproducible results sometimes. That can be avoided by disabling the + frequency boost mechanism before running benchmarks sensitive to that + issue. + +Legacy AMD ``cpb`` Knob +----------------------- + +The AMD powernow-k8 scaling driver supports a ``sysfs`` knob very similar to +the global ``boost`` one. It is used for disabling/enabling the "Core +Performance Boost" feature of some AMD processors. + +If present, that knob is located in every ``CPUFreq`` policy directory in +``sysfs`` (:file:`/sys/devices/system/cpu/cpufreq/policyX/`) and is called +``cpb``, which indicates a more fine grained control interface. The actual +implementation, however, works on the system-wide basis and setting that knob +for one policy causes the same value of it to be set for all of the other +policies at the same time. + +That knob is still supported on AMD processors that support its underlying +hardware feature, but it may be configured out of the kernel (via the +:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option) and the global +``boost`` knob is present regardless. Thus it is always possible use the +``boost`` knob instead of the ``cpb`` one which is highly recommended, as that +is more consistent with what all of the other systems do (and the ``cpb`` knob +may not be supported any more in the future). + +The ``cpb`` knob is never present for any processors without the underlying +hardware feature (e.g. all Intel ones), even if the +:c:macro:`CONFIG_X86_ACPI_CPUFREQ_CPB` configuration option is set. + + +.. _Per-entity load tracking: https://lwn.net/Articles/531853/ diff --git a/Documentation/admin-guide/pm/index.rst b/Documentation/admin-guide/pm/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..c80f087321fcb704e549254c70ccb927f9659b4c --- /dev/null +++ b/Documentation/admin-guide/pm/index.rst @@ -0,0 +1,15 @@ +================ +Power Management +================ + +.. toctree:: + :maxdepth: 2 + + cpufreq + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/admin-guide/ras.rst b/Documentation/admin-guide/ras.rst index 1b90c6f00a9290b14e365c5e07eed13b57942ac0..8c7bbf2c88d23d2c690ecfbde43e6b309f796803 100644 --- a/Documentation/admin-guide/ras.rst +++ b/Documentation/admin-guide/ras.rst @@ -8,7 +8,7 @@ RAS concepts ************ Reliability, Availability and Serviceability (RAS) is a concept used on -servers meant to measure their robusteness. +servers meant to measure their robustness. Reliability is the probability that a system will produce correct outputs. @@ -42,13 +42,13 @@ Among the monitoring measures, the most usual ones include: * CPU – detect errors at instruction execution and at L1/L2/L3 caches; * Memory – add error correction logic (ECC) to detect and correct errors; -* I/O – add CRC checksums for tranfered data; +* I/O – add CRC checksums for transferred data; * Storage – RAID, journal file systems, checksums, Self-Monitoring, Analysis and Reporting Technology (SMART). By monitoring the number of occurrences of error detections, it is possible to identify if the probability of hardware errors is increasing, and, on such -case, do a preventive maintainance to replace a degrated component while +case, do a preventive maintenance to replace a degraded component while those errors are correctable. Types of errors @@ -121,7 +121,7 @@ using the ``dmidecode`` tool. For example, on a desktop machine, it shows:: On the above example, a DDR4 SO-DIMM memory module is located at the system's memory labeled as "BANK 0", as given by the *bank locator* field. Please notice that, on such system, the *total width* is equal to the -*data witdh*. It means that such memory module doesn't have error +*data width*. It means that such memory module doesn't have error detection/correction mechanisms. Unfortunately, not all systems use the same field to specify the memory @@ -145,7 +145,7 @@ bank. On this example, from an older server, ``dmidecode`` shows:: There, the DDR3 RDIMM memory module is located at the system's memory labeled as "DIMM_A1", as given by the *locator* field. Please notice that this -memory module has 64 bits of *data witdh* and 72 bits of *total width*. So, +memory module has 64 bits of *data width* and 72 bits of *total width*. So, it has 8 extra bits to be used by error detection and correction mechanisms. Such kind of memory is called Error-correcting code memory (ECC memory). @@ -186,7 +186,7 @@ Architecture (MCA)\ [#f3]_. .. [#f1] Please notice that several memory controllers allow operation on a mode called "Lock-Step", where it groups two memory modules together, doing 128-bit reads/writes. That gives 16 bits for error correction, with - significatively improves the error correction mechanism, at the expense + significantly improves the error correction mechanism, at the expense that, when an error happens, there's no way to know what memory module is to blame. So, it has to blame both memory modules. diff --git a/Documentation/admin-guide/sysrq.rst b/Documentation/admin-guide/sysrq.rst index d1712ea2d314da0f0405ec00f5ad410f1014b2db..7b9035c01a2e1c01d68336d891402f54362cdb13 100644 --- a/Documentation/admin-guide/sysrq.rst +++ b/Documentation/admin-guide/sysrq.rst @@ -212,7 +212,8 @@ I hit SysRq, but nothing seems to happen, what's wrong? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ There are some keyboards that produce a different keycode for SysRq than the -pre-defined value of 99 (see ``KEY_SYSRQ`` in ``include/linux/input.h``), or +pre-defined value of 99 +(see ``KEY_SYSRQ`` in ``include/uapi/linux/input-event-codes.h``), or which don't have a SysRq key at all. In these cases, run ``showkey -s`` to find an appropriate scancode sequence, and use ``setkeycodes 99`` to map this sequence to the usual SysRq code (e.g., ``setkeycodes e05b 99``). It's diff --git a/Documentation/arm/mem_alignment b/Documentation/arm/mem_alignment index c7c7a114c78c795c702f8543c57c7558466a9a4b..6335fcacbba986df418c1665410617d7b662398c 100644 --- a/Documentation/arm/mem_alignment +++ b/Documentation/arm/mem_alignment @@ -48,7 +48,7 @@ Note that not all combinations are supported - only values 0 through 5. For example, the following will turn on the warnings, but without fixing up or sending SIGBUS signals: - echo 1 > /proc/sys/debug/alignment + echo 1 > /proc/cpu/alignment You can also read the content of the same file to get statistical information on unaligned access occurrences plus the current mode of diff --git a/Documentation/arm/stm32/stm32h743-overview.txt b/Documentation/arm/stm32/stm32h743-overview.txt new file mode 100644 index 0000000000000000000000000000000000000000..3031cbae31a506b4ab2305d29f45d6550dc2d041 --- /dev/null +++ b/Documentation/arm/stm32/stm32h743-overview.txt @@ -0,0 +1,30 @@ + STM32H743 Overview + ================== + + Introduction + ------------ + The STM32H743 is a Cortex-M7 MCU aimed at various applications. + It features: + - Cortex-M7 core running up to @400MHz + - 2MB internal flash, 1MBytes internal RAM + - FMC controller to connect SDRAM, NOR and NAND memories + - Dual mode QSPI + - SD/MMC/SDIO support + - Ethernet controller + - USB OTFG FS & HS controllers + - I2C, SPI, CAN busses support + - Several 16 & 32 bits general purpose timers + - Serial Audio interface + - LCD controller + - HDMI-CEC + - SPDIFRX + - DFSDM + + Resources + --------- + Datasheet and reference manual are publicly available on ST website: + - http://www.st.com/en/microcontrollers/stm32h7x3.html?querycriteria=productId=LN2033 + + Document Author + --------------- + Alexandre Torgue diff --git a/Documentation/arm64/cpu-feature-registers.txt b/Documentation/arm64/cpu-feature-registers.txt index 61ca21ebef1a27ea8d633c56e9afef45224f776d..d1c97f9f51cc3bd3fd21ad181253bb354e7771b1 100644 --- a/Documentation/arm64/cpu-feature-registers.txt +++ b/Documentation/arm64/cpu-feature-registers.txt @@ -169,6 +169,18 @@ infrastructure: as available on the CPU where it is fetched and is not a system wide safe value. + 4) ID_AA64ISAR1_EL1 - Instruction set attribute register 1 + + x--------------------------------------------------x + | Name | bits | visible | + |--------------------------------------------------| + | LRCPC | [23-20] | y | + |--------------------------------------------------| + | FCMA | [19-16] | y | + |--------------------------------------------------| + | JSCVT | [15-12] | y | + x--------------------------------------------------x + Appendix I: Example --------------------------- diff --git a/Documentation/arm64/silicon-errata.txt b/Documentation/arm64/silicon-errata.txt index 2f66683500b8e44e0ceb44bc877acccae39b35d7..10f2dddbf449475ae5bdc79945330d09cf5f6683 100644 --- a/Documentation/arm64/silicon-errata.txt +++ b/Documentation/arm64/silicon-errata.txt @@ -54,6 +54,7 @@ stable kernels. | ARM | Cortex-A57 | #852523 | N/A | | ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | | ARM | Cortex-A72 | #853709 | N/A | +| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | | ARM | MMU-500 | #841119,#826419 | N/A | | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | diff --git a/Documentation/arm64/tagged-pointers.txt b/Documentation/arm64/tagged-pointers.txt index d9995f1f51b3eb9e678a557a471e4d387db49ca8..a25a99e82bb1c9811ee3e8cb111d356fa53fc979 100644 --- a/Documentation/arm64/tagged-pointers.txt +++ b/Documentation/arm64/tagged-pointers.txt @@ -11,24 +11,56 @@ in AArch64 Linux. The kernel configures the translation tables so that translations made via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of the virtual address ignored by the translation hardware. This frees up -this byte for application use, with the following caveats: +this byte for application use. - (1) The kernel requires that all user addresses passed to EL1 - are tagged with tag 0x00. This means that any syscall - parameters containing user virtual addresses *must* have - their top byte cleared before trapping to the kernel. - (2) Non-zero tags are not preserved when delivering signals. - This means that signal handlers in applications making use - of tags cannot rely on the tag information for user virtual - addresses being maintained for fields inside siginfo_t. - One exception to this rule is for signals raised in response - to watchpoint debug exceptions, where the tag information - will be preserved. +Passing tagged addresses to the kernel +-------------------------------------- - (3) Special care should be taken when using tagged pointers, - since it is likely that C compilers will not hazard two - virtual addresses differing only in the upper byte. +All interpretation of userspace memory addresses by the kernel assumes +an address tag of 0x00. + +This includes, but is not limited to, addresses found in: + + - pointer arguments to system calls, including pointers in structures + passed to system calls, + + - the stack pointer (sp), e.g. when interpreting it to deliver a + signal, + + - the frame pointer (x29) and frame records, e.g. when interpreting + them to generate a backtrace or call graph. + +Using non-zero address tags in any of these locations may result in an +error code being returned, a (fatal) signal being raised, or other modes +of failure. + +For these reasons, passing non-zero address tags to the kernel via +system calls is forbidden, and using a non-zero address tag for sp is +strongly discouraged. + +Programs maintaining a frame pointer and frame records that use non-zero +address tags may suffer impaired or inaccurate debug and profiling +visibility. + + +Preserving tags +--------------- + +Non-zero tags are not preserved when delivering signals. This means that +signal handlers in applications making use of tags cannot rely on the +tag information for user virtual addresses being maintained for fields +inside siginfo_t. One exception to this rule is for signals raised in +response to watchpoint debug exceptions, where the tag information will +be preserved. The architecture prevents the use of a tagged PC, so the upper byte will be set to a sign-extension of bit 55 on exception return. + + +Other considerations +-------------------- + +Special care should be taken when using tagged pointers, since it is +likely that C compilers will not hazard two virtual addresses differing +only in the upper byte. diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX index e55103ace382a093050ab70f5e0a6b92e4faa89a..8d55b4bbb5e2ef03344f3f4f8e920ad01f7d0e04 100644 --- a/Documentation/block/00-INDEX +++ b/Documentation/block/00-INDEX @@ -1,5 +1,7 @@ 00-INDEX - This file +bfq-iosched.txt + - BFQ IO scheduler and its tunables biodoc.txt - Notes on the Generic Block Layer Rewrite in Linux 2.5 biovecs.txt diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt new file mode 100644 index 0000000000000000000000000000000000000000..05e2822a80b34d0f31649cfb486ade2b4679f9e8 --- /dev/null +++ b/Documentation/block/bfq-iosched.txt @@ -0,0 +1,546 @@ +BFQ (Budget Fair Queueing) +========================== + +BFQ is a proportional-share I/O scheduler, with some extra +low-latency capabilities. In addition to cgroups support (blkio or io +controllers), BFQ's main features are: +- BFQ guarantees a high system and application responsiveness, and a + low latency for time-sensitive applications, such as audio or video + players; +- BFQ distributes bandwidth, and not just time, among processes or + groups (switching back to time distribution when needed to keep + throughput high). + +In its default configuration, BFQ privileges latency over +throughput. So, when needed for achieving a lower latency, BFQ builds +schedules that may lead to a lower throughput. If your main or only +goal, for a given device, is to achieve the maximum-possible +throughput at all times, then do switch off all low-latency heuristics +for that device, by setting low_latency to 0. Full details in Section 3. + +On average CPUs, the current version of BFQ can handle devices +performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a +reference, 30-50 KIOPS correspond to very high bandwidths with +sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and +to 120-200 MB/s with 4KB random I/O. BFQ has not yet been tested on +multi-queue devices. + +The table of contents follow. Impatients can just jump to Section 3. + +CONTENTS + +1. When may BFQ be useful? + 1-1 Personal systems + 1-2 Server systems +2. How does BFQ work? +3. What are BFQ's tunable? +4. BFQ group scheduling + 4-1 Service guarantees provided + 4-2 Interface + +1. When may BFQ be useful? +========================== + +BFQ provides the following benefits on personal and server systems. + +1-1 Personal systems +-------------------- + +Low latency for interactive applications + +Regardless of the actual background workload, BFQ guarantees that, for +interactive tasks, the storage device is virtually as responsive as if +it was idle. For example, even if one or more of the following +background workloads are being executed: +- one or more large files are being read, written or copied, +- a tree of source files is being compiled, +- one or more virtual machines are performing I/O, +- a software update is in progress, +- indexing daemons are scanning filesystems and updating their + databases, +starting an application or loading a file from within an application +takes about the same time as if the storage device was idle. As a +comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, +applications experience high latencies, or even become unresponsive +until the background workload terminates (also on SSDs). + +Low latency for soft real-time applications + +Also soft real-time applications, such as audio and video +players/streamers, enjoy a low latency and a low drop rate, regardless +of the background I/O workload. As a consequence, these applications +do not suffer from almost any glitch due to the background workload. + +Higher speed for code-development tasks + +If some additional workload happens to be executed in parallel, then +BFQ executes the I/O-related components of typical code-development +tasks (compilation, checkout, merge, ...) much more quickly than CFQ, +NOOP or DEADLINE. + +High throughput + +On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and +up to 150% higher throughput than DEADLINE and NOOP, with all the +sequential workloads considered in our tests. With random workloads, +and with all the workloads on flash-based devices, BFQ achieves, +instead, about the same throughput as the other schedulers. + +Strong fairness, bandwidth and delay guarantees + +BFQ distributes the device throughput, and not just the device time, +among I/O-bound applications in proportion their weights, with any +workload and regardless of the device parameters. From these bandwidth +guarantees, it is possible to compute tight per-I/O-request delay +guarantees by a simple formula. If not configured for strict service +guarantees, BFQ switches to time-based resource sharing (only) for +applications that would otherwise cause a throughput loss. + +1-2 Server systems +------------------ + +Most benefits for server systems follow from the same service +properties as above. In particular, regardless of whether additional, +possibly heavy workloads are being served, BFQ guarantees: + +. audio and video-streaming with zero or very low jitter and drop + rate; + +. fast retrieval of WEB pages and embedded objects; + +. real-time recording of data in live-dumping applications (e.g., + packet logging); + +. responsiveness in local and remote access to a server. + + +2. How does BFQ work? +===================== + +BFQ is a proportional-share I/O scheduler, whose general structure, +plus a lot of code, are borrowed from CFQ. + +- Each process doing I/O on a device is associated with a weight and a + (bfq_)queue. + +- BFQ grants exclusive access to the device, for a while, to one queue + (process) at a time, and implements this service model by + associating every queue with a budget, measured in number of + sectors. + + - After a queue is granted access to the device, the budget of the + queue is decremented, on each request dispatch, by the size of the + request. + + - The in-service queue is expired, i.e., its service is suspended, + only if one of the following events occurs: 1) the queue finishes + its budget, 2) the queue empties, 3) a "budget timeout" fires. + + - The budget timeout prevents processes doing random I/O from + holding the device for too long and dramatically reducing + throughput. + + - Actually, as in CFQ, a queue associated with a process issuing + sync requests may not be expired immediately when it empties. In + contrast, BFQ may idle the device for a short time interval, + giving the process the chance to go on being served if it issues + a new request in time. Device idling typically boosts the + throughput on rotational devices, if processes do synchronous + and sequential I/O. In addition, under BFQ, device idling is + also instrumental in guaranteeing the desired throughput + fraction to processes issuing sync requests (see the description + of the slice_idle tunable in this document, or [1, 2], for more + details). + + - With respect to idling for service guarantees, if several + processes are competing for the device at the same time, but + all processes (and groups, after the following commit) have + the same weight, then BFQ guarantees the expected throughput + distribution without ever idling the device. Throughput is + thus as high as possible in this common scenario. + + - If low-latency mode is enabled (default configuration), BFQ + executes some special heuristics to detect interactive and soft + real-time applications (e.g., video or audio players/streamers), + and to reduce their latency. The most important action taken to + achieve this goal is to give to the queues associated with these + applications more than their fair share of the device + throughput. For brevity, we call just "weight-raising" the whole + sets of actions taken by BFQ to privilege these queues. In + particular, BFQ provides a milder form of weight-raising for + interactive applications, and a stronger form for soft real-time + applications. + + - BFQ automatically deactivates idling for queues born in a burst of + queue creations. In fact, these queues are usually associated with + the processes of applications and services that benefit mostly + from a high throughput. Examples are systemd during boot, or git + grep. + + - As CFQ, BFQ merges queues performing interleaved I/O, i.e., + performing random I/O that becomes mostly sequential if + merged. Differently from CFQ, BFQ achieves this goal with a more + reactive mechanism, called Early Queue Merge (EQM). EQM is so + responsive in detecting interleaved I/O (cooperating processes), + that it enables BFQ to achieve a high throughput, by queue + merging, even for queues for which CFQ needs a different + mechanism, preemption, to get a high throughput. As such EQM is a + unified mechanism to achieve a high throughput with interleaved + I/O. + + - Queues are scheduled according to a variant of WF2Q+, named + B-WF2Q+, and implemented using an augmented rb-tree to preserve an + O(log N) overall complexity. See [2] for more details. B-WF2Q+ is + also ready for hierarchical scheduling. However, for a cleaner + logical breakdown, the code that enables and completes + hierarchical support is provided in the next commit, which focuses + exactly on this feature. + + - B-WF2Q+ guarantees a tight deviation with respect to an ideal, + perfectly fair, and smooth service. In particular, B-WF2Q+ + guarantees that each queue receives a fraction of the device + throughput proportional to its weight, even if the throughput + fluctuates, and regardless of: the device parameters, the current + workload and the budgets assigned to the queue. + + - The last, budget-independence, property (although probably + counterintuitive in the first place) is definitely beneficial, for + the following reasons: + + - First, with any proportional-share scheduler, the maximum + deviation with respect to an ideal service is proportional to + the maximum budget (slice) assigned to queues. As a consequence, + BFQ can keep this deviation tight not only because of the + accurate service of B-WF2Q+, but also because BFQ *does not* + need to assign a larger budget to a queue to let the queue + receive a higher fraction of the device throughput. + + - Second, BFQ is free to choose, for every process (queue), the + budget that best fits the needs of the process, or best + leverages the I/O pattern of the process. In particular, BFQ + updates queue budgets with a simple feedback-loop algorithm that + allows a high throughput to be achieved, while still providing + tight latency guarantees to time-sensitive applications. When + the in-service queue expires, this algorithm computes the next + budget of the queue so as to: + + - Let large budgets be eventually assigned to the queues + associated with I/O-bound applications performing sequential + I/O: in fact, the longer these applications are served once + got access to the device, the higher the throughput is. + + - Let small budgets be eventually assigned to the queues + associated with time-sensitive applications (which typically + perform sporadic and short I/O), because, the smaller the + budget assigned to a queue waiting for service is, the sooner + B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). + +- If several processes are competing for the device at the same time, + but all processes and groups have the same weight, then BFQ + guarantees the expected throughput distribution without ever idling + the device. It uses preemption instead. Throughput is then much + higher in this common scenario. + +- ioprio classes are served in strict priority order, i.e., + lower-priority queues are not served as long as there are + higher-priority queues. Among queues in the same class, the + bandwidth is distributed in proportion to the weight of each + queue. A very thin extra bandwidth is however guaranteed to + the Idle class, to prevent it from starving. + + +3. What are BFQ's tunable? +========================== + +The tunables back_seek-max, back_seek_penalty, fifo_expire_async and +fifo_expire_sync below are the same as in CFQ. Their description is +just copied from that for CFQ. Some considerations in the description +of slice_idle are copied from CFQ too. + +per-process ioprio and weight +----------------------------- + +Unless the cgroups interface is used (see "4. BFQ group scheduling"), +weights can be assigned to processes only indirectly, through I/O +priorities, and according to the relation: +weight = (IOPRIO_BE_NR - ioprio) * 10. + +Beware that, if low-latency is set, then BFQ automatically raises the +weight of the queues associated with interactive and soft real-time +applications. Unset this tunable if you need/want to control weights. + +slice_idle +---------- + +This parameter specifies how long BFQ should idle for next I/O +request, when certain sync BFQ queues become empty. By default +slice_idle is a non-zero value. Idling has a double purpose: boosting +throughput and making sure that the desired throughput distribution is +respected (see the description of how BFQ works, and, if needed, the +papers referred there). + +As for throughput, idling can be very helpful on highly seeky media +like single spindle SATA/SAS disks where we can cut down on overall +number of seeks and see improved throughput. + +Setting slice_idle to 0 will remove all the idling on queues and one +should see an overall improved throughput on faster storage devices +like multiple SATA/SAS disks in hardware RAID configuration. + +So depending on storage and workload, it might be useful to set +slice_idle=0. In general for SATA/SAS disks and software RAID of +SATA/SAS disks keeping slice_idle enabled should be useful. For any +configurations where there are multiple spindles behind single LUN +(Host based hardware RAID controller or for storage arrays), setting +slice_idle=0 might end up in better throughput and acceptable +latencies. + +Idling is however necessary to have service guarantees enforced in +case of differentiated weights or differentiated I/O-request lengths. +To see why, suppose that a given BFQ queue A must get several I/O +requests served for each request served for another queue B. Idling +ensures that, if A makes a new I/O request slightly after becoming +empty, then no request of B is dispatched in the middle, and thus A +does not lose the possibility to get more than one request dispatched +before the next request of B is dispatched. Note that idling +guarantees the desired differentiated treatment of queues only in +terms of I/O-request dispatches. To guarantee that the actual service +order then corresponds to the dispatch order, the strict_guarantees +tunable must be set too. + +There is an important flipside for idling: apart from the above cases +where it is beneficial also for throughput, idling can severely impact +throughput. One important case is random workload. Because of this +issue, BFQ tends to avoid idling as much as possible, when it is not +beneficial also for throughput. As a consequence of this behavior, and +of further issues described for the strict_guarantees tunable, +short-term service guarantees may be occasionally violated. And, in +some cases, these guarantees may be more important than guaranteeing +maximum throughput. For example, in video playing/streaming, a very +low drop rate may be more important than maximum throughput. In these +cases, consider setting the strict_guarantees parameter. + +strict_guarantees +----------------- + +If this parameter is set (default: unset), then BFQ + +- always performs idling when the in-service queue becomes empty; + +- forces the device to serve one I/O request at a time, by dispatching a + new request only if there is no outstanding request. + +In the presence of differentiated weights or I/O-request sizes, both +the above conditions are needed to guarantee that every BFQ queue +receives its allotted share of the bandwidth. The first condition is +needed for the reasons explained in the description of the slice_idle +tunable. The second condition is needed because all modern storage +devices reorder internally-queued requests, which may trivially break +the service guarantees enforced by the I/O scheduler. + +Setting strict_guarantees may evidently affect throughput. + +back_seek_max +------------- + +This specifies, given in Kbytes, the maximum "distance" for backward seeking. +The distance is the amount of space from the current head location to the +sectors that are backward in terms of distance. + +This parameter allows the scheduler to anticipate requests in the "backward" +direction and consider them as being the "next" if they are within this +distance from the current head location. + +back_seek_penalty +----------------- + +This parameter is used to compute the cost of backward seeking. If the +backward distance of request is just 1/back_seek_penalty from a "front" +request, then the seeking cost of two requests is considered equivalent. + +So scheduler will not bias toward one or the other request (otherwise scheduler +will bias toward front request). Default value of back_seek_penalty is 2. + +fifo_expire_async +----------------- + +This parameter is used to set the timeout of asynchronous requests. Default +value of this is 248ms. + +fifo_expire_sync +---------------- + +This parameter is used to set the timeout of synchronous requests. Default +value of this is 124ms. In case to favor synchronous requests over asynchronous +one, this value should be decreased relative to fifo_expire_async. + +low_latency +----------- + +This parameter is used to enable/disable BFQ's low latency mode. By +default, low latency mode is enabled. If enabled, interactive and soft +real-time applications are privileged and experience a lower latency, +as explained in more detail in the description of how BFQ works. + +DISABLE this mode if you need full control on bandwidth +distribution. In fact, if it is enabled, then BFQ automatically +increases the bandwidth share of privileged applications, as the main +means to guarantee a lower latency to them. + +In addition, as already highlighted at the beginning of this document, +DISABLE this mode if your only goal is to achieve a high throughput. +In fact, privileging the I/O of some application over the rest may +entail a lower throughput. To achieve the highest-possible throughput +on a non-rotational device, setting slice_idle to 0 may be needed too +(at the cost of giving up any strong guarantee on fairness and low +latency). + +timeout_sync +------------ + +Maximum amount of device time that can be given to a task (queue) once +it has been selected for service. On devices with costly seeks, +increasing this time usually increases maximum throughput. On the +opposite end, increasing this time coarsens the granularity of the +short-term bandwidth and latency guarantees, especially if the +following parameter is set to zero. + +max_budget +---------- + +Maximum amount of service, measured in sectors, that can be provided +to a BFQ queue once it is set in service (of course within the limits +of the above timeout). According to what said in the description of +the algorithm, larger values increase the throughput in proportion to +the percentage of sequential I/O requests issued. The price of larger +values is that they coarsen the granularity of short-term bandwidth +and latency guarantees. + +The default value is 0, which enables auto-tuning: BFQ sets max_budget +to the maximum number of sectors that can be served during +timeout_sync, according to the estimated peak rate. + +weights +------- + +Read-only parameter, used to show the weights of the currently active +BFQ queues. + + +wr_ tunables +------------ + +BFQ exports a few parameters to control/tune the behavior of +low-latency heuristics. + +wr_coeff + +Factor by which the weight of a weight-raised queue is multiplied. If +the queue is deemed soft real-time, then the weight is further +multiplied by an additional, constant factor. + +wr_max_time + +Maximum duration of a weight-raising period for an interactive task +(ms). If set to zero (default value), then this value is computed +automatically, as a function of the peak rate of the device. In any +case, when the value of this parameter is read, it always reports the +current duration, regardless of whether it has been set manually or +computed automatically. + +wr_max_softrt_rate + +Maximum service rate below which a queue is deemed to be associated +with a soft real-time application, and is then weight-raised +accordingly (sectors/sec). + +wr_min_idle_time + +Minimum idle period after which interactive weight-raising may be +reactivated for a queue (in ms). + +wr_rt_max_time + +Maximum weight-raising duration for soft real-time queues (in ms). The +start time from which this duration is considered is automatically +moved forward if the queue is detected to be still soft real-time +before the current soft real-time weight-raising period finishes. + +wr_min_inter_arr_async + +Minimum period between I/O request arrivals after which weight-raising +may be reactivated for an already busy async queue (in ms). + + +4. Group scheduling with BFQ +============================ + +BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely +blkio and io. In particular, BFQ supports weight-based proportional +share. To activate cgroups support, set BFQ_GROUP_IOSCHED. + +4-1 Service guarantees provided +------------------------------- + +With BFQ, proportional share means true proportional share of the +device bandwidth, according to group weights. For example, a group +with weight 200 gets twice the bandwidth, and not just twice the time, +of a group with weight 100. + +BFQ supports hierarchies (group trees) of any depth. Bandwidth is +distributed among groups and processes in the expected way: for each +group, the children of the group share the whole bandwidth of the +group in proportion to their weights. In particular, this implies +that, for each leaf group, every process of the group receives the +same share of the whole group bandwidth, unless the ioprio of the +process is modified. + +The resource-sharing guarantee for a group may partially or totally +switch from bandwidth to time, if providing bandwidth guarantees to +the group lowers the throughput too much. This switch occurs on a +per-process basis: if a process of a leaf group causes throughput loss +if served in such a way to receive its share of the bandwidth, then +BFQ switches back to just time-based proportional share for that +process. + +4-2 Interface +------------- + +To get proportional sharing of bandwidth with BFQ for a given device, +BFQ must of course be the active scheduler for that device. + +Within each group directory, the names of the files associated with +BFQ-specific cgroup parameters and stats begin with the "bfq." +prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for +BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group +parameter to set the weight of a group with BFQ is blkio.bfq.weight +or io.bfq.weight. + +Parameters to set +----------------- + +For each group, there is only the following parameter to set. + +weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the +group inside its parent. Available values: 1..10000 (default 100). The +linear mapping between ioprio and weights, described at the beginning +of the tunable section, is still valid, but all weights higher than +IOPRIO_BE_NR*10 are mapped to ioprio 0. + +Recall that, if low-latency is set, then BFQ automatically raises the +weight of the queues associated with interactive and soft real-time +applications. Unset this tunable if you need/want to control weights. + + +[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + Scheduler", Proceedings of the First Workshop on Mobile System + Technologies (MST-2015), May 2015. + http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + +[2] P. Valente and M. Andreolini, "Improving Application + Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of + the 5th Annual International Systems and Storage Conference + (SYSTOR '12), June 2012. + Slightly extended version: + http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- + results.pdf diff --git a/Documentation/block/kyber-iosched.txt b/Documentation/block/kyber-iosched.txt new file mode 100644 index 0000000000000000000000000000000000000000..e94feacd7edcd7d8156ea6a5702e1fe781164bb1 --- /dev/null +++ b/Documentation/block/kyber-iosched.txt @@ -0,0 +1,14 @@ +Kyber I/O scheduler tunables +=========================== + +The only two tunables for the Kyber scheduler are the target latencies for +reads and synchronous writes. Kyber will throttle requests in order to meet +these target latencies. + +read_lat_nsec +------------- +Target latency for reads (in nanoseconds). + +write_lat_nsec +-------------- +Target latency for synchronous writes (in nanoseconds). diff --git a/Documentation/block/queue-sysfs.txt b/Documentation/block/queue-sysfs.txt index c0a3bb5a6e4eb291d077f10633001c439563ccc2..2c1e67058fd3bdf02336a71dc34ec885b9b84c2d 100644 --- a/Documentation/block/queue-sysfs.txt +++ b/Documentation/block/queue-sysfs.txt @@ -43,11 +43,6 @@ large discards are issued, setting this value lower will make Linux issue smaller discards and potentially help reduce latencies induced by large discard operations. -discard_zeroes_data (RO) ------------------------- -When read, this file will show if the discarded block are zeroed by the -device or not. If its value is '1' the blocks are zeroed otherwise not. - hw_sector_size (RO) ------------------- This is the hardware sector size of the device, in bytes. @@ -192,5 +187,11 @@ scaling back writes. Writing a value of '0' to this file disables the feature. Writing a value of '-1' to this file resets the value to the default setting. +throttle_sample_time (RW) +------------------------- +This is the time window that blk-throttle samples data, in millisecond. +blk-throttle makes decision based on the samplings. Lower time means cgroups +have more smooth throughput, but higher CPU overhead. This exists only when +CONFIG_BLK_DEV_THROTTLING_LOW is enabled. Jens Axboe , February 2009 diff --git a/Documentation/blockdev/mflash.txt b/Documentation/blockdev/mflash.txt deleted file mode 100644 index f7e05055148715f1eca44617fecaf20193c799dd..0000000000000000000000000000000000000000 --- a/Documentation/blockdev/mflash.txt +++ /dev/null @@ -1,84 +0,0 @@ -This document describes m[g]flash support in linux. - -Contents - 1. Overview - 2. Reserved area configuration - 3. Example of mflash platform driver registration - -1. Overview - -Mflash and gflash are embedded flash drive. The only difference is mflash is -MCP(Multi Chip Package) device. These two device operate exactly same way. -So the rest mflash repersents mflash and gflash altogether. - -Internally, mflash has nand flash and other hardware logics and supports -2 different operation (ATA, IO) modes. ATA mode doesn't need any new -driver and currently works well under standard IDE subsystem. Actually it's -one chip SSD. IO mode is ATA-like custom mode for the host that doesn't have -IDE interface. - -Following are brief descriptions about IO mode. -A. IO mode based on ATA protocol and uses some custom command. (read confirm, -write confirm) -B. IO mode uses SRAM bus interface. -C. IO mode supports 4kB boot area, so host can boot from mflash. - -2. Reserved area configuration -If host boot from mflash, usually needs raw area for boot loader image. All of -the mflash's block device operation will be taken this value as start offset. -Note that boot loader's size of reserved area and kernel configuration value -must be same. - -3. Example of mflash platform driver registration -Working mflash is very straight forward. Adding platform device stuff to board -configuration file is all. Here is some pseudo example. - -static struct mg_drv_data mflash_drv_data = { - /* If you want to polling driver set to 1 */ - .use_polling = 0, - /* device attribution */ - .dev_attr = MG_BOOT_DEV -}; - -static struct resource mg_mflash_rsc[] = { - /* Base address of mflash */ - [0] = { - .start = 0x08000000, - .end = 0x08000000 + SZ_64K - 1, - .flags = IORESOURCE_MEM - }, - /* mflash interrupt pin */ - [1] = { - .start = IRQ_GPIO(84), - .end = IRQ_GPIO(84), - .flags = IORESOURCE_IRQ - }, - /* mflash reset pin */ - [2] = { - .start = 43, - .end = 43, - .name = MG_RST_PIN, - .flags = IORESOURCE_IO - }, - /* mflash reset-out pin - * If you use mflash as storage device (i.e. other than MG_BOOT_DEV), - * should assign this */ - [3] = { - .start = 51, - .end = 51, - .name = MG_RSTOUT_PIN, - .flags = IORESOURCE_IO - } -}; - -static struct platform_device mflash_dev = { - .name = MG_DEV_NAME, - .id = -1, - .dev = { - .platform_data = &mflash_drv_data, - }, - .num_resources = ARRAY_SIZE(mg_mflash_rsc), - .resource = mg_mflash_rsc -}; - -platform_device_register(&mflash_dev); diff --git a/Documentation/cgroup-v2.txt b/Documentation/cgroup-v2.txt index 49d7c997fa1ee7f759b5ba319bb57be464f0bd47..dc5e2dcdbef40b344f373614220460ebe079f768 100644 --- a/Documentation/cgroup-v2.txt +++ b/Documentation/cgroup-v2.txt @@ -871,6 +871,11 @@ PAGE_SIZE multiple when read back. Amount of memory used in network transmission buffers + shmem + + Amount of cached filesystem data that is swap-backed, + such as tmpfs, shm segments, shared anonymous mmap()s + file_mapped Amount of cached filesystem data mapped with mmap() @@ -913,6 +918,18 @@ PAGE_SIZE multiple when read back. Number of major page faults incurred + workingset_refault + + Number of refaults of previously evicted pages + + workingset_activate + + Number of refaulted pages that were immediately activated + + workingset_nodereclaim + + Number of times a shadow node has been reclaimed + memory.swap.current A read-only single value file which exists on non-root diff --git a/Documentation/conf.py b/Documentation/conf.py index f2b9161583773defb6fd4f5b6b9f76aea7c21748..bacf9d337c89a067c8a9187d4cfeba6fe6582fcd 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -17,7 +17,7 @@ import os import sphinx # Get Sphinx version -major, minor, patch = map(int, sphinx.__version__.split(".")) +major, minor, patch = sphinx.version_info[:3] # If extensions (or modules to document with autodoc) are in another directory, @@ -29,7 +29,7 @@ from load_config import loadConfig # -- General configuration ------------------------------------------------ # If your documentation needs a minimal Sphinx version, state it here. -#needs_sphinx = '1.0' +needs_sphinx = '1.2' # Add any Sphinx extension module names here, as strings. They can be # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom @@ -348,6 +348,8 @@ latex_documents = [ 'The kernel development community', 'manual'), ('driver-api/index', 'driver-api.tex', 'The kernel driver API manual', 'The kernel development community', 'manual'), + ('input/index', 'linux-input.tex', 'The Linux input driver subsystem', + 'The kernel development community', 'manual'), ('kernel-documentation', 'kernel-documentation.tex', 'The Linux Kernel Documentation', 'The kernel development community', 'manual'), ('process/index', 'development-process.tex', 'Linux Kernel Development Documentation', diff --git a/Documentation/core-api/flexible-arrays.rst b/Documentation/core-api/flexible-arrays.rst new file mode 100644 index 0000000000000000000000000000000000000000..b6b85a1b518ed5c6fd38a25f74fa967c3666125c --- /dev/null +++ b/Documentation/core-api/flexible-arrays.rst @@ -0,0 +1,130 @@ + +=================================== +Using flexible arrays in the kernel +=================================== + +Large contiguous memory allocations can be unreliable in the Linux kernel. +Kernel programmers will sometimes respond to this problem by allocating +pages with :c:func:`vmalloc()`. This solution not ideal, though. On 32-bit +systems, memory from vmalloc() must be mapped into a relatively small address +space; it's easy to run out. On SMP systems, the page table changes required +by vmalloc() allocations can require expensive cross-processor interrupts on +all CPUs. And, on all systems, use of space in the vmalloc() range increases +pressure on the translation lookaside buffer (TLB), reducing the performance +of the system. + +In many cases, the need for memory from vmalloc() can be eliminated by piecing +together an array from smaller parts; the flexible array library exists to make +this task easier. + +A flexible array holds an arbitrary (within limits) number of fixed-sized +objects, accessed via an integer index. Sparse arrays are handled +reasonably well. Only single-page allocations are made, so memory +allocation failures should be relatively rare. The down sides are that the +arrays cannot be indexed directly, individual object size cannot exceed the +system page size, and putting data into a flexible array requires a copy +operation. It's also worth noting that flexible arrays do no internal +locking at all; if concurrent access to an array is possible, then the +caller must arrange for appropriate mutual exclusion. + +The creation of a flexible array is done with :c:func:`flex_array_alloc()`:: + + #include + + struct flex_array *flex_array_alloc(int element_size, + unsigned int total, + gfp_t flags); + +The individual object size is provided by ``element_size``, while total is the +maximum number of objects which can be stored in the array. The flags +argument is passed directly to the internal memory allocation calls. With +the current code, using flags to ask for high memory is likely to lead to +notably unpleasant side effects. + +It is also possible to define flexible arrays at compile time with:: + + DEFINE_FLEX_ARRAY(name, element_size, total); + +This macro will result in a definition of an array with the given name; the +element size and total will be checked for validity at compile time. + +Storing data into a flexible array is accomplished with a call to +:c:func:`flex_array_put()`:: + + int flex_array_put(struct flex_array *array, unsigned int element_nr, + void *src, gfp_t flags); + +This call will copy the data from src into the array, in the position +indicated by ``element_nr`` (which must be less than the maximum specified when +the array was created). If any memory allocations must be performed, flags +will be used. The return value is zero on success, a negative error code +otherwise. + +There might possibly be a need to store data into a flexible array while +running in some sort of atomic context; in this situation, sleeping in the +memory allocator would be a bad thing. That can be avoided by using +``GFP_ATOMIC`` for the flags value, but, often, there is a better way. The +trick is to ensure that any needed memory allocations are done before +entering atomic context, using :c:func:`flex_array_prealloc()`:: + + int flex_array_prealloc(struct flex_array *array, unsigned int start, + unsigned int nr_elements, gfp_t flags); + +This function will ensure that memory for the elements indexed in the range +defined by ``start`` and ``nr_elements`` has been allocated. Thereafter, a +``flex_array_put()`` call on an element in that range is guaranteed not to +block. + +Getting data back out of the array is done with :c:func:`flex_array_get()`:: + + void *flex_array_get(struct flex_array *fa, unsigned int element_nr); + +The return value is a pointer to the data element, or NULL if that +particular element has never been allocated. + +Note that it is possible to get back a valid pointer for an element which +has never been stored in the array. Memory for array elements is allocated +one page at a time; a single allocation could provide memory for several +adjacent elements. Flexible array elements are normally initialized to the +value ``FLEX_ARRAY_FREE`` (defined as 0x6c in ), so errors +involving that number probably result from use of unstored array entries. +Note that, if array elements are allocated with ``__GFP_ZERO``, they will be +initialized to zero and this poisoning will not happen. + +Individual elements in the array can be cleared with +:c:func:`flex_array_clear()`:: + + int flex_array_clear(struct flex_array *array, unsigned int element_nr); + +This function will set the given element to ``FLEX_ARRAY_FREE`` and return +zero. If storage for the indicated element is not allocated for the array, +``flex_array_clear()`` will return ``-EINVAL`` instead. Note that clearing an +element does not release the storage associated with it; to reduce the +allocated size of an array, call :c:func:`flex_array_shrink()`:: + + int flex_array_shrink(struct flex_array *array); + +The return value will be the number of pages of memory actually freed. +This function works by scanning the array for pages containing nothing but +``FLEX_ARRAY_FREE`` bytes, so (1) it can be expensive, and (2) it will not work +if the array's pages are allocated with ``__GFP_ZERO``. + +It is possible to remove all elements of an array with a call to +:c:func:`flex_array_free_parts()`:: + + void flex_array_free_parts(struct flex_array *array); + +This call frees all elements, but leaves the array itself in place. +Freeing the entire array is done with :c:func:`flex_array_free()`:: + + void flex_array_free(struct flex_array *array); + +As of this writing, there are no users of flexible arrays in the mainline +kernel. The functions described here are also not exported to modules; +that will probably be fixed when somebody comes up with a need for it. + + +Flexible array functions +------------------------ + +.. kernel-doc:: include/linux/flex_array.h diff --git a/Documentation/core-api/genericirq.rst b/Documentation/core-api/genericirq.rst new file mode 100644 index 0000000000000000000000000000000000000000..0054bd48be849035146239cb6efc0d7dba0c0a62 --- /dev/null +++ b/Documentation/core-api/genericirq.rst @@ -0,0 +1,440 @@ +.. include:: + +========================== +Linux generic IRQ handling +========================== + +:Copyright: |copy| 2005-2010: Thomas Gleixner +:Copyright: |copy| 2005-2006: Ingo Molnar + +Introduction +============ + +The generic interrupt handling layer is designed to provide a complete +abstraction of interrupt handling for device drivers. It is able to +handle all the different types of interrupt controller hardware. Device +drivers use generic API functions to request, enable, disable and free +interrupts. The drivers do not have to know anything about interrupt +hardware details, so they can be used on different platforms without +code changes. + +This documentation is provided to developers who want to implement an +interrupt subsystem based for their architecture, with the help of the +generic IRQ handling layer. + +Rationale +========= + +The original implementation of interrupt handling in Linux uses the +:c:func:`__do_IRQ` super-handler, which is able to deal with every type of +interrupt logic. + +Originally, Russell King identified different types of handlers to build +a quite universal set for the ARM interrupt handler implementation in +Linux 2.5/2.6. He distinguished between: + +- Level type + +- Edge type + +- Simple type + +During the implementation we identified another type: + +- Fast EOI type + +In the SMP world of the :c:func:`__do_IRQ` super-handler another type was +identified: + +- Per CPU type + +This split implementation of high-level IRQ handlers allows us to +optimize the flow of the interrupt handling for each specific interrupt +type. This reduces complexity in that particular code path and allows +the optimized handling of a given type. + +The original general IRQ implementation used hw_interrupt_type +structures and their ``->ack``, ``->end`` [etc.] callbacks to differentiate +the flow control in the super-handler. This leads to a mix of flow logic +and low-level hardware logic, and it also leads to unnecessary code +duplication: for example in i386, there is an ``ioapic_level_irq`` and an +``ioapic_edge_irq`` IRQ-type which share many of the low-level details but +have different flow handling. + +A more natural abstraction is the clean separation of the 'irq flow' and +the 'chip details'. + +Analysing a couple of architecture's IRQ subsystem implementations +reveals that most of them can use a generic set of 'irq flow' methods +and only need to add the chip-level specific code. The separation is +also valuable for (sub)architectures which need specific quirks in the +IRQ flow itself but not in the chip details - and thus provides a more +transparent IRQ subsystem design. + +Each interrupt descriptor is assigned its own high-level flow handler, +which is normally one of the generic implementations. (This high-level +flow handler implementation also makes it simple to provide +demultiplexing handlers which can be found in embedded platforms on +various architectures.) + +The separation makes the generic interrupt handling layer more flexible +and extensible. For example, an (sub)architecture can use a generic +IRQ-flow implementation for 'level type' interrupts and add a +(sub)architecture specific 'edge type' implementation. + +To make the transition to the new model easier and prevent the breakage +of existing implementations, the :c:func:`__do_IRQ` super-handler is still +available. This leads to a kind of duality for the time being. Over time +the new model should be used in more and more architectures, as it +enables smaller and cleaner IRQ subsystems. It's deprecated for three +years now and about to be removed. + +Known Bugs And Assumptions +========================== + +None (knock on wood). + +Abstraction layers +================== + +There are three main levels of abstraction in the interrupt code: + +1. High-level driver API + +2. High-level IRQ flow handlers + +3. Chip-level hardware encapsulation + +Interrupt control flow +---------------------- + +Each interrupt is described by an interrupt descriptor structure +irq_desc. The interrupt is referenced by an 'unsigned int' numeric +value which selects the corresponding interrupt description structure in +the descriptor structures array. The descriptor structure contains +status information and pointers to the interrupt flow method and the +interrupt chip structure which are assigned to this interrupt. + +Whenever an interrupt triggers, the low-level architecture code calls +into the generic interrupt code by calling :c:func:`desc->handle_irq`. This +high-level IRQ handling function only uses desc->irq_data.chip +primitives referenced by the assigned chip descriptor structure. + +High-level Driver API +--------------------- + +The high-level Driver API consists of following functions: + +- :c:func:`request_irq` + +- :c:func:`free_irq` + +- :c:func:`disable_irq` + +- :c:func:`enable_irq` + +- :c:func:`disable_irq_nosync` (SMP only) + +- :c:func:`synchronize_irq` (SMP only) + +- :c:func:`irq_set_irq_type` + +- :c:func:`irq_set_irq_wake` + +- :c:func:`irq_set_handler_data` + +- :c:func:`irq_set_chip` + +- :c:func:`irq_set_chip_data` + +See the autogenerated function documentation for details. + +High-level IRQ flow handlers +---------------------------- + +The generic layer provides a set of pre-defined irq-flow methods: + +- :c:func:`handle_level_irq` + +- :c:func:`handle_edge_irq` + +- :c:func:`handle_fasteoi_irq` + +- :c:func:`handle_simple_irq` + +- :c:func:`handle_percpu_irq` + +- :c:func:`handle_edge_eoi_irq` + +- :c:func:`handle_bad_irq` + +The interrupt flow handlers (either pre-defined or architecture +specific) are assigned to specific interrupts by the architecture either +during bootup or during device initialization. + +Default flow implementations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Helper functions +^^^^^^^^^^^^^^^^ + +The helper functions call the chip primitives and are used by the +default flow implementations. The following helper functions are +implemented (simplified excerpt):: + + default_enable(struct irq_data *data) + { + desc->irq_data.chip->irq_unmask(data); + } + + default_disable(struct irq_data *data) + { + if (!delay_disable(data)) + desc->irq_data.chip->irq_mask(data); + } + + default_ack(struct irq_data *data) + { + chip->irq_ack(data); + } + + default_mask_ack(struct irq_data *data) + { + if (chip->irq_mask_ack) { + chip->irq_mask_ack(data); + } else { + chip->irq_mask(data); + chip->irq_ack(data); + } + } + + noop(struct irq_data *data)) + { + } + + + +Default flow handler implementations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Default Level IRQ flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_level_irq provides a generic implementation for level-triggered +interrupts. + +The following control flow is implemented (simplified excerpt):: + + :c:func:`desc->irq_data.chip->irq_mask_ack`; + handle_irq_event(desc->action); + :c:func:`desc->irq_data.chip->irq_unmask`; + + +Default Fast EOI IRQ flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_fasteoi_irq provides a generic implementation for interrupts, +which only need an EOI at the end of the handler. + +The following control flow is implemented (simplified excerpt):: + + handle_irq_event(desc->action); + :c:func:`desc->irq_data.chip->irq_eoi`; + + +Default Edge IRQ flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_edge_irq provides a generic implementation for edge-triggered +interrupts. + +The following control flow is implemented (simplified excerpt):: + + if (desc->status & running) { + :c:func:`desc->irq_data.chip->irq_mask_ack`; + desc->status |= pending | masked; + return; + } + :c:func:`desc->irq_data.chip->irq_ack`; + desc->status |= running; + do { + if (desc->status & masked) + :c:func:`desc->irq_data.chip->irq_unmask`; + desc->status &= ~pending; + handle_irq_event(desc->action); + } while (status & pending); + desc->status &= ~running; + + +Default simple IRQ flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_simple_irq provides a generic implementation for simple +interrupts. + +.. note:: + + The simple flow handler does not call any handler/chip primitives. + +The following control flow is implemented (simplified excerpt):: + + handle_irq_event(desc->action); + + +Default per CPU flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_percpu_irq provides a generic implementation for per CPU +interrupts. + +Per CPU interrupts are only available on SMP and the handler provides a +simplified version without locking. + +The following control flow is implemented (simplified excerpt):: + + if (desc->irq_data.chip->irq_ack) + :c:func:`desc->irq_data.chip->irq_ack`; + handle_irq_event(desc->action); + if (desc->irq_data.chip->irq_eoi) + :c:func:`desc->irq_data.chip->irq_eoi`; + + +EOI Edge IRQ flow handler +^^^^^^^^^^^^^^^^^^^^^^^^^ + +handle_edge_eoi_irq provides an abnomination of the edge handler +which is solely used to tame a badly wreckaged irq controller on +powerpc/cell. + +Bad IRQ flow handler +^^^^^^^^^^^^^^^^^^^^ + +handle_bad_irq is used for spurious interrupts which have no real +handler assigned.. + +Quirks and optimizations +~~~~~~~~~~~~~~~~~~~~~~~~ + +The generic functions are intended for 'clean' architectures and chips, +which have no platform-specific IRQ handling quirks. If an architecture +needs to implement quirks on the 'flow' level then it can do so by +overriding the high-level irq-flow handler. + +Delayed interrupt disable +~~~~~~~~~~~~~~~~~~~~~~~~~ + +This per interrupt selectable feature, which was introduced by Russell +King in the ARM interrupt implementation, does not mask an interrupt at +the hardware level when :c:func:`disable_irq` is called. The interrupt is kept +enabled and is masked in the flow handler when an interrupt event +happens. This prevents losing edge interrupts on hardware which does not +store an edge interrupt event while the interrupt is disabled at the +hardware level. When an interrupt arrives while the IRQ_DISABLED flag +is set, then the interrupt is masked at the hardware level and the +IRQ_PENDING bit is set. When the interrupt is re-enabled by +:c:func:`enable_irq` the pending bit is checked and if it is set, the interrupt +is resent either via hardware or by a software resend mechanism. (It's +necessary to enable CONFIG_HARDIRQS_SW_RESEND when you want to use +the delayed interrupt disable feature and your hardware is not capable +of retriggering an interrupt.) The delayed interrupt disable is not +configurable. + +Chip-level hardware encapsulation +--------------------------------- + +The chip-level hardware descriptor structure :c:type:`irq_chip` contains all +the direct chip relevant functions, which can be utilized by the irq flow +implementations. + +- ``irq_ack`` + +- ``irq_mask_ack`` - Optional, recommended for performance + +- ``irq_mask`` + +- ``irq_unmask`` + +- ``irq_eoi`` - Optional, required for EOI flow handlers + +- ``irq_retrigger`` - Optional + +- ``irq_set_type`` - Optional + +- ``irq_set_wake`` - Optional + +These primitives are strictly intended to mean what they say: ack means +ACK, masking means masking of an IRQ line, etc. It is up to the flow +handler(s) to use these basic units of low-level functionality. + +__do_IRQ entry point +==================== + +The original implementation :c:func:`__do_IRQ` was an alternative entry point +for all types of interrupts. It no longer exists. + +This handler turned out to be not suitable for all interrupt hardware +and was therefore reimplemented with split functionality for +edge/level/simple/percpu interrupts. This is not only a functional +optimization. It also shortens code paths for interrupts. + +Locking on SMP +============== + +The locking of chip registers is up to the architecture that defines the +chip primitives. The per-irq structure is protected via desc->lock, by +the generic layer. + +Generic interrupt chip +====================== + +To avoid copies of identical implementations of IRQ chips the core +provides a configurable generic interrupt chip implementation. +Developers should check carefully whether the generic chip fits their +needs before implementing the same functionality slightly differently +themselves. + +.. kernel-doc:: kernel/irq/generic-chip.c + :export: + +Structures +========== + +This chapter contains the autogenerated documentation of the structures +which are used in the generic IRQ layer. + +.. kernel-doc:: include/linux/irq.h + :internal: + +.. kernel-doc:: include/linux/interrupt.h + :internal: + +Public Functions Provided +========================= + +This chapter contains the autogenerated documentation of the kernel API +functions which are exported. + +.. kernel-doc:: kernel/irq/manage.c + +.. kernel-doc:: kernel/irq/chip.c + +Internal Functions Provided +=========================== + +This chapter contains the autogenerated documentation of the internal +functions. + +.. kernel-doc:: kernel/irq/irqdesc.c + +.. kernel-doc:: kernel/irq/handle.c + +.. kernel-doc:: kernel/irq/chip.c + +Credits +======= + +The following people have contributed to this document: + +1. Thomas Gleixner tglx@linutronix.de + +2. Ingo Molnar mingo@elte.hu diff --git a/Documentation/core-api/index.rst b/Documentation/core-api/index.rst index 0d93d808913610c665eab7b7df7a5d2bbd848d6d..62abd36bfffbc1c1421c9212f40ac33d2f0fcf0c 100644 --- a/Documentation/core-api/index.rst +++ b/Documentation/core-api/index.rst @@ -11,11 +11,14 @@ Core utilities .. toctree:: :maxdepth: 1 + kernel-api assoc_array atomic_ops cpu_hotplug local_ops workqueue + genericirq + flexible-arrays Interfaces for kernel debugging =============================== diff --git a/Documentation/core-api/kernel-api.rst b/Documentation/core-api/kernel-api.rst new file mode 100644 index 0000000000000000000000000000000000000000..9ec8488319dcaacf1f595c5fea88580e97955483 --- /dev/null +++ b/Documentation/core-api/kernel-api.rst @@ -0,0 +1,346 @@ +==================== +The Linux Kernel API +==================== + +Data Types +========== + +Doubly Linked Lists +------------------- + +.. kernel-doc:: include/linux/list.h + :internal: + +Basic C Library Functions +========================= + +When writing drivers, you cannot in general use routines which are from +the C Library. Some of the functions have been found generally useful +and they are listed below. The behaviour of these functions may vary +slightly from those defined by ANSI, and these deviations are noted in +the text. + +String Conversions +------------------ + +.. kernel-doc:: lib/vsprintf.c + :export: + +.. kernel-doc:: include/linux/kernel.h + :functions: kstrtol + +.. kernel-doc:: include/linux/kernel.h + :functions: kstrtoul + +.. kernel-doc:: lib/kstrtox.c + :export: + +String Manipulation +------------------- + +.. kernel-doc:: lib/string.c + :export: + +Bit Operations +-------------- + +.. kernel-doc:: arch/x86/include/asm/bitops.h + :internal: + +Basic Kernel Library Functions +============================== + +The Linux kernel provides more basic utility functions. + +Bitmap Operations +----------------- + +.. kernel-doc:: lib/bitmap.c + :export: + +.. kernel-doc:: lib/bitmap.c + :internal: + +Command-line Parsing +-------------------- + +.. kernel-doc:: lib/cmdline.c + :export: + +CRC Functions +------------- + +.. kernel-doc:: lib/crc7.c + :export: + +.. kernel-doc:: lib/crc16.c + :export: + +.. kernel-doc:: lib/crc-itu-t.c + :export: + +.. kernel-doc:: lib/crc32.c + +.. kernel-doc:: lib/crc-ccitt.c + :export: + +idr/ida Functions +----------------- + +.. kernel-doc:: include/linux/idr.h + :doc: idr sync + +.. kernel-doc:: lib/idr.c + :doc: IDA description + +.. kernel-doc:: lib/idr.c + :export: + +Memory Management in Linux +========================== + +The Slab Cache +-------------- + +.. kernel-doc:: include/linux/slab.h + :internal: + +.. kernel-doc:: mm/slab.c + :export: + +.. kernel-doc:: mm/util.c + :export: + +User Space Memory Access +------------------------ + +.. kernel-doc:: arch/x86/include/asm/uaccess_32.h + :internal: + +.. kernel-doc:: arch/x86/lib/usercopy_32.c + :export: + +More Memory Management Functions +-------------------------------- + +.. kernel-doc:: mm/readahead.c + :export: + +.. kernel-doc:: mm/filemap.c + :export: + +.. kernel-doc:: mm/memory.c + :export: + +.. kernel-doc:: mm/vmalloc.c + :export: + +.. kernel-doc:: mm/page_alloc.c + :internal: + +.. kernel-doc:: mm/mempool.c + :export: + +.. kernel-doc:: mm/dmapool.c + :export: + +.. kernel-doc:: mm/page-writeback.c + :export: + +.. kernel-doc:: mm/truncate.c + :export: + +Kernel IPC facilities +===================== + +IPC utilities +------------- + +.. kernel-doc:: ipc/util.c + :internal: + +FIFO Buffer +=========== + +kfifo interface +--------------- + +.. kernel-doc:: include/linux/kfifo.h + :internal: + +relay interface support +======================= + +Relay interface support is designed to provide an efficient mechanism +for tools and facilities to relay large amounts of data from kernel +space to user space. + +relay interface +--------------- + +.. kernel-doc:: kernel/relay.c + :export: + +.. kernel-doc:: kernel/relay.c + :internal: + +Module Support +============== + +Module Loading +-------------- + +.. kernel-doc:: kernel/kmod.c + :export: + +Inter Module support +-------------------- + +Refer to the file kernel/module.c for more information. + +Hardware Interfaces +=================== + +Interrupt Handling +------------------ + +.. kernel-doc:: kernel/irq/manage.c + :export: + +DMA Channels +------------ + +.. kernel-doc:: kernel/dma.c + :export: + +Resources Management +-------------------- + +.. kernel-doc:: kernel/resource.c + :internal: + +.. kernel-doc:: kernel/resource.c + :export: + +MTRR Handling +------------- + +.. kernel-doc:: arch/x86/kernel/cpu/mtrr/main.c + :export: + +Security Framework +================== + +.. kernel-doc:: security/security.c + :internal: + +.. kernel-doc:: security/inode.c + :export: + +Audit Interfaces +================ + +.. kernel-doc:: kernel/audit.c + :export: + +.. kernel-doc:: kernel/auditsc.c + :internal: + +.. kernel-doc:: kernel/auditfilter.c + :internal: + +Accounting Framework +==================== + +.. kernel-doc:: kernel/acct.c + :internal: + +Block Devices +============= + +.. kernel-doc:: block/blk-core.c + :export: + +.. kernel-doc:: block/blk-core.c + :internal: + +.. kernel-doc:: block/blk-map.c + :export: + +.. kernel-doc:: block/blk-sysfs.c + :internal: + +.. kernel-doc:: block/blk-settings.c + :export: + +.. kernel-doc:: block/blk-exec.c + :export: + +.. kernel-doc:: block/blk-flush.c + :export: + +.. kernel-doc:: block/blk-lib.c + :export: + +.. kernel-doc:: block/blk-tag.c + :export: + +.. kernel-doc:: block/blk-tag.c + :internal: + +.. kernel-doc:: block/blk-integrity.c + :export: + +.. kernel-doc:: kernel/trace/blktrace.c + :internal: + +.. kernel-doc:: block/genhd.c + :internal: + +.. kernel-doc:: block/genhd.c + :export: + +Char devices +============ + +.. kernel-doc:: fs/char_dev.c + :export: + +Clock Framework +=============== + +The clock framework defines programming interfaces to support software +management of the system clock tree. This framework is widely used with +System-On-Chip (SOC) platforms to support power management and various +devices which may need custom clock rates. Note that these "clocks" +don't relate to timekeeping or real time clocks (RTCs), each of which +have separate frameworks. These :c:type:`struct clk ` +instances may be used to manage for example a 96 MHz signal that is used +to shift bits into and out of peripherals or busses, or otherwise +trigger synchronous state machine transitions in system hardware. + +Power management is supported by explicit software clock gating: unused +clocks are disabled, so the system doesn't waste power changing the +state of transistors that aren't in active use. On some systems this may +be backed by hardware clock gating, where clocks are gated without being +disabled in software. Sections of chips that are powered but not clocked +may be able to retain their last state. This low power state is often +called a *retention mode*. This mode still incurs leakage currents, +especially with finer circuit geometries, but for CMOS circuits power is +mostly used by clocked state changes. + +Power-aware drivers only enable their clocks when the device they manage +is in active use. Also, system sleep states often differ according to +which clock domains are active: while a "standby" state may allow wakeup +from several active domains, a "mem" (suspend-to-RAM) state may require +a more wholesale shutdown of clocks derived from higher speed PLLs and +oscillators, limiting the number of possible wakeup event sources. A +driver's suspend method may need to be aware of system-specific clock +constraints on the target sleep state. + +Some platforms support programmable clock generators. These can be used +by external chips of various kinds, such as other CPUs, multimedia +codecs, and devices with strict requirements for interface clocking. + +.. kernel-doc:: include/linux/clk.h + :internal: diff --git a/Documentation/cpu-freq/boost.txt b/Documentation/cpu-freq/boost.txt deleted file mode 100644 index dd62e1334f0a8b52b7a30a2d4f9daff0e90506a5..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/boost.txt +++ /dev/null @@ -1,93 +0,0 @@ -Processor boosting control - - - information for users - - -Quick guide for the impatient: --------------------- -/sys/devices/system/cpu/cpufreq/boost -controls the boost setting for the whole system. You can read and write -that file with either "0" (boosting disabled) or "1" (boosting allowed). -Reading or writing 1 does not mean that the system is boosting at this -very moment, but only that the CPU _may_ raise the frequency at it's -discretion. --------------------- - -Introduction -------------- -Some CPUs support a functionality to raise the operating frequency of -some cores in a multi-core package if certain conditions apply, mostly -if the whole chip is not fully utilized and below it's intended thermal -budget. The decision about boost disable/enable is made either at hardware -(e.g. x86) or software (e.g ARM). -On Intel CPUs this is called "Turbo Boost", AMD calls it "Turbo-Core", -in technical documentation "Core performance boost". In Linux we use -the term "boost" for convenience. - -Rationale for disable switch ----------------------------- - -Though the idea is to just give better performance without any user -intervention, sometimes the need arises to disable this functionality. -Most systems offer a switch in the (BIOS) firmware to disable the -functionality at all, but a more fine-grained and dynamic control would -be desirable: -1. While running benchmarks, reproducible results are important. Since - the boosting functionality depends on the load of the whole package, - single thread performance can vary. By explicitly disabling the boost - functionality at least for the benchmark's run-time the system will run - at a fixed frequency and results are reproducible again. -2. To examine the impact of the boosting functionality it is helpful - to do tests with and without boosting. -3. Boosting means overclocking the processor, though under controlled - conditions. By raising the frequency and the voltage the processor - will consume more power than without the boosting, which may be - undesirable for instance for mobile users. Disabling boosting may - save power here, though this depends on the workload. - - -User controlled switch ----------------------- - -To allow the user to toggle the boosting functionality, the cpufreq core -driver exports a sysfs knob to enable or disable it. There is a file: -/sys/devices/system/cpu/cpufreq/boost -which can either read "0" (boosting disabled) or "1" (boosting enabled). -The file is exported only when cpufreq driver supports boosting. -Explicitly changing the permissions and writing to that file anyway will -return EINVAL. - -On supported CPUs one can write either a "0" or a "1" into this file. -This will either disable the boost functionality on all cores in the -whole system (0) or will allow the software or hardware to boost at will -(1). - -Writing a "1" does not explicitly boost the system, but just allows the -CPU to boost at their discretion. Some implementations take external -factors like the chip's temperature into account, so boosting once does -not necessarily mean that it will occur every time even using the exact -same software setup. - - -AMD legacy cpb switch ---------------------- -The AMD powernow-k8 driver used to support a very similar switch to -disable or enable the "Core Performance Boost" feature of some AMD CPUs. -This switch was instantiated in each CPU's cpufreq directory -(/sys/devices/system/cpu[0-9]*/cpufreq) and was called "cpb". -Though the per CPU existence hints at a more fine grained control, the -actual implementation only supported a system-global switch semantics, -which was simply reflected into each CPU's file. Writing a 0 or 1 into it -would pull the other CPUs to the same state. -For compatibility reasons this file and its behavior is still supported -on AMD CPUs, though it is now protected by a config switch -(X86_ACPI_CPUFREQ_CPB). On Intel CPUs this file will never be created, -even with the config option set. -This functionality is considered legacy and will be removed in some future -kernel version. - -More fine grained boosting control ----------------------------------- - -Technically it is possible to switch the boosting functionality at least -on a per package basis, for some CPUs even per core. Currently the driver -does not support it, but this may be implemented in the future. diff --git a/Documentation/cpu-freq/governors.txt b/Documentation/cpu-freq/governors.txt deleted file mode 100644 index 61b3184b6c24e47c4a31c09763707e49a1d0de01..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/governors.txt +++ /dev/null @@ -1,301 +0,0 @@ - CPU frequency and voltage scaling code in the Linux(TM) kernel - - - L i n u x C P U F r e q - - C P U F r e q G o v e r n o r s - - - information for users and developers - - - - Dominik Brodowski - some additions and corrections by Nico Golde - Rafael J. Wysocki - Viresh Kumar - - - - Clock scaling allows you to change the clock speed of the CPUs on the - fly. This is a nice method to save battery power, because the lower - the clock speed, the less power the CPU consumes. - - -Contents: ---------- -1. What is a CPUFreq Governor? - -2. Governors In the Linux Kernel -2.1 Performance -2.2 Powersave -2.3 Userspace -2.4 Ondemand -2.5 Conservative -2.6 Schedutil - -3. The Governor Interface in the CPUfreq Core - -4. References - - -1. What Is A CPUFreq Governor? -============================== - -Most cpufreq drivers (except the intel_pstate and longrun) or even most -cpu frequency scaling algorithms only allow the CPU frequency to be set -to predefined fixed values. In order to offer dynamic frequency -scaling, the cpufreq core must be able to tell these drivers of a -"target frequency". So these specific drivers will be transformed to -offer a "->target/target_index/fast_switch()" call instead of the -"->setpolicy()" call. For set_policy drivers, all stays the same, -though. - -How to decide what frequency within the CPUfreq policy should be used? -That's done using "cpufreq governors". - -Basically, it's the following flow graph: - -CPU can be set to switch independently | CPU can only be set - within specific "limits" | to specific frequencies - - "CPUfreq policy" - consists of frequency limits (policy->{min,max}) - and CPUfreq governor to be used - / \ - / \ - / the cpufreq governor decides - / (dynamically or statically) - / what target_freq to set within - / the limits of policy->{min,max} - / \ - / \ - Using the ->setpolicy call, Using the ->target/target_index/fast_switch call, - the limits and the the frequency closest - "policy" is set. to target_freq is set. - It is assured that it - is within policy->{min,max} - - -2. Governors In the Linux Kernel -================================ - -2.1 Performance ---------------- - -The CPUfreq governor "performance" sets the CPU statically to the -highest frequency within the borders of scaling_min_freq and -scaling_max_freq. - - -2.2 Powersave -------------- - -The CPUfreq governor "powersave" sets the CPU statically to the -lowest frequency within the borders of scaling_min_freq and -scaling_max_freq. - - -2.3 Userspace -------------- - -The CPUfreq governor "userspace" allows the user, or any userspace -program running with UID "root", to set the CPU to a specific frequency -by making a sysfs file "scaling_setspeed" available in the CPU-device -directory. - - -2.4 Ondemand ------------- - -The CPUfreq governor "ondemand" sets the CPU frequency depending on the -current system load. Load estimation is triggered by the scheduler -through the update_util_data->func hook; when triggered, cpufreq checks -the CPU-usage statistics over the last period and the governor sets the -CPU accordingly. The CPU must have the capability to switch the -frequency very quickly. - -Sysfs files: - -* sampling_rate: - - Measured in uS (10^-6 seconds), this is how often you want the kernel - to look at the CPU usage and to make decisions on what to do about the - frequency. Typically this is set to values of around '10000' or more. - It's default value is (cmp. with users-guide.txt): transition_latency - * 1000. Be aware that transition latency is in ns and sampling_rate - is in us, so you get the same sysfs value by default. Sampling rate - should always get adjusted considering the transition latency to set - the sampling rate 750 times as high as the transition latency in the - bash (as said, 1000 is default), do: - - $ echo `$(($(cat cpuinfo_transition_latency) * 750 / 1000)) > ondemand/sampling_rate - -* sampling_rate_min: - - The sampling rate is limited by the HW transition latency: - transition_latency * 100 - - Or by kernel restrictions: - - If CONFIG_NO_HZ_COMMON is set, the limit is 10ms fixed. - - If CONFIG_NO_HZ_COMMON is not set or nohz=off boot parameter is - used, the limits depend on the CONFIG_HZ option: - HZ=1000: min=20000us (20ms) - HZ=250: min=80000us (80ms) - HZ=100: min=200000us (200ms) - - The highest value of kernel and HW latency restrictions is shown and - used as the minimum sampling rate. - -* up_threshold: - - This defines what the average CPU usage between the samplings of - 'sampling_rate' needs to be for the kernel to make a decision on - whether it should increase the frequency. For example when it is set - to its default value of '95' it means that between the checking - intervals the CPU needs to be on average more than 95% in use to then - decide that the CPU frequency needs to be increased. - -* ignore_nice_load: - - This parameter takes a value of '0' or '1'. When set to '0' (its - default), all processes are counted towards the 'cpu utilisation' - value. When set to '1', the processes that are run with a 'nice' - value will not count (and thus be ignored) in the overall usage - calculation. This is useful if you are running a CPU intensive - calculation on your laptop that you do not care how long it takes to - complete as you can 'nice' it and prevent it from taking part in the - deciding process of whether to increase your CPU frequency. - -* sampling_down_factor: - - This parameter controls the rate at which the kernel makes a decision - on when to decrease the frequency while running at top speed. When set - to 1 (the default) decisions to reevaluate load are made at the same - interval regardless of current clock speed. But when set to greater - than 1 (e.g. 100) it acts as a multiplier for the scheduling interval - for reevaluating load when the CPU is at its top speed due to high - load. This improves performance by reducing the overhead of load - evaluation and helping the CPU stay at its top speed when truly busy, - rather than shifting back and forth in speed. This tunable has no - effect on behavior at lower speeds/lower CPU loads. - -* powersave_bias: - - This parameter takes a value between 0 to 1000. It defines the - percentage (times 10) value of the target frequency that will be - shaved off of the target. For example, when set to 100 -- 10%, when - ondemand governor would have targeted 1000 MHz, it will target - 1000 MHz - (10% of 1000 MHz) = 900 MHz instead. This is set to 0 - (disabled) by default. - - When AMD frequency sensitivity powersave bias driver -- - drivers/cpufreq/amd_freq_sensitivity.c is loaded, this parameter - defines the workload frequency sensitivity threshold in which a lower - frequency is chosen instead of ondemand governor's original target. - The frequency sensitivity is a hardware reported (on AMD Family 16h - Processors and above) value between 0 to 100% that tells software how - the performance of the workload running on a CPU will change when - frequency changes. A workload with sensitivity of 0% (memory/IO-bound) - will not perform any better on higher core frequency, whereas a - workload with sensitivity of 100% (CPU-bound) will perform better - higher the frequency. When the driver is loaded, this is set to 400 by - default -- for CPUs running workloads with sensitivity value below - 40%, a lower frequency is chosen. Unloading the driver or writing 0 - will disable this feature. - - -2.5 Conservative ----------------- - -The CPUfreq governor "conservative", much like the "ondemand" -governor, sets the CPU frequency depending on the current usage. It -differs in behaviour in that it gracefully increases and decreases the -CPU speed rather than jumping to max speed the moment there is any load -on the CPU. This behaviour is more suitable in a battery powered -environment. The governor is tweaked in the same manner as the -"ondemand" governor through sysfs with the addition of: - -* freq_step: - - This describes what percentage steps the cpu freq should be increased - and decreased smoothly by. By default the cpu frequency will increase - in 5% chunks of your maximum cpu frequency. You can change this value - to anywhere between 0 and 100 where '0' will effectively lock your CPU - at a speed regardless of its load whilst '100' will, in theory, make - it behave identically to the "ondemand" governor. - -* down_threshold: - - Same as the 'up_threshold' found for the "ondemand" governor but for - the opposite direction. For example when set to its default value of - '20' it means that if the CPU usage needs to be below 20% between - samples to have the frequency decreased. - -* sampling_down_factor: - - Similar functionality as in "ondemand" governor. But in - "conservative", it controls the rate at which the kernel makes a - decision on when to decrease the frequency while running in any speed. - Load for frequency increase is still evaluated every sampling rate. - - -2.6 Schedutil -------------- - -The "schedutil" governor aims at better integration with the Linux -kernel scheduler. Load estimation is achieved through the scheduler's -Per-Entity Load Tracking (PELT) mechanism, which also provides -information about the recent load [1]. This governor currently does -load based DVFS only for tasks managed by CFS. RT and DL scheduler tasks -are always run at the highest frequency. Unlike all the other -governors, the code is located under the kernel/sched/ directory. - -Sysfs files: - -* rate_limit_us: - - This contains a value in microseconds. The governor waits for - rate_limit_us time before reevaluating the load again, after it has - evaluated the load once. - -For an in-depth comparison with the other governors refer to [2]. - - -3. The Governor Interface in the CPUfreq Core -============================================= - -A new governor must register itself with the CPUfreq core using -"cpufreq_register_governor". The struct cpufreq_governor, which has to -be passed to that function, must contain the following values: - -governor->name - A unique name for this governor. -governor->owner - .THIS_MODULE for the governor module (if appropriate). - -plus a set of hooks to the functions implementing the governor's logic. - -The CPUfreq governor may call the CPU processor driver using one of -these two functions: - -int cpufreq_driver_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation); - -int __cpufreq_driver_target(struct cpufreq_policy *policy, - unsigned int target_freq, - unsigned int relation); - -target_freq must be within policy->min and policy->max, of course. -What's the difference between these two functions? When your governor is -in a direct code path of a call to governor callbacks, like -governor->start(), the policy->rwsem is still held in the cpufreq core, -and there's no need to lock it again (in fact, this would cause a -deadlock). So use __cpufreq_driver_target only in these cases. In all -other cases (for example, when there's a "daemonized" function that -wakes up every second), use cpufreq_driver_target to take policy->rwsem -before the command is passed to the cpufreq driver. - -4. References -============= - -[1] Per-entity load tracking: https://lwn.net/Articles/531853/ -[2] Improvements in CPU frequency management: https://lwn.net/Articles/682391/ - diff --git a/Documentation/cpu-freq/index.txt b/Documentation/cpu-freq/index.txt index ef1d39247b054f332fc671bb0f781146866258c3..03a7cee6ac73a4f2dd48248196994538d81f6a4e 100644 --- a/Documentation/cpu-freq/index.txt +++ b/Documentation/cpu-freq/index.txt @@ -21,8 +21,6 @@ Documents in this directory: amd-powernow.txt - AMD powernow driver specific file. -boost.txt - Frequency boosting support. - core.txt - General description of the CPUFreq core and of CPUFreq notifiers. @@ -32,17 +30,12 @@ cpufreq-nforce2.txt - nVidia nForce2 platform specific file. cpufreq-stats.txt - General description of sysfs cpufreq stats. -governors.txt - What are cpufreq governors and how to - implement them? - index.txt - File index, Mailing list and Links (this document) intel-pstate.txt - Intel pstate cpufreq driver specific file. pcc-cpufreq.txt - PCC cpufreq driver specific file. -user-guide.txt - User Guide to CPUFreq - Mailing List ------------ diff --git a/Documentation/cpu-freq/user-guide.txt b/Documentation/cpu-freq/user-guide.txt deleted file mode 100644 index 391da64e94925c5fcfb04f0fc7c56db91c2023fa..0000000000000000000000000000000000000000 --- a/Documentation/cpu-freq/user-guide.txt +++ /dev/null @@ -1,228 +0,0 @@ - CPU frequency and voltage scaling code in the Linux(TM) kernel - - - L i n u x C P U F r e q - - U S E R G U I D E - - - Dominik Brodowski - - - - Clock scaling allows you to change the clock speed of the CPUs on the - fly. This is a nice method to save battery power, because the lower - the clock speed, the less power the CPU consumes. - - -Contents: ---------- -1. Supported Architectures and Processors -1.1 ARM and ARM64 -1.2 x86 -1.3 sparc64 -1.4 ppc -1.5 SuperH -1.6 Blackfin - -2. "Policy" / "Governor"? -2.1 Policy -2.2 Governor - -3. How to change the CPU cpufreq policy and/or speed -3.1 Preferred interface: sysfs - - - -1. Supported Architectures and Processors -========================================= - -1.1 ARM and ARM64 ------------------ - -Almost all ARM and ARM64 platforms support CPU frequency scaling. - -1.2 x86 -------- - -The following processors for the x86 architecture are supported by cpufreq: - -AMD Elan - SC400, SC410 -AMD mobile K6-2+ -AMD mobile K6-3+ -AMD mobile Duron -AMD mobile Athlon -AMD Opteron -AMD Athlon 64 -Cyrix Media GXm -Intel mobile PIII and Intel mobile PIII-M on certain chipsets -Intel Pentium 4, Intel Xeon -Intel Pentium M (Centrino) -National Semiconductors Geode GX -Transmeta Crusoe -Transmeta Efficeon -VIA Cyrix 3 / C3 -various processors on some ACPI 2.0-compatible systems [*] -And many more - -[*] Only if "ACPI Processor Performance States" are available -to the ACPI<->BIOS interface. - - -1.3 sparc64 ------------ - -The following processors for the sparc64 architecture are supported by -cpufreq: - -UltraSPARC-III - - -1.4 ppc -------- - -Several "PowerBook" and "iBook2" notebooks are supported. -The following POWER processors are supported in powernv mode: -POWER8 -POWER9 - -1.5 SuperH ----------- - -All SuperH processors supporting rate rounding through the clock -framework are supported by cpufreq. - -1.6 Blackfin ------------- - -The following Blackfin processors are supported by cpufreq: - -BF522, BF523, BF524, BF525, BF526, BF527, Rev 0.1 or higher -BF531, BF532, BF533, Rev 0.3 or higher -BF534, BF536, BF537, Rev 0.2 or higher -BF561, Rev 0.3 or higher -BF542, BF544, BF547, BF548, BF549, Rev 0.1 or higher - - -2. "Policy" / "Governor" ? -========================== - -Some CPU frequency scaling-capable processor switch between various -frequencies and operating voltages "on the fly" without any kernel or -user involvement. This guarantees very fast switching to a frequency -which is high enough to serve the user's needs, but low enough to save -power. - - -2.1 Policy ----------- - -On these systems, all you can do is select the lower and upper -frequency limit as well as whether you want more aggressive -power-saving or more instantly available processing power. - - -2.2 Governor ------------- - -On all other cpufreq implementations, these boundaries still need to -be set. Then, a "governor" must be selected. Such a "governor" decides -what speed the processor shall run within the boundaries. One such -"governor" is the "userspace" governor. This one allows the user - or -a yet-to-implement userspace program - to decide what specific speed -the processor shall run at. - - -3. How to change the CPU cpufreq policy and/or speed -==================================================== - -3.1 Preferred Interface: sysfs ------------------------------- - -The preferred interface is located in the sysfs filesystem. If you -mounted it at /sys, the cpufreq interface is located in a subdirectory -"cpufreq" within the cpu-device directory -(e.g. /sys/devices/system/cpu/cpu0/cpufreq/ for the first CPU). - -affected_cpus : List of Online CPUs that require software - coordination of frequency. - -cpuinfo_cur_freq : Current frequency of the CPU as obtained from - the hardware, in KHz. This is the frequency - the CPU actually runs at. - -cpuinfo_min_freq : this file shows the minimum operating - frequency the processor can run at(in kHz) - -cpuinfo_max_freq : this file shows the maximum operating - frequency the processor can run at(in kHz) - -cpuinfo_transition_latency The time it takes on this CPU to - switch between two frequencies in nano - seconds. If unknown or known to be - that high that the driver does not - work with the ondemand governor, -1 - (CPUFREQ_ETERNAL) will be returned. - Using this information can be useful - to choose an appropriate polling - frequency for a kernel governor or - userspace daemon. Make sure to not - switch the frequency too often - resulting in performance loss. - -related_cpus : List of Online + Offline CPUs that need software - coordination of frequency. - -scaling_available_frequencies : List of available frequencies, in KHz. - -scaling_available_governors : this file shows the CPUfreq governors - available in this kernel. You can see the - currently activated governor in - -scaling_cur_freq : Current frequency of the CPU as determined by - the governor and cpufreq core, in KHz. This is - the frequency the kernel thinks the CPU runs - at. - -scaling_driver : this file shows what cpufreq driver is - used to set the frequency on this CPU - -scaling_governor, and by "echoing" the name of another - governor you can change it. Please note - that some governors won't load - they only - work on some specific architectures or - processors. - -scaling_min_freq and -scaling_max_freq show the current "policy limits" (in - kHz). By echoing new values into these - files, you can change these limits. - NOTE: when setting a policy you need to - first set scaling_max_freq, then - scaling_min_freq. - -scaling_setspeed This can be read to get the currently programmed - value by the governor. This can be written to - change the current frequency for a group of - CPUs, represented by a policy. This is supported - currently only by the userspace governor. - -bios_limit : If the BIOS tells the OS to limit a CPU to - lower frequencies, the user can read out the - maximum available frequency from this file. - This typically can happen through (often not - intended) BIOS settings, restrictions - triggered through a service processor or other - BIOS/HW based implementations. - This does not cover thermal ACPI limitations - which can be detected through the generic - thermal driver. - -If you have selected the "userspace" governor which allows you to -set the CPU operating frequency to a specific value, you can read out -the current frequency in - -scaling_setspeed. By "echoing" a new frequency into this - you can change the speed of the CPU, - but only within the limits of - scaling_min_freq and scaling_max_freq. diff --git a/Documentation/cputopology.txt b/Documentation/cputopology.txt index f722f227a73bbc2cc0bbbbd6e0e65ab0d2f8b23e..127c9d8c2174ab419351882c45ef16230fb73563 100644 --- a/Documentation/cputopology.txt +++ b/Documentation/cputopology.txt @@ -100,7 +100,7 @@ not defined by include/asm-XXX/topology.h: For architectures that don't support books (CONFIG_SCHED_BOOK) there are no default definitions for topology_book_id() and topology_book_cpumask(). -For architectures that don't support drawes (CONFIG_SCHED_DRAWER) there are +For architectures that don't support drawers (CONFIG_SCHED_DRAWER) there are no default definitions for topology_drawer_id() and topology_drawer_cpumask(). Additionally, CPU topology information is provided under diff --git a/Documentation/crypto/api-samples.rst b/Documentation/crypto/api-samples.rst index 0a10819f6107a706c391ee5c01baa519246516fa..d021fd96a76ded1fd3a70c1c3166e7258c5127c9 100644 --- a/Documentation/crypto/api-samples.rst +++ b/Documentation/crypto/api-samples.rst @@ -155,9 +155,9 @@ Code Example For Use of Operational State Memory With SHASH char ctx[]; }; - static struct sdescinit_sdesc(struct crypto_shash *alg) + static struct sdesc init_sdesc(struct crypto_shash *alg) { - struct sdescsdesc; + struct sdesc sdesc; int size; size = sizeof(struct shash_desc) + crypto_shash_descsize(alg); @@ -172,7 +172,7 @@ Code Example For Use of Operational State Memory With SHASH static int calc_hash(struct crypto_shashalg, const unsigned chardata, unsigned int datalen, unsigned chardigest) { - struct sdescsdesc; + struct sdesc sdesc; int ret; sdesc = init_sdesc(alg); diff --git a/Documentation/crypto/asymmetric-keys.txt b/Documentation/crypto/asymmetric-keys.txt index 2b7816dea370a2f4a30404f5bbc8542f1f231561..5ad6480e3fb96308eed5a0f3f5f80130bd7af61d 100644 --- a/Documentation/crypto/asymmetric-keys.txt +++ b/Documentation/crypto/asymmetric-keys.txt @@ -311,3 +311,54 @@ Functions are provided to register and unregister parsers: Parsers may not have the same name. The names are otherwise only used for displaying in debugging messages. + + +========================= +KEYRING LINK RESTRICTIONS +========================= + +Keyrings created from userspace using add_key can be configured to check the +signature of the key being linked. + +Several restriction methods are available: + + (1) Restrict using the kernel builtin trusted keyring + + - Option string used with KEYCTL_RESTRICT_KEYRING: + - "builtin_trusted" + + The kernel builtin trusted keyring will be searched for the signing + key. The ca_keys kernel parameter also affects which keys are used for + signature verification. + + (2) Restrict using the kernel builtin and secondary trusted keyrings + + - Option string used with KEYCTL_RESTRICT_KEYRING: + - "builtin_and_secondary_trusted" + + The kernel builtin and secondary trusted keyrings will be searched for the + signing key. The ca_keys kernel parameter also affects which keys are used + for signature verification. + + (3) Restrict using a separate key or keyring + + - Option string used with KEYCTL_RESTRICT_KEYRING: + - "key_or_keyring:[:chain]" + + Whenever a key link is requested, the link will only succeed if the key + being linked is signed by one of the designated keys. This key may be + specified directly by providing a serial number for one asymmetric key, or + a group of keys may be searched for the signing key by providing the + serial number for a keyring. + + When the "chain" option is provided at the end of the string, the keys + within the destination keyring will also be searched for signing keys. + This allows for verification of certificate chains by adding each + cert in order (starting closest to the root) to one keyring. + +In all of these cases, if the signing key is found the signature of the key to +be linked will be verified using the signing key. The requested key is added +to the keyring only if the signature is successfully verified. -ENOKEY is +returned if the parent certificate could not be found, or -EKEYREJECTED is +returned if the signature check fails or the key is blacklisted. Other errors +may be returned if the signature check could not be performed. diff --git a/Documentation/debugging-via-ohci1394.txt b/Documentation/debugging-via-ohci1394.txt index 03703afc4d302e7eeb7fb4031d494ab750233194..9ff026d22b751deb64f5f850631c0b6e7e93e4c1 100644 --- a/Documentation/debugging-via-ohci1394.txt +++ b/Documentation/debugging-via-ohci1394.txt @@ -100,8 +100,8 @@ Step-by-step instructions for using firescope with early OHCI initialization: CardBus and even some Express cards which are fully compliant to OHCI-1394 specification are available. If it requires no driver for Windows operating systems, it most likely is. Only specialized shops have cards which are not - compliant, they are based on TI PCILynx chips and require drivers for Win- - dows operating systems. + compliant, they are based on TI PCILynx chips and require drivers for Windows + operating systems. The mentioned kernel log message contains the string "physUB" if the controller implements a writable Physical Upper Bound register. This is diff --git a/Documentation/device-mapper/cache.txt b/Documentation/device-mapper/cache.txt index f228604ddbcd6890c1db0ef2778422a499cbd480..cdfd0feb294e6f0be9851223b7669de3109d875a 100644 --- a/Documentation/device-mapper/cache.txt +++ b/Documentation/device-mapper/cache.txt @@ -290,7 +290,7 @@ message, which takes an arbitrary number of cblock ranges. Each cblock range's end value is "one past the end", meaning 5-10 expresses a range of values from 5 to 9. Each cblock must be expressed as a decimal value, in the future a variant message that takes cblock ranges -expressed in hexidecimal may be needed to better support efficient +expressed in hexadecimal may be needed to better support efficient invalidation of larger caches. The cache must be in passthrough mode when invalidate_cblocks is used. diff --git a/Documentation/device-mapper/dm-crypt.txt b/Documentation/device-mapper/dm-crypt.txt index ff1f87bf26e8ad10e7327ecd74a87cb3a6e64058..3b3e1de21c9ccb3f07b45aa5770e25e611d4e4d5 100644 --- a/Documentation/device-mapper/dm-crypt.txt +++ b/Documentation/device-mapper/dm-crypt.txt @@ -11,14 +11,31 @@ Parameters: \ [<#opt_params> ] - Encryption cipher and an optional IV generation mode. - (In format cipher[:keycount]-chainmode-ivmode[:ivopts]). + Encryption cipher, encryption mode and Initial Vector (IV) generator. + + The cipher specifications format is: + cipher[:keycount]-chainmode-ivmode[:ivopts] Examples: - des aes-cbc-essiv:sha256 - twofish-ecb + aes-xts-plain64 + serpent-xts-plain64 + + Cipher format also supports direct specification with kernel crypt API + format (selected by capi: prefix). The IV specification is the same + as for the first format type. + This format is mainly used for specification of authenticated modes. - /proc/crypto contains supported crypto modes + The crypto API cipher specifications format is: + capi:cipher_api_spec-ivmode[:ivopts] + Examples: + capi:cbc(aes)-essiv:sha256 + capi:xts(aes)-plain64 + Examples of authenticated modes: + capi:gcm(aes)-random + capi:authenc(hmac(sha256),xts(aes))-random + capi:rfc7539(chacha20,poly1305)-random + + The /proc/crypto contains a list of curently loaded crypto modes. Key used for encryption. It is encoded either as a hexadecimal number @@ -93,6 +110,32 @@ submit_from_crypt_cpus thread because it benefits CFQ to have writes submitted using the same context. +integrity:: + The device requires additional metadata per-sector stored + in per-bio integrity structure. This metadata must by provided + by underlying dm-integrity target. + + The can be "none" if metadata is used only for persistent IV. + + For Authenticated Encryption with Additional Data (AEAD) + the is "aead". An AEAD mode additionally calculates and verifies + integrity for the encrypted device. The additional space is then + used for storing authentication tag (and persistent IV if needed). + +sector_size: + Use as the encryption unit instead of 512 bytes sectors. + This option can be in range 512 - 4096 bytes and must be power of two. + Virtual device will announce this size as a minimal IO and logical sector. + +iv_large_sectors + IV generators will use sector number counted in units + instead of default 512 bytes sectors. + + For example, if is 4096 bytes, plain64 IV for the second + sector will be 8 (without flag) and 1 if iv_large_sectors is present. + The must be multiple of (in 512 bytes units) + if this flag is specified. + Example scripts =============== LUKS (Linux Unified Key Setup) is now the preferred way to set up disk diff --git a/Documentation/device-mapper/dm-integrity.txt b/Documentation/device-mapper/dm-integrity.txt new file mode 100644 index 0000000000000000000000000000000000000000..f33e3ade7a09bdaf9f6b338d5142acc512a94cb2 --- /dev/null +++ b/Documentation/device-mapper/dm-integrity.txt @@ -0,0 +1,199 @@ +The dm-integrity target emulates a block device that has additional +per-sector tags that can be used for storing integrity information. + +A general problem with storing integrity tags with every sector is that +writing the sector and the integrity tag must be atomic - i.e. in case of +crash, either both sector and integrity tag or none of them is written. + +To guarantee write atomicity, the dm-integrity target uses journal, it +writes sector data and integrity tags into a journal, commits the journal +and then copies the data and integrity tags to their respective location. + +The dm-integrity target can be used with the dm-crypt target - in this +situation the dm-crypt target creates the integrity data and passes them +to the dm-integrity target via bio_integrity_payload attached to the bio. +In this mode, the dm-crypt and dm-integrity targets provide authenticated +disk encryption - if the attacker modifies the encrypted device, an I/O +error is returned instead of random data. + +The dm-integrity target can also be used as a standalone target, in this +mode it calculates and verifies the integrity tag internally. In this +mode, the dm-integrity target can be used to detect silent data +corruption on the disk or in the I/O path. + + +When loading the target for the first time, the kernel driver will format +the device. But it will only format the device if the superblock contains +zeroes. If the superblock is neither valid nor zeroed, the dm-integrity +target can't be loaded. + +To use the target for the first time: +1. overwrite the superblock with zeroes +2. load the dm-integrity target with one-sector size, the kernel driver + will format the device +3. unload the dm-integrity target +4. read the "provided_data_sectors" value from the superblock +5. load the dm-integrity target with the the target size + "provided_data_sectors" +6. if you want to use dm-integrity with dm-crypt, load the dm-crypt target + with the size "provided_data_sectors" + + +Target arguments: + +1. the underlying block device + +2. the number of reserved sector at the beginning of the device - the + dm-integrity won't read of write these sectors + +3. the size of the integrity tag (if "-" is used, the size is taken from + the internal-hash algorithm) + +4. mode: + D - direct writes (without journal) - in this mode, journaling is + not used and data sectors and integrity tags are written + separately. In case of crash, it is possible that the data + and integrity tag doesn't match. + J - journaled writes - data and integrity tags are written to the + journal and atomicity is guaranteed. In case of crash, + either both data and tag or none of them are written. The + journaled mode degrades write throughput twice because the + data have to be written twice. + R - recovery mode - in this mode, journal is not replayed, + checksums are not checked and writes to the device are not + allowed. This mode is useful for data recovery if the + device cannot be activated in any of the other standard + modes. + +5. the number of additional arguments + +Additional arguments: + +journal_sectors:number + The size of journal, this argument is used only if formatting the + device. If the device is already formatted, the value from the + superblock is used. + +interleave_sectors:number + The number of interleaved sectors. This values is rounded down to + a power of two. If the device is already formatted, the value from + the superblock is used. + +buffer_sectors:number + The number of sectors in one buffer. The value is rounded down to + a power of two. + + The tag area is accessed using buffers, the buffer size is + configurable. The large buffer size means that the I/O size will + be larger, but there could be less I/Os issued. + +journal_watermark:number + The journal watermark in percents. When the size of the journal + exceeds this watermark, the thread that flushes the journal will + be started. + +commit_time:number + Commit time in milliseconds. When this time passes, the journal is + written. The journal is also written immediatelly if the FLUSH + request is received. + +internal_hash:algorithm(:key) (the key is optional) + Use internal hash or crc. + When this argument is used, the dm-integrity target won't accept + integrity tags from the upper target, but it will automatically + generate and verify the integrity tags. + + You can use a crc algorithm (such as crc32), then integrity target + will protect the data against accidental corruption. + You can also use a hmac algorithm (for example + "hmac(sha256):0123456789abcdef"), in this mode it will provide + cryptographic authentication of the data without encryption. + + When this argument is not used, the integrity tags are accepted + from an upper layer target, such as dm-crypt. The upper layer + target should check the validity of the integrity tags. + +journal_crypt:algorithm(:key) (the key is optional) + Encrypt the journal using given algorithm to make sure that the + attacker can't read the journal. You can use a block cipher here + (such as "cbc(aes)") or a stream cipher (for example "chacha20", + "salsa20", "ctr(aes)" or "ecb(arc4)"). + + The journal contains history of last writes to the block device, + an attacker reading the journal could see the last sector nubmers + that were written. From the sector numbers, the attacker can infer + the size of files that were written. To protect against this + situation, you can encrypt the journal. + +journal_mac:algorithm(:key) (the key is optional) + Protect sector numbers in the journal from accidental or malicious + modification. To protect against accidental modification, use a + crc algorithm, to protect against malicious modification, use a + hmac algorithm with a key. + + This option is not needed when using internal-hash because in this + mode, the integrity of journal entries is checked when replaying + the journal. Thus, modified sector number would be detected at + this stage. + +block_size:number + The size of a data block in bytes. The larger the block size the + less overhead there is for per-block integrity metadata. + Supported values are 512, 1024, 2048 and 4096 bytes. If not + specified the default block size is 512 bytes. + +The journal mode (D/J), buffer_sectors, journal_watermark, commit_time can +be changed when reloading the target (load an inactive table and swap the +tables with suspend and resume). The other arguments should not be changed +when reloading the target because the layout of disk data depend on them +and the reloaded target would be non-functional. + + +The layout of the formatted block device: +* reserved sectors (they are not used by this target, they can be used for + storing LUKS metadata or for other purpose), the size of the reserved + area is specified in the target arguments +* superblock (4kiB) + * magic string - identifies that the device was formatted + * version + * log2(interleave sectors) + * integrity tag size + * the number of journal sections + * provided data sectors - the number of sectors that this target + provides (i.e. the size of the device minus the size of all + metadata and padding). The user of this target should not send + bios that access data beyond the "provided data sectors" limit. + * flags - a flag is set if journal_mac is used +* journal + The journal is divided into sections, each section contains: + * metadata area (4kiB), it contains journal entries + every journal entry contains: + * logical sector (specifies where the data and tag should + be written) + * last 8 bytes of data + * integrity tag (the size is specified in the superblock) + every metadata sector ends with + * mac (8-bytes), all the macs in 8 metadata sectors form a + 64-byte value. It is used to store hmac of sector + numbers in the journal section, to protect against a + possibility that the attacker tampers with sector + numbers in the journal. + * commit id + * data area (the size is variable; it depends on how many journal + entries fit into the metadata area) + every sector in the data area contains: + * data (504 bytes of data, the last 8 bytes are stored in + the journal entry) + * commit id + To test if the whole journal section was written correctly, every + 512-byte sector of the journal ends with 8-byte commit id. If the + commit id matches on all sectors in a journal section, then it is + assumed that the section was written correctly. If the commit id + doesn't match, the section was written partially and it should not + be replayed. +* one or more runs of interleaved tags and data. Each run contains: + * tag area - it contains integrity tags. There is one tag for each + sector in the data area + * data area - it contains data sectors. The number of data sectors + in one run must be a power of two. log2 of this value is stored + in the superblock. diff --git a/Documentation/device-mapper/dm-raid.txt b/Documentation/device-mapper/dm-raid.txt index cd2cb2fc85ead940394873d941b24a7858ea3336..7e06e65586d4ae110262a4dc8c0eef62d43d9dd5 100644 --- a/Documentation/device-mapper/dm-raid.txt +++ b/Documentation/device-mapper/dm-raid.txt @@ -170,6 +170,13 @@ The target is named "raid" and it accepts the following parameters: Takeover/reshape is not possible with a raid4/5/6 journal device; it has to be deconfigured before requesting these. + [journal_mode ] + This option sets the caching mode on journaled raid4/5/6 raid sets + (see 'journal_dev ' above) to 'writethrough' or 'writeback'. + If 'writeback' is selected the journal device has to be resilient + and must not suffer from the 'write hole' problem itself (e.g. use + raid1 or raid10) to avoid a single point of failure. + <#raid_devs>: The number of devices composing the array. Each device consists of two entries. The first is the device containing the metadata (if any); the second is the one containing the @@ -254,7 +261,8 @@ recovery. Here is a fuller description of the individual fields: The current data offset to the start of the user data on each component device of a raid set (see the respective raid parameter to support out-of-place reshaping). - 'A' - active raid4/5/6 journal device. + 'A' - active write-through journal device. + 'a' - active write-back journal device. 'D' - dead journal device. '-' - no journal device. @@ -331,3 +339,7 @@ Version History 'D' on the status line. If '- -' is passed into the constructor, emit '- -' on the table line and '-' as the status line health character. 1.10.0 Add support for raid4/5/6 journal device +1.10.1 Fix data corruption on reshape request +1.11.0 Fix table line argument order + (wrong raid10_copies/raid10_format sequence) +1.11.1 Add raid4/5/6 journal write-back support via journal_mode option diff --git a/Documentation/devicetree/bindings/arm/amlogic.txt b/Documentation/devicetree/bindings/arm/amlogic.txt index c246cd2730d90669787e0bb5a0f0f50ed5ad3376..bfd5b558477d95bd1c2fe42d47861d791313872e 100644 --- a/Documentation/devicetree/bindings/arm/amlogic.txt +++ b/Documentation/devicetree/bindings/arm/amlogic.txt @@ -43,8 +43,11 @@ Board compatible values: - "wetek,hub" (Meson gxbb) - "wetek,play2" (Meson gxbb) - "amlogic,p212" (Meson gxl s905x) + - "khadas,vim" (Meson gxl s905x) + - "amlogic,p230" (Meson gxl s905d) - "amlogic,p231" (Meson gxl s905d) + - "hwacom,amazetv" (Meson gxl s905x) - "amlogic,q200" (Meson gxm s912) - "amlogic,q201" (Meson gxm s912) - "nexbox,a95x" (Meson gxbb or Meson gxl s905x) diff --git a/Documentation/devicetree/bindings/arm/atmel-at91.txt b/Documentation/devicetree/bindings/arm/atmel-at91.txt index 29737b9b616e46920a6df2a6c9327499eda8b69e..799af90dd75b09b4c28e85bbcf36db0604d63bd2 100644 --- a/Documentation/devicetree/bindings/arm/atmel-at91.txt +++ b/Documentation/devicetree/bindings/arm/atmel-at91.txt @@ -217,7 +217,8 @@ memory, bridge implementations, processor and other functionality not controlled elsewhere. required properties: -- compatible: Should be "atmel,-sfr", "syscon". +- compatible: Should be "atmel,-sfr", "syscon" or + "atmel,-sfrbu", "syscon" can be "sama5d3", "sama5d4" or "sama5d2". - reg: Should contain registers location and length diff --git a/Documentation/devicetree/bindings/arm/cavium-thunder2.txt b/Documentation/devicetree/bindings/arm/cavium-thunder2.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc5dd65cbce7a13a46eb4ef3e3ca172074509775 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/cavium-thunder2.txt @@ -0,0 +1,8 @@ +Cavium ThunderX2 CN99XX platform tree bindings +---------------------------------------------- + +Boards with Cavium ThunderX2 CN99XX SoC shall have the root property: + compatible = "cavium,thunderx2-cn9900", "brcm,vulcan-soc"; + +These SoC uses the "cavium,thunder2" core which will be compatible +with "brcm,vulcan". diff --git a/Documentation/devicetree/bindings/arm/cpus.txt b/Documentation/devicetree/bindings/arm/cpus.txt index 698ad1f097fa34c835478a84a7ad9bddfc86e9b6..1030f5f50207f22baa245e9df9ca69df286c128e 100644 --- a/Documentation/devicetree/bindings/arm/cpus.txt +++ b/Documentation/devicetree/bindings/arm/cpus.txt @@ -170,6 +170,7 @@ nodes to be present and contain the properties described below. "brcm,brahma-b15" "brcm,vulcan" "cavium,thunder" + "cavium,thunder2" "faraday,fa526" "intel,sa110" "intel,sa1100" diff --git a/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt new file mode 100644 index 0000000000000000000000000000000000000000..d38834c67dffe910af59fa15531b9fcd2303a33b --- /dev/null +++ b/Documentation/devicetree/bindings/arm/firmware/linaro,optee-tz.txt @@ -0,0 +1,31 @@ +OP-TEE Device Tree Bindings + +OP-TEE is a piece of software using hardware features to provide a Trusted +Execution Environment. The security can be provided with ARM TrustZone, but +also by virtualization or a separate chip. + +We're using "linaro" as the first part of the compatible property for +the reference implementation maintained by Linaro. + +* OP-TEE based on ARM TrustZone required properties: + +- compatible : should contain "linaro,optee-tz" + +- method : The method of calling the OP-TEE Trusted OS. Permitted + values are: + + "smc" : SMC #0, with the register assignments specified + in drivers/tee/optee/optee_smc.h + + "hvc" : HVC #0, with the register assignments specified + in drivers/tee/optee/optee_smc.h + + + +Example: + firmware { + optee { + compatible = "linaro,optee-tz"; + method = "smc"; + }; + }; diff --git a/Documentation/devicetree/bindings/arm/fsl.txt b/Documentation/devicetree/bindings/arm/fsl.txt index c9c567ae227f7a3f948ddc3a2ad66ba7d3664f05..cdb9dd705754a82928ffb90b7d67faf493104a24 100644 --- a/Documentation/devicetree/bindings/arm/fsl.txt +++ b/Documentation/devicetree/bindings/arm/fsl.txt @@ -179,6 +179,18 @@ LS1046A ARMv8 based RDB Board Required root node properties: - compatible = "fsl,ls1046a-rdb", "fsl,ls1046a"; +LS1088A SoC +Required root node properties: + - compatible = "fsl,ls1088a"; + +LS1088A ARMv8 based QDS Board +Required root node properties: + - compatible = "fsl,ls1088a-qds", "fsl,ls1088a"; + +LS1088A ARMv8 based RDB Board +Required root node properties: + - compatible = "fsl,ls1088a-rdb", "fsl,ls1088a"; + LS2080A SoC Required root node properties: - compatible = "fsl,ls2080a"; @@ -195,3 +207,14 @@ LS2080A ARMv8 based RDB Board Required root node properties: - compatible = "fsl,ls2080a-rdb", "fsl,ls2080a"; +LS2088A SoC +Required root node properties: + - compatible = "fsl,ls2088a"; + +LS2088A ARMv8 based QDS Board +Required root node properties: + - compatible = "fsl,ls2088a-qds", "fsl,ls2088a"; + +LS2088A ARMv8 based RDB Board +Required root node properties: + - compatible = "fsl,ls2088a-rdb", "fsl,ls2088a"; diff --git a/Documentation/devicetree/bindings/arm/gemini.txt b/Documentation/devicetree/bindings/arm/gemini.txt new file mode 100644 index 0000000000000000000000000000000000000000..0041eb031116af2ed8e306ffa6d30eedfc7a4aca --- /dev/null +++ b/Documentation/devicetree/bindings/arm/gemini.txt @@ -0,0 +1,86 @@ +Cortina systems Gemini platforms + +The Gemini SoC is the project name for an ARMv4 FA525-based SoC originally +produced by Storlink Semiconductor around 2005. The company was renamed +later renamed Storm Semiconductor. The chip product name is Storlink SL3516. +It was derived from earlier products from Storm named SL3316 (Centroid) and +SL3512 (Bulverde). + +Storm Semiconductor was acquired by Cortina Systems in 2008 and the SoC was +produced and used for NAS and similar usecases. In 2014 Cortina Systems was +in turn acquired by Inphi, who seem to have discontinued this product family. + +Many of the IP blocks used in the SoC comes from Faraday Technology. + +Required properties (in root node): + compatible = "cortina,gemini"; + +Required nodes: + +- soc: the SoC should be represented by a simple bus encompassing all the + onchip devices, this is referred to as the soc bus node. + +- syscon: the soc bus node must have a system controller node pointing to the + global control registers, with the compatible string + "cortina,gemini-syscon", "syscon"; + +- timer: the soc bus node must have a timer node pointing to the SoC timer + block, with the compatible string "cortina,gemini-timer" + See: clocksource/cortina,gemini-timer.txt + +- interrupt-controller: the sob bus node must have an interrupt controller + node pointing to the SoC interrupt controller block, with the compatible + string "cortina,gemini-interrupt-controller" + See interrupt-controller/cortina,gemini-interrupt-controller.txt + +Example: + +/ { + model = "Foo Gemini Machine"; + compatible = "cortina,gemini"; + #address-cells = <1>; + #size-cells = <1>; + + memory { + device_type = "memory"; + reg = <0x00000000 0x8000000>; + }; + + soc { + #address-cells = <1>; + #size-cells = <1>; + ranges; + compatible = "simple-bus"; + interrupt-parent = <&intcon>; + + syscon: syscon@40000000 { + compatible = "cortina,gemini-syscon", "syscon"; + reg = <0x40000000 0x1000>; + }; + + uart0: serial@42000000 { + compatible = "ns16550a"; + reg = <0x42000000 0x100>; + clock-frequency = <48000000>; + interrupts = <18 IRQ_TYPE_LEVEL_HIGH>; + reg-shift = <2>; + }; + + timer@43000000 { + compatible = "cortina,gemini-timer"; + reg = <0x43000000 0x1000>; + interrupt-parent = <&intcon>; + interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ + <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ + <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ + syscon = <&syscon>; + }; + + intcon: interrupt-controller@48000000 { + compatible = "cortina,gemini-interrupt-controller"; + reg = <0x48000000 0x1000>; + interrupt-controller; + #interrupt-cells = <2>; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt index f1c1e21a811099ddd980a566f064d2284729cfcd..2e732152064b5ae51ae33f38dd9f90e3c9fc90a1 100644 --- a/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt +++ b/Documentation/devicetree/bindings/arm/hisilicon/hisilicon.txt @@ -4,6 +4,14 @@ Hi3660 SoC Required root node properties: - compatible = "hisilicon,hi3660"; +Hi3798cv200 SoC +Required root node properties: + - compatible = "hisilicon,hi3798cv200"; + +Hi3798cv200 Poplar Board +Required root node properties: + - compatible = "hisilicon,hi3798cv200-poplar", "hisilicon,hi3798cv200"; + Hi4511 Board Required root node properties: - compatible = "hisilicon,hi3620-hi4511"; diff --git a/Documentation/devicetree/bindings/arm/i2se.txt b/Documentation/devicetree/bindings/arm/i2se.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbd54a3aa07da1f1cd9de71b2ece851803ed4709 --- /dev/null +++ b/Documentation/devicetree/bindings/arm/i2se.txt @@ -0,0 +1,22 @@ +I2SE Device Tree Bindings +------------------------- + +Duckbill Board +Required root node properties: + - compatible = "i2se,duckbill", "fsl,imx28"; + +Duckbill 2 Board +Required root node properties: + - compatible = "i2se,duckbill-2", "fsl,imx28"; + +Duckbill 2 485 Board +Required root node properties: + - compatible = "i2se,duckbill-2-485", "i2se,duckbill-2", "fsl,imx28"; + +Duckbill 2 EnOcean Board +Required root node properties: + - compatible = "i2se,duckbill-2-enocean", "i2se,duckbill-2", "fsl,imx28"; + +Duckbill 2 SPI Board +Required root node properties: + - compatible = "i2se,duckbill-2-spi", "i2se,duckbill-2", "fsl,imx28"; diff --git a/Documentation/devicetree/bindings/arm/l2c2x0.txt b/Documentation/devicetree/bindings/arm/l2c2x0.txt index 917199f179658248baa37b7f84c20b737b3a5152..d9650c1788f4122199703abe8a89253dc6f1974f 100644 --- a/Documentation/devicetree/bindings/arm/l2c2x0.txt +++ b/Documentation/devicetree/bindings/arm/l2c2x0.txt @@ -90,6 +90,9 @@ Optional properties: - arm,standby-mode: L2 standby mode enable. Value <0> (forcibly disable), <1> (forcibly enable), property absent (OS specific behavior, preferably retain firmware settings) +- arm,early-bresp-disable : Disable the CA9 optimization Early BRESP (PL310) +- arm,full-line-zero-disable : Disable the CA9 optimization Full line of zero + write (PL310) Example: diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt index cb0054ac7121e54a747f147844b3f698c380c0e1..cd977db7630c50a6e118e60edd96256efa138ac9 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,apmixedsys.txt @@ -7,6 +7,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-apmixedsys" + - "mediatek,mt6797-apmixedsys" - "mediatek,mt8135-apmixedsys" - "mediatek,mt8173-apmixedsys" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt index f6a916686f4c4a94515d703ebcca0506343a4ba9..047b11ae5f45c0a7e020a231a4e87ce656b3842e 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,imgsys.txt @@ -7,6 +7,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-imgsys", "syscon" + - "mediatek,mt6797-imgsys", "syscon" - "mediatek,mt8173-imgsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt index 1620ec2a5a3f12872d828d18fa04e02a9eafd315..58d58e2006b83324502d3d39739ab6537552649b 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,infracfg.txt @@ -8,6 +8,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-infracfg", "syscon" + - "mediatek,mt6797-infracfg", "syscon" - "mediatek,mt8135-infracfg", "syscon" - "mediatek,mt8173-infracfg", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt index 67dd2e473d25fbe7189d788f47edfb3f09d19dcd..70529e0b58e9a15927a552ada8106434c1627100 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,mmsys.txt @@ -7,6 +7,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-mmsys", "syscon" + - "mediatek,mt6797-mmsys", "syscon" - "mediatek,mt8173-mmsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt index 9f2fe7860114d65cb1a07ed70a6ec3242b57303f..ec93ecbb9f3c2fb72bf461d8d57275d8c57ac79a 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,topckgen.txt @@ -7,6 +7,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-topckgen" + - "mediatek,mt6797-topckgen" - "mediatek,mt8135-topckgen" - "mediatek,mt8173-topckgen" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt index 2440f73450c365895a0ec62e6f87c21281d1b0ef..d150104f928a4f7c023a724a3e646eb11b49d83a 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vdecsys.txt @@ -7,6 +7,7 @@ Required Properties: - compatible: Should be one of: - "mediatek,mt2701-vdecsys", "syscon" + - "mediatek,mt6797-vdecsys", "syscon" - "mediatek,mt8173-vdecsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt index 5bb2866a2b50acebabf714868386aa9223186542..8a93be643647d429c39ba3c978e6e1a02f94d1c4 100644 --- a/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt +++ b/Documentation/devicetree/bindings/arm/mediatek/mediatek,vencsys.txt @@ -5,7 +5,8 @@ The Mediatek vencsys controller provides various clocks to the system. Required Properties: -- compatible: Should be: +- compatible: Should be one of: + - "mediatek,mt6797-vencsys", "syscon" - "mediatek,mt8173-vencsys", "syscon" - #clock-cells: Must be 1 diff --git a/Documentation/devicetree/bindings/arm/rockchip.txt b/Documentation/devicetree/bindings/arm/rockchip.txt index cc4ace6397abda9ffa0a7aa02a0fce76eff5c6e0..c965d99e86c2b8413c5b5463d2bbc46966f53005 100644 --- a/Documentation/devicetree/bindings/arm/rockchip.txt +++ b/Documentation/devicetree/bindings/arm/rockchip.txt @@ -1,5 +1,8 @@ Rockchip platforms device tree bindings --------------------------------------- +- Asus Tinker board + Required root node properties: + - compatible = "asus,rk3288-tinker", "rockchip,rk3288"; - Kylin RK3036 board: Required root node properties: @@ -56,6 +59,17 @@ Rockchip platforms device tree bindings - compatible = "google,veyron-brain-rev0", "google,veyron-brain", "google,veyron", "rockchip,rk3288"; +- Google Gru (dev-board): + Required root node properties: + - compatible = "google,gru-rev15", "google,gru-rev14", + "google,gru-rev13", "google,gru-rev12", + "google,gru-rev11", "google,gru-rev10", + "google,gru-rev9", "google,gru-rev8", + "google,gru-rev7", "google,gru-rev6", + "google,gru-rev5", "google,gru-rev4", + "google,gru-rev3", "google,gru-rev2", + "google,gru", "rockchip,rk3399"; + - Google Jaq (Haier Chromebook 11 and more): Required root node properties: - compatible = "google,veyron-jaq-rev5", "google,veyron-jaq-rev4", @@ -70,6 +84,15 @@ Rockchip platforms device tree bindings "google,veyron-jerry-rev3", "google,veyron-jerry", "google,veyron", "rockchip,rk3288"; +- Google Kevin (Samsung Chromebook Plus): + Required root node properties: + - compatible = "google,kevin-rev15", "google,kevin-rev14", + "google,kevin-rev13", "google,kevin-rev12", + "google,kevin-rev11", "google,kevin-rev10", + "google,kevin-rev9", "google,kevin-rev8", + "google,kevin-rev7", "google,kevin-rev6", + "google,kevin", "google,gru", "rockchip,rk3399"; + - Google Mickey (Asus Chromebit CS10): Required root node properties: - compatible = "google,veyron-mickey-rev8", "google,veyron-mickey-rev7", @@ -103,6 +126,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "mqmaker,miqi", "rockchip,rk3288"; +- Phytec phyCORE-RK3288: Rapid Development Kit + Required root node properties: + - compatible = "phytec,rk3288-pcm-947", "phytec,rk3288-phycore-som", "rockchip,rk3288"; + - Rockchip PX3 Evaluation board: Required root node properties: - compatible = "rockchip,px3-evb", "rockchip,px3", "rockchip,rk3188"; @@ -134,6 +161,10 @@ Rockchip platforms device tree bindings Required root node properties: - compatible = "rockchip,rk3288-fennec", "rockchip,rk3288"; +- Rockchip RK3328 evb: + Required root node properties: + - compatible = "rockchip,rk3328-evb", "rockchip,rk3328"; + - Rockchip RK3399 evb: Required root node properties: - compatible = "rockchip,rk3399-evb", "rockchip,rk3399"; diff --git a/Documentation/devicetree/bindings/arm/shmobile.txt b/Documentation/devicetree/bindings/arm/shmobile.txt index c9502634316da3b184e883463973a64962fa252d..170fe0562c637eabb738978b909e930ba4fee87a 100644 --- a/Documentation/devicetree/bindings/arm/shmobile.txt +++ b/Documentation/devicetree/bindings/arm/shmobile.txt @@ -13,8 +13,12 @@ SoCs: compatible = "renesas,r8a73a4" - R-Mobile A1 (R8A77400) compatible = "renesas,r8a7740" + - RZ/G1H (R8A77420) + compatible = "renesas,r8a7742" - RZ/G1M (R8A77430) compatible = "renesas,r8a7743" + - RZ/G1N (R8A77440) + compatible = "renesas,r8a7744" - RZ/G1E (R8A77450) compatible = "renesas,r8a7745" - R-Car M1A (R8A77781) diff --git a/Documentation/devicetree/bindings/arm/sprd.txt b/Documentation/devicetree/bindings/arm/sprd.txt index 31a629dc75b8a959a3ad937a60364fa8dddccff0..3df034b13e284338df88af478101b4c4cd1542d0 100644 --- a/Documentation/devicetree/bindings/arm/sprd.txt +++ b/Documentation/devicetree/bindings/arm/sprd.txt @@ -1,11 +1,14 @@ Spreadtrum SoC Platforms Device Tree Bindings ---------------------------------------------------- -Sharkl64 is a Spreadtrum's SoC Platform which is based -on ARM 64-bit processor. +SC9836 openphone Board +Required root node properties: + - compatible = "sprd,sc9836-openphone", "sprd,sc9836"; -SC9836 openphone board with SC9836 SoC based on the -Sharkl64 Platform shall have the following properties. +SC9860 SoC +Required root node properties: + - compatible = "sprd,sc9860" +SP9860G 3GFHD Board Required root node properties: - - compatible = "sprd,sc9836-openphone", "sprd,sc9836"; + - compatible = "sprd,sp9860g-1h10", "sprd,sc9860"; diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt new file mode 100644 index 0000000000000000000000000000000000000000..078a58b0302fb5009da58ebf7c76ec7d556098cc --- /dev/null +++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra186-pmc.txt @@ -0,0 +1,34 @@ +NVIDIA Tegra Power Management Controller (PMC) + +Required properties: +- compatible: Should contain one of the following: + - "nvidia,tegra186-pmc": for Tegra186 +- reg: Must contain an (offset, length) pair of the register set for each + entry in reg-names. +- reg-names: Must include the following entries: + - "pmc" + - "wake" + - "aotag" + - "scratch" + +Optional properties: +- nvidia,invert-interrupt: If present, inverts the PMU interrupt signal. + +Example: + +SoC DTSI: + + pmc@c3600000 { + compatible = "nvidia,tegra186-pmc"; + reg = <0 0x0c360000 0 0x10000>, + <0 0x0c370000 0 0x10000>, + <0 0x0c380000 0 0x10000>, + <0 0x0c390000 0 0x10000>; + reg-names = "pmc", "wake", "aotag", "scratch"; + }; + +Board DTS: + + pmc@c360000 { + nvidia,invert-interrupt; + }; diff --git a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-flowctrl.txt b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-flowctrl.txt index ccf0adddc8205e3f47df8ff99af43271f7792a99..a855c1bffc0f6695d5c0cd3f032019ea32d9d5e8 100644 --- a/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-flowctrl.txt +++ b/Documentation/devicetree/bindings/arm/tegra/nvidia,tegra20-flowctrl.txt @@ -1,7 +1,13 @@ NVIDIA Tegra Flow Controller Required properties: -- compatible: Should be "nvidia,tegra-flowctrl" +- compatible: Should contain one of the following: + - "nvidia,tegra20-flowctrl": for Tegra20 + - "nvidia,tegra30-flowctrl": for Tegra30 + - "nvidia,tegra114-flowctrl": for Tegra114 + - "nvidia,tegra124-flowctrl": for Tegra124 + - "nvidia,tegra132-flowctrl", "nvidia,tegra124-flowctrl": for Tegra132 + - "nvidia,tegra210-flowctrl": for Tegra210 - reg: Should contain one register range (address and length) Example: diff --git a/Documentation/devicetree/bindings/ata/ahci-dm816.txt b/Documentation/devicetree/bindings/ata/ahci-dm816.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8c535f3541f57d4cbf7da9e938e0cebaade608c --- /dev/null +++ b/Documentation/devicetree/bindings/ata/ahci-dm816.txt @@ -0,0 +1,21 @@ +Device tree binding for the TI DM816 AHCI SATA Controller +--------------------------------------------------------- + +Required properties: + - compatible: must be "ti,dm816-ahci" + - reg: physical base address and size of the register region used by + the controller (as defined by the AHCI 1.1 standard) + - interrupts: interrupt specifier (refer to the interrupt binding) + - clocks: list of phandle and clock specifier pairs (or only + phandles for clock providers with '0' defined for + #clock-cells); two clocks must be specified: the functional + clock and an external reference clock + +Example: + + sata: sata@4a140000 { + compatible = "ti,dm816-ahci"; + reg = <0x4a140000 0x10000>; + interrupts = <16>; + clocks = <&sysclk5_ck>, <&sata_refclk>; + }; diff --git a/Documentation/devicetree/bindings/auxdisplay/hit,hd44780.txt b/Documentation/devicetree/bindings/auxdisplay/hit,hd44780.txt new file mode 100644 index 0000000000000000000000000000000000000000..2aa24b8899236882e2c74c99657bfa0d64b091ba --- /dev/null +++ b/Documentation/devicetree/bindings/auxdisplay/hit,hd44780.txt @@ -0,0 +1,45 @@ +DT bindings for the Hitachi HD44780 Character LCD Controller + +The Hitachi HD44780 Character LCD Controller is commonly used on character LCDs +that can display one or more lines of text. It exposes an M6800 bus interface, +which can be used in either 4-bit or 8-bit mode. + +Required properties: + - compatible: Must contain "hit,hd44780", + - data-gpios: Must contain an array of either 4 or 8 GPIO specifiers, + referring to the GPIO pins connected to the data signal lines DB0-DB7 + (8-bit mode) or DB4-DB7 (4-bit mode) of the LCD Controller's bus interface, + - enable-gpios: Must contain a GPIO specifier, referring to the GPIO pin + connected to the "E" (Enable) signal line of the LCD Controller's bus + interface, + - rs-gpios: Must contain a GPIO specifier, referring to the GPIO pin + connected to the "RS" (Register Select) signal line of the LCD Controller's + bus interface, + - display-height-chars: Height of the display, in character cells, + - display-width-chars: Width of the display, in character cells. + +Optional properties: + - rw-gpios: Must contain a GPIO specifier, referring to the GPIO pin + connected to the "RW" (Read/Write) signal line of the LCD Controller's bus + interface, + - backlight-gpios: Must contain a GPIO specifier, referring to the GPIO pin + used for enabling the LCD's backlight, + - internal-buffer-width: Internal buffer width (default is 40 for displays + with 1 or 2 lines, and display-width-chars for displays with more than 2 + lines). + +Example: + + auxdisplay { + compatible = "hit,hd44780"; + + data-gpios = <&hc595 0 GPIO_ACTIVE_HIGH>, + <&hc595 1 GPIO_ACTIVE_HIGH>, + <&hc595 2 GPIO_ACTIVE_HIGH>, + <&hc595 3 GPIO_ACTIVE_HIGH>; + enable-gpios = <&hc595 4 GPIO_ACTIVE_HIGH>; + rs-gpios = <&hc595 5 GPIO_ACTIVE_HIGH>; + + display-height-chars = <2>; + display-width-chars = <16>; + }; diff --git a/Documentation/devicetree/bindings/chosen.txt b/Documentation/devicetree/bindings/chosen.txt index 6ae9d82d4c378844505c4b8184b60d429ba95a9e..b5e39af4ddc0d6330dbf19ed768d6f15e1d2ba6c 100644 --- a/Documentation/devicetree/bindings/chosen.txt +++ b/Documentation/devicetree/bindings/chosen.txt @@ -52,3 +52,48 @@ This property is set (currently only on PowerPC, and only needed on book3e) by some versions of kexec-tools to tell the new kernel that it is being booted by kexec, as the booting environment may differ (e.g. a different secondary CPU release mechanism) + +linux,usable-memory-range +------------------------- + +This property (arm64 only) holds a base address and size, describing a +limited region in which memory may be considered available for use by +the kernel. Memory outside of this range is not available for use. + +This property describes a limitation: memory within this range is only +valid when also described through another mechanism that the kernel +would otherwise use to determine available memory (e.g. memory nodes +or the EFI memory map). Valid memory may be sparse within the range. +e.g. + +/ { + chosen { + linux,usable-memory-range = <0x9 0xf0000000 0x0 0x10000000>; + }; +}; + +The main usage is for crash dump kernel to identify its own usable +memory and exclude, at its boot time, any other memory areas that are +part of the panicked kernel's memory. + +While this property does not represent a real hardware, the address +and the size are expressed in #address-cells and #size-cells, +respectively, of the root node. + +linux,elfcorehdr +---------------- + +This property (currently used only on arm64) holds the memory range, +the address and the size, of the elf core header which mainly describes +the panicked kernel's memory layout as PT_LOAD segments of elf format. +e.g. + +/ { + chosen { + linux,elfcorehdr = <0x9 0xfffff000 0x0 0x800>; + }; +}; + +While this property does not represent a real hardware, the address +and the size are expressed in #address-cells and #size-cells, +respectively, of the root node. diff --git a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt index ce06435d28ed9a036cdb2283f159acd6b3ac8b4c..a09d627b5508a2f8c9905dc9c02c09387acfe9e6 100644 --- a/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt +++ b/Documentation/devicetree/bindings/clock/amlogic,gxbb-clkc.txt @@ -5,7 +5,8 @@ controllers within the SoC. Required Properties: -- compatible: should be "amlogic,gxbb-clkc" +- compatible: should be "amlogic,gxbb-clkc" for GXBB SoC, + or "amlogic,gxl-clkc" for GXL and GXM SoC. - reg: physical base address of the clock controller and length of memory mapped region. diff --git a/Documentation/devicetree/bindings/clock/armada3700-xtal-clock.txt b/Documentation/devicetree/bindings/clock/armada3700-xtal-clock.txt index a88f1f05fbd6394dcc451e5bacb77c1220a6f3c3..4c0807f28cfaa119a55a6f48fa061daca01f1f60 100644 --- a/Documentation/devicetree/bindings/clock/armada3700-xtal-clock.txt +++ b/Documentation/devicetree/bindings/clock/armada3700-xtal-clock.txt @@ -5,6 +5,7 @@ reading the gpio latch register. This node must be a subnode of the node exposing the register address of the GPIO block where the gpio latch is located. +See Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt Required properties: - compatible : shall be one of the following: @@ -16,9 +17,9 @@ Optional properties: output names ("xtal") Example: -gpio1: gpio@13800 { - compatible = "marvell,armada-3700-gpio", "syscon", "simple-mfd"; - reg = <0x13800 0x1000>; +pinctrl_nb: pinctrl-nb@13800 { + compatible = "armada3710-nb-pinctrl", "syscon", "simple-mfd"; + reg = <0x13800 0x100>, <0x13C00 0x20>; xtalclk: xtal-clk { compatible = "marvell,armada-3700-xtal-clock"; diff --git a/Documentation/devicetree/bindings/clock/idt,versaclock5.txt b/Documentation/devicetree/bindings/clock/idt,versaclock5.txt index 87e9c47a89a399fc3d8ef30623697d58cd956741..53d7e50ed875ae0db438af18f9f3354c2548c705 100644 --- a/Documentation/devicetree/bindings/clock/idt,versaclock5.txt +++ b/Documentation/devicetree/bindings/clock/idt,versaclock5.txt @@ -6,18 +6,21 @@ from 3 to 12 output clocks. ==I2C device node== Required properties: -- compatible: shall be one of "idt,5p49v5923" , "idt,5p49v5933". +- compatible: shall be one of "idt,5p49v5923" , "idt,5p49v5933" , + "idt,5p49v5935". - reg: i2c device address, shall be 0x68 or 0x6a. - #clock-cells: from common clock binding; shall be set to 1. - clocks: from common clock binding; list of parent clock handles, - 5p49v5923: (required) either or both of XTAL or CLKIN reference clock. - - 5p49v5933: (optional) property not present (internal + - 5p49v5933 and + - 5p49v5935: (optional) property not present (internal Xtal used) or CLKIN reference clock. - clock-names: from common clock binding; clock input names, can be - 5p49v5923: (required) either or both of "xin", "clkin". - - 5p49v5933: (optional) property not present or "clkin". + - 5p49v5933 and + - 5p49v5935: (optional) property not present or "clkin". ==Mapping between clock specifier and physical pins== @@ -34,6 +37,13 @@ clock specifier, the following mapping applies: 1 -- OUT1 2 -- OUT4 +5P49V5935: + 0 -- OUT0_SEL_I2CB + 1 -- OUT1 + 2 -- OUT2 + 3 -- OUT3 + 4 -- OUT4 + ==Example== /* 25MHz reference crystal */ diff --git a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt index eb985a633d595b98fe3d861e06b4efe468e25d9c..796c260c183d6af6d202752d453beb6363109c97 100644 --- a/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt +++ b/Documentation/devicetree/bindings/clock/mvebu-core-clock.txt @@ -31,6 +31,12 @@ The following is a list of provided IDs and clock names on Armada 39x: 4 = dclk (SDRAM Interface Clock) 5 = refclk (Reference Clock) +The following is a list of provided IDs and clock names on 98dx3236: + 0 = tclk (Internal Bus clock) + 1 = cpuclk (CPU clock) + 2 = ddrclk (DDR clock) + 3 = mpll (MPLL Clock) + The following is a list of provided IDs and clock names on Kirkwood and Dove: 0 = tclk (Internal Bus clock) 1 = cpuclk (CPU0 clock) @@ -49,6 +55,7 @@ Required properties: "marvell,armada-380-core-clock" - For Armada 380/385 SoC core clocks "marvell,armada-390-core-clock" - For Armada 39x SoC core clocks "marvell,armada-xp-core-clock" - For Armada XP SoC core clocks + "marvell,mv98dx3236-core-clock" - For 98dx3236 family SoC core clocks "marvell,dove-core-clock" - for Dove SoC core clocks "marvell,kirkwood-core-clock" - for Kirkwood SoC (except mv88f6180) "marvell,mv88f6180-core-clock" - for Kirkwood MV88f6180 SoC diff --git a/Documentation/devicetree/bindings/clock/mvebu-gated-clock.txt b/Documentation/devicetree/bindings/clock/mvebu-gated-clock.txt index 5142efc8099d24c927859cac7ca4d317853bfbdf..de562da2ae77db48639d1a775ca5efd489e5de04 100644 --- a/Documentation/devicetree/bindings/clock/mvebu-gated-clock.txt +++ b/Documentation/devicetree/bindings/clock/mvebu-gated-clock.txt @@ -119,6 +119,16 @@ ID Clock Peripheral 29 sata1lnk 30 sata1 SATA Host 1 +The following is a list of provided IDs for 98dx3236: +ID Clock Peripheral +----------------------------------- +3 ge1 Gigabit Ethernet 1 +4 ge0 Gigabit Ethernet 0 +5 pex0 PCIe Cntrl 0 +17 sdio SDHCI Host +18 usb0 USB Host 0 +22 xor0 XOR DMA 0 + The following is a list of provided IDs for Dove: ID Clock Peripheral ----------------------------------- @@ -169,6 +179,7 @@ Required properties: "marvell,armada-380-gating-clock" - for Armada 380/385 SoC clock gating "marvell,armada-390-gating-clock" - for Armada 39x SoC clock gating "marvell,armada-xp-gating-clock" - for Armada XP SoC clock gating + "marvell,mv98dx3236-gating-clock" - for 98dx3236 SoC clock gating "marvell,dove-gating-clock" - for Dove SoC clock gating "marvell,kirkwood-gating-clock" - for Kirkwood SoC clock gating - reg : shall be the register address of the Clock Gating Control register diff --git a/Documentation/devicetree/bindings/clock/qoriq-clock.txt b/Documentation/devicetree/bindings/clock/qoriq-clock.txt index aa3526f229a721435766bb7ac3d6a82dd14feaa9..6ed469c66b32f42c4bfb62849ce5451618d5067e 100644 --- a/Documentation/devicetree/bindings/clock/qoriq-clock.txt +++ b/Documentation/devicetree/bindings/clock/qoriq-clock.txt @@ -35,6 +35,7 @@ Required properties: * "fsl,ls1021a-clockgen" * "fsl,ls1043a-clockgen" * "fsl,ls1046a-clockgen" + * "fsl,ls1088a-clockgen" * "fsl,ls2080a-clockgen" Chassis-version clock strings include: * "fsl,qoriq-clockgen-1.0": for chassis 1.0 clocks diff --git a/Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt deleted file mode 100644 index 4da126116cf007602bee3107c3222a537f7af856..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/clock/rockchip,rk1108-cru.txt +++ /dev/null @@ -1,59 +0,0 @@ -* Rockchip RK1108 Clock and Reset Unit - -The RK1108 clock controller generates and supplies clock to various -controllers within the SoC and also implements a reset controller for SoC -peripherals. - -Required Properties: - -- compatible: should be "rockchip,rk1108-cru" -- reg: physical base address of the controller and length of memory mapped - region. -- #clock-cells: should be 1. -- #reset-cells: should be 1. - -Optional Properties: - -- rockchip,grf: phandle to the syscon managing the "general register files" - If missing pll rates are not changeable, due to the missing pll lock status. - -Each clock is assigned an identifier and client nodes can use this identifier -to specify the clock which they consume. All available clocks are defined as -preprocessor macros in the dt-bindings/clock/rk1108-cru.h headers and can be -used in device tree sources. Similar macros exist for the reset sources in -these files. - -External clocks: - -There are several clocks that are generated outside the SoC. It is expected -that they are defined using standard clock bindings with following -clock-output-names: - - "xin24m" - crystal input - required, - - "ext_vip" - external VIP clock - optional - - "ext_i2s" - external I2S clock - optional - - "ext_gmac" - external GMAC clock - optional - - "hdmiphy" - external clock input derived from HDMI PHY - optional - - "usbphy" - external clock input derived from USB PHY - optional - -Example: Clock controller node: - - cru: cru@20200000 { - compatible = "rockchip,rk1108-cru"; - reg = <0x20200000 0x1000>; - rockchip,grf = <&grf>; - - #clock-cells = <1>; - #reset-cells = <1>; - }; - -Example: UART controller node that consumes the clock generated by the clock - controller: - - uart0: serial@10230000 { - compatible = "rockchip,rk1108-uart", "snps,dw-apb-uart"; - reg = <0x10230000 0x100>; - interrupts = ; - reg-shift = <2>; - reg-io-width = <4>; - clocks = <&cru SCLK_UART0>; - }; diff --git a/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt new file mode 100644 index 0000000000000000000000000000000000000000..161326a4f9c1feebe41d0ee38e8089c769043dd1 --- /dev/null +++ b/Documentation/devicetree/bindings/clock/rockchip,rv1108-cru.txt @@ -0,0 +1,59 @@ +* Rockchip RV1108 Clock and Reset Unit + +The RV1108 clock controller generates and supplies clock to various +controllers within the SoC and also implements a reset controller for SoC +peripherals. + +Required Properties: + +- compatible: should be "rockchip,rv1108-cru" +- reg: physical base address of the controller and length of memory mapped + region. +- #clock-cells: should be 1. +- #reset-cells: should be 1. + +Optional Properties: + +- rockchip,grf: phandle to the syscon managing the "general register files" + If missing pll rates are not changeable, due to the missing pll lock status. + +Each clock is assigned an identifier and client nodes can use this identifier +to specify the clock which they consume. All available clocks are defined as +preprocessor macros in the dt-bindings/clock/rv1108-cru.h headers and can be +used in device tree sources. Similar macros exist for the reset sources in +these files. + +External clocks: + +There are several clocks that are generated outside the SoC. It is expected +that they are defined using standard clock bindings with following +clock-output-names: + - "xin24m" - crystal input - required, + - "ext_vip" - external VIP clock - optional + - "ext_i2s" - external I2S clock - optional + - "ext_gmac" - external GMAC clock - optional + - "hdmiphy" - external clock input derived from HDMI PHY - optional + - "usbphy" - external clock input derived from USB PHY - optional + +Example: Clock controller node: + + cru: cru@20200000 { + compatible = "rockchip,rv1108-cru"; + reg = <0x20200000 0x1000>; + rockchip,grf = <&grf>; + + #clock-cells = <1>; + #reset-cells = <1>; + }; + +Example: UART controller node that consumes the clock generated by the clock + controller: + + uart0: serial@10230000 { + compatible = "rockchip,rv1108-uart", "snps,dw-apb-uart"; + reg = <0x10230000 0x100>; + interrupts = ; + reg-shift = <2>; + reg-io-width = <4>; + clocks = <&cru SCLK_UART0>; + }; diff --git a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt index bae5668cf427836642a81ebe2e1d9b8b386bee71..e9c5a1d9834af600de55821bc0077e6235ff0bff 100644 --- a/Documentation/devicetree/bindings/clock/sunxi-ccu.txt +++ b/Documentation/devicetree/bindings/clock/sunxi-ccu.txt @@ -7,9 +7,12 @@ Required properties : - "allwinner,sun8i-a23-ccu" - "allwinner,sun8i-a33-ccu" - "allwinner,sun8i-h3-ccu" + - "allwinner,sun8i-h3-r-ccu" - "allwinner,sun8i-v3s-ccu" - "allwinner,sun9i-a80-ccu" - "allwinner,sun50i-a64-ccu" + - "allwinner,sun50i-a64-r-ccu" + - "allwinner,sun50i-h5-ccu" - reg: Must contain the registers base address and length - clocks: phandle to the oscillators feeding the CCU. Two are needed: @@ -19,7 +22,10 @@ Required properties : - #clock-cells : must contain 1 - #reset-cells : must contain 1 -Example: +For the PRCM CCUs on H3/A64, one more clock is needed: +- "iosc": the SoC's internal frequency oscillator + +Example for generic CCU: ccu: clock@01c20000 { compatible = "allwinner,sun8i-h3-ccu"; reg = <0x01c20000 0x400>; @@ -28,3 +34,13 @@ ccu: clock@01c20000 { #clock-cells = <1>; #reset-cells = <1>; }; + +Example for PRCM CCU: +r_ccu: clock@01f01400 { + compatible = "allwinner,sun50i-a64-r-ccu"; + reg = <0x01f01400 0x100>; + clocks = <&osc24M>, <&osc32k>, <&iosc>; + clock-names = "hosc", "losc", "iosc"; + #clock-cells = <1>; + #reset-cells = <1>; +}; diff --git a/Documentation/devicetree/bindings/crypto/st,stm32-crc.txt b/Documentation/devicetree/bindings/crypto/st,stm32-crc.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ba92a5e9b3617e0ceb86af03f6cf288d278ae73 --- /dev/null +++ b/Documentation/devicetree/bindings/crypto/st,stm32-crc.txt @@ -0,0 +1,16 @@ +* STMicroelectronics STM32 CRC + +Required properties: +- compatible: Should be "st,stm32f7-crc". +- reg: The address and length of the peripheral registers space +- clocks: The input clock of the CRC instance + +Optional properties: none + +Example: + +crc: crc@40023000 { + compatible = "st,stm32f7-crc"; + reg = <0x40023000 0x400>; + clocks = <&rcc 0 12>; +}; diff --git a/Documentation/devicetree/bindings/devfreq/exynos-bus.txt b/Documentation/devicetree/bindings/devfreq/exynos-bus.txt index d085ef90d27c1f8b82645177169f71c3eb42d00f..f8e946471a58d7bb6ac07d22cf6a876d8c509410 100644 --- a/Documentation/devicetree/bindings/devfreq/exynos-bus.txt +++ b/Documentation/devicetree/bindings/devfreq/exynos-bus.txt @@ -202,23 +202,23 @@ Example2 : compatible = "operating-points-v2"; opp-shared; - opp@50000000 { + opp-50000000 { opp-hz = /bits/ 64 <50000000>; opp-microvolt = <800000>; }; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; opp-microvolt = <800000>; }; - opp@134000000 { + opp-134000000 { opp-hz = /bits/ 64 <134000000>; opp-microvolt = <800000>; }; - opp@200000000 { + opp-200000000 { opp-hz = /bits/ 64 <200000000>; opp-microvolt = <825000>; }; - opp@400000000 { + opp-400000000 { opp-hz = /bits/ 64 <400000000>; opp-microvolt = <875000>; }; @@ -292,23 +292,23 @@ Example2 : compatible = "operating-points-v2"; opp-shared; - opp@50000000 { + opp-50000000 { opp-hz = /bits/ 64 <50000000>; opp-microvolt = <900000>; }; - opp@80000000 { + opp-80000000 { opp-hz = /bits/ 64 <80000000>; opp-microvolt = <900000>; }; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; opp-microvolt = <1000000>; }; - opp@134000000 { + opp-134000000 { opp-hz = /bits/ 64 <134000000>; opp-microvolt = <1000000>; }; - opp@200000000 { + opp-200000000 { opp-hz = /bits/ 64 <200000000>; opp-microvolt = <1000000>; }; @@ -318,19 +318,19 @@ Example2 : compatible = "operating-points-v2"; opp-shared; - opp@50000000 { + opp-50000000 { opp-hz = /bits/ 64 <50000000>; }; - opp@80000000 { + opp-80000000 { opp-hz = /bits/ 64 <80000000>; }; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; }; - opp@200000000 { + opp-200000000 { opp-hz = /bits/ 64 <200000000>; }; - opp@400000000 { + opp-400000000 { opp-hz = /bits/ 64 <400000000>; }; }; @@ -339,19 +339,19 @@ Example2 : compatible = "operating-points-v2"; opp-shared; - opp@50000000 { + opp-50000000 { opp-hz = /bits/ 64 <50000000>; }; - opp@80000000 { + opp-80000000 { opp-hz = /bits/ 64 <80000000>; }; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; }; - opp@200000000 { + opp-200000000 { opp-hz = /bits/ 64 <200000000>; }; - opp@300000000 { + opp-300000000 { opp-hz = /bits/ 64 <300000000>; }; }; @@ -360,13 +360,13 @@ Example2 : compatible = "operating-points-v2"; opp-shared; - opp@50000000 { + opp-50000000 { opp-hz = /bits/ 64 <50000000>; }; - opp@80000000 { + opp-80000000 { opp-hz = /bits/ 64 <80000000>; }; - opp@100000000 { + opp-100000000 { opp-hz = /bits/ 64 <100000000>; }; }; diff --git a/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt b/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt index ebc1a914bda339922cd87d6f83883a0a18eaaa24..ec94468b35beca724152ab48c4f7630e3383ef2c 100644 --- a/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt +++ b/Documentation/devicetree/bindings/display/atmel/hlcdc-dc.txt @@ -1,7 +1,7 @@ Device-Tree bindings for Atmel's HLCDC (High LCD Controller) DRM driver The Atmel HLCDC Display Controller is subdevice of the HLCDC MFD device. -See ../mfd/atmel-hlcdc.txt for more details. +See ../../mfd/atmel-hlcdc.txt for more details. Required properties: - compatible: value should be "atmel,hlcdc-display-controller" diff --git a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt b/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt index 7a5c0e204c8edf031c263a4129966445fdd9d0f9..e5a8b363d82972dc64133b69be7cc190f4470f56 100644 --- a/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt +++ b/Documentation/devicetree/bindings/display/imx/fsl,imx-fb.txt @@ -13,6 +13,8 @@ Required nodes: Additional, the display node has to define properties: - bits-per-pixel: Bits per pixel - fsl,pcr: LCDC PCR value + A display node may optionally define + - fsl,aus-mode: boolean to enable AUS mode (only for imx21) Optional properties: - lcd-supply: Regulator for LCD supply voltage. diff --git a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt index b82c004494681341bdef251e9323bb1df4773581..57a8d061006276730b07ff1ebf9867f52841e673 100644 --- a/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt +++ b/Documentation/devicetree/bindings/display/sunxi/sun4i-drm.txt @@ -94,6 +94,7 @@ Required properties: * allwinner,sun6i-a31-display-backend * allwinner,sun8i-a33-display-backend - reg: base address and size of the memory-mapped region. + - interrupts: interrupt associated to this IP - clocks: phandles to the clocks feeding the frontend and backend * ahb: the backend interface clock * mod: the backend module clock @@ -265,6 +266,7 @@ fe0: display-frontend@1e00000 { be0: display-backend@1e60000 { compatible = "allwinner,sun5i-a13-display-backend"; reg = <0x01e60000 0x10000>; + interrupts = <47>; clocks = <&ahb_gates 44>, <&de_be_clk>, <&dram_gates 26>; clock-names = "ahb", "mod", diff --git a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt index 0fad7ed2ea191201af1a136d1fee19bd94e67ff2..74e1e8add5a1264bfc2ed8db1b26667ed8906903 100644 --- a/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt +++ b/Documentation/devicetree/bindings/display/tegra/nvidia,tegra20-host1x.txt @@ -249,6 +249,19 @@ of the following host1x client modules: See ../pinctrl/nvidia,tegra124-dpaux-padctl.txt for information regarding the DPAUX pad controller bindings. +- vic: Video Image Compositor + - compatible : "nvidia,tegra-vic" + - reg: Physical base address and length of the controller's registers. + - interrupts: The interrupt outputs from the controller. + - clocks: Must contain an entry for each entry in clock-names. + See ../clocks/clock-bindings.txt for details. + - clock-names: Must include the following entries: + - vic: clock input for the VIC hardware + - resets: Must contain an entry for each entry in reset-names. + See ../reset/reset.txt for details. + - reset-names: Must include the following entries: + - vic + Example: / { diff --git a/Documentation/devicetree/bindings/firmware/coreboot.txt b/Documentation/devicetree/bindings/firmware/coreboot.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c955703cea87f733494d7ac84d87d1198cf0fb7 --- /dev/null +++ b/Documentation/devicetree/bindings/firmware/coreboot.txt @@ -0,0 +1,33 @@ +COREBOOT firmware information + +The device tree node to communicate the location of coreboot's memory-resident +bookkeeping structures to the kernel. Since coreboot itself cannot boot a +device-tree-based kernel (yet), this node needs to be inserted by a +second-stage bootloader (a coreboot "payload"). + +Required properties: + - compatible: Should be "coreboot" + - reg: Address and length of the following two memory regions, in order: + 1.) The coreboot table. This is a list of variable-sized descriptors + that contain various compile- and run-time generated firmware + parameters. It is identified by the magic string "LBIO" in its first + four bytes. + See coreboot's src/commonlib/include/commonlib/coreboot_tables.h for + details. + 2.) The CBMEM area. This is a downward-growing memory region used by + coreboot to dynamically allocate data structures that remain resident. + It may or may not include the coreboot table as one of its members. It + is identified by a root node descriptor with the magic number + 0xc0389481 that resides in the topmost 8 bytes of the area. + See coreboot's src/include/imd.h for details. + +Example: + firmware { + ranges; + + coreboot { + compatible = "coreboot"; + reg = <0xfdfea000 0x264>, + <0xfdfea000 0x16000>; + } + }; diff --git a/Documentation/devicetree/bindings/fpga/altera-pr-ip.txt b/Documentation/devicetree/bindings/fpga/altera-pr-ip.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a294cf2730537c0b925cb16b0845b85c0195b1 --- /dev/null +++ b/Documentation/devicetree/bindings/fpga/altera-pr-ip.txt @@ -0,0 +1,12 @@ +Altera Arria10 Partial Reconfiguration IP + +Required properties: +- compatible : should contain "altr,a10-pr-ip" +- reg : base address and size for memory mapped io. + +Example: + + fpga_mgr: fpga-mgr@ff20c000 { + compatible = "altr,a10-pr-ip"; + reg = <0xff20c000 0x10>; + }; diff --git a/Documentation/devicetree/bindings/fpga/fpga-region.txt b/Documentation/devicetree/bindings/fpga/fpga-region.txt index 3b32ba15a71730838fb33bebb1365013b3c3c74d..6db8aeda461aebc0155326ae68a136fce19e2898 100644 --- a/Documentation/devicetree/bindings/fpga/fpga-region.txt +++ b/Documentation/devicetree/bindings/fpga/fpga-region.txt @@ -186,12 +186,15 @@ Optional properties: otherwise full reconfiguration is done. - external-fpga-config : boolean, set if the FPGA has already been configured prior to OS boot up. +- encrypted-fpga-config : boolean, set if the bitstream is encrypted - region-unfreeze-timeout-us : The maximum time in microseconds to wait for bridges to successfully become enabled after the region has been programmed. - region-freeze-timeout-us : The maximum time in microseconds to wait for bridges to successfully become disabled before the region has been programmed. +- config-complete-timeout-us : The maximum time in microseconds time for the + FPGA to go to operating mode after the region has been programmed. - child nodes : devices in the FPGA after programming. In the example below, when an overlay is applied targeting fpga-region0, diff --git a/Documentation/devicetree/bindings/fpga/lattice-ice40-fpga-mgr.txt b/Documentation/devicetree/bindings/fpga/lattice-ice40-fpga-mgr.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dc412437b086d6470038f60480b7af03ddf7e3d --- /dev/null +++ b/Documentation/devicetree/bindings/fpga/lattice-ice40-fpga-mgr.txt @@ -0,0 +1,21 @@ +Lattice iCE40 FPGA Manager + +Required properties: +- compatible: Should contain "lattice,ice40-fpga-mgr" +- reg: SPI chip select +- spi-max-frequency: Maximum SPI frequency (>=1000000, <=25000000) +- cdone-gpios: GPIO input connected to CDONE pin +- reset-gpios: Active-low GPIO output connected to CRESET_B pin. Note + that unless the GPIO is held low during startup, the + FPGA will enter Master SPI mode and drive SCK with a + clock signal potentially jamming other devices on the + bus until the firmware is loaded. + +Example: + fpga: fpga@0 { + compatible = "lattice,ice40-fpga-mgr"; + reg = <0>; + spi-max-frequency = <1000000>; + cdone-gpios = <&gpio 24 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio 22 GPIO_ACTIVE_LOW>; + }; diff --git a/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt b/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt new file mode 100644 index 0000000000000000000000000000000000000000..9766f7472f512fec9652fb15d8d74f1f1f5133c7 --- /dev/null +++ b/Documentation/devicetree/bindings/fpga/xilinx-slave-serial.txt @@ -0,0 +1,44 @@ +Xilinx Slave Serial SPI FPGA Manager + +Xilinx Spartan-6 FPGAs support a method of loading the bitstream over +what is referred to as "slave serial" interface. +The slave serial link is not technically SPI, and might require extra +circuits in order to play nicely with other SPI slaves on the same bus. + +See https://www.xilinx.com/support/documentation/user_guides/ug380.pdf + +Required properties: +- compatible: should contain "xlnx,fpga-slave-serial" +- reg: spi chip select of the FPGA +- prog_b-gpios: config pin (referred to as PROGRAM_B in the manual) +- done-gpios: config status pin (referred to as DONE in the manual) + +Example for full FPGA configuration: + + fpga-region0 { + compatible = "fpga-region"; + fpga-mgr = <&fpga_mgr_spi>; + #address-cells = <0x1>; + #size-cells = <0x1>; + }; + + spi1: spi@10680 { + compatible = "marvell,armada-xp-spi", "marvell,orion-spi"; + pinctrl-0 = <&spi0_pins>; + pinctrl-names = "default"; + #address-cells = <1>; + #size-cells = <0>; + cell-index = <1>; + interrupts = <92>; + clocks = <&coreclk 0>; + status = "okay"; + + fpga_mgr_spi: fpga-mgr@0 { + compatible = "xlnx,fpga-slave-serial"; + spi-max-frequency = <60000000>; + spi-cpha; + reg = <0>; + done-gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>; + prog_b-gpios = <&gpio0 29 GPIO_ACTIVE_LOW>; + }; + }; diff --git a/Documentation/devicetree/bindings/gpio/cortina,gemini-gpio.txt b/Documentation/devicetree/bindings/gpio/cortina,gemini-gpio.txt deleted file mode 100644 index 5c9246c054e5bae6c0c552c52e76f84afa3eceb4..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/gpio/cortina,gemini-gpio.txt +++ /dev/null @@ -1,24 +0,0 @@ -Cortina Systems Gemini GPIO Controller - -Required properties: - -- compatible : Must be "cortina,gemini-gpio" -- reg : Should contain registers location and length -- interrupts : Should contain the interrupt line for the GPIO block -- gpio-controller : marks this as a GPIO controller -- #gpio-cells : Should be 2, see gpio/gpio.txt -- interrupt-controller : marks this as an interrupt controller -- #interrupt-cells : a standard two-cell interrupt flag, see - interrupt-controller/interrupts.txt - -Example: - -gpio@4d000000 { - compatible = "cortina,gemini-gpio"; - reg = <0x4d000000 0x100>; - interrupts = <22 IRQ_TYPE_LEVEL_HIGH>; - gpio-controller; - #gpio-cells = <2>; - interrupt-controller; - #interrupt-cells = <2>; -}; diff --git a/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt b/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt new file mode 100644 index 0000000000000000000000000000000000000000..d04236558619d84b0b1b99351ac72803ac4b2ca9 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/faraday,ftgpio010.txt @@ -0,0 +1,27 @@ +Faraday Technology FTGPIO010 GPIO Controller + +Required properties: + +- compatible : Should be one of + "cortina,gemini-gpio", "faraday,ftgpio010" + "moxa,moxart-gpio", "faraday,ftgpio010" + "faraday,ftgpio010" +- reg : Should contain registers location and length +- interrupts : Should contain the interrupt line for the GPIO block +- gpio-controller : marks this as a GPIO controller +- #gpio-cells : Should be 2, see gpio/gpio.txt +- interrupt-controller : marks this as an interrupt controller +- #interrupt-cells : a standard two-cell interrupt flag, see + interrupt-controller/interrupts.txt + +Example: + +gpio@4d000000 { + compatible = "cortina,gemini-gpio", "faraday,ftgpio010"; + reg = <0x4d000000 0x100>; + interrupts = <22 IRQ_TYPE_LEVEL_HIGH>; + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; +}; diff --git a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt index 393bb2ed8a774276bf1922ce63e018ab21985ca2..c756afa88cc62359323cfe3f0cc1171a31e955f4 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-aspeed.txt @@ -17,7 +17,8 @@ Required properties: Optional properties: -- interrupt-parent : The parent interrupt controller, optional if inherited +- interrupt-parent : The parent interrupt controller, optional if inherited +- clocks : A phandle to the HPLL clock node for debounce timings The gpio and interrupt properties are further described in their respective bindings documentation: diff --git a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt index a6f3bec1da7d0f46d6f0ecf22fcec20232b354dc..42c3bb2d53e88b651a7d39efe00e278c24d9c9b3 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-mvebu.txt @@ -38,6 +38,24 @@ Required properties: - #gpio-cells: Should be two. The first cell is the pin number. The second cell is reserved for flags, unused at the moment. +Optional properties: + +In order to use the GPIO lines in PWM mode, some additional optional +properties are required. Only Armada 370 and XP support these properties. + +- compatible: Must contain "marvell,armada-370-xp-gpio" + +- reg: an additional register set is needed, for the GPIO Blink + Counter on/off registers. + +- reg-names: Must contain an entry "pwm" corresponding to the + additional register range needed for PWM operation. + +- #pwm-cells: Should be two. The first cell is the GPIO line number. The + second cell is the period in nanoseconds. + +- clocks: Must be a phandle to the clock for the GPIO controller. + Example: gpio0: gpio@d0018100 { @@ -51,3 +69,17 @@ Example: #interrupt-cells = <2>; interrupts = <16>, <17>, <18>, <19>; }; + + gpio1: gpio@18140 { + compatible = "marvell,armada-370-xp-gpio"; + reg = <0x18140 0x40>, <0x181c8 0x08>; + reg-names = "gpio", "pwm"; + ngpios = <17>; + gpio-controller; + #gpio-cells = <2>; + #pwm-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; + interrupts = <87>, <88>, <89>; + clocks = <&coreclk 0>; + }; diff --git a/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt b/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt index e6393571001144bba965d0d72e3d4c50a0b9da8e..7f57271df2bc96b41168d337c911511ccb726e33 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-pca953x.txt @@ -26,6 +26,7 @@ Required properties: ti,tca6416 ti,tca6424 ti,tca9539 + ti,tca9554 onsemi,pca9654 exar,xra1202 diff --git a/Documentation/devicetree/bindings/gpio/gpio-pcf857x.txt b/Documentation/devicetree/bindings/gpio/gpio-pcf857x.txt index ada4e29733234aa0f5c0afe8cf20e2ac4b8de173..7d3bd631d0114850d3598a9a64824e8af906d6ee 100644 --- a/Documentation/devicetree/bindings/gpio/gpio-pcf857x.txt +++ b/Documentation/devicetree/bindings/gpio/gpio-pcf857x.txt @@ -25,7 +25,6 @@ Required Properties: - "nxp,pcf8574": For the NXP PCF8574 - "nxp,pcf8574a": For the NXP PCF8574A - "nxp,pcf8575": For the NXP PCF8575 - - "ti,tca9554": For the TI TCA9554 - reg: I2C slave address. diff --git a/Documentation/devicetree/bindings/gpio/gpio-thunderx.txt b/Documentation/devicetree/bindings/gpio/gpio-thunderx.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f883ae29d116887e702ead20b26a25f9d2349d5 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/gpio-thunderx.txt @@ -0,0 +1,27 @@ +Cavium ThunderX/OCTEON-TX GPIO controller bindings + +Required Properties: +- reg: The controller bus address. +- gpio-controller: Marks the device node as a GPIO controller. +- #gpio-cells: Must be 2. + - First cell is the GPIO pin number relative to the controller. + - Second cell is a standard generic flag bitfield as described in gpio.txt. + +Optional Properties: +- compatible: "cavium,thunder-8890-gpio", unused as PCI driver binding is used. +- interrupt-controller: Marks the device node as an interrupt controller. +- #interrupt-cells: Must be present and have value of 2 if + "interrupt-controller" is present. + - First cell is the GPIO pin number relative to the controller. + - Second cell is triggering flags as defined in interrupts.txt. + +Example: + +gpio_6_0: gpio@6,0 { + compatible = "cavium,thunder-8890-gpio"; + reg = <0x3000 0 0 0 0>; /* DEVFN = 0x30 (6:0) */ + gpio-controller; + #gpio-cells = <2>; + interrupt-controller; + #interrupt-cells = <2>; +}; diff --git a/Documentation/devicetree/bindings/gpio/gpio-xra1403.txt b/Documentation/devicetree/bindings/gpio/gpio-xra1403.txt new file mode 100644 index 0000000000000000000000000000000000000000..e13cc399b363f24aae0083fc34b09bd434aa78cd --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/gpio-xra1403.txt @@ -0,0 +1,46 @@ +GPIO Driver for XRA1403 16-BIT GPIO Expander With Reset Input from EXAR + +The XRA1403 is an 16-bit GPIO expander with an SPI interface. Features available: + - Individually programmable inputs: + - Internal pull-up resistors + - Polarity inversion + - Individual interrupt enable + - Rising edge and/or Falling edge interrupt + - Input filter + - Individually programmable outputs + - Output Level Control + - Output Three-State Control + +Properties +---------- +Check documentation for SPI and GPIO controllers regarding properties needed to configure the node. + + - compatible = "exar,xra1403". + - reg - SPI id of the device. + - gpio-controller - marks the node as gpio. + - #gpio-cells - should be two where the first cell is the pin number + and the second one is used for optional parameters. + +Optional properties: +------------------- + - reset-gpios: in case available used to control the device reset line. + - interrupt-controller - marks the node as interrupt controller. + - #interrupt-cells - should be two and represents the number of cells + needed to encode interrupt source. + +Example +-------- + + gpioxra0: gpio@2 { + compatible = "exar,xra1403"; + reg = <2>; + + gpio-controller; + #gpio-cells = <2>; + + interrupt-controller; + #interrupt-cells = <2>; + + reset-gpios = <&gpio3 6 GPIO_ACTIVE_LOW>; + spi-max-frequency = <1000000>; + }; diff --git a/Documentation/devicetree/bindings/gpio/moxa,moxart-gpio.txt b/Documentation/devicetree/bindings/gpio/moxa,moxart-gpio.txt deleted file mode 100644 index f8e8f185a3dbacbd60641cd2b70da3b0c2c1e32a..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/gpio/moxa,moxart-gpio.txt +++ /dev/null @@ -1,19 +0,0 @@ -MOXA ART GPIO Controller - -Required properties: - -- #gpio-cells : Should be 2, The first cell is the pin number, - the second cell is used to specify polarity: - 0 = active high - 1 = active low -- compatible : Must be "moxa,moxart-gpio" -- reg : Should contain registers location and length - -Example: - - gpio: gpio@98700000 { - gpio-controller; - #gpio-cells = <2>; - compatible = "moxa,moxart-gpio"; - reg = <0x98700000 0xC>; - }; diff --git a/Documentation/devicetree/bindings/gpio/ni,169445-nand-gpio.txt b/Documentation/devicetree/bindings/gpio/ni,169445-nand-gpio.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca2f8c745a279cfea1375dc68dceff29a63f8de5 --- /dev/null +++ b/Documentation/devicetree/bindings/gpio/ni,169445-nand-gpio.txt @@ -0,0 +1,38 @@ +Bindings for the National Instruments 169445 GPIO NAND controller + +The 169445 GPIO NAND controller has two memory mapped GPIO registers, one +for input (the ready signal) and one for output (control signals). It is +intended to be used with the GPIO NAND driver. + +Required properties: + - compatible: should be "ni,169445-nand-gpio" + - reg-names: must contain + "dat" - data register + - reg: address + size pairs describing the GPIO register sets; + order must correspond with the order of entries in reg-names + - #gpio-cells: must be set to 2. The first cell is the pin number and + the second cell is used to specify the gpio polarity: + 0 = active high + 1 = active low + - gpio-controller: Marks the device node as a gpio controller. + +Optional properties: + - no-output: disables driving output on the pins + +Examples: + gpio1: nand-gpio-out@1f300010 { + compatible = "ni,169445-nand-gpio"; + reg = <0x1f300010 0x4>; + reg-names = "dat"; + gpio-controller; + #gpio-cells = <2>; + }; + + gpio2: nand-gpio-in@1f300014 { + compatible = "ni,169445-nand-gpio"; + reg = <0x1f300014 0x4>; + reg-names = "dat"; + gpio-controller; + #gpio-cells = <2>; + no-output; + }; diff --git a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt index 476f5ea6c6276a15c52ffc87a7a1033c6d173a26..2b6243e730f690f9bbf2f0c50295b91c0699129f 100644 --- a/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt +++ b/Documentation/devicetree/bindings/gpu/arm,mali-utgard.txt @@ -35,6 +35,14 @@ Optional properties: - interrupt-names and interrupts: * pmu: Power Management Unit interrupt, if implemented in hardware + - memory-region: + Memory region to allocate from, as defined in + Documentation/devicetree/bindi/reserved-memory/reserved-memory.txt + + - operating-points-v2: + Operating Points for the GPU, as defined in + Documentation/devicetree/bindings/opp/opp.txt + Vendor-specific bindings ------------------------ diff --git a/Documentation/devicetree/bindings/gpu/nvidia,gk20a.txt b/Documentation/devicetree/bindings/gpu/nvidia,gk20a.txt index ff3db65e50def8224096107eaf05827df884ca25..b7e4c74445108dcd2c966529a85a4a930b40e831 100644 --- a/Documentation/devicetree/bindings/gpu/nvidia,gk20a.txt +++ b/Documentation/devicetree/bindings/gpu/nvidia,gk20a.txt @@ -5,6 +5,7 @@ Required properties: Currently recognized values: - nvidia,gk20a - nvidia,gm20b + - nvidia,gp10b - reg: Physical base address and length of the controller's registers. Must contain two entries: - first entry for bar0 @@ -14,7 +15,8 @@ Required properties: - interrupt-names: Must include the following entries: - stall - nonstall -- vdd-supply: regulator for supply voltage. +- vdd-supply: regulator for supply voltage. Only required for GPUs not using + power domains. - clocks: Must contain an entry for each entry in clock-names. See ../clocks/clock-bindings.txt for details. - clock-names: Must include the following entries: @@ -27,6 +29,8 @@ is also required: See ../reset/reset.txt for details. - reset-names: Must include the following entries: - gpu +- power-domains: GPUs that make use of power domains can define this property + instead of vdd-supply. Currently "nvidia,gp10b" makes use of this. Optional properties: - iommus: A reference to the IOMMU. See ../iommu/iommu.txt for details. @@ -68,3 +72,22 @@ Example for GM20B: iommus = <&mc TEGRA_SWGROUP_GPU>; status = "disabled"; }; + +Example for GP10B: + + gpu@17000000 { + compatible = "nvidia,gp10b"; + reg = <0x0 0x17000000 0x0 0x1000000>, + <0x0 0x18000000 0x0 0x1000000>; + interrupts = ; + interrupt-names = "stall", "nonstall"; + clocks = <&bpmp TEGRA186_CLK_GPCCLK>, + <&bpmp TEGRA186_CLK_GPU>; + clock-names = "gpu", "pwr"; + resets = <&bpmp TEGRA186_RESET_GPU>; + reset-names = "gpu"; + power-domains = <&bpmp TEGRA186_POWER_DOMAIN_GPU>; + iommus = <&smmu TEGRA186_SID_GPU>; + status = "disabled"; + }; diff --git a/Documentation/devicetree/bindings/hwmon/ads7828.txt b/Documentation/devicetree/bindings/hwmon/ads7828.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe0cc4ad7ea9f289d650e71895145fbaeb2c7b02 --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/ads7828.txt @@ -0,0 +1,25 @@ +ads7828 properties + +Required properties: +- compatible: Should be one of + ti,ads7828 + ti,ads7830 +- reg: I2C address + +Optional properties: + +- ti,differential-input + Set to use the device in differential mode. +- vref-supply + The external reference on the device is set to this regulators output. If it + does not exists the internal reference will be used and output by the ads78xx + on the "external vref" pin. + + Example ADS7828 node: + + ads7828: ads@48 { + comatible = "ti,ads7828"; + reg = <0x48>; + vref-supply = <&vref>; + ti,differential-input; + }; diff --git a/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf4460564adb20941f127008fdb0ec235dc5f7ff --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/aspeed-pwm-tacho.txt @@ -0,0 +1,68 @@ +ASPEED AST2400/AST2500 PWM and Fan Tacho controller device driver + +The ASPEED PWM controller can support upto 8 PWM outputs. The ASPEED Fan Tacho +controller can support upto 16 Fan tachometer inputs. + +There can be upto 8 fans supported. Each fan can have one PWM output and +one/two Fan tach inputs. + +Required properties for pwm-tacho node: +- #address-cells : should be 1. + +- #size-cells : should be 1. + +- reg : address and length of the register set for the device. + +- pinctrl-names : a pinctrl state named "default" must be defined. + +- pinctrl-0 : phandle referencing pin configuration of the PWM ports. + +- compatible : should be "aspeed,ast2400-pwm-tacho" for AST2400 and + "aspeed,ast2500-pwm-tacho" for AST2500. + +- clocks : a fixed clock providing input clock frequency(PWM + and Fan Tach clock) + +fan subnode format: +=================== +Under fan subnode there can upto 8 child nodes, with each child node +representing a fan. If there are 8 fans each fan can have one PWM port and +one/two Fan tach inputs. + +Required properties for each child node: +- reg : should specify PWM source port. + integer value in the range 0 to 7 with 0 indicating PWM port A and + 7 indicating PWM port H. + +- aspeed,fan-tach-ch : should specify the Fan tach input channel. + integer value in the range 0 through 15, with 0 indicating + Fan tach channel 0 and 15 indicating Fan tach channel 15. + Atleast one Fan tach input channel is required. + +Examples: + +pwm_tacho_fixed_clk: fixedclk { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <24000000>; +}; + +pwm_tacho: pwmtachocontroller@1e786000 { + #address-cells = <1>; + #size-cells = <1>; + reg = <0x1E786000 0x1000>; + compatible = "aspeed,ast2500-pwm-tacho"; + clocks = <&pwm_tacho_fixed_clk>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pwm0_default &pinctrl_pwm1_default>; + + fan@0 { + reg = <0x00>; + aspeed,fan-tach-ch = /bits/ 8 <0x00>; + }; + + fan@1 { + reg = <0x01>; + aspeed,fan-tach-ch = /bits/ 8 <0x01 0x02>; + }; +}; diff --git a/Documentation/devicetree/bindings/hwmon/lm87.txt b/Documentation/devicetree/bindings/hwmon/lm87.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1b79903f2043d6e3c16876b6651bb263afbb9fd --- /dev/null +++ b/Documentation/devicetree/bindings/hwmon/lm87.txt @@ -0,0 +1,30 @@ +*LM87 hwmon sensor. + +Required properties: +- compatible: Should be + "ti,lm87" + +- reg: I2C address + +optional properties: +- has-temp3: This configures pins 18 and 19 to be used as a second + remote temperature sensing channel. By default the pins + are configured as voltage input pins in0 and in5. + +- has-in6: When set, pin 5 is configured to be used as voltage input + in6. Otherwise the pin is set as FAN1 input. + +- has-in7: When set, pin 6 is configured to be used as voltage input + in7. Otherwise the pin is set as FAN2 input. + +- vcc-supply: a Phandle for the regulator supplying power, can be + cofigured to measure 5.0V power supply. Default is 3.3V. + +Example: + +lm87@2e { + compatible = "ti,lm87"; + reg = <0x2e>; + has-temp3; + vcc-supply = <®_5v0>; +}; diff --git a/Documentation/devicetree/bindings/i2c/i2c-meson.txt b/Documentation/devicetree/bindings/i2c/i2c-meson.txt index 386357d1aab05de652675bf6ed19c0dc22b50709..611b934c7e104d4ec0b844bbca59b54c45471009 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-meson.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-meson.txt @@ -8,6 +8,8 @@ Required properties: - #address-cells: should be <1> - #size-cells: should be <0> +For details regarding the following core I2C bindings see also i2c.txt. + Optional properties: - clock-frequency: the desired I2C bus clock frequency in Hz; in absence of this property the default value is used (100 kHz). diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-ltc4306.txt b/Documentation/devicetree/bindings/i2c/i2c-mux-ltc4306.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e98c6b3a721bfe6b92386678027a38641865ab8 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/i2c-mux-ltc4306.txt @@ -0,0 +1,61 @@ +* Linear Technology / Analog Devices I2C bus switch + +Required Properties: + + - compatible: Must contain one of the following. + "lltc,ltc4305", "lltc,ltc4306" + - reg: The I2C address of the device. + + The following required properties are defined externally: + + - Standard I2C mux properties. See i2c-mux.txt in this directory. + - I2C child bus nodes. See i2c-mux.txt in this directory. + +Optional Properties: + + - enable-gpios: Reference to the GPIO connected to the enable input. + - i2c-mux-idle-disconnect: Boolean; if defined, forces mux to disconnect all + children in idle state. This is necessary for example, if there are several + multiplexers on the bus and the devices behind them use same I2C addresses. + - gpio-controller: Marks the device node as a GPIO Controller. + - #gpio-cells: Should be two. The first cell is the pin number and + the second cell is used to specify flags. + See ../gpio/gpio.txt for more information. + - ltc,downstream-accelerators-enable: Enables the rise time accelerators + on the downstream port. + - ltc,upstream-accelerators-enable: Enables the rise time accelerators + on the upstream port. + +Example: + + ltc4306: i2c-mux@4a { + compatible = "lltc,ltc4306"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0x4a>; + + gpio-controller; + #gpio-cells = <2>; + + i2c@0 { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + + eeprom@50 { + compatible = "at,24c02"; + reg = <0x50>; + }; + }; + + i2c@1 { + #address-cells = <1>; + #size-cells = <0>; + reg = <1>; + + eeprom@50 { + compatible = "at,24c02"; + reg = <0x50>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/i2c/i2c-rk3x.txt b/Documentation/devicetree/bindings/i2c/i2c-rk3x.txt index bbc5a1ed5fa188557a4c45828ea8ba5d65f46893..e18445d0980cc6ab5d2b0d6b3b10fb35e6d3cff3 100644 --- a/Documentation/devicetree/bindings/i2c/i2c-rk3x.txt +++ b/Documentation/devicetree/bindings/i2c/i2c-rk3x.txt @@ -11,6 +11,7 @@ Required properties : - "rockchip,rk3188-i2c": for rk3188 - "rockchip,rk3228-i2c": for rk3228 - "rockchip,rk3288-i2c": for rk3288 + - "rockchip,rk3328-i2c", "rockchip,rk3399-i2c": for rk3328 - "rockchip,rk3399-i2c": for rk3399 - interrupts : interrupt number - clocks: See ../clock/clock-bindings.txt diff --git a/Documentation/devicetree/bindings/i2c/trivial-devices.txt b/Documentation/devicetree/bindings/i2c/trivial-devices.txt deleted file mode 100644 index ad10fbe61562506015c93e3e6fe52dba751665a3..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/i2c/trivial-devices.txt +++ /dev/null @@ -1,174 +0,0 @@ -This is a list of trivial i2c devices that have simple device tree -bindings, consisting only of a compatible field, an address and -possibly an interrupt line. - -If a device needs more specific bindings, such as properties to -describe some aspect of it, there needs to be a specific binding -document for it just like any other devices. - - -Compatible Vendor / Chip -========== ============= -abracon,abb5zes3 AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface -ad,ad7414 SMBus/I2C Digital Temperature Sensor in 6-Pin SOT with SMBus Alert and Over Temperature Pin -ad,adm9240 ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems -adi,adt7461 +/-1C TDM Extended Temp Range I.C -adt7461 +/-1C TDM Extended Temp Range I.C -adi,adt7473 +/-1C TDM Extended Temp Range I.C -adi,adt7475 +/-1C TDM Extended Temp Range I.C -adi,adt7476 +/-1C TDM Extended Temp Range I.C -adi,adt7490 +/-1C TDM Extended Temp Range I.C -adi,adxl345 Three-Axis Digital Accelerometer -adi,adxl346 Three-Axis Digital Accelerometer (backward-compatibility value "adi,adxl345" must be listed too) -ams,iaq-core AMS iAQ-Core VOC Sensor -at,24c08 i2c serial eeprom (24cxx) -atmel,at97sc3204t i2c trusted platform module (TPM) -capella,cm32181 CM32181: Ambient Light Sensor -capella,cm3232 CM3232: Ambient Light Sensor -cirrus,cs42l51 Cirrus Logic CS42L51 audio codec -dallas,ds1307 64 x 8, Serial, I2C Real-Time Clock -dallas,ds1338 I2C RTC with 56-Byte NV RAM -dallas,ds1340 I2C RTC with Trickle Charger -dallas,ds1374 I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output -dallas,ds1631 High-Precision Digital Thermometer -dallas,ds1682 Total-Elapsed-Time Recorder with Alarm -dallas,ds1775 Tiny Digital Thermometer and Thermostat -dallas,ds3232 Extremely Accurate I²C RTC with Integrated Crystal and SRAM -dallas,ds4510 CPU Supervisor with Nonvolatile Memory and Programmable I/O -dallas,ds75 Digital Thermometer and Thermostat -devantech,srf08 Devantech SRF08 ultrasonic ranger -dlg,da9053 DA9053: flexible system level PMIC with multicore support -dlg,da9063 DA9063: system PMIC for quad-core application processors -domintech,dmard09 DMARD09: 3-axis Accelerometer -domintech,dmard10 DMARD10: 3-axis Accelerometer -epson,rx8010 I2C-BUS INTERFACE REAL TIME CLOCK MODULE -epson,rx8025 High-Stability. I2C-Bus INTERFACE REAL TIME CLOCK MODULE -epson,rx8581 I2C-BUS INTERFACE REAL TIME CLOCK MODULE -fsl,mag3110 MAG3110: Xtrinsic High Accuracy, 3D Magnetometer -fsl,mc13892 MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51 -fsl,mma7660 MMA7660FC: 3-Axis Orientation/Motion Detection Sensor -fsl,mma8450 MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer -fsl,mpl3115 MPL3115: Absolute Digital Pressure Sensor -fsl,mpr121 MPR121: Proximity Capacitive Touch Sensor Controller -fsl,sgtl5000 SGTL5000: Ultra Low-Power Audio Codec -gmt,g751 G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface -infineon,slb9635tt Infineon SLB9635 (Soft-) I2C TPM (old protocol, max 100khz) -infineon,slb9645tt Infineon SLB9645 I2C TPM (new protocol, max 400khz) -isil,isl29028 Intersil ISL29028 Ambient Light and Proximity Sensor -maxim,ds1050 5 Bit Programmable, Pulse-Width Modulator -maxim,max1237 Low-Power, 4-/12-Channel, 2-Wire Serial, 12-Bit ADCs -maxim,max6625 9-Bit/12-Bit Temperature Sensors with I²C-Compatible Serial Interface -mc,rv3029c2 Real Time Clock Module with I2C-Bus -mcube,mc3230 mCube 3-axis 8-bit digital accelerometer -memsic,mxc6225 MEMSIC 2-axis 8-bit digital accelerometer -microchip,mcp4531-502 Microchip 7-bit Single I2C Digital Potentiometer (5k) -microchip,mcp4531-103 Microchip 7-bit Single I2C Digital Potentiometer (10k) -microchip,mcp4531-503 Microchip 7-bit Single I2C Digital Potentiometer (50k) -microchip,mcp4531-104 Microchip 7-bit Single I2C Digital Potentiometer (100k) -microchip,mcp4532-502 Microchip 7-bit Single I2C Digital Potentiometer (5k) -microchip,mcp4532-103 Microchip 7-bit Single I2C Digital Potentiometer (10k) -microchip,mcp4532-503 Microchip 7-bit Single I2C Digital Potentiometer (50k) -microchip,mcp4532-104 Microchip 7-bit Single I2C Digital Potentiometer (100k) -microchip,mcp4541-502 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4541-103 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4541-503 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4541-104 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4542-502 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4542-103 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4542-503 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4542-104 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4551-502 Microchip 8-bit Single I2C Digital Potentiometer (5k) -microchip,mcp4551-103 Microchip 8-bit Single I2C Digital Potentiometer (10k) -microchip,mcp4551-503 Microchip 8-bit Single I2C Digital Potentiometer (50k) -microchip,mcp4551-104 Microchip 8-bit Single I2C Digital Potentiometer (100k) -microchip,mcp4552-502 Microchip 8-bit Single I2C Digital Potentiometer (5k) -microchip,mcp4552-103 Microchip 8-bit Single I2C Digital Potentiometer (10k) -microchip,mcp4552-503 Microchip 8-bit Single I2C Digital Potentiometer (50k) -microchip,mcp4552-104 Microchip 8-bit Single I2C Digital Potentiometer (100k) -microchip,mcp4561-502 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4561-103 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4561-503 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4561-104 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4562-502 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4562-103 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4562-503 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4562-104 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4631-502 Microchip 7-bit Dual I2C Digital Potentiometer (5k) -microchip,mcp4631-103 Microchip 7-bit Dual I2C Digital Potentiometer (10k) -microchip,mcp4631-503 Microchip 7-bit Dual I2C Digital Potentiometer (50k) -microchip,mcp4631-104 Microchip 7-bit Dual I2C Digital Potentiometer (100k) -microchip,mcp4632-502 Microchip 7-bit Dual I2C Digital Potentiometer (5k) -microchip,mcp4632-103 Microchip 7-bit Dual I2C Digital Potentiometer (10k) -microchip,mcp4632-503 Microchip 7-bit Dual I2C Digital Potentiometer (50k) -microchip,mcp4632-104 Microchip 7-bit Dual I2C Digital Potentiometer (100k) -microchip,mcp4641-502 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4641-103 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4641-503 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4641-104 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4642-502 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4642-103 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4642-503 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4642-104 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4651-502 Microchip 8-bit Dual I2C Digital Potentiometer (5k) -microchip,mcp4651-103 Microchip 8-bit Dual I2C Digital Potentiometer (10k) -microchip,mcp4651-503 Microchip 8-bit Dual I2C Digital Potentiometer (50k) -microchip,mcp4651-104 Microchip 8-bit Dual I2C Digital Potentiometer (100k) -microchip,mcp4652-502 Microchip 8-bit Dual I2C Digital Potentiometer (5k) -microchip,mcp4652-103 Microchip 8-bit Dual I2C Digital Potentiometer (10k) -microchip,mcp4652-503 Microchip 8-bit Dual I2C Digital Potentiometer (50k) -microchip,mcp4652-104 Microchip 8-bit Dual I2C Digital Potentiometer (100k) -microchip,mcp4661-502 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4661-103 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4661-503 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4661-104 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (100k) -microchip,mcp4662-502 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (5k) -microchip,mcp4662-103 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (10k) -microchip,mcp4662-503 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (50k) -microchip,mcp4662-104 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (100k) -microchip,tc654 PWM Fan Speed Controller With Fan Fault Detection -microchip,tc655 PWM Fan Speed Controller With Fan Fault Detection -miramems,da226 MiraMEMS DA226 2-axis 14-bit digital accelerometer -miramems,da280 MiraMEMS DA280 3-axis 14-bit digital accelerometer -miramems,da311 MiraMEMS DA311 3-axis 12-bit digital accelerometer -national,lm63 Temperature sensor with integrated fan control -national,lm75 I2C TEMP SENSOR -national,lm80 Serial Interface ACPI-Compatible Microprocessor System Hardware Monitor -national,lm85 Temperature sensor with integrated fan control -national,lm92 ±0.33°C Accurate, 12-Bit + Sign Temperature Sensor and Thermal Window Comparator with Two-Wire Interface -nuvoton,npct501 i2c trusted platform module (TPM) -nuvoton,npct601 i2c trusted platform module (TPM2) -nxp,pca9556 Octal SMBus and I2C registered interface -nxp,pca9557 8-bit I2C-bus and SMBus I/O port with reset -nxp,pcf2127 Real-time clock -nxp,pcf2129 Real-time clock -nxp,pcf8563 Real-time clock/calendar -nxp,pcf85063 Tiny Real-Time Clock -oki,ml86v7667 OKI ML86V7667 video decoder -ovti,ov5642 OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and Embedded TrueFocus -pericom,pt7c4338 Real-time Clock Module -plx,pex8648 48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch -pulsedlight,lidar-lite-v2 Pulsedlight LIDAR range-finding sensor -ricoh,r2025sd I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,r2221tl I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rs5c372a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rs5c372b I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rv5c386 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -ricoh,rv5c387a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC -samsung,24ad0xd1 S524AD0XF1 (128K/256K-bit Serial EEPROM for Low Power) -sgx,vz89x SGX Sensortech VZ89X Sensors -sii,s35390a 2-wire CMOS real-time clock -silabs,si7020 Relative Humidity and Temperature Sensors -skyworks,sky81452 Skyworks SKY81452: Six-Channel White LED Driver with Touch Panel Bias Supply -st,24c256 i2c serial eeprom (24cxx) -st,m41t00 Serial real-time clock (RTC) -st,m41t62 Serial real-time clock (RTC) with alarm -st,m41t80 M41T80 - SERIAL ACCESS RTC WITH ALARMS -taos,tsl2550 Ambient Light Sensor with SMBUS/Two Wire Serial Interface -ti,ads7828 8-Channels, 12-bit ADC -ti,ads7830 8-Channels, 8-bit ADC -ti,tsc2003 I2C Touch-Screen Controller -ti,tmp102 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface -ti,tmp103 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface -ti,tmp275 Digital Temperature Sensor -winbond,w83793 Winbond/Nuvoton H/W Monitor -winbond,wpct301 i2c trusted platform module (TPM) diff --git a/Documentation/devicetree/bindings/iio/accel/adxl345.txt b/Documentation/devicetree/bindings/iio/accel/adxl345.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7111b02c02c46ba7635d61f2f4b0abf2ccc047c --- /dev/null +++ b/Documentation/devicetree/bindings/iio/accel/adxl345.txt @@ -0,0 +1,38 @@ +Analog Devices ADXL345 3-Axis, +/-(2g/4g/8g/16g) Digital Accelerometer + +http://www.analog.com/en/products/mems/accelerometers/adxl345.html + +Required properties: + - compatible : should be "adi,adxl345" + - reg : the I2C address or SPI chip select number of the sensor + +Required properties for SPI bus usage: + - spi-max-frequency : set maximum clock frequency, must be 5000000 + - spi-cpol and spi-cpha : must be defined for adxl345 to enable SPI mode 3 + +Optional properties: + - interrupt-parent : phandle to the parent interrupt controller as documented + in Documentation/devicetree/bindings/interrupt-controller/interrupts.txt + - interrupts: interrupt mapping for IRQ as documented in + Documentation/devicetree/bindings/interrupt-controller/interrupts.txt + +Example for a I2C device node: + + accelerometer@2a { + compatible = "adi,adxl345"; + reg = <0x53>; + interrupt-parent = <&gpio1>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; + +Example for a SPI device node: + + accelerometer@0 { + compatible = "adi,adxl345"; + reg = <0>; + spi-max-frequency = <5000000>; + spi-cpol; + spi-cpha; + interrupt-parent = <&gpio1>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; diff --git a/Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt b/Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt index f9e3ff2c656e538c40cdc6e9826c6f72eda70b9b..047189192aec2691f93cb28120bd4fe1a5746ef6 100644 --- a/Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt +++ b/Documentation/devicetree/bindings/iio/adc/amlogic,meson-saradc.txt @@ -7,6 +7,7 @@ Required properties: - "amlogic,meson-gxm-saradc" for GXM along with the generic "amlogic,meson-saradc" - reg: the physical base address and length of the registers +- interrupts: the interrupt indicating end of sampling - clocks: phandle and clock identifier (see clock-names) - clock-names: mandatory clocks: - "clkin" for the reference clock (typically XTAL) @@ -23,6 +24,7 @@ Example: compatible = "amlogic,meson-gxl-saradc", "amlogic,meson-saradc"; #io-channel-cells = <1>; reg = <0x0 0x8680 0x0 0x34>; + interrupts = ; clocks = <&xtal>, <&clkc CLKID_SAR_ADC>, <&clkc CLKID_SANA>, diff --git a/Documentation/devicetree/bindings/iio/adc/aspeed_adc.txt b/Documentation/devicetree/bindings/iio/adc/aspeed_adc.txt new file mode 100644 index 0000000000000000000000000000000000000000..674e133b7cd7d72aeb81efb4f7cb304ce0204644 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/aspeed_adc.txt @@ -0,0 +1,20 @@ +Aspeed ADC + +This device is a 10-bit converter for 16 voltage channels. All inputs are +single ended. + +Required properties: +- compatible: Should be "aspeed,ast2400-adc" or "aspeed,ast2500-adc" +- reg: memory window mapping address and length +- clocks: Input clock used to derive the sample clock. Expected to be the + SoC's APB clock. +- #io-channel-cells: Must be set to <1> to indicate channels are selected + by index. + +Example: + adc@1e6e9000 { + compatible = "aspeed,ast2400-adc"; + reg = <0x1e6e9000 0xb0>; + clocks = <&clk_apb>; + #io-channel-cells = <1>; + }; diff --git a/Documentation/devicetree/bindings/iio/adc/cpcap-adc.txt b/Documentation/devicetree/bindings/iio/adc/cpcap-adc.txt new file mode 100644 index 0000000000000000000000000000000000000000..487ea966858e94eb8a7d6ffe9420cadd1a8bdc77 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/cpcap-adc.txt @@ -0,0 +1,18 @@ +Motorola CPCAP PMIC ADC binding + +Required properties: +- compatible: Should be "motorola,cpcap-adc" or "motorola,mapphone-cpcap-adc" +- interrupt-parent: The interrupt controller +- interrupts: The interrupt number for the ADC device +- interrupt-names: Should be "adcdone" +- #io-channel-cells: Number of cells in an IIO specifier + +Example: + +cpcap_adc: adc { + compatible = "motorola,mapphone-cpcap-adc"; + interrupt-parent = <&cpcap>; + interrupts = <8 IRQ_TYPE_NONE>; + interrupt-names = "adcdone"; + #io-channel-cells = <1>; +}; diff --git a/Documentation/devicetree/bindings/iio/adc/ltc2497.txt b/Documentation/devicetree/bindings/iio/adc/ltc2497.txt new file mode 100644 index 0000000000000000000000000000000000000000..a237ed99c0d8d5041a4105b80a8d2523e37bc4b7 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/ltc2497.txt @@ -0,0 +1,13 @@ +* Linear Technology / Analog Devices LTC2497 ADC + +Required properties: + - compatible: Must be "lltc,ltc2497" + - reg: Must contain the ADC I2C address + - vref-supply: The regulator supply for ADC reference voltage + +Example: + ltc2497: adc@76 { + compatible = "lltc,ltc2497"; + reg = <0x76>; + vref-supply = <<c2497_reg>; + }; diff --git a/Documentation/devicetree/bindings/iio/adc/max1118.txt b/Documentation/devicetree/bindings/iio/adc/max1118.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf33d0b15a6d714024512b6d2f9e881f1ea9f6b1 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/max1118.txt @@ -0,0 +1,21 @@ +* MAX1117/MAX1118/MAX1119 8-bit, dual-channel ADCs + +Required properties: + - compatible: Should be one of + * "maxim,max1117" + * "maxim,max1118" + * "maxim,max1119" + - reg: spi chip select number for the device + - (max1118 only) vref-supply: The regulator supply for ADC reference voltage + +Recommended properties: + - spi-max-frequency: Definition as per + Documentation/devicetree/bindings/spi/spi-bus.txt + +Example: +adc@0 { + compatible = "maxim,max1118"; + reg = <0>; + vref-supply = <&vdd_supply>; + spi-max-frequency = <1000000>; +}; diff --git a/Documentation/devicetree/bindings/iio/adc/max9611.txt b/Documentation/devicetree/bindings/iio/adc/max9611.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab4f43145ae578f37f030bbdeb4080a138762a0f --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/max9611.txt @@ -0,0 +1,27 @@ +* Maxim max9611/max9612 current sense amplifier with 12-bits ADC interface + +Maxim max9611/max9612 is an high-side current sense amplifier with integrated +12-bits ADC communicating over I2c bus. +The device node for this driver shall be a child of a I2c controller. + +Required properties + - compatible: Should be "maxim,max9611" or "maxim,max9612" + - reg: The 7-bits long I2c address of the device + - shunt-resistor-micro-ohms: Value, in micro Ohms, of the current sense shunt + resistor + +Example: + +&i2c4 { + csa: adc@7c { + compatible = "maxim,max9611"; + reg = <0x7c>; + + shunt-resistor-micro-ohms = <5000>; + }; +}; + +This device node describes a current sense amplifier sitting on I2c4 bus +with address 0x7c (read address is 0xf9, write address is 0xf8). +A sense resistor of 0,005 Ohm is installed between RS+ and RS- current-sensing +inputs. diff --git a/Documentation/devicetree/bindings/iio/adc/qcom,pm8xxx-xoadc.txt b/Documentation/devicetree/bindings/iio/adc/qcom,pm8xxx-xoadc.txt index 53cd146d8096becd5f85a6bdac7f06351a60c331..3ae06127789e30654792851c79b51fe3f53863e0 100644 --- a/Documentation/devicetree/bindings/iio/adc/qcom,pm8xxx-xoadc.txt +++ b/Documentation/devicetree/bindings/iio/adc/qcom,pm8xxx-xoadc.txt @@ -19,32 +19,42 @@ Required properties: with PMIC variant but is typically something like 2.2 or 1.8V. The following required properties are standard for IO channels, see -iio-bindings.txt for more details: +iio-bindings.txt for more details, but notice that this particular +ADC has a special addressing scheme that require two cells for +identifying each ADC channel: -- #address-cells: should be set to <1> +- #address-cells: should be set to <2>, the first cell is the + prescaler (on PM8058) or premux (on PM8921) with two valid bits + so legal values are 0x00, 0x01 or 0x02. The second cell + is the main analog mux setting (0x00..0x0f). The combination + of prescaler/premux and analog mux uniquely addresses a hardware + channel on all systems. - #size-cells: should be set to <0> -- #io-channel-cells: should be set to <1> +- #io-channel-cells: should be set to <2>, again the cells are + precaler or premux followed by the analog muxing line. - interrupts: should refer to the parent PMIC interrupt controller and reference the proper ADC interrupt. Required subnodes: -The ADC channels are configured as subnodes of the ADC. Since some of -them are used for calibrating the ADC, these nodes are compulsory: +The ADC channels are configured as subnodes of the ADC. + +Since some of them are used for calibrating the ADC, these nodes are +compulsory: adc-channel@c { - reg = <0x0c>; + reg = <0x00 0x0c>; }; adc-channel@d { - reg = <0x0d>; + reg = <0x00 0x0d>; }; adc-channel@f { - reg = <0x0f>; + reg = <0x00 0x0f>; }; These three nodes are used for absolute and ratiometric calibration @@ -52,13 +62,13 @@ and only need to have these reg values: they are by hardware definition 1:1 ratio converters that sample 625, 1250 and 0 milliV and create an interpolation calibration for all other ADCs. -Optional subnodes: any channels other than channel 0x0c, 0x0d and -0x0f are optional. +Optional subnodes: any channels other than channels [0x00 0x0c], +[0x00 0x0d] and [0x00 0x0f] are optional. Required channel node properties: - reg: should contain the hardware channel number in the range - 0 .. 0x0f (4 bits). The hardware only supports 16 channels. + 0 .. 0xff (8 bits). Optional channel node properties: @@ -94,56 +104,54 @@ Example: xoadc: xoadc@197 { compatible = "qcom,pm8058-adc"; reg = <0x197>; - interrupt-parent = <&pm8058>; - interrupts = <76 1>; - #address-cells = <1>; + interrupts-extended = <&pm8058 76 IRQ_TYPE_EDGE_RISING>; + #address-cells = <2>; #size-cells = <0>; - #io-channel-cells = <1>; + #io-channel-cells = <2>; vcoin: adc-channel@0 { - reg = <0x00>; + reg = <0x00 0x00>; }; vbat: adc-channel@1 { - reg = <0x01>; + reg = <0x00 0x01>; }; dcin: adc-channel@2 { - reg = <0x02>; + reg = <0x00 0x02>; }; ichg: adc-channel@3 { - reg = <0x03>; + reg = <0x00 0x03>; }; vph_pwr: adc-channel@4 { - reg = <0x04>; + reg = <0x00 0x04>; }; usb_vbus: adc-channel@a { - reg = <0x0a>; + reg = <0x00 0x0a>; }; die_temp: adc-channel@b { - reg = <0x0b>; + reg = <0x00 0x0b>; }; ref_625mv: adc-channel@c { - reg = <0x0c>; + reg = <0x00 0x0c>; }; ref_1250mv: adc-channel@d { - reg = <0x0d>; + reg = <0x00 0x0d>; }; ref_325mv: adc-channel@e { - reg = <0x0e>; + reg = <0x00 0x0e>; }; ref_muxoff: adc-channel@f { - reg = <0x0f>; + reg = <0x00 0x0f>; }; }; - /* IIO client node */ iio-hwmon { compatible = "iio-hwmon"; - io-channels = <&xoadc 0x01>, /* Battery */ - <&xoadc 0x02>, /* DC in (charger) */ - <&xoadc 0x04>, /* VPH the main system voltage */ - <&xoadc 0x0b>, /* Die temperature */ - <&xoadc 0x0c>, /* Reference voltage 1.25V */ - <&xoadc 0x0d>, /* Reference voltage 0.625V */ - <&xoadc 0x0e>; /* Reference voltage 0.325V */ + io-channels = <&xoadc 0x00 0x01>, /* Battery */ + <&xoadc 0x00 0x02>, /* DC in (charger) */ + <&xoadc 0x00 0x04>, /* VPH the main system voltage */ + <&xoadc 0x00 0x0b>, /* Die temperature */ + <&xoadc 0x00 0x0c>, /* Reference voltage 1.25V */ + <&xoadc 0x00 0x0d>, /* Reference voltage 0.625V */ + <&xoadc 0x00 0x0e>; /* Reference voltage 0.325V */ }; diff --git a/Documentation/devicetree/bindings/iio/adc/rockchip-saradc.txt b/Documentation/devicetree/bindings/iio/adc/rockchip-saradc.txt index 205593f56fe759706ea6fd321c08e66059ebd489..e0a9b9d6d6fdc1b84f74a81fe9516fb343be3c9e 100644 --- a/Documentation/devicetree/bindings/iio/adc/rockchip-saradc.txt +++ b/Documentation/devicetree/bindings/iio/adc/rockchip-saradc.txt @@ -4,6 +4,7 @@ Required properties: - compatible: should be "rockchip,-saradc" or "rockchip,rk3066-tsadc" - "rockchip,saradc": for rk3188, rk3288 - "rockchip,rk3066-tsadc": for rk3036 + - "rockchip,rk3328-saradc", "rockchip,rk3399-saradc": for rk3328 - "rockchip,rk3399-saradc": for rk3399 - reg: physical base address of the controller and length of memory mapped diff --git a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt index 5dfc88ec24a49ad7a0ae3c950b0ed5adc26d7a18..e35f9f1b320042c6b89714d32d3998db58e2a4e3 100644 --- a/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt +++ b/Documentation/devicetree/bindings/iio/adc/st,stm32-adc.txt @@ -57,6 +57,9 @@ Optional properties: - dmas: Phandle to dma channel for this ADC instance. See ../../dma/dma.txt for details. - dma-names: Must be "rx" when dmas property is being used. +- assigned-resolution-bits: Resolution (bits) to use for conversions. Must + match device available resolutions (e.g. can be 6, 8, 10 or 12 on stm32f4). + Default is maximum resolution if unset. Example: adc: adc@40012000 { @@ -84,6 +87,7 @@ Example: st,adc-channels = <8>; dmas = <&dma2 0 0 0x400 0x0>; dma-names = "rx"; + assigned-resolution-bits = <8>; }; ... other adc child nodes follow... diff --git a/Documentation/devicetree/bindings/iio/dac/ltc2632.txt b/Documentation/devicetree/bindings/iio/dac/ltc2632.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb911e5a8ab443dc6364713f20de0f0fca5ae2b1 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/dac/ltc2632.txt @@ -0,0 +1,23 @@ +Linear Technology LTC2632 DAC device driver + +Required properties: + - compatible: Has to contain one of the following: + lltc,ltc2632-l12 + lltc,ltc2632-l10 + lltc,ltc2632-l8 + lltc,ltc2632-h12 + lltc,ltc2632-h10 + lltc,ltc2632-h8 + +Property rules described in Documentation/devicetree/bindings/spi/spi-bus.txt +apply. In particular, "reg" and "spi-max-frequency" properties must be given. + +Example: + + spi_master { + dac: ltc2632@0 { + compatible = "lltc,ltc2632-l12"; + reg = <0>; /* CS0 */ + spi-max-frequency = <1000000>; + }; + }; diff --git a/Documentation/devicetree/bindings/iio/dac/st,stm32-dac.txt b/Documentation/devicetree/bindings/iio/dac/st,stm32-dac.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcee71f808d00d96dac04954f7ba464282460236 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/dac/st,stm32-dac.txt @@ -0,0 +1,61 @@ +STMicroelectronics STM32 DAC + +The STM32 DAC is a 12-bit voltage output digital-to-analog converter. The DAC +may be configured in 8 or 12-bit mode. It has two output channels, each with +its own converter. +It has built-in noise and triangle waveform generator and supports external +triggers for conversions. The DAC's output buffer allows a high drive output +current. + +Contents of a stm32 dac root node: +----------------------------------- +Required properties: +- compatible: Must be "st,stm32h7-dac-core". +- reg: Offset and length of the device's register set. +- clocks: Must contain an entry for pclk (which feeds the peripheral bus + interface) +- clock-names: Must be "pclk". +- vref-supply: Phandle to the vref+ input analog reference supply. +- #address-cells = <1>; +- #size-cells = <0>; + +Optional properties: +- resets: Must contain the phandle to the reset controller. +- A pinctrl state named "default" for each DAC channel may be defined to set + DAC_OUTx pin in mode of operation for analog output on external pin. + +Contents of a stm32 dac child node: +----------------------------------- +DAC core node should contain at least one subnode, representing a +DAC instance/channel available on the machine. + +Required properties: +- compatible: Must be "st,stm32-dac". +- reg: Must be either 1 or 2, to define (single) channel in use +- #io-channel-cells = <1>: See the IIO bindings section "IIO consumers" in + Documentation/devicetree/bindings/iio/iio-bindings.txt + +Example: + dac: dac@40007400 { + compatible = "st,stm32h7-dac-core"; + reg = <0x40007400 0x400>; + clocks = <&clk>; + clock-names = "pclk"; + vref-supply = <®_vref>; + pinctrl-names = "default"; + pinctrl-0 = <&dac_out1 &dac_out2>; + #address-cells = <1>; + #size-cells = <0>; + + dac1: dac@1 { + compatible = "st,stm32-dac"; + #io-channels-cells = <1>; + reg = <1>; + }; + + dac2: dac@2 { + compatible = "st,stm32-dac"; + #io-channels-cells = <1>; + reg = <2>; + }; + }; diff --git a/Documentation/devicetree/bindings/iio/health/max30102.txt b/Documentation/devicetree/bindings/iio/health/max30102.txt new file mode 100644 index 0000000000000000000000000000000000000000..c695e7cbeefb9d29d7d97602341e2cc3b82d8916 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/health/max30102.txt @@ -0,0 +1,30 @@ +Maxim MAX30102 heart rate and pulse oximeter sensor + +* https://datasheets.maximintegrated.com/en/ds/MAX30102.pdf + +Required properties: + - compatible: must be "maxim,max30102" + - reg: the I2C address of the sensor + - interrupt-parent: should be the phandle for the interrupt controller + - interrupts: the sole interrupt generated by the device + + Refer to interrupt-controller/interrupts.txt for generic + interrupt client node bindings. + +Optional properties: + - maxim,red-led-current-microamp: configuration for RED LED current + - maxim,ir-led-current-microamp: configuration for IR LED current + + Note that each step is approximately 200 microamps, ranging from 0 uA to + 50800 uA. + +Example: + +max30100@57 { + compatible = "maxim,max30102"; + reg = <0x57>; + maxim,red-led-current-microamp = <7000>; + maxim,ir-led-current-microamp = <7000>; + interrupt-parent = <&gpio1>; + interrupts = <16 2>; +}; diff --git a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt index a9fc11e43b45df1f490ee0b643f311e89c7dbf65..2b4514592f833c9be71e0522ca8e839082470030 100644 --- a/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt +++ b/Documentation/devicetree/bindings/iio/imu/inv_mpu6050.txt @@ -3,14 +3,21 @@ InvenSense MPU-6050 Six-Axis (Gyro + Accelerometer) MEMS MotionTracking Device http://www.invensense.com/mems/gyro/mpu6050.html Required properties: - - compatible : should be "invensense,mpu6050" + - compatible : should be one of + "invensense,mpu6050" + "invensense,mpu6500" + "invensense,mpu9150" + "invensense,mpu9250" + "invensense,icm20608" - reg : the I2C address of the sensor - interrupt-parent : should be the phandle for the interrupt controller - interrupts : interrupt mapping for GPIO IRQ Optional properties: - mount-matrix: an optional 3x3 mounting rotation matrix - + - i2c-gate node. These devices also support an auxiliary i2c bus. This is + simple enough to be described using the i2c-gate binding. See + i2c/i2c-gate.txt for more details. Example: mpu6050@68 { @@ -28,3 +35,19 @@ Example: "0", /* y2 */ "0.984807753012208"; /* z2 */ }; + + + mpu9250@68 { + compatible = "invensense,mpu9250"; + reg = <0x68>; + interrupt-parent = <&gpio3>; + interrupts = <21 1>; + i2c-gate { + #address-cells = <1>; + #size-cells = <0>; + ax8975@c { + compatible = "ak,ak8975"; + reg = <0x0c>; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt b/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt index cf81afdf7803e2d1c49c70efe2a0cf63347739fa..8305fb05ffda433f807e9097b62118995daf90aa 100644 --- a/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt +++ b/Documentation/devicetree/bindings/iio/imu/st_lsm6dsx.txt @@ -3,6 +3,8 @@ Required properties: - compatible: must be one of: "st,lsm6ds3" + "st,lsm6ds3h" + "st,lsm6dsl" "st,lsm6dsm" - reg: i2c address of the sensor / spi cs line diff --git a/Documentation/devicetree/bindings/iio/light/vl6180.txt b/Documentation/devicetree/bindings/iio/light/vl6180.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c52952715a0e279596067b14554c0462e3a2e9b --- /dev/null +++ b/Documentation/devicetree/bindings/iio/light/vl6180.txt @@ -0,0 +1,15 @@ +STMicro VL6180 - ALS, range and proximity sensor + +Link to datasheet: http://www.st.com/resource/en/datasheet/vl6180x.pdf + +Required properties: + + -compatible: should be "st,vl6180" + -reg: the I2C address of the sensor + +Example: + +vl6180@29 { + compatible = "st,vl6180"; + reg = <0x29>; +}; diff --git a/Documentation/devicetree/bindings/iio/proximity/devantech-srf04.txt b/Documentation/devicetree/bindings/iio/proximity/devantech-srf04.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4dc7a227e2ed8840cf2ca861661dd9bedd1cdee --- /dev/null +++ b/Documentation/devicetree/bindings/iio/proximity/devantech-srf04.txt @@ -0,0 +1,28 @@ +* Devantech SRF04 ultrasonic range finder + Bit-banging driver using two GPIOs + +Required properties: + - compatible: Should be "devantech,srf04" + + - trig-gpios: Definition of the GPIO for the triggering (output) + This GPIO is set for about 10 us by the driver to tell the + device it should initiate the measurement cycle. + + - echo-gpios: Definition of the GPIO for the echo (input) + This GPIO is set by the device as soon as an ultrasonic + burst is sent out and reset when the first echo is + received. + Thus this GPIO is set while the ultrasonic waves are doing + one round trip. + It needs to be an GPIO which is able to deliver an + interrupt because the time between two interrupts is + measured in the driver. + See Documentation/devicetree/bindings/gpio/gpio.txt for + information on how to specify a consumer gpio. + +Example: +srf04@0 { + compatible = "devantech,srf04"; + trig-gpios = <&gpio1 15 GPIO_ACTIVE_HIGH>; + echo-gpios = <&gpio2 6 GPIO_ACTIVE_HIGH>; +}; diff --git a/Documentation/devicetree/bindings/input/cpcap-pwrbutton.txt b/Documentation/devicetree/bindings/input/cpcap-pwrbutton.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dd0076daf71f17463e0d06e856677e1d2d1bc44 --- /dev/null +++ b/Documentation/devicetree/bindings/input/cpcap-pwrbutton.txt @@ -0,0 +1,20 @@ +Motorola CPCAP on key + +This module is part of the CPCAP. For more details about the whole +chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt. + +This module provides a simple power button event via an Interrupt. + +Required properties: +- compatible: should be one of the following + - "motorola,cpcap-pwrbutton" +- interrupts: irq specifier for CPCAP's ON IRQ + +Example: + +&cpcap { + cpcap_pwrbutton: pwrbutton { + compatible = "motorola,cpcap-pwrbutton"; + interrupts = <23 IRQ_TYPE_NONE>; + }; +}; diff --git a/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt b/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt index d0ea09ba249fc5ea484625564e81a815abd7f014..570dc10f0cd74b7da9c1ced0c2dfb73e55e452a2 100644 --- a/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt +++ b/Documentation/devicetree/bindings/input/gpio-matrix-keypad.txt @@ -24,6 +24,8 @@ Optional Properties: - debounce-delay-ms: debounce interval in milliseconds - col-scan-delay-us: delay, measured in microseconds, that is needed before we can scan keypad after activating column gpio +- drive-inactive-cols: drive inactive columns during scan, + default is to turn inactive columns into inputs. Example: matrix-keypad { diff --git a/Documentation/devicetree/bindings/input/hid-over-i2c.txt b/Documentation/devicetree/bindings/input/hid-over-i2c.txt index 488edcb264c4fea49441bb21ea7deb02a0056882..28e8bd8b7d640b34dceadae6b7fe5cf36d392527 100644 --- a/Documentation/devicetree/bindings/input/hid-over-i2c.txt +++ b/Documentation/devicetree/bindings/input/hid-over-i2c.txt @@ -17,6 +17,22 @@ Required properties: - interrupt-parent: the phandle for the interrupt controller - interrupts: interrupt line +Additional optional properties: + +Some devices may support additional optional properties to help with, e.g., +power sequencing. The following properties can be supported by one or more +device-specific compatible properties, which should be used in addition to the +"hid-over-i2c" string. + +- compatible: + * "wacom,w9013" (Wacom W9013 digitizer). Supports: + - vdd-supply + - post-power-on-delay-ms + +- vdd-supply: phandle of the regulator that provides the supply voltage. +- post-power-on-delay-ms: time required by the device after enabling its regulators + before it is ready for communication. Must be used with 'vdd-supply'. + Example: i2c-hid-dev@2c { diff --git a/Documentation/devicetree/bindings/input/pwm-beeper.txt b/Documentation/devicetree/bindings/input/pwm-beeper.txt index 529408b4431a62e44ddfb4ee0bd6795182c2d792..8fc0e48c20db2b2b31795e8f75c4b24257f9272f 100644 --- a/Documentation/devicetree/bindings/input/pwm-beeper.txt +++ b/Documentation/devicetree/bindings/input/pwm-beeper.txt @@ -8,6 +8,7 @@ Required properties: Optional properties: - amp-supply: phandle to a regulator that acts as an amplifier for the beeper +- beeper-hz: bell frequency in Hz Example: diff --git a/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt b/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt index 4ed467b1e4021601e89254d715cc579a12676883..64bb990075c31c3c51a1b1b1aabd4b8a3083fc30 100644 --- a/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt +++ b/Documentation/devicetree/bindings/input/qcom,pm8xxx-vib.txt @@ -7,6 +7,7 @@ PROPERTIES Value type: Definition: must be one of: "qcom,pm8058-vib" + "qcom,pm8916-vib" "qcom,pm8921-vib" - reg: diff --git a/Documentation/devicetree/bindings/input/rotary-encoder.txt b/Documentation/devicetree/bindings/input/rotary-encoder.txt index e85ce3dea4806bfbf9769d27514458603166e632..f99fe5cdeaec6b6c4d841bfa49603e84495a1b16 100644 --- a/Documentation/devicetree/bindings/input/rotary-encoder.txt +++ b/Documentation/devicetree/bindings/input/rotary-encoder.txt @@ -12,7 +12,7 @@ Optional properties: - rotary-encoder,relative-axis: register a relative axis rather than an absolute one. Relative axis will only generate +1/-1 events on the input device, hence no steps need to be passed. -- rotary-encoder,rollover: Automatic rollove when the rotary value becomes +- rotary-encoder,rollover: Automatic rollover when the rotary value becomes greater than the specified steps or smaller than 0. For absolute axis only. - rotary-encoder,steps-per-period: Number of steps (stable states) per period. The values have the following meaning: diff --git a/Documentation/devicetree/bindings/input/touchscreen/ad7879.txt b/Documentation/devicetree/bindings/input/touchscreen/ad7879.txt index e3f22d23fc8fa8b91db6521afe540a46daeef5a8..3c8614c451f265558175cae62cf10e4016301d0e 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/ad7879.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/ad7879.txt @@ -35,6 +35,7 @@ Optional properties: - adi,conversion-interval: : 0 : convert one time only 1-255: 515us + val * 35us (up to 9.440ms) This property has to be a '/bits/ 8' value +- gpio-controller : Switch AUX/VBAT/GPIO pin to GPIO mode Example: @@ -51,3 +52,21 @@ Example: adi,averaging = /bits/ 8 <1>; adi,conversion-interval = /bits/ 8 <255>; }; + + ad7879@1 { + compatible = "adi,ad7879"; + spi-max-frequency = <5000000>; + reg = <1>; + spi-cpol; + spi-cpha; + gpio-controller; + interrupt-parent = <&gpio1>; + interrupts = <13 IRQ_TYPE_EDGE_FALLING>; + touchscreen-max-pressure = <4096>; + adi,resistance-plate-x = <120>; + adi,first-conversion-delay = /bits/ 8 <3>; + adi,acquisition-time = /bits/ 8 <1>; + adi,median-filter-size = /bits/ 8 <2>; + adi,averaging = /bits/ 8 <1>; + adi,conversion-interval = /bits/ 8 <255>; + }; diff --git a/Documentation/devicetree/bindings/input/ads7846.txt b/Documentation/devicetree/bindings/input/touchscreen/ads7846.txt similarity index 100% rename from Documentation/devicetree/bindings/input/ads7846.txt rename to Documentation/devicetree/bindings/input/touchscreen/ads7846.txt diff --git a/Documentation/devicetree/bindings/input/touchscreen/ar1021.txt b/Documentation/devicetree/bindings/input/touchscreen/ar1021.txt new file mode 100644 index 0000000000000000000000000000000000000000..e459e8546f34cdb4f70ce709a29d943e4964057f --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/ar1021.txt @@ -0,0 +1,16 @@ +* Microchip AR1020 and AR1021 touchscreen interface (I2C) + +Required properties: +- compatible : "microchip,ar1021-i2c" +- reg : I2C slave address +- interrupt-parent : the phandle for the interrupt controller +- interrupts : touch controller interrupt + +Example: + + touchscreen@4d { + compatible = "microchip,ar1021-i2c"; + reg = <0x4d>; + interrupt-parent = <&gpio3>; + interrupts = <11 IRQ_TYPE_LEVEL_HIGH>; + }; diff --git a/Documentation/devicetree/bindings/input/touchscreen/max11801-ts.txt b/Documentation/devicetree/bindings/input/touchscreen/max11801-ts.txt new file mode 100644 index 0000000000000000000000000000000000000000..40ac0fe94df64f30db111f177493c7e296762431 --- /dev/null +++ b/Documentation/devicetree/bindings/input/touchscreen/max11801-ts.txt @@ -0,0 +1,18 @@ +* MAXI MAX11801 Resistive touch screen controller with i2c interface + +Required properties: +- compatible: must be "maxim,max11801" +- reg: i2c slave address +- interrupt-parent: the phandle for the interrupt controller +- interrupts: touch controller interrupt + +Example: + +&i2c1 { + max11801: touchscreen@48 { + compatible = "maxim,max11801"; + reg = <0x48>; + interrupt-parent = <&gpio3>; + interrupts = <31 IRQ_TYPE_EDGE_FALLING>; + }; +}; diff --git a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt index ce85ee508238f27514172ecf9559e04a8710f908..6aa625e0cb8dafec75d57d38fbc8eca06ae22bf6 100644 --- a/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt +++ b/Documentation/devicetree/bindings/input/touchscreen/silead_gsl1680.txt @@ -1,7 +1,12 @@ * GSL 1680 touchscreen controller Required properties: -- compatible : "silead,gsl1680" +- compatible : Must be one of the following, depending on the model: + "silead,gsl1680" + "silead,gsl1688" + "silead,gsl3670" + "silead,gsl3675" + "silead,gsl3692" - reg : I2C slave address of the chip (0x40) - interrupt-parent : a phandle pointing to the interrupt controller serving the interrupt for this chip diff --git a/Documentation/devicetree/bindings/input/touchscreen/sun4i.txt b/Documentation/devicetree/bindings/input/touchscreen/sun4i.txt deleted file mode 100644 index 89abecd938cb98b55d82070ed108eb92bf914b8d..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/input/touchscreen/sun4i.txt +++ /dev/null @@ -1,38 +0,0 @@ -sun4i resistive touchscreen controller --------------------------------------- - -Required properties: - - compatible: "allwinner,sun4i-a10-ts", "allwinner,sun5i-a13-ts" or - "allwinner,sun6i-a31-ts" - - reg: mmio address range of the chip - - interrupts: interrupt to which the chip is connected - - #thermal-sensor-cells: shall be 0 - -Optional properties: - - allwinner,ts-attached : boolean indicating that an actual touchscreen - is attached to the controller - - allwinner,tp-sensitive-adjust : integer (4 bits) - adjust sensitivity of pen down detection - between 0 (least sensitive) and 15 - (defaults to 15) - - allwinner,filter-type : integer (2 bits) - select median and averaging filter - samples used for median / averaging filter - 0: 4/2 - 1: 5/3 - 2: 8/4 - 3: 16/8 - (defaults to 1) - -Example: - - rtp: rtp@01c25000 { - compatible = "allwinner,sun4i-a10-ts"; - reg = <0x01c25000 0x100>; - interrupts = <29>; - allwinner,ts-attached; - #thermal-sensor-cells = <0>; - /* sensitive/noisy touch panel */ - allwinner,tp-sensitive-adjust = <0>; - allwinner,filter-type = <3>; - }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/arm,nvic.txt b/Documentation/devicetree/bindings/interrupt-controller/arm,nvic.txt new file mode 100644 index 0000000000000000000000000000000000000000..386ab37a383fa229de70a31b4398f408fb791edc --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/arm,nvic.txt @@ -0,0 +1,36 @@ +* ARM Nested Vector Interrupt Controller (NVIC) + +The NVIC provides an interrupt controller that is tightly coupled to +Cortex-M based processor cores. The NVIC implemented on different SoCs +vary in the number of interrupts and priority bits per interrupt. + +Main node required properties: + +- compatible : should be one of: + "arm,v6m-nvic" + "arm,v7m-nvic" + "arm,v8m-nvic" +- interrupt-controller : Identifies the node as an interrupt controller +- #interrupt-cells : Specifies the number of cells needed to encode an + interrupt source. The type shall be a and the value shall be 2. + + The 1st cell contains the interrupt number for the interrupt type. + + The 2nd cell is the priority of the interrupt. + +- reg : Specifies base physical address(s) and size of the NVIC registers. + This is at a fixed address (0xe000e100) and size (0xc00). + +- arm,num-irq-priority-bits: The number of priority bits implemented by the + given SoC + +Example: + + intc: interrupt-controller@e000e100 { + compatible = "arm,v7m-nvic"; + #interrupt-cells = <2>; + #address-cells = <1>; + interrupt-controller; + reg = <0xe000e100 0xc00>; + arm,num-irq-priority-bits = <4>; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/cortina,gemini-interrupt-controller.txt b/Documentation/devicetree/bindings/interrupt-controller/cortina,gemini-interrupt-controller.txt deleted file mode 100644 index 97c1167fa5335164b0d2f98a0daf8c86a6c46ee0..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/interrupt-controller/cortina,gemini-interrupt-controller.txt +++ /dev/null @@ -1,22 +0,0 @@ -* Cortina Systems Gemini interrupt controller - -This interrupt controller is found on the Gemini SoCs. - -Required properties: -- compatible: must be "cortina,gemini-interrupt-controller" -- reg: The register bank for the interrupt controller. -- interrupt-controller: Identifies the node as an interrupt controller -- #interrupt-cells: The number of cells to define the interrupts. - Must be 2 as the controller can specify level or rising edge - IRQs. The bindings follows the standard binding for controllers - with two cells specified in - interrupt-controller/interrupts.txt - -Example: - -interrupt-controller@48000000 { - compatible = "cortina,gemini-interrupt-controller"; - reg = <0x48000000 0x1000>; - interrupt-controller; - #interrupt-cells = <2>; -}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/faraday,ftintc010.txt b/Documentation/devicetree/bindings/interrupt-controller/faraday,ftintc010.txt new file mode 100644 index 0000000000000000000000000000000000000000..24428d47f4872c2e8114336d3559c92636b4af56 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/faraday,ftintc010.txt @@ -0,0 +1,25 @@ +* Faraday Technologt FTINTC010 interrupt controller + +This interrupt controller is a stock IP block from Faraday Technology found +in the Gemini SoCs and other designs. + +Required properties: +- compatible: must be one of + "faraday,ftintc010" + "cortina,gemini-interrupt-controller" (deprecated) +- reg: The register bank for the interrupt controller. +- interrupt-controller: Identifies the node as an interrupt controller +- #interrupt-cells: The number of cells to define the interrupts. + Must be 2 as the controller can specify level or rising edge + IRQs. The bindings follows the standard binding for controllers + with two cells specified in + interrupt-controller/interrupts.txt + +Example: + +interrupt-controller@48000000 { + compatible = "faraday,ftintc010" + reg = <0x48000000 0x1000>; + interrupt-controller; + #interrupt-cells = <2>; +}; diff --git a/Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt b/Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7efdbc3de5be3f7d0be225bd6916be2ac69a120 --- /dev/null +++ b/Documentation/devicetree/bindings/interrupt-controller/mediatek,cirq.txt @@ -0,0 +1,35 @@ +* Mediatek 27xx cirq + +In Mediatek SOCs, the CIRQ is a low power interrupt controller designed to +work outside MCUSYS which comprises with Cortex-Ax cores,CCI and GIC. +The external interrupts (outside MCUSYS) will feed through CIRQ and connect +to GIC in MCUSYS. When CIRQ is enabled, it will record the edge-sensitive +interrupts and generate a pulse signal to parent interrupt controller when +flush command is executed. With CIRQ, MCUSYS can be completely turned off +to improve the system power consumption without losing interrupts. + +Required properties: +- compatible: should be one of + - "mediatek,mt2701-cirq" for mt2701 CIRQ + - "mediatek,mt8135-cirq" for mt8135 CIRQ + - "mediatek,mt8173-cirq" for mt8173 CIRQ + and "mediatek,cirq" as a fallback. +- interrupt-controller : Identifies the node as an interrupt controller. +- #interrupt-cells : Use the same format as specified by GIC in arm,gic.txt. +- interrupt-parent: phandle of irq parent for cirq. The parent must + use the same interrupt-cells format as GIC. +- reg: Physical base address of the cirq registers and length of memory + mapped region. +- mediatek,ext-irq-range: Identifies external irq number range in different + SOCs. + +Example: + cirq: interrupt-controller@10204000 { + compatible = "mediatek,mt2701-cirq", + "mediatek,mtk-cirq"; + interrupt-controller; + #interrupt-cells = <3>; + interrupt-parent = <&sysirq>; + reg = <0 0x10204000 0 0x400>; + mediatek,ext-irq-start = <32 200>; + }; diff --git a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt index 9d1d72c65489e1045d76291bcb8e7724c7c3cc43..a89c03bb1a81549c6029e8b4e7aeb48e3d726a4a 100644 --- a/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt +++ b/Documentation/devicetree/bindings/interrupt-controller/mediatek,sysirq.txt @@ -21,13 +21,16 @@ Required properties: - interrupt-parent: phandle of irq parent for sysirq. The parent must use the same interrupt-cells format as GIC. - reg: Physical base address of the intpol registers and length of memory - mapped region. + mapped region. Could be multiple bases here. Ex: mt6797 needs 2 reg, others + need 1. Example: - sysirq: interrupt-controller@10200100 { - compatible = "mediatek,mt6589-sysirq", "mediatek,mt6577-sysirq"; + sysirq: intpol-controller@10200620 { + compatible = "mediatek,mt6797-sysirq", + "mediatek,mt6577-sysirq"; interrupt-controller; #interrupt-cells = <3>; interrupt-parent = <&gic>; - reg = <0 0x10200100 0 0x1c>; + reg = <0 0x10220620 0 0x20>, + <0 0x10220690 0 0x10>; }; diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.txt b/Documentation/devicetree/bindings/iommu/arm,smmu.txt index 6cdf32d037fcbedc1a25d7f283026ff89f73b361..8a6ffce12af5305e8c630724fa86f735c1884f4d 100644 --- a/Documentation/devicetree/bindings/iommu/arm,smmu.txt +++ b/Documentation/devicetree/bindings/iommu/arm,smmu.txt @@ -60,6 +60,17 @@ conditions. aliases of secure registers have to be used during SMMU configuration. +- stream-match-mask : For SMMUs supporting stream matching and using + #iommu-cells = <1>, specifies a mask of bits to ignore + when matching stream IDs (e.g. this may be programmed + into the SMRn.MASK field of every stream match register + used). For cases where it is desirable to ignore some + portion of every Stream ID (e.g. for certain MMU-500 + configurations given globally unique input IDs). This + property is not valid for SMMUs using stream indexing, + or using stream matching with #iommu-cells = <2>, and + may be ignored if present in such cases. + ** Deprecated properties: - mmu-masters (deprecated in favour of the generic "iommus" binding) : @@ -109,3 +120,20 @@ conditions. master3 { iommus = <&smmu2 1 0x30>; }; + + + /* ARM MMU-500 with 10-bit stream ID input configuration */ + smmu3: iommu { + compatible = "arm,mmu-500", "arm,smmu-v2"; + ... + #iommu-cells = <1>; + /* always ignore appended 5-bit TBU number */ + stream-match-mask = 0x7c00; + }; + + bus { + /* bus whose child devices emit one unique 10-bit stream + ID each, but may master through multiple SMMU TBUs */ + iommu-map = <0 &smmu3 0 0x400>; + ... + }; diff --git a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt index 6f28969af9dcd03048ea75f82858664570531489..028268fd99eeba926424ae90ac73eab6b818f4be 100644 --- a/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt +++ b/Documentation/devicetree/bindings/ipmi/aspeed,ast2400-ibt-bmc.txt @@ -6,7 +6,9 @@ perform in-band IPMI communication with their host. Required properties: -- compatible : should be "aspeed,ast2400-ibt-bmc" +- compatible : should be one of + "aspeed,ast2400-ibt-bmc" + "aspeed,ast2500-ibt-bmc" - reg: physical address and size of the registers Optional properties: diff --git a/Documentation/devicetree/bindings/leds/backlight/arcxcnn_bl.txt b/Documentation/devicetree/bindings/leds/backlight/arcxcnn_bl.txt new file mode 100644 index 0000000000000000000000000000000000000000..230abdefd6e7be20b470c3790e74c4d26d084ee8 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/backlight/arcxcnn_bl.txt @@ -0,0 +1,33 @@ +Binding for ArcticSand arc2c0608 LED driver + +Required properties: +- compatible: should be "arc,arc2c0608" +- reg: slave address + +Optional properties: +- default-brightness: brightness value on boot, value from: 0-4095 +- label: The name of the backlight device + See Documentation/devicetree/bindings/leds/common.txt +- led-sources: List of enabled channels from 0 to 5. + See Documentation/devicetree/bindings/leds/common.txt + +- arc,led-config-0: setting for register ILED_CONFIG_0 +- arc,led-config-1: setting for register ILED_CONFIG_1 +- arc,dim-freq: PWM mode frequence setting (bits [3:0] used) +- arc,comp-config: setting for register CONFIG_COMP +- arc,filter-config: setting for register FILTER_CONFIG +- arc,trim-config: setting for register IMAXTUNE + +Note: Optional properties not specified will default to values in IC EPROM + +Example: + +arc2c0608@30 { + compatible = "arc,arc2c0608"; + reg = <0x30>; + default-brightness = <500>; + label = "lcd-backlight"; + linux,default-trigger = "backlight"; + led-sources = <0 1 2 5>; +}; + diff --git a/Documentation/devicetree/bindings/leds/leds-cpcap.txt b/Documentation/devicetree/bindings/leds/leds-cpcap.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebf7cdc7f70c2d545db6669d5547c5128878251a --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-cpcap.txt @@ -0,0 +1,29 @@ +Motorola CPCAP PMIC LEDs +------------------------ + +This module is part of the CPCAP. For more details about the whole +chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt. + +Requires node properties: +- compatible: should be one of + * "motorola,cpcap-led-mdl" (Main Display Lighting) + * "motorola,cpcap-led-kl" (Keyboard Lighting) + * "motorola,cpcap-led-adl" (Aux Display Lighting) + * "motorola,cpcap-led-red" (Red Triode) + * "motorola,cpcap-led-green" (Green Triode) + * "motorola,cpcap-led-blue" (Blue Triode) + * "motorola,cpcap-led-cf" (Camera Flash) + * "motorola,cpcap-led-bt" (Bluetooth) + * "motorola,cpcap-led-cp" (Camera Privacy LED) +- label: see Documentation/devicetree/bindings/leds/common.txt +- vdd-supply: A phandle to the regulator powering the LED + +Example: + +&cpcap { + cpcap_led_red: red-led { + compatible = "motorola,cpcap-led-red"; + label = "cpcap:red"; + vdd-supply = <&sw5>; + }; +}; diff --git a/Documentation/devicetree/bindings/leds/leds-mt6323.txt b/Documentation/devicetree/bindings/leds/leds-mt6323.txt new file mode 100644 index 0000000000000000000000000000000000000000..45bf9f7d85f37685932f558aa9807bf5cf7ac8c8 --- /dev/null +++ b/Documentation/devicetree/bindings/leds/leds-mt6323.txt @@ -0,0 +1,60 @@ +Device Tree Bindings for LED support on MT6323 PMIC + +MT6323 LED controller is subfunction provided by MT6323 PMIC, so the LED +controllers are defined as the subnode of the function node provided by MT6323 +PMIC controller that is being defined as one kind of Muti-Function Device (MFD) +using shared bus called PMIC wrapper for each subfunction to access remote +MT6323 PMIC hardware. + +For MT6323 MFD bindings see: +Documentation/devicetree/bindings/mfd/mt6397.txt +For MediaTek PMIC wrapper bindings see: +Documentation/devicetree/bindings/soc/mediatek/pwrap.txt + +Required properties: +- compatible : Must be "mediatek,mt6323-led" +- address-cells : Must be 1 +- size-cells : Must be 0 + +Each led is represented as a child node of the mediatek,mt6323-led that +describes the initial behavior for each LED physically and currently only four +LED child nodes can be supported. + +Required properties for the LED child node: +- reg : LED channel number (0..3) + +Optional properties for the LED child node: +- label : See Documentation/devicetree/bindings/leds/common.txt +- linux,default-trigger : See Documentation/devicetree/bindings/leds/common.txt +- default-state: See Documentation/devicetree/bindings/leds/common.txt + +Example: + + mt6323: pmic { + compatible = "mediatek,mt6323"; + + ... + + mt6323led: leds { + compatible = "mediatek,mt6323-led"; + #address-cells = <1>; + #size-cells = <0>; + + led@0 { + reg = <0>; + label = "LED0"; + linux,default-trigger = "timer"; + default-state = "on"; + }; + led@1 { + reg = <1>; + label = "LED1"; + default-state = "off"; + }; + led@2 { + reg = <2>; + label = "LED2"; + default-state = "on"; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/leds/leds-pca9532.txt b/Documentation/devicetree/bindings/leds/leds-pca9532.txt index 198f3ba0e01f822dc8c148158a01551205571072..f769c52e36439205572e33d74c4c9e0f9a09b808 100644 --- a/Documentation/devicetree/bindings/leds/leds-pca9532.txt +++ b/Documentation/devicetree/bindings/leds/leds-pca9532.txt @@ -17,6 +17,8 @@ Optional sub-node properties: - label: see Documentation/devicetree/bindings/leds/common.txt - type: Output configuration, see dt-bindings/leds/leds-pca9532.h (default NONE) - linux,default-trigger: see Documentation/devicetree/bindings/leds/common.txt + - default-state: see Documentation/devicetree/bindings/leds/common.txt + This property is only valid for sub-nodes of type . Example: #include @@ -33,6 +35,14 @@ Example: label = "pca:green:power"; type = ; }; + kernel-booting { + type = ; + default-state = "on"; + }; + sys-stat { + type = ; + default-state = "keep"; // don't touch, was set by U-Boot + }; }; For more product information please see the link below: diff --git a/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt b/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt new file mode 100644 index 0000000000000000000000000000000000000000..752ae6b00d26a7c39f914637d37149f93efb5b36 --- /dev/null +++ b/Documentation/devicetree/bindings/mailbox/brcm,iproc-flexrm-mbox.txt @@ -0,0 +1,59 @@ +Broadcom FlexRM Ring Manager +============================ +The Broadcom FlexRM ring manager provides a set of rings which can be +used to submit work to offload engines. An SoC may have multiple FlexRM +hardware blocks. There is one device tree entry per FlexRM block. The +FlexRM driver will create a mailbox-controller instance for given FlexRM +hardware block where each mailbox channel is a separate FlexRM ring. + +Required properties: +-------------------- +- compatible: Should be "brcm,iproc-flexrm-mbox" +- reg: Specifies base physical address and size of the FlexRM + ring registers +- msi-parent: Phandles (and potential Device IDs) to MSI controllers + The FlexRM engine will send MSIs (instead of wired + interrupts) to CPU. There is one MSI for each FlexRM ring. + Refer devicetree/bindings/interrupt-controller/msi.txt +- #mbox-cells: Specifies the number of cells needed to encode a mailbox + channel. This should be 3. + + The 1st cell is the mailbox channel number. + + The 2nd cell contains MSI completion threshold. This is the + number of completion messages for which FlexRM will inject + one MSI interrupt to CPU. + + The 3nd cell contains MSI timer value representing time for + which FlexRM will wait to accumulate N completion messages + where N is the value specified by 2nd cell above. If FlexRM + does not get required number of completion messages in time + specified by this cell then it will inject one MSI interrupt + to CPU provided atleast one completion message is available. + +Optional properties: +-------------------- +- dma-coherent: Present if DMA operations made by the FlexRM engine (such + as DMA descriptor access, access to buffers pointed by DMA + descriptors and read/write pointer updates to DDR) are + cache coherent with the CPU. + +Example: +-------- +crypto_mbox: mbox@67000000 { + compatible = "brcm,iproc-flexrm-mbox"; + reg = <0x67000000 0x200000>; + msi-parent = <&gic_its 0x7f00>; + #mbox-cells = <3>; +}; + +crypto@672c0000 { + compatible = "brcm,spu2-v2-crypto"; + reg = <0x672c0000 0x1000>; + mboxes = <&crypto_mbox 0 0x1 0xffff>, + <&crypto_mbox 1 0x1 0xffff>, + <&crypto_mbox 16 0x1 0xffff>, + <&crypto_mbox 17 0x1 0xffff>, + <&crypto_mbox 30 0x1 0xffff>, + <&crypto_mbox 31 0x1 0xffff>; +}; diff --git a/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt b/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt index 411ccf421584514b5d2128373115cd9443b6d3e8..0f3ee81d92c297022adcdbb1f9689f4b924ea143 100644 --- a/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt +++ b/Documentation/devicetree/bindings/mailbox/brcm,iproc-pdc-mbox.txt @@ -1,9 +1,11 @@ The PDC driver manages data transfer to and from various offload engines on some Broadcom SoCs. An SoC may have multiple PDC hardware blocks. There is -one device tree entry per block. +one device tree entry per block. On some chips, the PDC functionality is +handled by the FA2 (Northstar Plus). Required properties: -- compatible : Should be "brcm,iproc-pdc-mbox". +- compatible : Should be "brcm,iproc-pdc-mbox" or "brcm,iproc-fa2-mbox" for + FA2/Northstar Plus. - reg: Should contain PDC registers location and length. - interrupts: Should contain the IRQ line for the PDC. - #mbox-cells: 1 diff --git a/Documentation/devicetree/bindings/media/atmel-isi.txt b/Documentation/devicetree/bindings/media/atmel-isi.txt index 251f008f220cff32969b974f0b8ee8dd92320bd4..332513a151cc9d45ba299e4722cb19eb8a20a827 100644 --- a/Documentation/devicetree/bindings/media/atmel-isi.txt +++ b/Documentation/devicetree/bindings/media/atmel-isi.txt @@ -1,51 +1,66 @@ -Atmel Image Sensor Interface (ISI) SoC Camera Subsystem ----------------------------------------------- - -Required properties: -- compatible: must be "atmel,at91sam9g45-isi" -- reg: physical base address and length of the registers set for the device; -- interrupts: should contain IRQ line for the ISI; -- clocks: list of clock specifiers, corresponding to entries in - the clock-names property; -- clock-names: must contain "isi_clk", which is the isi peripherial clock. - -ISI supports a single port node with parallel bus. It should contain one +Atmel Image Sensor Interface (ISI) +---------------------------------- + +Required properties for ISI: +- compatible: must be "atmel,at91sam9g45-isi". +- reg: physical base address and length of the registers set for the device. +- interrupts: should contain IRQ line for the ISI. +- clocks: list of clock specifiers, corresponding to entries in the clock-names + property; please refer to clock-bindings.txt. +- clock-names: required elements: "isi_clk". +- pinctrl-names, pinctrl-0: please refer to pinctrl-bindings.txt. + +ISI supports a single port node with parallel bus. It shall contain one 'port' child node with child 'endpoint' node. Please refer to the bindings defined in Documentation/devicetree/bindings/media/video-interfaces.txt. -Example: - isi: isi@f0034000 { - compatible = "atmel,at91sam9g45-isi"; - reg = <0xf0034000 0x4000>; - interrupts = <37 IRQ_TYPE_LEVEL_HIGH 5>; - - clocks = <&isi_clk>; - clock-names = "isi_clk"; +Endpoint node properties +------------------------ - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_isi>; +- bus-width: <8> or <10> (mandatory) +- hsync-active (default: active high) +- vsync-active (default: active high) +- pclk-sample (default: sample on falling edge) +- remote-endpoint: A phandle to the bus receiver's endpoint node (mandatory). - port { - #address-cells = <1>; - #size-cells = <0>; +Example: - isi_0: endpoint { - remote-endpoint = <&ov2640_0>; - bus-width = <8>; - }; +isi: isi@f0034000 { + compatible = "atmel,at91sam9g45-isi"; + reg = <0xf0034000 0x4000>; + interrupts = <37 IRQ_TYPE_LEVEL_HIGH 5>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_isi_data_0_7>; + clocks = <&isi_clk>; + clock-names = "isi_clk"; + port { + isi_0: endpoint { + remote-endpoint = <&ov2640_0>; + bus-width = <8>; + vsync-active = <1>; + hsync-active = <1>; }; }; +}; - i2c1: i2c@f0018000 { - ov2640: camera@0x30 { - compatible = "ovti,ov2640"; - reg = <0x30>; +i2c1: i2c@f0018000 { + ov2640: camera@30 { + compatible = "ovti,ov2640"; + reg = <0x30>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pck0_as_isi_mck &pinctrl_sensor_power &pinctrl_sensor_reset>; + resetb-gpios = <&pioE 11 GPIO_ACTIVE_LOW>; + pwdn-gpios = <&pioE 13 GPIO_ACTIVE_HIGH>; + clocks = <&pck0>; + clock-names = "xvclk"; + assigned-clocks = <&pck0>; + assigned-clock-rates = <25000000>; - port { - ov2640_0: endpoint { - remote-endpoint = <&isi_0>; - bus-width = <8>; - }; + port { + ov2640_0: endpoint { + remote-endpoint = <&isi_0>; + bus-width = <8>; }; }; }; +}; diff --git a/Documentation/devicetree/bindings/media/i2c/ov2640.txt b/Documentation/devicetree/bindings/media/i2c/ov2640.txt index c429b5bdcaa05ef7486965814b5e55820f9d2dc3..989ce6cb6ac30489df01c8008ef0181e5e7bc637 100644 --- a/Documentation/devicetree/bindings/media/i2c/ov2640.txt +++ b/Documentation/devicetree/bindings/media/i2c/ov2640.txt @@ -1,8 +1,8 @@ * Omnivision OV2640 CMOS sensor -The Omnivision OV2640 sensor support multiple resolutions output, such as -CIF, SVGA, UXGA. It also can support YUV422/420, RGB565/555 or raw RGB -output format. +The Omnivision OV2640 sensor supports multiple resolutions output, such as +CIF, SVGA, UXGA. It also can support the YUV422/420, RGB565/555 or raw RGB +output formats. Required Properties: - compatible: should be "ovti,ov2640" @@ -20,26 +20,21 @@ Documentation/devicetree/bindings/media/video-interfaces.txt. Example: i2c1: i2c@f0018000 { - ov2640: camera@0x30 { + ov2640: camera@30 { compatible = "ovti,ov2640"; reg = <0x30>; - pinctrl-names = "default"; - pinctrl-0 = <&pinctrl_pck1 &pinctrl_ov2640_pwdn &pinctrl_ov2640_resetb>; - - resetb-gpios = <&pioE 24 GPIO_ACTIVE_LOW>; - pwdn-gpios = <&pioE 29 GPIO_ACTIVE_HIGH>; - - clocks = <&pck1>; + pinctrl-0 = <&pinctrl_pck0_as_isi_mck &pinctrl_sensor_power &pinctrl_sensor_reset>; + resetb-gpios = <&pioE 11 GPIO_ACTIVE_LOW>; + pwdn-gpios = <&pioE 13 GPIO_ACTIVE_HIGH>; + clocks = <&pck0>; clock-names = "xvclk"; - - assigned-clocks = <&pck1>; + assigned-clocks = <&pck0>; assigned-clock-rates = <25000000>; port { ov2640_0: endpoint { remote-endpoint = <&isi_0>; - bus-width = <8>; }; }; }; diff --git a/Documentation/devicetree/bindings/media/i2c/ov5645.txt b/Documentation/devicetree/bindings/media/i2c/ov5645.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7aec9f8e247a1a507991aba3544713cd6b06f2 --- /dev/null +++ b/Documentation/devicetree/bindings/media/i2c/ov5645.txt @@ -0,0 +1,54 @@ +* Omnivision 1/4-Inch 5Mp CMOS Digital Image Sensor + +The Omnivision OV5645 is a 1/4-Inch CMOS active pixel digital image sensor with +an active array size of 2592H x 1944V. It is programmable through a serial I2C +interface. + +Required Properties: +- compatible: Value should be "ovti,ov5645". +- clocks: Reference to the xclk clock. +- clock-names: Should be "xclk". +- clock-frequency: Frequency of the xclk clock. +- enable-gpios: Chip enable GPIO. Polarity is GPIO_ACTIVE_HIGH. This corresponds + to the hardware pin PWDNB which is physically active low. +- reset-gpios: Chip reset GPIO. Polarity is GPIO_ACTIVE_LOW. This corresponds to + the hardware pin RESETB. +- vdddo-supply: Chip digital IO regulator. +- vdda-supply: Chip analog regulator. +- vddd-supply: Chip digital core regulator. + +The device node must contain one 'port' child node for its digital output +video port, in accordance with the video interface bindings defined in +Documentation/devicetree/bindings/media/video-interfaces.txt. + +Example: + + &i2c1 { + ... + + ov5645: ov5645@78 { + compatible = "ovti,ov5645"; + reg = <0x78>; + + enable-gpios = <&gpio1 6 GPIO_ACTIVE_HIGH>; + reset-gpios = <&gpio5 20 GPIO_ACTIVE_LOW>; + pinctrl-names = "default"; + pinctrl-0 = <&camera_rear_default>; + + clocks = <&clks 200>; + clock-names = "xclk"; + clock-frequency = <23880000>; + + vdddo-supply = <&camera_dovdd_1v8>; + vdda-supply = <&camera_avdd_2v8>; + vddd-supply = <&camera_dvdd_1v2>; + + port { + ov5645_ep: endpoint { + clock-lanes = <1>; + data-lanes = <0 2>; + remote-endpoint = <&csi0_ep>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/media/i2c/ov5647.txt b/Documentation/devicetree/bindings/media/i2c/ov5647.txt new file mode 100644 index 0000000000000000000000000000000000000000..22e44945b66108d20eb283cb00f25b6af1a47a34 --- /dev/null +++ b/Documentation/devicetree/bindings/media/i2c/ov5647.txt @@ -0,0 +1,35 @@ +Omnivision OV5647 raw image sensor +--------------------------------- + +OV5647 is a raw image sensor with MIPI CSI-2 and CCP2 image data interfaces +and CCI (I2C compatible) control bus. + +Required properties: + +- compatible : "ovti,ov5647". +- reg : I2C slave address of the sensor. +- clocks : Reference to the xclk clock. + +The common video interfaces bindings (see video-interfaces.txt) should be +used to specify link to the image data receiver. The OV5647 device +node should contain one 'port' child node with an 'endpoint' subnode. + +Endpoint node mandatory properties: + +- remote-endpoint: A phandle to the bus receiver's endpoint node. + +Example: + + i2c@2000 { + ... + ov: camera@36 { + compatible = "ovti,ov5647"; + reg = <0x36>; + clocks = <&camera_clk>; + port { + camera_1: endpoint { + remote-endpoint = <&csi1_ep1>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/media/i2c/ov7670.txt b/Documentation/devicetree/bindings/media/i2c/ov7670.txt new file mode 100644 index 0000000000000000000000000000000000000000..826b6563b009ce7508d2598323f0648a0a8e3d66 --- /dev/null +++ b/Documentation/devicetree/bindings/media/i2c/ov7670.txt @@ -0,0 +1,43 @@ +* Omnivision OV7670 CMOS sensor + +The Omnivision OV7670 sensor supports multiple resolutions output, such as +CIF, SVGA, UXGA. It also can support the YUV422/420, RGB565/555 or raw RGB +output formats. + +Required Properties: +- compatible: should be "ovti,ov7670" +- clocks: reference to the xclk input clock. +- clock-names: should be "xclk". + +Optional Properties: +- reset-gpios: reference to the GPIO connected to the resetb pin, if any. + Active is low. +- powerdown-gpios: reference to the GPIO connected to the pwdn pin, if any. + Active is high. + +The device node must contain one 'port' child node for its digital output +video port, in accordance with the video interface bindings defined in +Documentation/devicetree/bindings/media/video-interfaces.txt. + +Example: + + i2c1: i2c@f0018000 { + ov7670: camera@21 { + compatible = "ovti,ov7670"; + reg = <0x21>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_pck0_as_isi_mck &pinctrl_sensor_power &pinctrl_sensor_reset>; + reset-gpios = <&pioE 11 GPIO_ACTIVE_LOW>; + powerdown-gpios = <&pioE 13 GPIO_ACTIVE_HIGH>; + clocks = <&pck0>; + clock-names = "xclk"; + assigned-clocks = <&pck0>; + assigned-clock-rates = <25000000>; + + port { + ov7670_0: endpoint { + remote-endpoint = <&isi_0>; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt b/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt new file mode 100644 index 0000000000000000000000000000000000000000..3813947b4d4f02bfd27ccaf126f64e0ecb933ce2 --- /dev/null +++ b/Documentation/devicetree/bindings/media/mediatek-jpeg-decoder.txt @@ -0,0 +1,37 @@ +* Mediatek JPEG Decoder + +Mediatek JPEG Decoder is the JPEG decode hardware present in Mediatek SoCs + +Required properties: +- compatible : must be one of the following string: + "mediatek,mt8173-jpgdec" + "mediatek,mt2701-jpgdec" +- reg : physical base address of the jpeg decoder registers and length of + memory mapped region. +- interrupts : interrupt number to the interrupt controller. +- clocks: device clocks, see + Documentation/devicetree/bindings/clock/clock-bindings.txt for details. +- clock-names: must contain "jpgdec-smi" and "jpgdec". +- power-domains: a phandle to the power domain, see + Documentation/devicetree/bindings/power/power_domain.txt for details. +- mediatek,larb: must contain the local arbiters in the current Socs, see + Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt + for details. +- iommus: should point to the respective IOMMU block with master port as + argument, see Documentation/devicetree/bindings/iommu/mediatek,iommu.txt + for details. + +Example: + jpegdec: jpegdec@15004000 { + compatible = "mediatek,mt2701-jpgdec"; + reg = <0 0x15004000 0 0x1000>; + interrupts = ; + clocks = <&imgsys CLK_IMG_JPGDEC_SMI>, + <&imgsys CLK_IMG_JPGDEC>; + clock-names = "jpgdec-smi", + "jpgdec"; + power-domains = <&scpsys MT2701_POWER_DOMAIN_ISP>; + mediatek,larb = <&larb2>; + iommus = <&iommu MT2701_M4U_PORT_JPGDEC_WDMA>, + <&iommu MT2701_M4U_PORT_JPGDEC_BSDMA>; + }; diff --git a/Documentation/devicetree/bindings/media/s5p-cec.txt b/Documentation/devicetree/bindings/media/s5p-cec.txt index 925ab4d72eaab342f8f197b131c7d7ddcf6219de..4bb08d9d940be0473d937ad7c7c97d87c5d517d9 100644 --- a/Documentation/devicetree/bindings/media/s5p-cec.txt +++ b/Documentation/devicetree/bindings/media/s5p-cec.txt @@ -15,6 +15,7 @@ Required properties: - clock-names : from common clock binding: must contain "hdmicec", corresponding to entry in the clocks property. - samsung,syscon-phandle - phandle to the PMU system controller + - hdmi-phandle - phandle to the HDMI controller Example: @@ -25,6 +26,7 @@ hdmicec: cec@100B0000 { clocks = <&clock CLK_HDMI_CEC>; clock-names = "hdmicec"; samsung,syscon-phandle = <&pmu_system_controller>; + hdmi-phandle = <&hdmi>; pinctrl-names = "default"; pinctrl-0 = <&hdmi_cec>; status = "okay"; diff --git a/Documentation/devicetree/bindings/media/s5p-mfc.txt b/Documentation/devicetree/bindings/media/s5p-mfc.txt index 2c901286d8185a0999a4ef6097bc79f905eab748..d3404b5d4d1795034342ae8640c5d9f9648591ae 100644 --- a/Documentation/devicetree/bindings/media/s5p-mfc.txt +++ b/Documentation/devicetree/bindings/media/s5p-mfc.txt @@ -28,7 +28,7 @@ Optional properties: - memory-region : from reserved memory binding: phandles to two reserved memory regions, first is for "left" mfc memory bus interfaces, second if for the "right" mfc memory bus, used when no SYSMMU - support is available + support is available; used only by MFC v5 present in Exynos4 SoCs Obsolete properties: - samsung,mfc-r, samsung,mfc-l : support removed, please use memory-region diff --git a/Documentation/devicetree/bindings/media/stih-cec.txt b/Documentation/devicetree/bindings/media/stih-cec.txt index 71c4b2f4bcef1f0a8a31c3088250efe45677226d..289a08b336513db88957263aa6777d2b4372647e 100644 --- a/Documentation/devicetree/bindings/media/stih-cec.txt +++ b/Documentation/devicetree/bindings/media/stih-cec.txt @@ -9,6 +9,7 @@ Required properties: - pinctrl-names: Contains only one value - "default" - pinctrl-0: Specifies the pin control groups used for CEC hardware. - resets: Reference to a reset controller + - hdmi-phandle: Phandle to the HDMI controller Example for STIH407: @@ -22,4 +23,5 @@ sti-cec@094a087c { pinctrl-names = "default"; pinctrl-0 = <&pinctrl_cec0_default>; resets = <&softreset STIH407_LPM_SOFTRESET>; + hdmi-phandle = <&hdmi>; }; diff --git a/Documentation/devicetree/bindings/media/ti,da850-vpif.txt b/Documentation/devicetree/bindings/media/ti,da850-vpif.txt index 6d25d7f23d2621c13d4556a1f3e877bddac1e007..df7182a63e59aef757c58fd930279a400f7fb210 100644 --- a/Documentation/devicetree/bindings/media/ti,da850-vpif.txt +++ b/Documentation/devicetree/bindings/media/ti,da850-vpif.txt @@ -16,8 +16,10 @@ Required properties: Video Capture: VPIF has a 16-bit parallel bus input, supporting 2 8-bit channels or a -single 16-bit channel. It should contain at least one port child node -with child 'endpoint' node. Please refer to the bindings defined in +single 16-bit channel. It should contain one or two port child nodes +with child 'endpoint' node. If there are two ports then port@0 must +describe the input and port@1 output channels. Please refer to the +bindings defined in Documentation/devicetree/bindings/media/video-interfaces.txt. Example using 2 8-bit input channels, one of which is connected to an @@ -28,17 +30,24 @@ I2C-connected TVP5147 decoder: reg = <0x217000 0x1000>; interrupts = <92>; - port { - vpif_ch0: endpoint@0 { - reg = <0>; - bus-width = <8>; - remote-endpoint = <&composite>; + port@0 { + vpif_input_ch0: endpoint@0 { + reg = <0>; + bus-width = <8>; + remote-endpoint = <&composite_in>; + }; + + vpif_input_ch1: endpoint@1 { + reg = <1>; + bus-width = <8>; + data-shift = <8>; }; + }; - vpif_ch1: endpoint@1 { - reg = <1>; - bus-width = <8>; - data-shift = <8>; + port@1 { + vpif_output_ch0: endpoint { + bus-width = <8>; + remote-endpoint = <&composite_out>; }; }; }; @@ -53,13 +62,28 @@ I2C-connected TVP5147 decoder: status = "okay"; port { - composite: endpoint { + composite_in: endpoint { hsync-active = <1>; vsync-active = <1>; pclk-sample = <0>; /* VPIF channel 0 (lower 8-bits) */ - remote-endpoint = <&vpif_ch0>; + remote-endpoint = <&vpif_input_ch0>; + bus-width = <8>; + }; + }; + }; + + adv7343@2a { + compatible = "adi,adv7343"; + reg = <0x2a>; + + port { + composite_out: endpoint { + adi,dac-enable = <1 1 1>; + adi,sd-dac-enable = <1>; + + remote-endpoint = <&vpif_output_ch0>; bus-width = <8>; }; }; diff --git a/Documentation/devicetree/bindings/mfd/altera-a10sr.txt b/Documentation/devicetree/bindings/mfd/altera-a10sr.txt index ea151f295ad71de2f7c5de2efbe6270c6c651fff..c8a736554b4b3494220293e611eeaa33653da7f7 100644 --- a/Documentation/devicetree/bindings/mfd/altera-a10sr.txt +++ b/Documentation/devicetree/bindings/mfd/altera-a10sr.txt @@ -18,6 +18,7 @@ The A10SR consists of these sub-devices: Device Description ------ ---------- a10sr_gpio GPIO Controller +a10sr_rst Reset Controller Arria10 GPIO Required Properties: @@ -27,6 +28,11 @@ Required Properties: the second cell is used to specify flags. See ../gpio/gpio.txt for more information. +Arria10 Peripheral PHY Reset +Required Properties: +- compatible : Should be "altr,a10sr-reset" +- #reset-cells : Should be one. + Example: resource-manager@0 { @@ -43,4 +49,9 @@ Example: gpio-controller; #gpio-cells = <2>; }; + + a10sr_rst: reset-controller { + compatible = "altr,a10sr-reset"; + #reset-cells = <1>; + }; }; diff --git a/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt b/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt index 670831b29565369279960e9eddb2e14b4267ccdd..eec40be7f79a007005a2e835c1d73e59298a02bc 100644 --- a/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt +++ b/Documentation/devicetree/bindings/mfd/atmel-hlcdc.txt @@ -15,7 +15,7 @@ Required properties: The HLCDC IP exposes two subdevices: - a PWM chip: see ../pwm/atmel-hlcdc-pwm.txt - - a Display Controller: see ../display/atmel-hlcdc-dc.txt + - a Display Controller: see ../display/atmel/hlcdc-dc.txt Example: diff --git a/Documentation/devicetree/bindings/mfd/axp20x.txt b/Documentation/devicetree/bindings/mfd/axp20x.txt index 8f3ad9ab46372bd36631db5701589efb04d88e87..aca09af665142c21f9512136817fd2d8f856f490 100644 --- a/Documentation/devicetree/bindings/mfd/axp20x.txt +++ b/Documentation/devicetree/bindings/mfd/axp20x.txt @@ -6,12 +6,19 @@ axp202 (X-Powers) axp209 (X-Powers) axp221 (X-Powers) axp223 (X-Powers) +axp803 (X-Powers) axp809 (X-Powers) Required properties: -- compatible: "x-powers,axp152", "x-powers,axp202", "x-powers,axp209", - "x-powers,axp221", "x-powers,axp223", "x-powers,axp806", - "x-powers,axp809" +- compatible: should be one of: + * "x-powers,axp152" + * "x-powers,axp202" + * "x-powers,axp209" + * "x-powers,axp221" + * "x-powers,axp223" + * "x-powers,axp803" + * "x-powers,axp806" + * "x-powers,axp809" - reg: The I2C slave address or RSB hardware address for the AXP chip - interrupt-parent: The parent interrupt controller - interrupts: SoC NMI / GPIO interrupt connected to the PMIC's IRQ pin @@ -28,6 +35,9 @@ Optional properties: regulator to drive the OTG VBus, rather then as an input pin which signals whether the board is driving OTG VBus or not. +- x-powers,master-mode: Boolean (axp806 only). Set this when the PMIC is + wired for master mode. The default is slave mode. + - -supply: a phandle to the regulator supply node. May be omitted if inputs are unregulated, such as using the IPSOUT output from the PMIC. @@ -86,6 +96,33 @@ LDO_IO1 : LDO : ips-supply : GPIO 1 RTC_LDO : LDO : ips-supply : always on DRIVEVBUS : Enable output : drivevbus-supply : external regulator +AXP803 regulators, type, and corresponding input supply names: + +Regulator Type Supply Name Notes +--------- ---- ----------- ----- +DCDC1 : DC-DC buck : vin1-supply +DCDC2 : DC-DC buck : vin2-supply : poly-phase capable +DCDC3 : DC-DC buck : vin3-supply : poly-phase capable +DCDC4 : DC-DC buck : vin4-supply +DCDC5 : DC-DC buck : vin5-supply : poly-phase capable +DCDC6 : DC-DC buck : vin6-supply : poly-phase capable +DC1SW : On/Off Switch : : DCDC1 secondary output +ALDO1 : LDO : aldoin-supply : shared supply +ALDO2 : LDO : aldoin-supply : shared supply +ALDO3 : LDO : aldoin-supply : shared supply +DLDO1 : LDO : dldoin-supply : shared supply +DLDO2 : LDO : dldoin-supply : shared supply +DLDO3 : LDO : dldoin-supply : shared supply +DLDO4 : LDO : dldoin-supply : shared supply +ELDO1 : LDO : eldoin-supply : shared supply +ELDO2 : LDO : eldoin-supply : shared supply +ELDO3 : LDO : eldoin-supply : shared supply +FLDO1 : LDO : fldoin-supply : shared supply +FLDO2 : LDO : fldoin-supply : shared supply +LDO_IO0 : LDO : ips-supply : GPIO 0 +LDO_IO1 : LDO : ips-supply : GPIO 1 +RTC_LDO : LDO : ips-supply : always on + AXP806 regulators, type, and corresponding input supply names: Regulator Type Supply Name Notes diff --git a/Documentation/devicetree/bindings/mfd/da9062.txt b/Documentation/devicetree/bindings/mfd/da9062.txt index 38802b54d48a4aef15a33c632050549fb9eb3707..c0a418c27e9d94796cb1d3b0a1d15845432ceb93 100644 --- a/Documentation/devicetree/bindings/mfd/da9062.txt +++ b/Documentation/devicetree/bindings/mfd/da9062.txt @@ -1,22 +1,39 @@ * Dialog DA9062 Power Management Integrated Circuit (PMIC) -DA9062 consists of a large and varied group of sub-devices: +Product information for the DA9062 and DA9061 devices can be found here: +- http://www.dialog-semiconductor.com/products/da9062 +- http://www.dialog-semiconductor.com/products/da9061 + +The DA9062 PMIC consists of: Device Supply Names Description ------ ------------ ----------- da9062-regulator : : LDOs & BUCKs da9062-rtc : : Real-Time Clock +da9062-onkey : : On Key +da9062-watchdog : : Watchdog Timer +da9062-thermal : : Thermal + +The DA9061 PMIC consists of: + +Device Supply Names Description +------ ------------ ----------- +da9062-regulator : : LDOs & BUCKs +da9062-onkey : : On Key da9062-watchdog : : Watchdog Timer +da9062-thermal : : Thermal ====== Required properties: -- compatible : Should be "dlg,da9062". +- compatible : Should be + "dlg,da9062" for DA9062 + "dlg,da9061" for DA9061 - reg : Specifies the I2C slave address (this defaults to 0x58 but it can be modified to match the chip's OTP settings). - interrupt-parent : Specifies the reference to the interrupt controller for - the DA9062. + the DA9062 or DA9061. - interrupts : IRQ line information. - interrupt-controller @@ -25,8 +42,8 @@ further information on IRQ bindings. Sub-nodes: -- regulators : This node defines the settings for the LDOs and BUCKs. The - DA9062 regulators are bound using their names listed below: +- regulators : This node defines the settings for the LDOs and BUCKs. + The DA9062 regulators are bound using their names listed below: buck1 : BUCK_1 buck2 : BUCK_2 @@ -37,19 +54,29 @@ Sub-nodes: ldo3 : LDO_3 ldo4 : LDO_4 + The DA9061 regulators are bound using their names listed below: + + buck1 : BUCK_1 + buck2 : BUCK_2 + buck3 : BUCK_3 + ldo1 : LDO_1 + ldo2 : LDO_2 + ldo3 : LDO_3 + ldo4 : LDO_4 + The component follows the standard regulator framework and the bindings details of individual regulator device can be found in: Documentation/devicetree/bindings/regulator/regulator.txt - - rtc : This node defines settings required for the Real-Time Clock associated with the DA9062. There are currently no entries in this binding, however compatible = "dlg,da9062-rtc" should be added if a node is created. -- watchdog: This node defines the settings for the watchdog driver associated - with the DA9062 PMIC. The compatible = "dlg,da9062-watchdog" should be added - if a node is created. +- onkey : See ../input/da9062-onkey.txt + +- watchdog: See ../watchdog/da9062-watchdog.txt +- thermal : See ../thermal/da9062-thermal.txt Example: @@ -64,10 +91,6 @@ Example: compatible = "dlg,da9062-rtc"; }; - watchdog { - compatible = "dlg,da9062-watchdog"; - }; - regulators { DA9062_BUCK1: buck1 { regulator-name = "BUCK1"; diff --git a/Documentation/devicetree/bindings/mfd/mt6397.txt b/Documentation/devicetree/bindings/mfd/mt6397.txt index c568d52af5afc094c107f418859ae092fcbe6b0f..522a3bbf1bac0482018970f966a06b58aa43faf5 100644 --- a/Documentation/devicetree/bindings/mfd/mt6397.txt +++ b/Documentation/devicetree/bindings/mfd/mt6397.txt @@ -6,6 +6,7 @@ MT6397/MT6323 is a multifunction device with the following sub modules: - Audio codec - GPIO - Clock +- LED It is interfaced to host controller using SPI interface by a proprietary hardware called PMIC wrapper or pwrap. MT6397/MT6323 MFD is a child device of pwrap. diff --git a/Documentation/devicetree/bindings/iio/adc/mxs-lradc.txt b/Documentation/devicetree/bindings/mfd/mxs-lradc.txt similarity index 100% rename from Documentation/devicetree/bindings/iio/adc/mxs-lradc.txt rename to Documentation/devicetree/bindings/mfd/mxs-lradc.txt diff --git a/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt b/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt index c110e118b79feeb5745b9e57c0eee7c99585289b..df664018c148edf237fcc89e0a85b68c94c299e7 100644 --- a/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt +++ b/Documentation/devicetree/bindings/mfd/samsung,exynos5433-lpass.txt @@ -5,7 +5,10 @@ Required properties: - compatible : "samsung,exynos5433-lpass" - reg : should contain the LPASS top SFR region location and size - - samsung,pmu-syscon : the phandle to the Power Management Unit node + - clock-names : should contain following required clocks: "sfr0_ctrl" + - clocks : should contain clock specifiers of all clocks, which + input names have been specified in clock-names + property, in same order. - #address-cells : should be 1 - #size-cells : should be 1 - ranges : must be present @@ -25,7 +28,8 @@ Example: audio-subsystem { compatible = "samsung,exynos5433-lpass"; reg = <0x11400000 0x100>, <0x11500000 0x08>; - samsung,pmu-syscon = <&pmu_system_controller>; + clocks = <&cmu_aud CLK_PCLK_SFR0_CTRL>; + clock-names = "sfr0_ctrl"; #address-cells = <1>; #size-cells = <1>; ranges; diff --git a/Documentation/devicetree/bindings/mfd/sun4i-gpadc.txt b/Documentation/devicetree/bindings/mfd/sun4i-gpadc.txt new file mode 100644 index 0000000000000000000000000000000000000000..badff3611a986e9d726a4e9851b840b9ac413806 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/sun4i-gpadc.txt @@ -0,0 +1,59 @@ +Allwinner SoCs' GPADC Device Tree bindings +------------------------------------------ +The Allwinner SoCs all have an ADC that can also act as a thermal sensor +and sometimes as a touchscreen controller. + +Required properties: + - compatible: "allwinner,sun8i-a33-ths", + - reg: mmio address range of the chip, + - #thermal-sensor-cells: shall be 0, + - #io-channel-cells: shall be 0, + +Example: + ths: ths@01c25000 { + compatible = "allwinner,sun8i-a33-ths"; + reg = <0x01c25000 0x100>; + #thermal-sensor-cells = <0>; + #io-channel-cells = <0>; + }; + +sun4i, sun5i and sun6i SoCs are also supported via the older binding: + +sun4i resistive touchscreen controller +-------------------------------------- + +Required properties: + - compatible: "allwinner,sun4i-a10-ts", "allwinner,sun5i-a13-ts" or + "allwinner,sun6i-a31-ts" + - reg: mmio address range of the chip + - interrupts: interrupt to which the chip is connected + - #thermal-sensor-cells: shall be 0 + +Optional properties: + - allwinner,ts-attached : boolean indicating that an actual touchscreen + is attached to the controller + - allwinner,tp-sensitive-adjust : integer (4 bits) + adjust sensitivity of pen down detection + between 0 (least sensitive) and 15 + (defaults to 15) + - allwinner,filter-type : integer (2 bits) + select median and averaging filter + samples used for median / averaging filter + 0: 4/2 + 1: 5/3 + 2: 8/4 + 3: 16/8 + (defaults to 1) + +Example: + + rtp: rtp@01c25000 { + compatible = "allwinner,sun4i-a10-ts"; + reg = <0x01c25000 0x100>; + interrupts = <29>; + allwinner,ts-attached; + #thermal-sensor-cells = <0>; + /* sensitive/noisy touch panel */ + allwinner,tp-sensitive-adjust = <0>; + allwinner,filter-type = <3>; + }; diff --git a/Documentation/devicetree/bindings/mfd/ti-lmu.txt b/Documentation/devicetree/bindings/mfd/ti-lmu.txt new file mode 100644 index 0000000000000000000000000000000000000000..c885cf89b8ce83ce29b9d603011859f5962493a4 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/ti-lmu.txt @@ -0,0 +1,243 @@ +TI LMU (Lighting Management Unit) device tree bindings + +TI LMU driver supports lighting devices below. + + Name Child nodes + ------ --------------------------------- + LM3532 Backlight + LM3631 Backlight and regulator + LM3632 Backlight and regulator + LM3633 Backlight, LED and fault monitor + LM3695 Backlight + LM3697 Backlight and fault monitor + +Required properties: + - compatible: Should be one of: + "ti,lm3532" + "ti,lm3631" + "ti,lm3632" + "ti,lm3633" + "ti,lm3695" + "ti,lm3697" + - reg: I2C slave address. + 0x11 for LM3632 + 0x29 for LM3631 + 0x36 for LM3633, LM3697 + 0x38 for LM3532 + 0x63 for LM3695 + +Optional property: + - enable-gpios: A GPIO specifier for hardware enable pin. + +Required node: + - backlight: All LMU devices have backlight child nodes. + For the properties, please refer to [1]. + +Optional nodes: + - fault-monitor: Hardware fault monitoring driver for LM3633 and LM3697. + Required properties: + - compatible: Should be one of: + "ti,lm3633-fault-monitor" + "ti,lm3697-fault-monitor" + - leds: LED properties for LM3633. Please refer to [2]. + - regulators: Regulator properties for LM3631 and LM3632. + Please refer to [3]. + +[1] ../leds/backlight/ti-lmu-backlight.txt +[2] ../leds/leds-lm3633.txt +[3] ../regulator/lm363x-regulator.txt + +lm3532@38 { + compatible = "ti,lm3532"; + reg = <0x38>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; + + backlight { + compatible = "ti,lm3532-backlight"; + + lcd { + led-sources = <0 1 2>; + ramp-up-msec = <30>; + ramp-down-msec = <0>; + }; + }; +}; + +lm3631@29 { + compatible = "ti,lm3631"; + reg = <0x29>; + + regulators { + compatible = "ti,lm363x-regulator"; + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4500000>; + regulator-max-microvolt = <6350000>; + regulator-always-on; + }; + + vcont { + regulator-name = "lcd_vcont"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + }; + + voref { + regulator-name = "lcd_voref"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + regulator-boot-on; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + regulator-boot-on; + }; + }; + + backlight { + compatible = "ti,lm3631-backlight"; + + lcd_bl { + led-sources = <0 1>; + ramp-up-msec = <300>; + }; + }; +}; + +lm3632@11 { + compatible = "ti,lm3632"; + reg = <0x11>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; /* PC2 */ + + regulators { + compatible = "ti,lm363x-regulator"; + + ti,lcm-en1-gpio = <&pioC 0 GPIO_ACTIVE_HIGH>; /* PC0 */ + ti,lcm-en2-gpio = <&pioC 1 GPIO_ACTIVE_HIGH>; /* PC1 */ + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4500000>; + regulator-max-microvolt = <6400000>; + regulator-always-on; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + }; + + backlight { + compatible = "ti,lm3632-backlight"; + + pwms = <&pwm0 0 10000 0>; /* pwm number, period, polarity */ + pwm-names = "lmu-backlight"; + + lcd { + led-sources = <0 1>; + pwm-period = <10000>; + }; + }; +}; + +lm3633@36 { + compatible = "ti,lm3633"; + reg = <0x36>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; + + backlight { + compatible = "ti,lm3633-backlight"; + + main { + label = "main_lcd"; + led-sources = <1 2>; + ramp-up-msec = <500>; + ramp-down-msec = <500>; + }; + + front { + label = "front_lcd"; + led-sources = <0>; + ramp-up-msec = <1000>; + ramp-down-msec = <0>; + }; + }; + + leds { + compatible = "ti,lm3633-leds"; + + chan1 { + label = "status"; + led-sources = <1>; + led-max-microamp = <6000>; + }; + + chan345 { + label = "rgb"; + led-sources = <3 4 5>; + led-max-microamp = <10000>; + }; + }; + + fault-monitor { + compatible = "ti,lm3633-fault-monitor"; + }; +}; + +lm3695@63 { + compatible = "ti,lm3695"; + reg = <0x63>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; + + backlight { + compatible = "ti,lm3695-backlight"; + + lcd { + label = "bl"; + led-sources = <0 1>; + }; + }; +}; + +lm3697@36 { + compatible = "ti,lm3697"; + reg = <0x36>; + + enable-gpios = <&pioC 2 GPIO_ACTIVE_HIGH>; + + backlight { + compatible = "ti,lm3697-backlight"; + + lcd { + led-sources = <0 1 2>; + ramp-up-msec = <200>; + ramp-down-msec = <200>; + }; + }; + + fault-monitor { + compatible = "ti,lm3697-fault-monitor"; + }; +}; diff --git a/Documentation/devicetree/bindings/mfd/wm831x.txt b/Documentation/devicetree/bindings/mfd/wm831x.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f8b7430673c922b2d7f857a8c558e8345a2e831 --- /dev/null +++ b/Documentation/devicetree/bindings/mfd/wm831x.txt @@ -0,0 +1,81 @@ +Cirrus Logic/Wolfson Microelectronics wm831x PMICs + +System PMICs with a wide range of additional features. + +Required properties: + + - compatible : One of the following chip-specific strings: + "wlf,wm8310" + "wlf,wm8311" + "wlf,wm8312" + "wlf,wm8320" + "wlf,wm8321" + "wlf,wm8325" + "wlf,wm8326" + + - reg : I2C slave address when connected using I2C, chip select number + when using SPI. + + - gpio-controller : Indicates this device is a GPIO controller. + - #gpio-cells : Must be 2. The first cell is the pin number and the + second cell is used to specify optional parameters (currently unused). + + - interrupts : The interrupt line the IRQ signal for the device is + connected to. + - interrupt-parent : The parent interrupt controller. + + - interrupt-controller : wm831x devices contain interrupt controllers and + may provide interrupt services to other devices. + - #interrupt-cells: Must be 2. The first cell is the IRQ number, and the + second cell is the flags, encoded as the trigger masks from + ../interrupt-controller/interrupts.txt + +Optional sub-nodes: + - regulators : Contains sub-nodes for each of the regulators supplied by + the device. The regulators are bound using their names listed below: + + dcdc1 : DCDC1 + dcdc2 : DCDC2 + dcdc3 : DCDC3 + dcdc4 : DCDC3 + isink1 : ISINK1 + isink2 : ISINK2 + ldo1 : LDO1 + ldo2 : LDO2 + ldo3 : LDO3 + ldo4 : LDO4 + ldo5 : LDO5 + ldo7 : LDO7 + ldo11 : LDO11 + + The bindings details of each regulator can be found in: + ../regulator/regulator.txt + +Example: + +wm8310: pmic@36 { + compatible = "wlf,wm8310"; + reg = <0x36>; + + gpio-controller; + #gpio-cells = <2>; + + interrupts = <347>; + interrupt-parent = <&gic>; + + interrupt-controller; + #interrupt-cells = <2>; + + regulators { + dcdc1: dcdc1 { + regulator-name = "DCDC1"; + regulator-min-microvolt = <600000>; + regulator-max-microvolt = <600000>; + }; + ldo1: ldo1 { + regulator-name = "LDO1"; + regulator-min-microvolt = <1700000>; + regulator-max-microvolt = <1700000>; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/mmc/brcm,bcm2835-sdhost.txt b/Documentation/devicetree/bindings/mmc/brcm,bcm2835-sdhost.txt new file mode 100644 index 0000000000000000000000000000000000000000..d876580ae3b837a9174e454d223c6fa4df7ce94e --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/brcm,bcm2835-sdhost.txt @@ -0,0 +1,23 @@ +Broadcom BCM2835 SDHOST controller + +This file documents differences between the core properties described +by mmc.txt and the properties that represent the BCM2835 controller. + +Required properties: +- compatible: Should be "brcm,bcm2835-sdhost". +- clocks: The clock feeding the SDHOST controller. + +Optional properties: +- dmas: DMA channel for read and write. + See Documentation/devicetree/bindings/dma/dma.txt for details + +Example: + +sdhost: mmc@7e202000 { + compatible = "brcm,bcm2835-sdhost"; + reg = <0x7e202000 0x100>; + interrupts = <2 24>; + clocks = <&clocks BCM2835_CLOCK_VPU>; + dmas = <&dma 13>; + dma-names = "rx-tx"; +}; diff --git a/Documentation/devicetree/bindings/mmc/cavium-mmc.txt b/Documentation/devicetree/bindings/mmc/cavium-mmc.txt new file mode 100644 index 0000000000000000000000000000000000000000..1433e6201dff3c1b4bb4a79f80b8ccf47a33fdaf --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/cavium-mmc.txt @@ -0,0 +1,57 @@ +* Cavium Octeon & ThunderX MMC controller + +The highspeed MMC host controller on Caviums SoCs provides an interface +for MMC and SD types of memory cards. + +Supported maximum speeds are the ones of the eMMC standard 4.41 as well +as the speed of SD standard 4.0. Only 3.3 Volt is supported. + +Required properties: + - compatible : should be one of: + cavium,octeon-6130-mmc + cavium,octeon-7890-mmc + cavium,thunder-8190-mmc + cavium,thunder-8390-mmc + mmc-slot + - reg : mmc controller base registers + - clocks : phandle + +Optional properties: + - for cd, bus-width and additional generic mmc parameters + please refer to mmc.txt within this directory + - cavium,cmd-clk-skew : number of coprocessor clocks before sampling command + - cavium,dat-clk-skew : number of coprocessor clocks before sampling data + +Deprecated properties: +- spi-max-frequency : use max-frequency instead +- cavium,bus-max-width : use bus-width instead +- power-gpios : use vmmc-supply instead +- cavium,octeon-6130-mmc-slot : use mmc-slot instead + +Examples: + mmc_1_4: mmc@1,4 { + compatible = "cavium,thunder-8390-mmc"; + reg = <0x0c00 0 0 0 0>; /* DEVFN = 0x0c (1:4) */ + #address-cells = <1>; + #size-cells = <0>; + clocks = <&sclk>; + + mmc-slot@0 { + compatible = "mmc-slot"; + reg = <0>; + vmmc-supply = <&mmc_supply_3v3>; + max-frequency = <42000000>; + bus-width = <4>; + cap-sd-highspeed; + }; + + mmc-slot@1 { + compatible = "mmc-slot"; + reg = <1>; + vmmc-supply = <&mmc_supply_3v3>; + max-frequency = <42000000>; + bus-width = <8>; + cap-mmc-highspeed; + non-removable; + }; + }; diff --git a/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt new file mode 100644 index 0000000000000000000000000000000000000000..b878a1e305af6e143a646b3473be522c82a62e8a --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/marvell,xenon-sdhci.txt @@ -0,0 +1,170 @@ +Marvell Xenon SDHCI Controller device tree bindings +This file documents differences between the core mmc properties +described by mmc.txt and the properties used by the Xenon implementation. + +Multiple SDHCs might be put into a single Xenon IP, to save size and cost. +Each SDHC is independent and owns independent resources, such as register sets, +clock and PHY. +Each SDHC should have an independent device tree node. + +Required Properties: +- compatible: should be one of the following + - "marvell,armada-3700-sdhci": For controllers on Armada-3700 SoC. + Must provide a second register area and marvell,pad-type. + - "marvell,armada-ap806-sdhci": For controllers on Armada AP806. + - "marvell,armada-cp110-sdhci": For controllers on Armada CP110. + +- clocks: + Array of clocks required for SDHC. + Require at least input clock for Xenon IP core. + +- clock-names: + Array of names corresponding to clocks property. + The input clock for Xenon IP core should be named as "core". + +- reg: + * For "marvell,armada-3700-sdhci", two register areas. + The first one for Xenon IP register. The second one for the Armada 3700 SoC + PHY PAD Voltage Control register. + Please follow the examples with compatible "marvell,armada-3700-sdhci" + in below. + Please also check property marvell,pad-type in below. + + * For other compatible strings, one register area for Xenon IP. + +Optional Properties: +- marvell,xenon-sdhc-id: + Indicate the corresponding bit index of current SDHC in + SDHC System Operation Control Register Bit[7:0]. + Set/clear the corresponding bit to enable/disable current SDHC. + If Xenon IP contains only one SDHC, this property is optional. + +- marvell,xenon-phy-type: + Xenon support multiple types of PHYs. + To select eMMC 5.1 PHY, set: + marvell,xenon-phy-type = "emmc 5.1 phy" + eMMC 5.1 PHY is the default choice if this property is not provided. + To select eMMC 5.0 PHY, set: + marvell,xenon-phy-type = "emmc 5.0 phy" + + All those types of PHYs can support eMMC, SD and SDIO. + Please note that this property only presents the type of PHY. + It doesn't stand for the entire SDHC type or property. + For example, "emmc 5.1 phy" doesn't mean that this Xenon SDHC only + supports eMMC 5.1. + +- marvell,xenon-phy-znr: + Set PHY ZNR value. + Only available for eMMC PHY. + Valid range = [0:0x1F]. + ZNR is set as 0xF by default if this property is not provided. + +- marvell,xenon-phy-zpr: + Set PHY ZPR value. + Only available for eMMC PHY. + Valid range = [0:0x1F]. + ZPR is set as 0xF by default if this property is not provided. + +- marvell,xenon-phy-nr-success-tun: + Set the number of required consecutive successful sampling points + used to identify a valid sampling window, in tuning process. + Valid range = [1:7]. + Set as 0x4 by default if this property is not provided. + +- marvell,xenon-phy-tun-step-divider: + Set the divider for calculating TUN_STEP. + Set as 64 by default if this property is not provided. + +- marvell,xenon-phy-slow-mode: + If this property is selected, transfers will bypass PHY. + Only available when bus frequency lower than 55MHz in SDR mode. + Disabled by default. Please only try this property if timing issues + always occur with PHY enabled in eMMC HS SDR, SD SDR12, SD SDR25, + SD Default Speed and HS mode and eMMC legacy speed mode. + +- marvell,xenon-tun-count: + Xenon SDHC SoC usually doesn't provide re-tuning counter in + Capabilities Register 3 Bit[11:8]. + This property provides the re-tuning counter. + If this property is not set, default re-tuning counter will + be set as 0x9 in driver. + +- marvell,pad-type: + Type of Armada 3700 SoC PHY PAD Voltage Controller register. + Only valid when "marvell,armada-3700-sdhci" is selected. + Two types: "sd" and "fixed-1-8v". + If "sd" is selected, SoC PHY PAD is set as 3.3V at the beginning and is + switched to 1.8V when later in higher speed mode. + If "fixed-1-8v" is selected, SoC PHY PAD is fixed 1.8V, such as for eMMC. + Please follow the examples with compatible "marvell,armada-3700-sdhci" + in below. + +Example: +- For eMMC: + + sdhci@aa0000 { + compatible = "marvell,armada-ap806-sdhci"; + reg = <0xaa0000 0x1000>; + interrupts = + clocks = <&emmc_clk>; + clock-names = "core"; + bus-width = <4>; + marvell,xenon-phy-slow-mode; + marvell,xenon-tun-count = <11>; + non-removable; + no-sd; + no-sdio; + + /* Vmmc and Vqmmc are both fixed */ + }; + +- For SD/SDIO: + + sdhci@ab0000 { + compatible = "marvell,armada-cp110-sdhci"; + reg = <0xab0000 0x1000>; + interrupts = + vqmmc-supply = <&sd_vqmmc_regulator>; + vmmc-supply = <&sd_vmmc_regulator>; + clocks = <&sdclk>; + clock-names = "core"; + bus-width = <4>; + marvell,xenon-tun-count = <9>; + }; + +- For eMMC with compatible "marvell,armada-3700-sdhci": + + sdhci@aa0000 { + compatible = "marvell,armada-3700-sdhci"; + reg = <0xaa0000 0x1000>, + ; + interrupts = + clocks = <&emmcclk>; + clock-names = "core"; + bus-width = <8>; + mmc-ddr-1_8v; + mmc-hs400-1_8v; + non-removable; + no-sd; + no-sdio; + + /* Vmmc and Vqmmc are both fixed */ + + marvell,pad-type = "fixed-1-8v"; + }; + +- For SD/SDIO with compatible "marvell,armada-3700-sdhci": + + sdhci@ab0000 { + compatible = "marvell,armada-3700-sdhci"; + reg = <0xab0000 0x1000>, + ; + interrupts = + vqmmc-supply = <&sd_regulator>; + /* Vmmc is fixed */ + clocks = <&sdclk>; + clock-names = "core"; + bus-width = <4>; + + marvell,pad-type = "sd"; + }; diff --git a/Documentation/devicetree/bindings/mmc/mtk-sd.txt b/Documentation/devicetree/bindings/mmc/mtk-sd.txt index 0120c7f1109cb2cf830b002f9e77e774ddbdef79..4182ea36ca5b1ca2e1282f93a2d398cacb1fc357 100644 --- a/Documentation/devicetree/bindings/mmc/mtk-sd.txt +++ b/Documentation/devicetree/bindings/mmc/mtk-sd.txt @@ -21,6 +21,15 @@ Optional properties: - assigned-clocks: PLL of the source clock - assigned-clock-parents: parent of source clock, used for HS400 mode to get 400Mhz source clock - hs400-ds-delay: HS400 DS delay setting +- mediatek,hs200-cmd-int-delay: HS200 command internal delay setting. + This field has total 32 stages. + The value is an integer from 0 to 31. +- mediatek,hs400-cmd-int-delay: HS400 command internal delay setting + This field has total 32 stages. + The value is an integer from 0 to 31. +- mediatek,hs400-cmd-resp-sel-rising: HS400 command response sample selection + If present,HS400 command responses are sampled on rising edges. + If not present,HS400 command responses are sampled on falling edges. Examples: mmc0: mmc@11230000 { @@ -38,4 +47,7 @@ mmc0: mmc@11230000 { assigned-clocks = <&topckgen CLK_TOP_MSDC50_0_SEL>; assigned-clock-parents = <&topckgen CLK_TOP_MSDCPLL_D2>; hs400-ds-delay = <0x14015>; + mediatek,hs200-cmd-int-delay = <26>; + mediatek,hs400-cmd-int-delay = <14>; + mediatek,hs400-cmd-resp-sel-rising; }; diff --git a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt index 15b8368ee1f260a4f55d4226b6fccfea40e46162..9bce57862ed6ead2e693ee5bc2bcb6ef58f4c9f5 100644 --- a/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt +++ b/Documentation/devicetree/bindings/mmc/nvidia,tegra20-sdhci.txt @@ -7,11 +7,13 @@ This file documents differences between the core properties described by mmc.txt and the properties used by the sdhci-tegra driver. Required properties: -- compatible : For Tegra20, must contain "nvidia,tegra20-sdhci". - For Tegra30, must contain "nvidia,tegra30-sdhci". For Tegra114, - must contain "nvidia,tegra114-sdhci". For Tegra124, must contain - "nvidia,tegra124-sdhci". Otherwise, must contain "nvidia,-sdhci", - plus one of the above, where is tegra132 or tegra210. +- compatible : should be one of: + - "nvidia,tegra20-sdhci": for Tegra20 + - "nvidia,tegra30-sdhci": for Tegra30 + - "nvidia,tegra114-sdhci": for Tegra114 + - "nvidia,tegra124-sdhci": for Tegra124 and Tegra132 + - "nvidia,tegra210-sdhci": for Tegra210 + - "nvidia,tegra186-sdhci": for Tegra186 - clocks : Must contain one entry, for the module clock. See ../clocks/clock-bindings.txt for details. - resets : Must contain an entry for each entry in reset-names. diff --git a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt index e4ba92aa035e2314255037ae5ef844ea34d3a406..c32dc5a9dbe6cf2e434e049c4149ff0c2c3e1f4f 100644 --- a/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt +++ b/Documentation/devicetree/bindings/mmc/renesas,mmcif.txt @@ -8,6 +8,7 @@ Required properties: - compatible: should be "renesas,mmcif-", "renesas,sh-mmcif" as a fallback. Examples with are: + - "renesas,mmcif-r7s72100" for the MMCIF found in r7s72100 SoCs - "renesas,mmcif-r8a73a4" for the MMCIF found in r8a73a4 SoCs - "renesas,mmcif-r8a7740" for the MMCIF found in r8a7740 SoCs - "renesas,mmcif-r8a7778" for the MMCIF found in r8a7778 SoCs @@ -17,6 +18,13 @@ Required properties: - "renesas,mmcif-r8a7794" for the MMCIF found in r8a7794 SoCs - "renesas,mmcif-sh73a0" for the MMCIF found in sh73a0 SoCs +- interrupts: Some SoCs have only 1 shared interrupt, while others have either + 2 or 3 individual interrupts (error, int, card detect). Below is the number + of interrupts for each SoC: + 1: r8a73a4, r8a7778, r8a7790, r8a7791, r8a7793, r8a7794 + 2: r8a7740, sh73a0 + 3: r7s72100 + - clocks: reference to the functional clock - dmas: reference to the DMA channels, one per channel name listed in the diff --git a/Documentation/devicetree/bindings/mmc/samsung,s3cmci.txt b/Documentation/devicetree/bindings/mmc/samsung,s3cmci.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f68feb9f9d61f2bd0cadfe38e8b5cd6d834cbec --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/samsung,s3cmci.txt @@ -0,0 +1,42 @@ +* Samsung's S3C24XX MMC/SD/SDIO controller device tree bindings + +Samsung's S3C24XX MMC/SD/SDIO controller is used as a connectivity interface +with external MMC, SD and SDIO storage mediums. + +This file documents differences between the core mmc properties described by +mmc.txt and the properties used by the Samsung S3C24XX MMC/SD/SDIO controller +implementation. + +Required SoC Specific Properties: +- compatible: should be one of the following + - "samsung,s3c2410-sdi": for controllers compatible with s3c2410 + - "samsung,s3c2412-sdi": for controllers compatible with s3c2412 + - "samsung,s3c2440-sdi": for controllers compatible with s3c2440 +- reg: register location and length +- interrupts: mmc controller interrupt +- clocks: Should reference the controller clock +- clock-names: Should contain "sdi" + +Required Board Specific Properties: +- pinctrl-0: Should specify pin control groups used for this controller. +- pinctrl-names: Should contain only one value - "default". + +Optional Properties: +- bus-width: number of data lines (see mmc.txt) +- cd-gpios: gpio for card detection (see mmc.txt) +- wp-gpios: gpio for write protection (see mmc.txt) + +Example: + + mmc0: mmc@5a000000 { + compatible = "samsung,s3c2440-sdi"; + pinctrl-names = "default"; + pinctrl-0 = <&sdi_pins>; + reg = <0x5a000000 0x100000>; + interrupts = <0 0 21 3>; + clocks = <&clocks PCLK_SDI>; + clock-names = "sdi"; + bus-width = <4>; + cd-gpios = <&gpg 8 GPIO_ACTIVE_LOW>; + wp-gpios = <&gph 8 GPIO_ACTIVE_LOW>; + }; diff --git a/Documentation/devicetree/bindings/mmc/sdhci-cadence.txt b/Documentation/devicetree/bindings/mmc/sdhci-cadence.txt index c0f37cb41a9b47516f29cafa573d7886d103a2a7..fa423c2778535bb50b89f805d1a295d1babbe210 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-cadence.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-cadence.txt @@ -19,6 +19,53 @@ if supported. See mmc.txt for details. - mmc-hs400-1_8v - mmc-hs400-1_2v +Some PHY delays can be configured by following properties. +PHY DLL input delays: +They are used to delay the data valid window, and align the window +to sampling clock. The delay starts from 5ns (for delay parameter equal to 0) +and it is increased by 2.5ns in each step. +- cdns,phy-input-delay-sd-highspeed: + Value of the delay in the input path for SD high-speed timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-legacy: + Value of the delay in the input path for legacy timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-sd-uhs-sdr12: + Value of the delay in the input path for SD UHS SDR12 timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-sd-uhs-sdr25: + Value of the delay in the input path for SD UHS SDR25 timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-sd-uhs-sdr50: + Value of the delay in the input path for SD UHS SDR50 timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-sd-uhs-ddr50: + Value of the delay in the input path for SD UHS DDR50 timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-mmc-highspeed: + Value of the delay in the input path for MMC high-speed timing + Valid range = [0:0x1F]. +- cdns,phy-input-delay-mmc-ddr: + Value of the delay in the input path for eMMC high-speed DDR timing + Valid range = [0:0x1F]. + +PHY DLL clock delays: +Each delay property represents the fraction of the clock period. +The approximate delay value will be +(/128)*sdmclk_clock_period. +- cdns,phy-dll-delay-sdclk: + Value of the delay introduced on the sdclk output + for all modes except HS200, HS400 and HS400_ES. + Valid range = [0:0x7F]. +- cdns,phy-dll-delay-sdclk-hsmmc: + Value of the delay introduced on the sdclk output + for HS200, HS400 and HS400_ES speed modes. + Valid range = [0:0x7F]. +- cdns,phy-dll-delay-strobe: + Value of the delay introduced on the dat_strobe input + used in HS400 / HS400_ES speed modes. + Valid range = [0:0x7F]. + Example: emmc: sdhci@5a000000 { compatible = "socionext,uniphier-sd4hc", "cdns,sd4hc"; @@ -29,4 +76,5 @@ Example: mmc-ddr-1_8v; mmc-hs200-1_8v; mmc-hs400-1_8v; + cdns,phy-dll-delay-sdclk = <0>; }; diff --git a/Documentation/devicetree/bindings/mtd/atmel-nand.txt b/Documentation/devicetree/bindings/mtd/atmel-nand.txt index 3e7ee99d3949ab07b5515e3afff350aebc2e0e71..f6bee57e453aa6dcb9114a188538e1c69860b4b2 100644 --- a/Documentation/devicetree/bindings/mtd/atmel-nand.txt +++ b/Documentation/devicetree/bindings/mtd/atmel-nand.txt @@ -1,4 +1,109 @@ -Atmel NAND flash +Atmel NAND flash controller bindings + +The NAND flash controller node should be defined under the EBI bus (see +Documentation/devicetree/bindings/memory-controllers/atmel,ebi.txt). +One or several NAND devices can be defined under this NAND controller. +The NAND controller might be connected to an ECC engine. + +* NAND controller bindings: + +Required properties: +- compatible: should be one of the following + "atmel,at91rm9200-nand-controller" + "atmel,at91sam9260-nand-controller" + "atmel,at91sam9261-nand-controller" + "atmel,at91sam9g45-nand-controller" + "atmel,sama5d3-nand-controller" +- ranges: empty ranges property to forward EBI ranges definitions. +- #address-cells: should be set to 2. +- #size-cells: should be set to 1. +- atmel,nfc-io: phandle to the NFC IO block. Only required for sama5d3 + controllers. +- atmel,nfc-sram: phandle to the NFC SRAM block. Only required for sama5d3 + controllers. + +Optional properties: +- ecc-engine: phandle to the PMECC block. Only meaningful if the SoC embeds + a PMECC engine. + +* NAND device/chip bindings: + +Required properties: +- reg: describes the CS lines assigned to the NAND device. If the NAND device + exposes multiple CS lines (multi-dies chips), your reg property will + contain X tuples of 3 entries. + 1st entry: the CS line this NAND chip is connected to + 2nd entry: the base offset of the memory region assigned to this + device (always 0) + 3rd entry: the memory region size (always 0x800000) + +Optional properties: +- rb-gpios: the GPIO(s) used to check the Ready/Busy status of the NAND. +- cs-gpios: the GPIO(s) used to control the CS line. +- det-gpios: the GPIO used to detect if a Smartmedia Card is present. +- atmel,rb: an integer identifying the native Ready/Busy pin. Only meaningful + on sama5 SoCs. + +All generic properties described in +Documentation/devicetree/bindings/mtd/{common,nand}.txt also apply to the NAND +device node, and NAND partitions should be defined under the NAND node as +described in Documentation/devicetree/bindings/mtd/partition.txt. + +* ECC engine (PMECC) bindings: + +Required properties: +- compatible: should be one of the following + "atmel,at91sam9g45-pmecc" + "atmel,sama5d4-pmecc" + "atmel,sama5d2-pmecc" +- reg: should contain 2 register ranges. The first one is pointing to the PMECC + block, and the second one to the PMECC_ERRLOC block. + +Example: + + pmecc: ecc-engine@ffffc070 { + compatible = "atmel,at91sam9g45-pmecc"; + reg = <0xffffc070 0x490>, + <0xffffc500 0x100>; + }; + + ebi: ebi@10000000 { + compatible = "atmel,sama5d3-ebi"; + #address-cells = <2>; + #size-cells = <1>; + atmel,smc = <&hsmc>; + reg = <0x10000000 0x10000000 + 0x40000000 0x30000000>; + ranges = <0x0 0x0 0x10000000 0x10000000 + 0x1 0x0 0x40000000 0x10000000 + 0x2 0x0 0x50000000 0x10000000 + 0x3 0x0 0x60000000 0x10000000>; + clocks = <&mck>; + + nand_controller: nand-controller { + compatible = "atmel,sama5d3-nand-controller"; + atmel,nfc-sram = <&nfc_sram>; + atmel,nfc-io = <&nfc_io>; + ecc-engine = <&pmecc>; + #address-cells = <2>; + #size-cells = <1>; + ranges; + + nand@3 { + reg = <0x3 0x0 0x800000>; + atmel,rb = <0>; + + /* + * Put generic NAND/MTD properties and + * subnodes here. + */ + }; + }; + }; + +----------------------------------------------------------------------- + +Deprecated bindings (should not be used in new device trees): Required properties: - compatible: The possible values are: diff --git a/Documentation/devicetree/bindings/mtd/denali-nand.txt b/Documentation/devicetree/bindings/mtd/denali-nand.txt index b04d03a1d49951a0fe6fbecde446fb523919f811..e593bbeb2115deb92966d593c18f002e5f269e41 100644 --- a/Documentation/devicetree/bindings/mtd/denali-nand.txt +++ b/Documentation/devicetree/bindings/mtd/denali-nand.txt @@ -1,11 +1,11 @@ * Denali NAND controller Required properties: - - compatible : should be "denali,denali-nand-dt" + - compatible : should be one of the following: + "altr,socfpga-denali-nand" - for Altera SOCFPGA - reg : should contain registers location and length for data and reg. - reg-names: Should contain the reg names "nand_data" and "denali_reg" - interrupts : The interrupt number. - - dm-mask : DMA bit mask The device tree may optionally contain sub-nodes describing partitions of the address space. See partition.txt for more detail. @@ -15,9 +15,8 @@ Examples: nand: nand@ff900000 { #address-cells = <1>; #size-cells = <1>; - compatible = "denali,denali-nand-dt"; + compatible = "altr,socfpga-denali-nand"; reg = <0xff900000 0x100000>, <0xffb80000 0x10000>; reg-names = "nand_data", "denali_reg"; interrupts = <0 144 4>; - dma-mask = <0xffffffff>; }; diff --git a/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt b/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt index af8915b41ccf4962f9aa3a232b5bd268910ae3b7..486a17d533d7a33cce0ac8814a51979ebc7bc0ca 100644 --- a/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt +++ b/Documentation/devicetree/bindings/mtd/gpio-control-nand.txt @@ -12,7 +12,7 @@ Required properties: - #address-cells, #size-cells : Must be present if the device has sub-nodes representing partitions. - gpios : Specifies the GPIO pins to control the NAND device. The order of - GPIO references is: RDY, nCE, ALE, CLE, and an optional nWP. + GPIO references is: RDY, nCE, ALE, CLE, and nWP. nCE and nWP are optional. Optional properties: - bank-width : Width (in bytes) of the device. If not present, the width @@ -36,7 +36,7 @@ gpio-nand@1,0 { #address-cells = <1>; #size-cells = <1>; gpios = <&banka 1 0>, /* RDY */ - <&banka 2 0>, /* nCE */ + <0>, /* nCE */ <&banka 3 0>, /* ALE */ <&banka 4 0>, /* CLE */ <0>; /* nWP */ diff --git a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt index 3e920ec5c4d36d48148e069ccb9de617dbee2326..9ce35af8507c1da0a208fc1823b5d1d487909114 100644 --- a/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt +++ b/Documentation/devicetree/bindings/mtd/jedec,spi-nor.txt @@ -40,6 +40,7 @@ Required properties: w25x80 w25x32 w25q32 + w25q64 w25q32dw w25q80bl w25q128 diff --git a/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddd18c1351486d579606123aa900683848c29e65 --- /dev/null +++ b/Documentation/devicetree/bindings/mtd/stm32-quadspi.txt @@ -0,0 +1,43 @@ +* STMicroelectronics Quad Serial Peripheral Interface(QuadSPI) + +Required properties: +- compatible: should be "st,stm32f469-qspi" +- reg: the first contains the register location and length. + the second contains the memory mapping address and length +- reg-names: should contain the reg names "qspi" "qspi_mm" +- interrupts: should contain the interrupt for the device +- clocks: the phandle of the clock needed by the QSPI controller +- A pinctrl must be defined to set pins in mode of operation for QSPI transfer + +Optional properties: +- resets: must contain the phandle to the reset controller. + +A spi flash must be a child of the nor_flash node and could have some +properties. Also see jedec,spi-nor.txt. + +Required properties: +- reg: chip-Select number (QSPI controller may connect 2 nor flashes) +- spi-max-frequency: max frequency of spi bus + +Optional property: +- spi-rx-bus-width: see ../spi/spi-bus.txt for the description + +Example: + +qspi: spi@a0001000 { + compatible = "st,stm32f469-qspi"; + reg = <0xa0001000 0x1000>, <0x90000000 0x10000000>; + reg-names = "qspi", "qspi_mm"; + interrupts = <91>; + resets = <&rcc STM32F4_AHB3_RESET(QSPI)>; + clocks = <&rcc 0 STM32F4_AHB3_CLOCK(QSPI)>; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_qspi0>; + + flash@0 { + reg = <0>; + spi-rx-bus-width = <4>; + spi-max-frequency = <108000000>; + ... + }; +}; diff --git a/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt b/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt index 10587bdadbbe5a8e8fe703e23c194420e3701f9f..26c77d985fafe06092467c5a2ea8a0e176d0d460 100644 --- a/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt +++ b/Documentation/devicetree/bindings/net/brcm,bcmgenet.txt @@ -2,11 +2,14 @@ Required properties: - compatible: should contain one of "brcm,genet-v1", "brcm,genet-v2", - "brcm,genet-v3", "brcm,genet-v4". + "brcm,genet-v3", "brcm,genet-v4", "brcm,genet-v5". - reg: address and length of the register set for the device -- interrupts: must be two cells, the first cell is the general purpose - interrupt line, while the second cell is the interrupt for the ring - RX and TX queues operating in ring mode +- interrupts and/or interrupts-extended: must be two cells, the first cell + is the general purpose interrupt line, while the second cell is the + interrupt for the ring RX and TX queues operating in ring mode. An + optional third interrupt cell for Wake-on-LAN can be specified. + See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt + for information on the property specifics. - phy-mode: see ethernet.txt file in the same directory - #address-cells: should be 1 - #size-cells: should be 1 @@ -29,15 +32,15 @@ Optional properties: Required child nodes: -- mdio bus node: this node should always be present regarless of the PHY +- mdio bus node: this node should always be present regardless of the PHY configuration of the GENET instance MDIO bus node required properties: - compatible: should contain one of "brcm,genet-mdio-v1", "brcm,genet-mdio-v2" - "brcm,genet-mdio-v3", "brcm,genet-mdio-v4", the version has to match the - parent node compatible property (e.g: brcm,genet-v4 pairs with - brcm,genet-mdio-v4) + "brcm,genet-mdio-v3", "brcm,genet-mdio-v4", "brcm,genet-mdio-v5", the version + has to match the parent node compatible property (e.g: brcm,genet-v4 pairs + with brcm,genet-mdio-v4) - reg: address and length relative to the parent node base register address - #address-cells: address cell for MDIO bus addressing, should be 1 - #size-cells: size of the cells for MDIO bus addressing, should be 0 diff --git a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt index ab0bb4247d14950959044cc138e71e7c736f85b9..4648948f7c3b8f26391292b06aa01743100e8796 100644 --- a/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt +++ b/Documentation/devicetree/bindings/net/brcm,unimac-mdio.txt @@ -2,8 +2,9 @@ Required properties: - compatible: should one from "brcm,genet-mdio-v1", "brcm,genet-mdio-v2", - "brcm,genet-mdio-v3", "brcm,genet-mdio-v4" or "brcm,unimac-mdio" -- reg: address and length of the regsiter set for the device, first one is the + "brcm,genet-mdio-v3", "brcm,genet-mdio-v4", "brcm,genet-mdio-v5" or + "brcm,unimac-mdio" +- reg: address and length of the register set for the device, first one is the base register, and the second one is optional and for indirect accesses to larger than 16-bits MDIO transactions - reg-names: name(s) of the register must be "mdio" and optional "mdio_indir_rw" diff --git a/Documentation/devicetree/bindings/net/can/holt_hi311x.txt b/Documentation/devicetree/bindings/net/can/holt_hi311x.txt new file mode 100644 index 0000000000000000000000000000000000000000..23aa94eab2076f142dea3a0c878aebb15d46d7fd --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/holt_hi311x.txt @@ -0,0 +1,24 @@ +* Holt HI-311X stand-alone CAN controller device tree bindings + +Required properties: + - compatible: Should be one of the following: + - "holt,hi3110" for HI-3110 + - reg: SPI chip select. + - clocks: The clock feeding the CAN controller. + - interrupt-parent: The parent interrupt controller. + - interrupts: Should contain IRQ line for the CAN controller. + +Optional properties: + - vdd-supply: Regulator that powers the CAN controller. + - xceiver-supply: Regulator that powers the CAN transceiver. + +Example: + can0: can@1 { + compatible = "holt,hi3110"; + reg = <1>; + clocks = <&clk32m>; + interrupt-parent = <&gpio4>; + interrupts = <13 IRQ_TYPE_EDGE_RISING>; + vdd-supply = <®5v0>; + xceiver-supply = <®5v0>; + }; diff --git a/Documentation/devicetree/bindings/net/can/ti_hecc.txt b/Documentation/devicetree/bindings/net/can/ti_hecc.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0f0a7cfe3293893ae42fae3ca14571c29288a01 --- /dev/null +++ b/Documentation/devicetree/bindings/net/can/ti_hecc.txt @@ -0,0 +1,32 @@ +Texas Instruments High End CAN Controller (HECC) +================================================ + +This file provides information, what the device node +for the hecc interface contains. + +Required properties: +- compatible: "ti,am3517-hecc" +- reg: addresses and lengths of the register spaces for 'hecc', 'hecc-ram' + and 'mbx' +- reg-names :"hecc", "hecc-ram", "mbx" +- interrupts: interrupt mapping for the hecc interrupts sources +- clocks: clock phandles (see clock bindings for details) + +Optional properties: +- ti,use-hecc1int: if provided configures HECC to produce all interrupts + on HECC1INT interrupt line. By default HECC0INT interrupt + line will be used. +- xceiver-supply: regulator that powers the CAN transceiver + +Example: + +For am3517evm board: + hecc: can@5c050000 { + compatible = "ti,am3517-hecc"; + reg = <0x5c050000 0x80>, + <0x5c053000 0x180>, + <0x5c052000 0x200>; + reg-names = "hecc", "hecc-ram", "mbx"; + interrupts = <24>; + clocks = <&hecc_ck>; + }; diff --git a/Documentation/devicetree/bindings/net/dsa/lan9303.txt b/Documentation/devicetree/bindings/net/dsa/lan9303.txt new file mode 100644 index 0000000000000000000000000000000000000000..04f2965a446768920ae90f846419d438773a633d --- /dev/null +++ b/Documentation/devicetree/bindings/net/dsa/lan9303.txt @@ -0,0 +1,105 @@ +SMSC/MicroChip LAN9303 three port ethernet switch +------------------------------------------------- + +Required properties: + +- compatible: should be + - "smsc,lan9303-i2c" for I2C managed mode + or + - "smsc,lan9303-mdio" for mdio managed mode + +Optional properties: + +- reset-gpios: GPIO to be used to reset the whole device +- reset-duration: reset duration in milliseconds, defaults to 200 ms + +Subnodes: + +The integrated switch subnode should be specified according to the binding +described in dsa/dsa.txt. The CPU port of this switch is always port 0. + +Note: always use 'reg = <0/1/2>;' for the three DSA ports, even if the device is +configured to use 1/2/3 instead. This hardware configuration will be +auto-detected and mapped accordingly. + +Example: + +I2C managed mode: + + master: masterdevice@X { + status = "okay"; + + fixed-link { /* RMII fixed link to LAN9303 */ + speed = <100>; + full-duplex; + }; + }; + + switch: switch@a { + compatible = "smsc,lan9303-i2c"; + reg = <0xa>; + status = "okay"; + reset-gpios = <&gpio7 6 GPIO_ACTIVE_LOW>; + reset-duration = <200>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { /* RMII fixed link to master */ + reg = <0>; + label = "cpu"; + ethernet = <&master>; + }; + + port@1 { /* external port 1 */ + reg = <1>; + label = "lan1; + }; + + port@2 { /* external port 2 */ + reg = <2>; + label = "lan2"; + }; + }; + }; + +MDIO managed mode: + + master: masterdevice@X { + status = "okay"; + phy-handle = <&switch>; + + mdio { + #address-cells = <1>; + #size-cells = <0>; + + switch: switch-phy@0 { + compatible = "smsc,lan9303-mdio"; + reg = <0>; + reset-gpios = <&gpio7 6 GPIO_ACTIVE_LOW>; + reset-duration = <100>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + label = "cpu"; + ethernet = <&master>; + }; + + port@1 { /* external port 1 */ + reg = <1>; + label = "lan1; + }; + + port@2 { /* external port 2 */ + reg = <2>; + label = "lan2"; + }; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/dsa/mt7530.txt b/Documentation/devicetree/bindings/net/dsa/mt7530.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9bc27b93ee326fe2ac665ffc143b0e3e345c0a0 --- /dev/null +++ b/Documentation/devicetree/bindings/net/dsa/mt7530.txt @@ -0,0 +1,92 @@ +Mediatek MT7530 Ethernet switch +================================ + +Required properties: + +- compatible: Must be compatible = "mediatek,mt7530"; +- #address-cells: Must be 1. +- #size-cells: Must be 0. +- mediatek,mcm: Boolean; if defined, indicates that either MT7530 is the part + on multi-chip module belong to MT7623A has or the remotely standalone + chip as the function MT7623N reference board provided for. +- core-supply: Phandle to the regulator node necessary for the core power. +- io-supply: Phandle to the regulator node necessary for the I/O power. + See Documentation/devicetree/bindings/regulator/mt6323-regulator.txt + for details for the regulator setup on these boards. + +If the property mediatek,mcm isn't defined, following property is required + +- reset-gpios: Should be a gpio specifier for a reset line. + +Else, following properties are required + +- resets : Phandle pointing to the system reset controller with + line index for the ethsys. +- reset-names : Should be set to "mcm". + +Required properties for the child nodes within ports container: + +- reg: Port address described must be 6 for CPU port and from 0 to 5 for + user ports. +- phy-mode: String, must be either "trgmii" or "rgmii" for port labeled + "cpu". + +See Documentation/devicetree/bindings/dsa/dsa.txt for a list of additional +required, optional properties and how the integrated switch subnodes must +be specified. + +Example: + + &mdio0 { + switch@0 { + compatible = "mediatek,mt7530"; + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + + core-supply = <&mt6323_vpa_reg>; + io-supply = <&mt6323_vemc3v3_reg>; + reset-gpios = <&pio 33 0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + reg = <0>; + port@0 { + reg = <0>; + label = "lan0"; + }; + + port@1 { + reg = <1>; + label = "lan1"; + }; + + port@2 { + reg = <2>; + label = "lan2"; + }; + + port@3 { + reg = <3>; + label = "lan3"; + }; + + port@4 { + reg = <4>; + label = "wan"; + }; + + port@6 { + reg = <6>; + label = "cpu"; + ethernet = <&gmac0>; + phy-mode = "trgmii"; + fixed-link { + speed = <1000>; + full-duplex; + }; + }; + }; + }; + }; diff --git a/Documentation/devicetree/bindings/net/faraday,ftmac.txt b/Documentation/devicetree/bindings/net/faraday,ftmac.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4f55e23bf778297f6e813f1889f205219038fe --- /dev/null +++ b/Documentation/devicetree/bindings/net/faraday,ftmac.txt @@ -0,0 +1,24 @@ +Faraday Ethernet Controller + +Required properties: + +- compatible : Must contain "faraday,ftmac", as well as one of + the SoC specific identifiers: + "andestech,atmac100" + "moxa,moxart-mac" +- reg : Should contain register location and length +- interrupts : Should contain the mac interrupt number + +Example: + + mac0: mac@90900000 { + compatible = "moxa,moxart-mac"; + reg = <0x90900000 0x100>; + interrupts = <25 0>; + }; + + mac1: mac@92000000 { + compatible = "moxa,moxart-mac"; + reg = <0x92000000 0x100>; + interrupts = <27 0>; + }; diff --git a/Documentation/devicetree/bindings/net/ftgmac100.txt b/Documentation/devicetree/bindings/net/ftgmac100.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1ce1680246f0134f08a8b1957d29d0f49f655ca --- /dev/null +++ b/Documentation/devicetree/bindings/net/ftgmac100.txt @@ -0,0 +1,35 @@ +* Faraday Technology FTGMAC100 gigabit ethernet controller + +Required properties: +- compatible: "faraday,ftgmac100" + + Must also contain one of these if used as part of an Aspeed AST2400 + or 2500 family SoC as they have some subtle tweaks to the + implementation: + + - "aspeed,ast2400-mac" + - "aspeed,ast2500-mac" + +- reg: Address and length of the register set for the device +- interrupts: Should contain ethernet controller interrupt + +Optional properties: +- phy-mode: See ethernet.txt file in the same directory. If the property is + absent, "rgmii" is assumed. Supported values are "rgmii*" and "rmii" for + aspeed parts. Other (unknown) parts will accept any value. +- use-ncsi: Use the NC-SI stack instead of an MDIO PHY. Currently assumes + rmii (100bT) but kept as a separate property in case NC-SI grows support + for a gigabit link. +- no-hw-checksum: Used to disable HW checksum support. Here for backward + compatibility as the driver now should have correct defaults based on + the SoC. + +Example: + + mac0: ethernet@1e660000 { + compatible = "aspeed,ast2500-mac", "faraday,ftgmac100"; + reg = <0x1e660000 0x180>; + interrupts = <2>; + status = "okay"; + use-ncsi; + }; diff --git a/Documentation/devicetree/bindings/net/ieee802154/ca8210.txt b/Documentation/devicetree/bindings/net/ieee802154/ca8210.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1046e636fa14b494963f6770ffa578359dcd224 --- /dev/null +++ b/Documentation/devicetree/bindings/net/ieee802154/ca8210.txt @@ -0,0 +1,28 @@ +* CA8210 IEEE 802.15.4 * + +Required properties: + - compatible: Should be "cascoda,ca8210" + - reg: Controlling chip select + - spi-max-frequency: Maximum clock speed, should be *less than* + 4000000 + - spi-cpol: Requires inverted clock polarity + - reset-gpio: GPIO attached to reset + - irq-gpio: GPIO attached to IRQ +Optional properties: + - extclock-enable: Include for the ca8210 to route its 16MHz clock + to an output + - extclock-freq: Frequency in Hz of the external clock + - extclock-gpio: GPIO of the ca8210 to output the clock on + +Example: + ca8210@0 { + compatible = "cascoda,ca8210"; + reg = <0>; + spi-max-frequency = <3000000>; + spi-cpol; + reset-gpio = <&gpio1 1 GPIO_ACTIVE_HIGH>; + irq-gpio = <&gpio1 2 GPIO_ACTIVE_HIGH>; + extclock-enable; + extclock-freq = 16000000; + extclock-gpio = 2; + }; diff --git a/Documentation/devicetree/bindings/net/marvell,prestera.txt b/Documentation/devicetree/bindings/net/marvell,prestera.txt index 5fbab29718e8ee087ffd2a3d6dc13e973d1bf48d..c329608fa887b1fad655f7c778ca539b4c5d36c6 100644 --- a/Documentation/devicetree/bindings/net/marvell,prestera.txt +++ b/Documentation/devicetree/bindings/net/marvell,prestera.txt @@ -32,19 +32,16 @@ DFX Server bindings ------------------- Required properties: -- compatible: must be "marvell,dfx-server" +- compatible: must be "marvell,dfx-server", "simple-bus" +- ranges: describes the address mapping of a memory-mapped bus. - reg: address and length of the register set for the device. Example: -dfx-registers { - compatible = "simple-bus"; +dfx-server { + compatible = "marvell,dfx-server", "simple-bus"; #address-cells = <1>; #size-cells = <1>; ranges = <0 MBUS_ID(0x08, 0x00) 0 0x100000>; - - dfx: dfx@0 { - compatible = "marvell,dfx-server"; - reg = <0 0x100000>; - }; + reg = ; }; diff --git a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt index 9417e54c26c0eba82004047d06fda4f3dd4d02b8..ccdabdcc8618c24e6d6801845aee62c54306466c 100644 --- a/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt +++ b/Documentation/devicetree/bindings/net/marvell-orion-mdio.txt @@ -7,17 +7,20 @@ interface. Required properties: - compatible: "marvell,orion-mdio" -- reg: address and length of the SMI register +- reg: address and length of the MDIO registers. When an interrupt is + not present, the length is the size of the SMI register (4 bytes) + otherwise it must be 0x84 bytes to cover the interrupt control + registers. Optional properties: - interrupts: interrupt line number for the SMI error/done interrupt -- clocks: Phandle to the clock control device and gate bit +- clocks: phandle for up to three required clocks for the MDIO instance The child nodes of the MDIO driver are the individual PHY devices connected to this MDIO bus. They must have a "reg" property given the PHY address on the MDIO bus. -Example at the SoC level: +Example at the SoC level without an interrupt property: mdio { #address-cells = <1>; @@ -26,6 +29,16 @@ mdio { reg = <0xd0072004 0x4>; }; +Example with an interrupt property: + +mdio { + #address-cells = <1>; + #size-cells = <0>; + compatible = "marvell,orion-mdio"; + reg = <0xd0072004 0x84>; + interrupts = <30>; +}; + And at the board level: mdio { diff --git a/Documentation/devicetree/bindings/net/marvell-pp2.txt b/Documentation/devicetree/bindings/net/marvell-pp2.txt index 4754364df4c66adfc53e2546e31e0d124070dca1..6b4956beff8c42c3214906c83d94072fca8e9083 100644 --- a/Documentation/devicetree/bindings/net/marvell-pp2.txt +++ b/Documentation/devicetree/bindings/net/marvell-pp2.txt @@ -1,17 +1,28 @@ -* Marvell Armada 375 Ethernet Controller (PPv2) +* Marvell Armada 375 Ethernet Controller (PPv2.1) + Marvell Armada 7K/8K Ethernet Controller (PPv2.2) Required properties: -- compatible: should be "marvell,armada-375-pp2" +- compatible: should be one of: + "marvell,armada-375-pp2" + "marvell,armada-7k-pp2" - reg: addresses and length of the register sets for the device. - Must contain the following register sets: + For "marvell,armada-375-pp2", must contain the following register + sets: - common controller registers - LMS registers - In addition, at least one port register set is required. -- clocks: a pointer to the reference clocks for this device, consequently: - - main controller clock - - GOP clock -- clock-names: names of used clocks, must be "pp_clk" and "gop_clk". + - one register area per Ethernet port + For "marvell,armada-7k-pp2", must contain the following register + sets: + - packet processor registers + - networking interfaces registers + +- clocks: pointers to the reference clocks for this device, consequently: + - main controller clock (for both armada-375-pp2 and armada-7k-pp2) + - GOP clock (for both armada-375-pp2 and armada-7k-pp2) + - MG clock (only for armada-7k-pp2) +- clock-names: names of used clocks, must be "pp_clk", "gop_clk" and + "mg_clk" (the latter only for armada-7k-pp2). The ethernet ports are represented by subnodes. At least one port is required. @@ -19,8 +30,10 @@ required. Required properties (port): - interrupts: interrupt for the port -- port-id: should be '0' or '1' for ethernet ports, and '2' for the - loopback port +- port-id: ID of the port from the MAC point of view +- gop-port-id: only for marvell,armada-7k-pp2, ID of the port from the + GOP (Group Of Ports) point of view. This ID is used to index the + per-port registers in the second register area. - phy-mode: See ethernet.txt file in the same directory Optional properties (port): @@ -29,7 +42,7 @@ Optional properties (port): - phy: a phandle to a phy node defining the PHY address (as the reg property, a single integer). -Example: +Example for marvell,armada-375-pp2: ethernet@f0000 { compatible = "marvell,armada-375-pp2"; @@ -57,3 +70,30 @@ ethernet@f0000 { phy-mode = "gmii"; }; }; + +Example for marvell,armada-7k-pp2: + +cpm_ethernet: ethernet@0 { + compatible = "marvell,armada-7k-pp22"; + reg = <0x0 0x100000>, <0x129000 0xb000>; + clocks = <&cpm_syscon0 1 3>, <&cpm_syscon0 1 9>, <&cpm_syscon0 1 5>; + clock-names = "pp_clk", "gop_clk", "gp_clk"; + + eth0: eth0 { + interrupts = ; + port-id = <0>; + gop-port-id = <0>; + }; + + eth1: eth1 { + interrupts = ; + port-id = <1>; + gop-port-id = <2>; + }; + + eth2: eth2 { + interrupts = ; + port-id = <2>; + gop-port-id = <3>; + }; +}; diff --git a/Documentation/devicetree/bindings/net/mdio.txt b/Documentation/devicetree/bindings/net/mdio.txt new file mode 100644 index 0000000000000000000000000000000000000000..96a53f89aa6e2fa7f9a0d4c6c3b18d03a3b75994 --- /dev/null +++ b/Documentation/devicetree/bindings/net/mdio.txt @@ -0,0 +1,37 @@ +Common MDIO bus properties. + +These are generic properties that can apply to any MDIO bus. + +Optional properties: +- reset-gpios: One GPIO that control the RESET lines of all PHYs on that MDIO + bus. +- reset-delay-us: RESET pulse width in microseconds. + +A list of child nodes, one per device on the bus is expected. These +should follow the generic phy.txt, or a device specific binding document. + +The 'reset-delay-us' indicates the RESET signal pulse width in microseconds and +applies to all PHY devices. It must therefore be appropriately determined based +on all PHY requirements (maximum value of all per-PHY RESET pulse widths). + +Example : +This example shows these optional properties, plus other properties +required for the TI Davinci MDIO driver. + + davinci_mdio: ethernet@0x5c030000 { + compatible = "ti,davinci_mdio"; + reg = <0x5c030000 0x1000>; + #address-cells = <1>; + #size-cells = <0>; + + reset-gpios = <&gpio2 5 GPIO_ACTIVE_LOW>; + reset-delay-us = <2>; + + ethphy0: ethernet-phy@1 { + reg = <1>; + }; + + ethphy1: ethernet-phy@3 { + reg = <3>; + }; + }; diff --git a/Documentation/devicetree/bindings/net/moxa,moxart-mac.txt b/Documentation/devicetree/bindings/net/moxa,moxart-mac.txt deleted file mode 100644 index 583418b2c127ebe74f2f8e904ea2aa8258f458d3..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/net/moxa,moxart-mac.txt +++ /dev/null @@ -1,21 +0,0 @@ -MOXA ART Ethernet Controller - -Required properties: - -- compatible : Must be "moxa,moxart-mac" -- reg : Should contain register location and length -- interrupts : Should contain the mac interrupt number - -Example: - - mac0: mac@90900000 { - compatible = "moxa,moxart-mac"; - reg = <0x90900000 0x100>; - interrupts = <25 0>; - }; - - mac1: mac@92000000 { - compatible = "moxa,moxart-mac"; - reg = <0x92000000 0x100>; - interrupts = <27 0>; - }; diff --git a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt index 32b35a07abe4224c298e601eb09ad50f1c0c0b2f..c627bbb3009e5a3f54472d58c2f5ef8f37f871fa 100644 --- a/Documentation/devicetree/bindings/net/nfc/trf7970a.txt +++ b/Documentation/devicetree/bindings/net/nfc/trf7970a.txt @@ -5,8 +5,8 @@ Required properties: - spi-max-frequency: Maximum SPI frequency (<= 2000000). - interrupt-parent: phandle of parent interrupt handler. - interrupts: A single interrupt specifier. -- ti,enable-gpios: Two GPIO entries used for 'EN' and 'EN2' pins on the - TRF7970A. +- ti,enable-gpios: One or two GPIO entries used for 'EN' and 'EN2' pins on the + TRF7970A. EN2 is optional. - vin-supply: Regulator for supply voltage to VIN pin Optional SoC Specific Properties: @@ -21,6 +21,8 @@ Optional SoC Specific Properties: - t5t-rmb-extra-byte-quirk: Specify that the trf7970a has the erratum where an extra byte is returned by Read Multiple Block commands issued to Type 5 tags. +- vdd-io-supply: Regulator specifying voltage for vdd-io +- clock-frequency: Set to specify that the input frequency to the trf7970a is 13560000Hz or 27120000Hz Example (for ARM-based BeagleBone with TRF7970A on SPI1): @@ -39,10 +41,12 @@ Example (for ARM-based BeagleBone with TRF7970A on SPI1): <&gpio2 5 GPIO_ACTIVE_LOW>; vin-supply = <&ldo3_reg>; vin-voltage-override = <5000000>; + vdd-io-supply = <&ldo2_reg>; autosuspend-delay = <30000>; irq-status-read-quirk; en2-rf-quirk; t5t-rmb-extra-byte-quirk; + clock-frequency = <27120000>; status = "okay"; }; }; diff --git a/Documentation/devicetree/bindings/net/nokia-bluetooth.txt b/Documentation/devicetree/bindings/net/nokia-bluetooth.txt new file mode 100644 index 0000000000000000000000000000000000000000..42be7dc9a70ba7cb0f653270e4941dc85046b6e8 --- /dev/null +++ b/Documentation/devicetree/bindings/net/nokia-bluetooth.txt @@ -0,0 +1,51 @@ +Nokia Bluetooth Chips +--------------------- + +Nokia phones often come with UART connected bluetooth chips from different +vendors and modified device API. Those devices speak a protocol named H4+ +(also known as h4p) by Nokia, which is similar to the H4 protocol from the +Bluetooth standard. In addition to the H4 protocol it specifies two more +UART status lines for wakeup of UART transceivers to improve power management +and a few new packet types used to negotiate uart speed. + +Required properties: + + - compatible: should contain "nokia,h4p-bluetooth" as well as one of the following: + * "brcm,bcm2048-nokia" + * "ti,wl1271-bluetooth-nokia" + - reset-gpios: GPIO specifier, used to reset the BT module (active low) + - bluetooth-wakeup-gpios: GPIO specifier, used to wakeup the BT module (active high) + - host-wakeup-gpios: GPIO specifier, used to wakeup the host processor (active high) + - clock-names: should be "sysclk" + - clocks: should contain a clock specifier for every name in clock-names + +Optional properties: + + - None + +Example: + +/ { + /* controlled (enabled/disabled) directly by BT module */ + bluetooth_clk: vctcxo { + compatible = "fixed-clock"; + #clock-cells = <0>; + clock-frequency = <38400000>; + }; +}; + +&uart2 { + pinctrl-names = "default"; + pinctrl-0 = <&uart2_pins>; + + bluetooth { + compatible = "ti,wl1271-bluetooth-nokia", "nokia,h4p-bluetooth"; + + reset-gpios = <&gpio1 26 GPIO_ACTIVE_LOW>; /* gpio26 */ + host-wakeup-gpios = <&gpio4 5 GPIO_ACTIVE_HIGH>; /* gpio101 */ + bluetooth-wakeup-gpios = <&gpio2 5 GPIO_ACTIVE_HIGH>; /* gpio37 */ + + clocks = <&bluetooth_clk>; + clock-names = "sysclk"; + }; +}; diff --git a/Documentation/devicetree/bindings/net/stmmac.txt b/Documentation/devicetree/bindings/net/stmmac.txt index d3bfc2b30fb5ecc07493b4c5510d5cdc32b3ff3a..c3a7be6615c547a5001c67996e18458b7f8e5b5d 100644 --- a/Documentation/devicetree/bindings/net/stmmac.txt +++ b/Documentation/devicetree/bindings/net/stmmac.txt @@ -7,9 +7,12 @@ Required properties: - interrupt-parent: Should be the phandle for the interrupt controller that services interrupts for this device - interrupts: Should contain the STMMAC interrupts -- interrupt-names: Should contain the interrupt names "macirq" - "eth_wake_irq" if this interrupt is supported in the "interrupts" - property +- interrupt-names: Should contain a list of interrupt names corresponding to + the interrupts in the interrupts property, if available. + Valid interrupt names are: + - "macirq" (combined signal for various interrupt events) + - "eth_wake_irq" (the interrupt to manage the remote wake-up packet detection) + - "eth_lpi" (the interrupt that occurs when Tx or Rx enters/exits LPI state) - phy-mode: See ethernet.txt file in the same directory. - snps,reset-gpio gpio number for phy reset. - snps,reset-active-low boolean flag to indicate if phy reset is active low. @@ -28,9 +31,9 @@ Optional properties: clocks may be specified in derived bindings. - clock-names: One name for each entry in the clocks property, the first one should be "stmmaceth" and the second one should be "pclk". -- clk_ptp_ref: this is the PTP reference clock; in case of the PTP is - available this clock is used for programming the Timestamp Addend Register. - If not passed then the system clock will be used and this is fine on some +- ptp_ref: this is the PTP reference clock; in case of the PTP is available + this clock is used for programming the Timestamp Addend Register. If not + passed then the system clock will be used and this is fine on some platforms. - tx-fifo-depth: See ethernet.txt file in the same directory - rx-fifo-depth: See ethernet.txt file in the same directory @@ -72,7 +75,45 @@ Optional properties: - snps,mb: mixed-burst - snps,rb: rebuild INCRx Burst - mdio: with compatible = "snps,dwmac-mdio", create and register mdio bus. - +- Multiple RX Queues parameters: below the list of all the parameters to + configure the multiple RX queues: + - snps,rx-queues-to-use: number of RX queues to be used in the driver + - Choose one of these RX scheduling algorithms: + - snps,rx-sched-sp: Strict priority + - snps,rx-sched-wsp: Weighted Strict priority + - For each RX queue + - Choose one of these modes: + - snps,dcb-algorithm: Queue to be enabled as DCB + - snps,avb-algorithm: Queue to be enabled as AVB + - snps,map-to-dma-channel: Channel to map + - Specifiy specific packet routing: + - snps,route-avcp: AV Untagged Control packets + - snps,route-ptp: PTP Packets + - snps,route-dcbcp: DCB Control Packets + - snps,route-up: Untagged Packets + - snps,route-multi-broad: Multicast & Broadcast Packets + - snps,priority: RX queue priority (Range: 0x0 to 0xF) +- Multiple TX Queues parameters: below the list of all the parameters to + configure the multiple TX queues: + - snps,tx-queues-to-use: number of TX queues to be used in the driver + - Choose one of these TX scheduling algorithms: + - snps,tx-sched-wrr: Weighted Round Robin + - snps,tx-sched-wfq: Weighted Fair Queuing + - snps,tx-sched-dwrr: Deficit Weighted Round Robin + - snps,tx-sched-sp: Strict priority + - For each TX queue + - snps,weight: TX queue weight (if using a DCB weight algorithm) + - Choose one of these modes: + - snps,dcb-algorithm: TX queue will be working in DCB + - snps,avb-algorithm: TX queue will be working in AVB + [Attention] Queue 0 is reserved for legacy traffic + and so no AVB is available in this queue. + - Configure Credit Base Shaper (if AVB Mode selected): + - snps,send_slope: enable Low Power Interface + - snps,idle_slope: unlock on WoL + - snps,high_credit: max write outstanding req. limit + - snps,low_credit: max read outstanding req. limit + - snps,priority: TX queue priority (Range: 0x0 to 0xF) Examples: stmmac_axi_setup: stmmac-axi-config { @@ -81,12 +122,41 @@ Examples: snps,blen = <256 128 64 32 0 0 0>; }; + mtl_rx_setup: rx-queues-config { + snps,rx-queues-to-use = <1>; + snps,rx-sched-sp; + queue0 { + snps,dcb-algorithm; + snps,map-to-dma-channel = <0x0>; + snps,priority = <0x0>; + }; + }; + + mtl_tx_setup: tx-queues-config { + snps,tx-queues-to-use = <2>; + snps,tx-sched-wrr; + queue0 { + snps,weight = <0x10>; + snps,dcb-algorithm; + snps,priority = <0x0>; + }; + + queue1 { + snps,avb-algorithm; + snps,send_slope = <0x1000>; + snps,idle_slope = <0x1000>; + snps,high_credit = <0x3E800>; + snps,low_credit = <0xFFC18000>; + snps,priority = <0x1>; + }; + }; + gmac0: ethernet@e0800000 { compatible = "st,spear600-gmac"; reg = <0xe0800000 0x8000>; interrupt-parent = <&vic1>; - interrupts = <24 23>; - interrupt-names = "macirq", "eth_wake_irq"; + interrupts = <24 23 22>; + interrupt-names = "macirq", "eth_wake_irq", "eth_lpi"; mac-address = [000000000000]; /* Filled in by U-Boot */ max-frame-size = <3800>; phy-mode = "gmii"; @@ -104,4 +174,6 @@ Examples: phy1: ethernet-phy@0 { }; }; + snps,mtl-rx-config = <&mtl_rx_setup>; + snps,mtl-tx-config = <&mtl_tx_setup>; }; diff --git a/Documentation/devicetree/bindings/net/ti,wilink-st.txt b/Documentation/devicetree/bindings/net/ti,wilink-st.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbad73a84ac43250865fd6e47dc47a388e082342 --- /dev/null +++ b/Documentation/devicetree/bindings/net/ti,wilink-st.txt @@ -0,0 +1,35 @@ +TI WiLink 7/8 (wl12xx/wl18xx) Shared Transport BT/FM/GPS devices + +TI WiLink devices have a UART interface for providing Bluetooth, FM radio, +and GPS over what's called "shared transport". The shared transport is +standard BT HCI protocol with additional channels for the other functions. + +These devices also have a separate WiFi interface as described in +wireless/ti,wlcore.txt. + +This bindings follows the UART slave device binding in +../serial/slave-device.txt. + +Required properties: + - compatible: should be one of the following: + "ti,wl1271-st" + "ti,wl1273-st" + "ti,wl1831-st" + "ti,wl1835-st" + "ti,wl1837-st" + +Optional properties: + - enable-gpios : GPIO signal controlling enabling of BT. Active high. + - vio-supply : Vio input supply (1.8V) + - vbat-supply : Vbat input supply (2.9-4.8V) + +Example: + +&serial0 { + compatible = "ns16550a"; + ... + bluetooth { + compatible = "ti,wl1835-st"; + enable-gpios = <&gpio1 7 GPIO_ACTIVE_HIGH>; + }; +}; diff --git a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt index d543ed3f5363251d0c22b5ddac8a5b078605de93..ef06d061913cfc1aa2d488073b53f9800fd33cd9 100644 --- a/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt +++ b/Documentation/devicetree/bindings/nvmem/allwinner,sunxi-sid.txt @@ -1,7 +1,11 @@ Allwinner sunxi-sid Required properties: -- compatible: "allwinner,sun4i-a10-sid" or "allwinner,sun7i-a20-sid" +- compatible: Should be one of the following: + "allwinner,sun4i-a10-sid" + "allwinner,sun7i-a20-sid" + "allwinner,sun8i-h3-sid" + - reg: Should contain registers location and length = Data cells = diff --git a/Documentation/devicetree/bindings/nvmem/imx-iim.txt b/Documentation/devicetree/bindings/nvmem/imx-iim.txt new file mode 100644 index 0000000000000000000000000000000000000000..1978c5bcd96d1ff6f6c83520e9ad536349112b07 --- /dev/null +++ b/Documentation/devicetree/bindings/nvmem/imx-iim.txt @@ -0,0 +1,22 @@ +Freescale i.MX IC Identification Module (IIM) device tree bindings + +This binding represents the IC Identification Module (IIM) found on +i.MX25, i.MX27, i.MX31, i.MX35, i.MX51 and i.MX53 SoCs. + +Required properties: +- compatible: should be one of + "fsl,imx25-iim", "fsl,imx27-iim", + "fsl,imx31-iim", "fsl,imx35-iim", + "fsl,imx51-iim", "fsl,imx53-iim", +- reg: Should contain the register base and length. +- interrupts: Should contain the interrupt for the IIM +- clocks: Should contain a phandle pointing to the gated peripheral clock. + +Example: + + iim: iim@63f98000 { + compatible = "fsl,imx53-iim", "fsl,imx27-iim"; + reg = <0x63f98000 0x4000>; + interrupts = <69>; + clocks = <&clks IMX5_CLK_IIM_GATE>; + }; diff --git a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt index 966a72ecc6bd8586a528b15119dd453b4d88d13b..70d791b03ea13cf52936d6542289a52c991c38a1 100644 --- a/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt +++ b/Documentation/devicetree/bindings/nvmem/imx-ocotp.txt @@ -9,14 +9,19 @@ Required properties: "fsl,imx6sl-ocotp" (i.MX6SL), or "fsl,imx6sx-ocotp" (i.MX6SX), "fsl,imx6ul-ocotp" (i.MX6UL), + "fsl,imx7d-ocotp" (i.MX7D/S), followed by "syscon". - reg: Should contain the register base and length. - clocks: Should contain a phandle pointing to the gated peripheral clock. +Optional properties: +- read-only: disable write access + Example: ocotp: ocotp@021bc000 { compatible = "fsl,imx6q-ocotp", "syscon"; reg = <0x021bc000 0x4000>; clocks = <&clks IMX6QDL_CLK_IIM>; + read-only; }; diff --git a/Documentation/devicetree/bindings/pci/designware-pcie.txt b/Documentation/devicetree/bindings/pci/designware-pcie.txt index 1392c705cecae84c1bdbd601e27d91d78dc60980..b2480dd38c1135747c6bcceb5153df94a6a36326 100644 --- a/Documentation/devicetree/bindings/pci/designware-pcie.txt +++ b/Documentation/devicetree/bindings/pci/designware-pcie.txt @@ -6,30 +6,40 @@ Required properties: - reg-names: Must be "config" for the PCIe configuration space. (The old way of getting the configuration address space from "ranges" is deprecated and should be avoided.) +- num-lanes: number of lanes to use +RC mode: - #address-cells: set to <3> - #size-cells: set to <2> - device_type: set to "pci" - ranges: ranges for the PCI memory and I/O regions - #interrupt-cells: set to <1> -- interrupt-map-mask and interrupt-map: standard PCI properties - to define the mapping of the PCIe interface to interrupt +- interrupt-map-mask and interrupt-map: standard PCI + properties to define the mapping of the PCIe interface to interrupt numbers. -- num-lanes: number of lanes to use +EP mode: +- num-ib-windows: number of inbound address translation + windows +- num-ob-windows: number of outbound address translation + windows Optional properties: -- num-viewport: number of view ports configured in hardware. If a platform - does not specify it, the driver assumes 2. - num-lanes: number of lanes to use (this property should be specified unless the link is brought already up in BIOS) - reset-gpio: gpio pin number of power good signal -- bus-range: PCI bus numbers covered (it is recommended for new devicetrees to - specify this property, to keep backwards compatibility a range of 0x00-0xff - is assumed if not present) - clocks: Must contain an entry for each entry in clock-names. See ../clocks/clock-bindings.txt for details. - clock-names: Must include the following entries: - "pcie" - "pcie_bus" +RC mode: +- num-viewport: number of view ports configured in + hardware. If a platform does not specify it, the driver assumes 2. +- bus-range: PCI bus numbers covered (it is recommended + for new devicetrees to specify this property, to keep backwards + compatibility a range of 0x00-0xff is assumed if not present) +EP mode: +- max-functions: maximum number of functions that can be + configured Example configuration: diff --git a/Documentation/devicetree/bindings/pci/faraday,ftpci100.txt b/Documentation/devicetree/bindings/pci/faraday,ftpci100.txt new file mode 100644 index 0000000000000000000000000000000000000000..35d4a979bb7b2012fc653ba23c622a76db42451e --- /dev/null +++ b/Documentation/devicetree/bindings/pci/faraday,ftpci100.txt @@ -0,0 +1,129 @@ +Faraday Technology FTPCI100 PCI Host Bridge + +This PCI bridge is found inside that Cortina Systems Gemini SoC platform and +is a generic IP block from Faraday Technology. It exists in two variants: +plain and dual PCI. The plain version embeds a cascading interrupt controller +into the host bridge. The dual version routes the interrupts to the host +chips interrupt controller. + +The host controller appear on the PCI bus with vendor ID 0x159b (Faraday +Technology) and product ID 0x4321. + +Mandatory properties: + +- compatible: ranging from specific to generic, should be one of + "cortina,gemini-pci", "faraday,ftpci100" + "cortina,gemini-pci-dual", "faraday,ftpci100-dual" + "faraday,ftpci100" + "faraday,ftpci100-dual" +- reg: memory base and size for the host bridge +- #address-cells: set to <3> +- #size-cells: set to <2> +- #interrupt-cells: set to <1> +- bus-range: set to <0x00 0xff> +- device_type, set to "pci" +- ranges: see pci.txt +- interrupt-map-mask: see pci.txt +- interrupt-map: see pci.txt +- dma-ranges: three ranges for the inbound memory region. The ranges must + be aligned to a 1MB boundary, and may be 1MB, 2MB, 4MB, 8MB, 16MB, 32MB, 64MB, + 128MB, 256MB, 512MB, 1GB or 2GB in size. The memory should be marked as + pre-fetchable. + +Mandatory subnodes: +- For "faraday,ftpci100" a node representing the interrupt-controller inside the + host bridge is mandatory. It has the following mandatory properties: + - interrupt: see interrupt-controller/interrupts.txt + - interrupt-parent: see interrupt-controller/interrupts.txt + - interrupt-controller: see interrupt-controller/interrupts.txt + - #address-cells: set to <0> + - #interrupt-cells: set to <1> + +I/O space considerations: + +The plain variant has 128MiB of non-prefetchable memory space, whereas the +"dual" variant has 64MiB. Take this into account when describing the ranges. + +Interrupt map considerations: + +The "dual" variant will get INT A, B, C, D from the system interrupt controller +and should point to respective interrupt in that controller in its +interrupt-map. + +The code which is the only documentation of how the Faraday PCI (the non-dual +variant) interrupts assigns the default interrupt mapping/swizzling has +typically been like this, doing the swizzling on the interrupt controller side +rather than in the interconnect: + +interrupt-map-mask = <0xf800 0 0 7>; +interrupt-map = + <0x4800 0 0 1 &pci_intc 0>, /* Slot 9 */ + <0x4800 0 0 2 &pci_intc 1>, + <0x4800 0 0 3 &pci_intc 2>, + <0x4800 0 0 4 &pci_intc 3>, + <0x5000 0 0 1 &pci_intc 1>, /* Slot 10 */ + <0x5000 0 0 2 &pci_intc 2>, + <0x5000 0 0 3 &pci_intc 3>, + <0x5000 0 0 4 &pci_intc 0>, + <0x5800 0 0 1 &pci_intc 2>, /* Slot 11 */ + <0x5800 0 0 2 &pci_intc 3>, + <0x5800 0 0 3 &pci_intc 0>, + <0x5800 0 0 4 &pci_intc 1>, + <0x6000 0 0 1 &pci_intc 3>, /* Slot 12 */ + <0x6000 0 0 2 &pci_intc 0>, + <0x6000 0 0 3 &pci_intc 1>, + <0x6000 0 0 4 &pci_intc 2>; + +Example: + +pci@50000000 { + compatible = "cortina,gemini-pci", "faraday,ftpci100"; + reg = <0x50000000 0x100>; + interrupts = <8 IRQ_TYPE_LEVEL_HIGH>, /* PCI A */ + <26 IRQ_TYPE_LEVEL_HIGH>, /* PCI B */ + <27 IRQ_TYPE_LEVEL_HIGH>, /* PCI C */ + <28 IRQ_TYPE_LEVEL_HIGH>; /* PCI D */ + #address-cells = <3>; + #size-cells = <2>; + #interrupt-cells = <1>; + + bus-range = <0x00 0xff>; + ranges = /* 1MiB I/O space 0x50000000-0x500fffff */ + <0x01000000 0 0 0x50000000 0 0x00100000>, + /* 128MiB non-prefetchable memory 0x58000000-0x5fffffff */ + <0x02000000 0 0x58000000 0x58000000 0 0x08000000>; + + /* DMA ranges */ + dma-ranges = + /* 128MiB at 0x00000000-0x07ffffff */ + <0x02000000 0 0x00000000 0x00000000 0 0x08000000>, + /* 64MiB at 0x00000000-0x03ffffff */ + <0x02000000 0 0x00000000 0x00000000 0 0x04000000>, + /* 64MiB at 0x00000000-0x03ffffff */ + <0x02000000 0 0x00000000 0x00000000 0 0x04000000>; + + interrupt-map-mask = <0xf800 0 0 7>; + interrupt-map = + <0x4800 0 0 1 &pci_intc 0>, /* Slot 9 */ + <0x4800 0 0 2 &pci_intc 1>, + <0x4800 0 0 3 &pci_intc 2>, + <0x4800 0 0 4 &pci_intc 3>, + <0x5000 0 0 1 &pci_intc 1>, /* Slot 10 */ + <0x5000 0 0 2 &pci_intc 2>, + <0x5000 0 0 3 &pci_intc 3>, + <0x5000 0 0 4 &pci_intc 0>, + <0x5800 0 0 1 &pci_intc 2>, /* Slot 11 */ + <0x5800 0 0 2 &pci_intc 3>, + <0x5800 0 0 3 &pci_intc 0>, + <0x5800 0 0 4 &pci_intc 1>, + <0x6000 0 0 1 &pci_intc 3>, /* Slot 12 */ + <0x6000 0 0 2 &pci_intc 0>, + <0x6000 0 0 3 &pci_intc 0>, + <0x6000 0 0 4 &pci_intc 0>; + pci_intc: interrupt-controller { + interrupt-parent = <&intcon>; + interrupt-controller; + #address-cells = <0>; + #interrupt-cells = <1>; + }; +}; diff --git a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt index 83aeb1f5a645ce2fe494e14dedce27dc7085155f..e3d5680875b123326c7cc5527c8f85d7219fe691 100644 --- a/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt +++ b/Documentation/devicetree/bindings/pci/fsl,imx6q-pcie.txt @@ -4,7 +4,11 @@ This PCIe host controller is based on the Synopsis Designware PCIe IP and thus inherits all the common properties defined in designware-pcie.txt. Required properties: -- compatible: "fsl,imx6q-pcie", "fsl,imx6sx-pcie", "fsl,imx6qp-pcie" +- compatible: + - "fsl,imx6q-pcie" + - "fsl,imx6sx-pcie", + - "fsl,imx6qp-pcie" + - "fsl,imx7d-pcie" - reg: base address and length of the PCIe controller - interrupts: A list of interrupt outputs of the controller. Must contain an entry for each entry in the interrupt-names property. @@ -34,6 +38,14 @@ Additional required properties for imx6sx-pcie: - clock names: Must include the following additional entries: - "pcie_inbound_axi" +Additional required properties for imx7d-pcie: +- power-domains: Must be set to a phandle pointing to PCIE_PHY power domain +- resets: Must contain phandles to PCIe-related reset lines exposed by SRC + IP block +- reset-names: Must contain the following entires: + - "pciephy" + - "apps" + Example: pcie@0x01000000 { diff --git a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt index b7fa3b97986d5fff391bf4ff1a01bdd25ce096e9..a339dbb154933282ee06aaeec0571e0d95a72b42 100644 --- a/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt +++ b/Documentation/devicetree/bindings/pci/hisilicon-pcie.txt @@ -44,13 +44,19 @@ Hip05 Example (note that Hip06 is the same except compatible): }; HiSilicon Hip06/Hip07 PCIe host bridge DT (almost-ECAM) description. + +Some BIOSes place the host controller in a mode where it is ECAM +compliant for all devices other than the root complex. In such cases, +the host controller should be described as below. + The properties and their meanings are identical to those described in host-generic-pci.txt except as listed below. Properties of the host controller node that differ from host-generic-pci.txt: -- compatible : Must be "hisilicon,pcie-almost-ecam" +- compatible : Must be "hisilicon,hip06-pcie-ecam", or + "hisilicon,hip07-pcie-ecam" - reg : Two entries: First the ECAM configuration space for any other bus underneath the root bus. Second, the base @@ -59,7 +65,7 @@ host-generic-pci.txt: Example: pcie0: pcie@a0090000 { - compatible = "hisilicon,pcie-almost-ecam"; + compatible = "hisilicon,hip06-pcie-ecam"; reg = <0 0xb0000000 0 0x2000000>, /* ECAM configuration space */ <0 0xa0090000 0 0x10000>; /* host bridge registers */ bus-range = <0 31>; diff --git a/Documentation/devicetree/bindings/pci/ti-pci.txt b/Documentation/devicetree/bindings/pci/ti-pci.txt index 60e25161f35181bfd026b48519f43666d33f091e..6a07c96227e0a0a9bd96530c2833df195657e7dc 100644 --- a/Documentation/devicetree/bindings/pci/ti-pci.txt +++ b/Documentation/devicetree/bindings/pci/ti-pci.txt @@ -1,17 +1,22 @@ TI PCI Controllers PCIe Designware Controller - - compatible: Should be "ti,dra7-pcie"" - - reg : Two register ranges as listed in the reg-names property - - reg-names : The first entry must be "ti-conf" for the TI specific registers - The second entry must be "rc-dbics" for the designware pcie - registers - The third entry must be "config" for the PCIe configuration space + - compatible: Should be "ti,dra7-pcie" for RC + Should be "ti,dra7-pcie-ep" for EP - phys : list of PHY specifiers (used by generic PHY framework) - phy-names : must be "pcie-phy0", "pcie-phy1", "pcie-phyN".. based on the number of PHYs as specified in *phys* property. - ti,hwmods : Name of the hwmod associated to the pcie, "pcie", where is the instance number of the pcie from the HW spec. + - num-lanes as specified in ../designware-pcie.txt + +HOST MODE +========= + - reg : Two register ranges as listed in the reg-names property + - reg-names : The first entry must be "ti-conf" for the TI specific registers + The second entry must be "rc-dbics" for the DesignWare PCIe + registers + The third entry must be "config" for the PCIe configuration space - interrupts : Two interrupt entries must be specified. The first one is for main interrupt line and the second for MSI interrupt line. - #address-cells, @@ -19,13 +24,36 @@ PCIe Designware Controller #interrupt-cells, device_type, ranges, - num-lanes, interrupt-map-mask, interrupt-map : as specified in ../designware-pcie.txt +DEVICE MODE +=========== + - reg : Four register ranges as listed in the reg-names property + - reg-names : "ti-conf" for the TI specific registers + "ep_dbics" for the standard configuration registers as + they are locally accessed within the DIF CS space + "ep_dbics2" for the standard configuration registers as + they are locally accessed within the DIF CS2 space + "addr_space" used to map remote RC address space + - interrupts : one interrupt entries must be specified for main interrupt. + - num-ib-windows : number of inbound address translation windows + - num-ob-windows : number of outbound address translation windows + - ti,syscon-unaligned-access: phandle to the syscon DT node. The 1st argument + should contain the register offset within syscon + and the 2nd argument should contain the bit field + for setting the bit to enable unaligned + access. + Optional Property: - gpios : Should be added if a gpio line is required to drive PERST# line +NOTE: Two DT nodes may be added for each PCI controller; one for host +mode and another for device mode. So in order for PCI to +work in host mode, EP mode DT node should be disabled and in order to PCI to +work in EP mode, host mode DT node should be disabled. Host mode and EP +mode are mutually exclusive. + Example: axi { compatible = "simple-bus"; diff --git a/Documentation/devicetree/bindings/phy/phy-mt65xx-usb.txt b/Documentation/devicetree/bindings/phy/phy-mt65xx-usb.txt index 33a2b1ee3f3eb20e09c8cefe400292468b3f6e43..0acc5a99fb79227262ed6ef1f8d8774fefcb914a 100644 --- a/Documentation/devicetree/bindings/phy/phy-mt65xx-usb.txt +++ b/Documentation/devicetree/bindings/phy/phy-mt65xx-usb.txt @@ -6,12 +6,11 @@ This binding describes a usb3.0 phy for mt65xx platforms of Medaitek SoC. Required properties (controller (parent) node): - compatible : should be one of "mediatek,mt2701-u3phy" + "mediatek,mt2712-u3phy" "mediatek,mt8173-u3phy" - - reg : offset and length of register for phy, exclude port's - register. - - clocks : a list of phandle + clock-specifier pairs, one for each - entry in clock-names - - clock-names : must contain + - clocks : (deprecated, use port's clocks instead) a list of phandle + + clock-specifier pairs, one for each entry in clock-names + - clock-names : (deprecated, use port's one instead) must contain "u3phya_ref": for reference clock of usb3.0 analog phy. Required nodes : a sub-node is required for each port the controller @@ -19,8 +18,19 @@ Required nodes : a sub-node is required for each port the controller 'reg' property is used inside these nodes to describe the controller's topology. +Optional properties (controller (parent) node): + - reg : offset and length of register shared by multiple ports, + exclude port's private register. It is needed on mt2701 + and mt8173, but not on mt2712. + Required properties (port (child) node): - reg : address and length of the register set for the port. +- clocks : a list of phandle + clock-specifier pairs, one for each + entry in clock-names +- clock-names : must contain + "ref": 48M reference clock for HighSpeed analog phy; and 26M + reference clock for SuperSpeed analog phy, sometimes is + 24M, 25M or 27M, depended on platform. - #phy-cells : should be 1 (See second example) cell after port phandle is phy type from: - PHY_TYPE_USB2 @@ -31,21 +41,31 @@ Example: u3phy: usb-phy@11290000 { compatible = "mediatek,mt8173-u3phy"; reg = <0 0x11290000 0 0x800>; - clocks = <&apmixedsys CLK_APMIXED_REF2USB_TX>; - clock-names = "u3phya_ref"; #address-cells = <2>; #size-cells = <2>; ranges; status = "okay"; - phy_port0: port@11290800 { - reg = <0 0x11290800 0 0x800>; + u2port0: usb-phy@11290800 { + reg = <0 0x11290800 0 0x100>; + clocks = <&apmixedsys CLK_APMIXED_REF2USB_TX>; + clock-names = "ref"; #phy-cells = <1>; status = "okay"; }; - phy_port1: port@11291000 { - reg = <0 0x11291000 0 0x800>; + u3port0: usb-phy@11290900 { + reg = <0 0x11290800 0 0x700>; + clocks = <&clk26m>; + clock-names = "ref"; + #phy-cells = <1>; + status = "okay"; + }; + + u2port1: usb-phy@11291000 { + reg = <0 0x11291000 0 0x100>; + clocks = <&apmixedsys CLK_APMIXED_REF2USB_TX>; + clock-names = "ref"; #phy-cells = <1>; status = "okay"; }; @@ -64,7 +84,54 @@ Example: usb30: usb@11270000 { ... - phys = <&phy_port0 PHY_TYPE_USB3>; - phy-names = "usb3-0"; + phys = <&u2port0 PHY_TYPE_USB2>, <&u3port0 PHY_TYPE_USB3>; + phy-names = "usb2-0", "usb3-0"; ... }; + + +Layout differences of banks between mt8173/mt2701 and mt2712 +------------------------------------------------------------- +mt8173 and mt2701: +port offset bank +shared 0x0000 SPLLC + 0x0100 FMREG +u2 port0 0x0800 U2PHY_COM +u3 port0 0x0900 U3PHYD + 0x0a00 U3PHYD_BANK2 + 0x0b00 U3PHYA + 0x0c00 U3PHYA_DA +u2 port1 0x1000 U2PHY_COM +u3 port1 0x1100 U3PHYD + 0x1200 U3PHYD_BANK2 + 0x1300 U3PHYA + 0x1400 U3PHYA_DA +u2 port2 0x1800 U2PHY_COM + ... + +mt2712: +port offset bank +u2 port0 0x0000 MISC + 0x0100 FMREG + 0x0300 U2PHY_COM +u3 port0 0x0700 SPLLC + 0x0800 CHIP + 0x0900 U3PHYD + 0x0a00 U3PHYD_BANK2 + 0x0b00 U3PHYA + 0x0c00 U3PHYA_DA +u2 port1 0x1000 MISC + 0x1100 FMREG + 0x1300 U2PHY_COM +u3 port1 0x1700 SPLLC + 0x1800 CHIP + 0x1900 U3PHYD + 0x1a00 U3PHYD_BANK2 + 0x1b00 U3PHYA + 0x1c00 U3PHYA_DA +u2 port2 0x2000 MISC + ... + + SPLLC shared by u3 ports and FMREG shared by u2 ports on +mt8173/mt2701 are put back into each port; a new bank MISC for +u2 ports and CHIP for u3 ports are added on mt2712. diff --git a/Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt b/Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt index 3c29c77a7018da4e52b87920965d41013dbd9ef2..e71a8d23f4a8c396b8377b9ec210bd10edbb9753 100644 --- a/Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt +++ b/Documentation/devicetree/bindings/phy/phy-rockchip-inno-usb2.txt @@ -2,6 +2,7 @@ ROCKCHIP USB2.0 PHY WITH INNO IP BLOCK Required properties (phy (parent) node): - compatible : should be one of the listed compatibles: + * "rockchip,rk3328-usb2phy" * "rockchip,rk3366-usb2phy" * "rockchip,rk3399-usb2phy" - reg : the address offset of grf for usb-phy configuration. @@ -11,6 +12,11 @@ Required properties (phy (parent) node): Optional properties: - clocks : phandle + phy specifier pair, for the input clock of phy. - clock-names : input clock name of phy, must be "phyclk". + - assigned-clocks : phandle of usb 480m clock. + - assigned-clock-parents : parent of usb 480m clock, select between + usb-phy output 480m and xin24m. + Refer to clk/clock-bindings.txt for generic clock + consumer properties. Required nodes : a sub-node is required for each port the phy provides. The sub-node name is used to identify host or otg port, diff --git a/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt new file mode 100644 index 0000000000000000000000000000000000000000..e11c563a65ecac285bcc058e80774190c8b3e71b --- /dev/null +++ b/Documentation/devicetree/bindings/phy/qcom-qmp-phy.txt @@ -0,0 +1,106 @@ +Qualcomm QMP PHY controller +=========================== + +QMP phy controller supports physical layer functionality for a number of +controllers on Qualcomm chipsets, such as, PCIe, UFS, and USB. + +Required properties: + - compatible: compatible list, contains: + "qcom,msm8996-qmp-pcie-phy" for 14nm PCIe phy on msm8996, + "qcom,msm8996-qmp-usb3-phy" for 14nm USB3 phy on msm8996. + + - reg: offset and length of register set for PHY's common serdes block. + + - #clock-cells: must be 1 + - Phy pll outputs a bunch of clocks for Tx, Rx and Pipe + interface (for pipe based PHYs). These clock are then gate-controlled + by gcc. + - #address-cells: must be 1 + - #size-cells: must be 1 + - ranges: must be present + + - clocks: a list of phandles and clock-specifier pairs, + one for each entry in clock-names. + - clock-names: "cfg_ahb" for phy config clock, + "aux" for phy aux clock, + "ref" for 19.2 MHz ref clk, + For "qcom,msm8996-qmp-pcie-phy" must contain: + "aux", "cfg_ahb", "ref". + For "qcom,msm8996-qmp-usb3-phy" must contain: + "aux", "cfg_ahb", "ref". + + - resets: a list of phandles and reset controller specifier pairs, + one for each entry in reset-names. + - reset-names: "phy" for reset of phy block, + "common" for phy common block reset, + "cfg" for phy's ahb cfg block reset (Optional). + For "qcom,msm8996-qmp-pcie-phy" must contain: + "phy", "common", "cfg". + For "qcom,msm8996-qmp-usb3-phy" must contain + "phy", "common". + + - vdda-phy-supply: Phandle to a regulator supply to PHY core block. + - vdda-pll-supply: Phandle to 1.8V regulator supply to PHY refclk pll block. + +Optional properties: + - vddp-ref-clk-supply: Phandle to a regulator supply to any specific refclk + pll block. + +Required nodes: + - Each device node of QMP phy is required to have as many child nodes as + the number of lanes the PHY has. + +Required properties for child node: + - reg: list of offset and length pairs of register sets for PHY blocks - + tx, rx and pcs. + + - #phy-cells: must be 0 + + - clocks: a list of phandles and clock-specifier pairs, + one for each entry in clock-names. + - clock-names: Must contain following for pcie and usb qmp phys: + "pipe" for pipe clock specific to each lane. + + - resets: a list of phandles and reset controller specifier pairs, + one for each entry in reset-names. + - reset-names: Must contain following for pcie qmp phys: + "lane" for reset specific to each lane. + +Example: + phy@34000 { + compatible = "qcom,msm8996-qmp-pcie-phy"; + reg = <0x34000 0x488>; + #clock-cells = <1>; + #address-cells = <1>; + #size-cells = <1>; + ranges; + + clocks = <&gcc GCC_PCIE_PHY_AUX_CLK>, + <&gcc GCC_PCIE_PHY_CFG_AHB_CLK>, + <&gcc GCC_PCIE_CLKREF_CLK>; + clock-names = "aux", "cfg_ahb", "ref"; + + vdda-phy-supply = <&pm8994_l28>; + vdda-pll-supply = <&pm8994_l12>; + + resets = <&gcc GCC_PCIE_PHY_BCR>, + <&gcc GCC_PCIE_PHY_COM_BCR>, + <&gcc GCC_PCIE_PHY_COM_NOCSR_BCR>; + reset-names = "phy", "common", "cfg"; + + pciephy_0: lane@35000 { + reg = <0x35000 0x130>, + <0x35200 0x200>, + <0x35400 0x1dc>; + #phy-cells = <0>; + + clocks = <&gcc GCC_PCIE_0_PIPE_CLK>; + clock-names = "pipe0"; + resets = <&gcc GCC_PCIE_0_PHY_BCR>; + reset-names = "lane0"; + }; + + pciephy_1: lane@36000 { + ... + ... + }; diff --git a/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt b/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa0fcb05acb3e4d309224f21fff291c71a6286e9 --- /dev/null +++ b/Documentation/devicetree/bindings/phy/qcom-qusb2-phy.txt @@ -0,0 +1,43 @@ +Qualcomm QUSB2 phy controller +============================= + +QUSB2 controller supports LS/FS/HS usb connectivity on Qualcomm chipsets. + +Required properties: + - compatible: compatible list, contains "qcom,msm8996-qusb2-phy". + - reg: offset and length of the PHY register set. + - #phy-cells: must be 0. + + - clocks: a list of phandles and clock-specifier pairs, + one for each entry in clock-names. + - clock-names: must be "cfg_ahb" for phy config clock, + "ref" for 19.2 MHz ref clk, + "iface" for phy interface clock (Optional). + + - vdda-pll-supply: Phandle to 1.8V regulator supply to PHY refclk pll block. + - vdda-phy-dpdm-supply: Phandle to 3.1V regulator supply to Dp/Dm port signals. + + - resets: Phandle to reset to phy block. + +Optional properties: + - nvmem-cells: Phandle to nvmem cell that contains 'HS Tx trim' + tuning parameter value for qusb2 phy. + + - qcom,tcsr-syscon: Phandle to TCSR syscon register region. + +Example: + hsusb_phy: phy@7411000 { + compatible = "qcom,msm8996-qusb2-phy"; + reg = <0x7411000 0x180>; + #phy-cells = <0>; + + clocks = <&gcc GCC_USB_PHY_CFG_AHB2PHY_CLK>, + <&gcc GCC_RX1_USB2_CLKREF_CLK>, + clock-names = "cfg_ahb", "ref"; + + vdda-pll-supply = <&pm8994_l12>; + vdda-phy-dpdm-supply = <&pm8994_l24>; + + resets = <&gcc GCC_QUSB2PHY_PRIM_BCR>; + nvmem-cells = <&qusb2p_hstx_trim>; + }; diff --git a/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt b/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt index 57dc388e2fa2fe7086b2abe186b46ef2703d7112..4ed569046daf919e487d0147b47720cdd44817aa 100644 --- a/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt +++ b/Documentation/devicetree/bindings/phy/rockchip-usb-phy.txt @@ -30,6 +30,7 @@ Optional Properties: - reset-names: Only allow the following entries: - phy-reset - resets: Must contain an entry for each entry in reset-names. +- vbus-supply: power-supply phandle for vbus power source Example: diff --git a/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt b/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt index e423342581855deebdc2bb4c9effa3e502540061..005bc22938fffb76d73fcee9b8cb29731a1e4584 100644 --- a/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt +++ b/Documentation/devicetree/bindings/phy/sun4i-usb-phy.txt @@ -15,6 +15,7 @@ Required properties: - reg : a list of offset + length pairs - reg-names : * "phy_ctrl" + * "pmu0" for H3, V3s and A64 * "pmu1" * "pmu2" for sun4i, sun6i or sun7i - #phy-cells : from the generic phy bindings, must be 1 diff --git a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt index 2fd688c8dbdb5ad1de269e2df6ca6261df231148..b532244736727716b2a4fe069b4ee31608f3dd05 100644 --- a/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/allwinner,sunxi-pinctrl.txt @@ -23,7 +23,8 @@ Required properties: "allwinner,sun8i-h3-pinctrl" "allwinner,sun8i-h3-r-pinctrl" "allwinner,sun50i-a64-pinctrl" - "allwinner,sun50i-h5-r-pinctrl" + "allwinner,sun50i-a64-r-pinctrl" + "allwinner,sun50i-h5-pinctrl" "nextthing,gr8-pinctrl" - reg: Should contain the register physical address and length for the diff --git a/Documentation/devicetree/bindings/pinctrl/atmel,at91-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/atmel,at91-pinctrl.txt index 9a8a45d9d8abc284c3108adb4ef6d1355208c415..590e60378be30dd8ff6838b6a7923dfc2a761b09 100644 --- a/Documentation/devicetree/bindings/pinctrl/atmel,at91-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/atmel,at91-pinctrl.txt @@ -4,7 +4,7 @@ The AT91 Pinmux Controller, enables the IC to share one PAD to several functional blocks. The sharing is done by multiplexing the PAD input/output signals. For each PAD there are up to 8 muxing options (called periph modes). Since different modules require -different PAD settings (like pull up, keeper, etc) the contoller controls +different PAD settings (like pull up, keeper, etc) the controller controls also the PAD settings parameters. Please refer to pinctrl-bindings.txt in this directory for details of the diff --git a/Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt new file mode 100644 index 0000000000000000000000000000000000000000..47284f85ec804fd038a9bd931cbcb867fa5fe793 --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/axis,artpec6-pinctrl.txt @@ -0,0 +1,85 @@ +Axis ARTPEC-6 Pin Controller + +Required properties: +- compatible: "axis,artpec6-pinctrl". +- reg: Should contain the register physical address and length for the pin + controller. + +A pinctrl node should contain at least one subnode representing the pinctrl +groups available on the machine. Each subnode will list the mux function +required and what pin group it will use. Each subnode will also configure the +drive strength and bias pullup of the pin group. If either of these options is +not set, its actual value will be unspecified. + + +Required subnode-properties: +- function: Function to mux. +- groups: Name of the pin group to use for the function above. + + Available functions and groups (function: group0, group1...): + gpio: cpuclkoutgrp0, udlclkoutgrp0, i2c1grp0, i2c2grp0, + i2c3grp0, i2s0grp0, i2s1grp0, i2srefclkgrp0, spi0grp0, + spi1grp0, pciedebuggrp0, uart0grp0, uart0grp1, uart1grp0, + uart2grp0, uart2grp1, uart3grp0, uart4grp0, uart5grp0 + cpuclkout: cpuclkoutgrp0 + udlclkout: udlclkoutgrp0 + i2c1: i2c1grp0 + i2c2: i2c2grp0 + i2c3: i2c3grp0 + i2s0: i2s0grp0 + i2s1: i2s1grp0 + i2srefclk: i2srefclkgrp0 + spi0: spi0grp0 + spi1: spi1grp0 + pciedebug: pciedebuggrp0 + uart0: uart0grp0, uart0grp1 + uart1: uart1grp0 + uart2: uart2grp0, uart2grp1 + uart3: uart3grp0 + uart4: uart4grp0 + uart5: uart5grp0 + nand: nandgrp0 + sdio0: sdio0grp0 + sdio1: sdio1grp0 + ethernet: ethernetgrp0 + + +Optional subnode-properties (see pinctrl-bindings.txt): +- drive-strength: 4, 6, 8, 9 mA. For SD and NAND pins, this is for 3.3V VCCQ3. +- bias-pull-up +- bias-disable + +Examples: +pinctrl@f801d000 { + compatible = "axis,artpec6-pinctrl"; + reg = <0xf801d000 0x400>; + + pinctrl_uart0: uart0grp { + function = "uart0"; + groups = "uart0grp0"; + drive-strength = <4>; + bias-pull-up; + }; + pinctrl_uart3: uart3grp { + function = "uart3"; + groups = "uart3grp0"; + }; +}; +uart0: uart@f8036000 { + compatible = "arm,pl011", "arm,primecell"; + reg = <0xf8036000 0x1000>; + interrupts = <0 104 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&pll2div24>, <&apb_pclk>; + clock-names = "uart_clk", "apb_pclk"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_uart0>; +}; +uart3: uart@f8039000 { + compatible = "arm,pl011", "arm,primecell"; + reg = <0xf8039000 0x1000>; + interrupts = <0 128 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&pll2div24>, <&apb_pclk>; + clock-names = "uart_clk", "apb_pclk"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_uart3>; +}; diff --git a/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt new file mode 100644 index 0000000000000000000000000000000000000000..f64060908d5ae173e39ec3440f9bbdd3350065ef --- /dev/null +++ b/Documentation/devicetree/bindings/pinctrl/marvell,armada-37xx-pinctrl.txt @@ -0,0 +1,183 @@ +* Marvell Armada 37xx SoC pin and gpio controller + +Each Armada 37xx SoC come with two pin and gpio controller one for the +south bridge and the other for the north bridge. + +Inside this set of register the gpio latch allows exposing some +configuration of the SoC and especially the clock frequency of the +xtal. Hence, this node is a represent as syscon allowing sharing the +register between multiple hardware block. + +GPIO and pin controller: +------------------------ + +Main node: + +Refer to pinctrl-bindings.txt in this directory for details of the +common pinctrl bindings used by client devices, including the meaning +of the phrase "pin configuration node". + +Required properties for pinctrl driver: + +- compatible: "marvell,armada3710-sb-pinctrl", "syscon, "simple-mfd" + for the south bridge + "marvell,armada3710-nb-pinctrl", "syscon, "simple-mfd" + for the north bridge +- reg: The first set of register are for pinctrl/gpio and the second + set for the interrupt controller +- interrupts: list of the interrupt use by the gpio + +Available groups and functions for the North bridge: + +group: jtag + - pins 20-24 + - functions jtag, gpio + +group sdio0 + - pins 8-10 + - functions sdio, gpio + +group emmc_nb + - pins 27-35 + - functions emmc, gpio + +group pwm0 + - pin 11 (GPIO1-11) + - functions pwm, gpio + +group pwm1 + - pin 12 + - functions pwm, gpio + +group pwm2 + - pin 13 + - functions pwm, gpio + +group pwm3 + - pin 14 + - functions pwm, gpio + +group pmic1 + - pin 17 + - functions pmic, gpio + +group pmic0 + - pin 16 + - functions pmic, gpio + +group i2c2 + - pins 2-3 + - functions i2c, gpio + +group i2c1 + - pins 0-1 + - functions i2c, gpio + +group spi_cs1 + - pin 17 + - functions spi, gpio + +group spi_cs2 + - pin 18 + - functions spi, gpio + +group spi_cs3 + - pin 19 + - functions spi, gpio + +group onewire + - pin 4 + - functions onewire, gpio + +group uart1 + - pins 25-26 + - functions uart, gpio + +group spi_quad + - pins 15-16 + - functions spi, gpio + +group uart_2 + - pins 9-10 + - functions uart, gpio + +Available groups and functions for the South bridge: + +group usb32_drvvbus0 + - pin 36 + - functions drvbus, gpio + +group usb2_drvvbus1 + - pin 37 + - functions drvbus, gpio + +group sdio_sb + - pins 60-64 + - functions sdio, gpio + +group rgmii + - pins 42-55 + - functions mii, gpio + +group pcie1 + - pins 39-40 + - functions pcie, gpio + +group ptp + - pins 56-58 + - functions ptp, gpio + +group ptp_clk + - pin 57 + - functions ptp, mii + +group ptp_trig + - pin 58 + - functions ptp, mii + +group mii_col + - pin 59 + - functions mii, mii_err + +GPIO subnode: + +Please refer to gpio.txt in this directory for details of gpio-ranges property +and the common GPIO bindings used by client devices. + +Required properties for gpio driver under the gpio subnode: +- interrupts: List of interrupt specifier for the controllers interrupt. +- gpio-controller: Marks the device node as a gpio controller. +- #gpio-cells: Should be 2. The first cell is the GPIO number and the + second cell specifies GPIO flags, as defined in + . Only the GPIO_ACTIVE_HIGH and + GPIO_ACTIVE_LOW flags are supported. +- gpio-ranges: Range of pins managed by the GPIO controller. + +Xtal Clock bindings for Marvell Armada 37xx SoCs +------------------------------------------------ + +see Documentation/devicetree/bindings/clock/armada3700-xtal-clock.txt + + +Example: +pinctrl_sb: pinctrl-sb@18800 { + compatible = "marvell,armada3710-sb-pinctrl", "syscon", "simple-mfd"; + reg = <0x18800 0x100>, <0x18C00 0x20>; + gpio { + #gpio-cells = <2>; + gpio-ranges = <&pinctrl_sb 0 0 29>; + gpio-controller; + interrupts = + , + , + , + , + ; + }; + + rgmii_pins: mii-pins { + groups = "rgmii"; + function = "mii"; + }; + +}; diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt index b98e6f030da84e0582b443124e305ed7fd6b0634..ca01710ee29afde12272a904907f49273403388d 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-aspeed.txt @@ -34,13 +34,28 @@ Documentation/devicetree/bindings/mfd/syscon.txt Subnode Format ============== -The required properties of child nodes are (as defined in pinctrl-bindings): -- function -- groups +The required properties of pinmux child nodes are: +- function: the mux function to select +- groups : the list of groups to select with this function -Each function has only one associated pin group. Each group is named by its -function. The following values for the function and groups properties are -supported: +Required properties of pinconf child nodes are: +- groups: A list of groups to select (either this or "pins" must be + specified) +- pins : A list of ball names as strings, eg "D14" (either this or "groups" + must be specified) + +Optional properties of pinconf child nodes are: +- bias-disable : disable any pin bias +- bias-pull-down: pull down the pin +- drive-strength: sink or source at most X mA + +Definitions are as specified in +Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt, with any +further limitations as described above. + +For pinmux, each mux function has only one associated pin group. Each group is +named by its function. The following values for the function and groups +properties are supported: aspeed,ast2400-pinctrl, aspeed,g4-pinctrl: @@ -90,6 +105,11 @@ syscon: scu@1e6e2000 { function = "I2C3"; groups = "I2C3"; }; + + pinctrl_gpioh0_unbiased_default: gpioh0 { + pins = "A8"; + bias-disable; + }; }; }; @@ -110,6 +130,11 @@ ahb { function = "I2C3"; groups = "I2C3"; }; + + pinctrl_gpioh0_unbiased_default: gpioh0 { + pins = "A18"; + bias-disable; + }; }; }; @@ -143,6 +168,3 @@ ahb { }; }; }; - -Please refer to pinctrl-bindings.txt in this directory for details of the -common pinctrl bindings used by client devices. diff --git a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt index bf3f7b014724676471a83ef06d3f75a6fb3e54fc..71a3c134af1b25d58613699aa465d17cff2c23d0 100644 --- a/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt +++ b/Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt @@ -162,8 +162,8 @@ state_2_node_a { pins = "mfio29", "mfio30"; }; -Optionally an altenative binding can be used if more suitable depending on the -pin controller hardware. For hardaware where there is a large number of identical +Optionally an alternative binding can be used if more suitable depending on the +pin controller hardware. For hardware where there is a large number of identical pin controller instances, naming each pin and function can easily become unmaintainable. This is especially the case if the same controller is used for different pins and functions depending on the SoC revision and packaging. @@ -198,6 +198,28 @@ registers, and must not be a virtual index of pin instances. The reason for this is to avoid mapping of the index in the dts files and the pin controller driver as it can change. +For hardware where pin multiplexing configurations have to be specified for +each single pin the number of required sub-nodes containing "pin" and +"function" properties can quickly escalate and become hard to write and +maintain. + +For cases like this, the pin controller driver may use the pinmux helper +property, where the pin identifier is packed with mux configuration settings +in a single integer. + +The pinmux property accepts an array of integers, each of them describing +a single pin multiplexing configuration. + +pincontroller { + state_0_node_a { + pinmux = , , ...; + }; +}; + +Each individual pin controller driver bindings documentation shall specify +how those values (pin IDs and pin multiplexing configuration) are defined and +assembled together. + == Generic pin configuration node content == Many data items that are represented in a pin configuration node are common @@ -210,18 +232,22 @@ structure of the DT nodes that contain these properties. Supported generic properties are: pins - the list of pins that properties in the node - apply to (either this or "group" has to be + apply to (either this, "group" or "pinmux" has to be specified) group - the group to apply the properties to, if the driver supports configuration of whole groups rather than - individual pins (either this or "pins" has to be - specified) + individual pins (either this, "pins" or "pinmux" has + to be specified) +pinmux - the list of numeric pin ids and their mux settings + that properties in the node apply to (either this, + "pins" or "groups" have to be specified) bias-disable - disable any pin bias bias-high-impedance - high impedance mode ("third-state", "floating") bias-bus-hold - latch weakly bias-pull-up - pull up the pin bias-pull-down - pull down the pin bias-pull-pin-default - use pin-default pull state +bi-directional - pin supports simultaneous input/output operations drive-push-pull - drive actively high and low drive-open-drain - drive with open drain drive-open-source - drive with open source @@ -234,6 +260,7 @@ input-debounce - debounce mode with debound time X power-source - select between different power supplies low-power-enable - enable low power mode low-power-disable - disable low power mode +output-enable - enable output on pin regardless of output value output-low - set the pin to output mode with low level output-high - set the pin to output mode with high level slew-rate - set the slew rate @@ -258,6 +285,12 @@ state_2_node_a { bias-pull-up; }; }; +state_3_node_a { + mux { + pinmux = , ; + input-enable; + }; +}; Some of the generic properties take arguments. For those that do, the arguments are described below. @@ -266,6 +299,11 @@ arguments are described below. binding for the hardware defines: - Whether the entries are integers or strings, and their meaning. +- pinmux takes a list of pin IDs and mux settings as required argument. The + specific bindings for the hardware defines: + - How pin IDs and mux settings are defined and assembled together in a single + integer. + - bias-pull-up, -down and -pin-default take as optional argument on hardware supporting it the pull strength in Ohm. bias-disable will disable the pull. diff --git a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt index 4722bc61a1a29c62a909ed581b9727d94cf51f82..ee01ab58224d5e3907b3a115dcf6d0d8ed7c595f 100644 --- a/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/rockchip,pinctrl.txt @@ -19,11 +19,18 @@ The pins are grouped into up to 5 individual pin banks which need to be defined as gpio sub-nodes of the pinmux controller. Required properties for iomux controller: - - compatible: one of "rockchip,rk1108-pinctrl", "rockchip,rk2928-pinctrl" - "rockchip,rk3066a-pinctrl", "rockchip,rk3066b-pinctrl" - "rockchip,rk3188-pinctrl", "rockchip,rk3228-pinctrl" - "rockchip,rk3288-pinctrl", "rockchip,rk3368-pinctrl" - "rockchip,rk3399-pinctrl" + - compatible: should be + "rockchip,rv1108-pinctrl": for Rockchip RV1108 + "rockchip,rk2928-pinctrl": for Rockchip RK2928 + "rockchip,rk3066a-pinctrl": for Rockchip RK3066a + "rockchip,rk3066b-pinctrl": for Rockchip RK3066b + "rockchip,rk3188-pinctrl": for Rockchip RK3188 + "rockchip,rk3228-pinctrl": for Rockchip RK3228 + "rockchip,rk3288-pinctrl": for Rockchip RK3288 + "rockchip,rk3328-pinctrl": for Rockchip RK3328 + "rockchip,rk3368-pinctrl": for Rockchip RK3368 + "rockchip,rk3399-pinctrl": for Rockchip RK3399 + - rockchip,grf: phandle referencing a syscon providing the "general register files" diff --git a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt index eac20aa33907a647377571995cb3f863dc8df27d..d907a74f8dc0a5ee98d87118232fb20580c0fba3 100644 --- a/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt +++ b/Documentation/devicetree/bindings/pinctrl/st,stm32-pinctrl.txt @@ -9,6 +9,7 @@ Pin controller node: Required properies: - compatible: value should be one of the following: "st,stm32f429-pinctrl" + "st,stm32f469-pinctrl" "st,stm32f746-pinctrl" "st,stm32h743-pinctrl" - #address-cells: The value of this property must be 1 @@ -38,8 +39,6 @@ Optional properties: - st,syscfg: Should be phandle/offset pair. The phandle to the syscon node which includes IRQ mux selection register, and the offset of the IRQ mux selection register. - - ngpios: Number of gpios in a bank (to use if bank gpio numbers is less - than 16). - gpio-ranges: Define a dedicated mapping between a pin-controller and a gpio controller. Format is <&phandle a b c> with: -(phandle): phandle of pin-controller. diff --git a/Documentation/devicetree/bindings/power/fsl,imx-gpc.txt b/Documentation/devicetree/bindings/power/fsl,imx-gpc.txt index 65cc0345747d73f540d6c5a4e8b18acc1618953b..6c1498958d48c0221ff2d7731051f5f82ea5fe85 100644 --- a/Documentation/devicetree/bindings/power/fsl,imx-gpc.txt +++ b/Documentation/devicetree/bindings/power/fsl,imx-gpc.txt @@ -1,22 +1,42 @@ Freescale i.MX General Power Controller ======================================= -The i.MX6Q General Power Control (GPC) block contains DVFS load tracking -counters and Power Gating Control (PGC) for the CPU and PU (GPU/VPU) power -domains. +The i.MX6 General Power Control (GPC) block contains DVFS load tracking +counters and Power Gating Control (PGC). Required properties: -- compatible: Should be "fsl,imx6q-gpc" or "fsl,imx6sl-gpc" +- compatible: Should be one of the following: + - fsl,imx6q-gpc + - fsl,imx6qp-gpc + - fsl,imx6sl-gpc - reg: should be register base and length as documented in the datasheet -- interrupts: Should contain GPC interrupt request 1 -- pu-supply: Link to the LDO regulator powering the PU power domain -- clocks: Clock phandles to devices in the PU power domain that need - to be enabled during domain power-up for reset propagation. -- #power-domain-cells: Should be 1, see below: +- interrupts: Should contain one interrupt specifier for the GPC interrupt +- clocks: Must contain an entry for each entry in clock-names. + See Documentation/devicetree/bindings/clocks/clock-bindings.txt for details. +- clock-names: Must include the following entries: + - ipg -The gpc node is a power-controller as documented by the generic power domain -bindings in Documentation/devicetree/bindings/power/power_domain.txt. +The power domains are generic power domain providers as documented in +Documentation/devicetree/bindings/power/power_domain.txt. They are described as +subnodes of the power gating controller 'pgc' node of the GPC and should +contain the following: + +Required properties: +- reg: Must contain the DOMAIN_INDEX of this power domain + The following DOMAIN_INDEX values are valid for i.MX6Q: + ARM_DOMAIN 0 + PU_DOMAIN 1 + The following additional DOMAIN_INDEX value is valid for i.MX6SL: + DISPLAY_DOMAIN 2 + +- #power-domain-cells: Should be 0 + +Optional properties: +- clocks: a number of phandles to clocks that need to be enabled during domain + power-up sequencing to ensure reset propagation into devices located inside + this power domain +- power-supply: a phandle to the regulator powering this domain Example: @@ -25,14 +45,30 @@ Example: reg = <0x020dc000 0x4000>; interrupts = <0 89 IRQ_TYPE_LEVEL_HIGH>, <0 90 IRQ_TYPE_LEVEL_HIGH>; - pu-supply = <®_pu>; - clocks = <&clks IMX6QDL_CLK_GPU3D_CORE>, - <&clks IMX6QDL_CLK_GPU3D_SHADER>, - <&clks IMX6QDL_CLK_GPU2D_CORE>, - <&clks IMX6QDL_CLK_GPU2D_AXI>, - <&clks IMX6QDL_CLK_OPENVG_AXI>, - <&clks IMX6QDL_CLK_VPU_AXI>; - #power-domain-cells = <1>; + clocks = <&clks IMX6QDL_CLK_IPG>; + clock-names = "ipg"; + + pgc { + #address-cells = <1>; + #size-cells = <0>; + + power-domain@0 { + reg = <0>; + #power-domain-cells = <0>; + }; + + pd_pu: power-domain@1 { + reg = <1>; + #power-domain-cells = <0>; + power-supply = <®_pu>; + clocks = <&clks IMX6QDL_CLK_GPU3D_CORE>, + <&clks IMX6QDL_CLK_GPU3D_SHADER>, + <&clks IMX6QDL_CLK_GPU2D_CORE>, + <&clks IMX6QDL_CLK_GPU2D_AXI>, + <&clks IMX6QDL_CLK_OPENVG_AXI>, + <&clks IMX6QDL_CLK_VPU_AXI>; + }; + }; }; @@ -40,20 +76,13 @@ Specifying power domain for IP modules ====================================== IP cores belonging to a power domain should contain a 'power-domains' property -that is a phandle pointing to the gpc device node and a DOMAIN_INDEX specifying -the power domain the device belongs to. +that is a phandle pointing to the power domain the device belongs to. Example of a device that is part of the PU power domain: vpu: vpu@02040000 { reg = <0x02040000 0x3c000>; /* ... */ - power-domains = <&gpc 1>; + power-domains = <&pd_pu>; /* ... */ }; - -The following DOMAIN_INDEX values are valid for i.MX6Q: -ARM_DOMAIN 0 -PU_DOMAIN 1 -The following additional DOMAIN_INDEX value is valid for i.MX6SL: -DISPLAY_DOMAIN 2 diff --git a/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.txt b/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.txt new file mode 100644 index 0000000000000000000000000000000000000000..02f45c65fd87627007228b58dc6d7679ec3e5507 --- /dev/null +++ b/Documentation/devicetree/bindings/power/fsl,imx-gpcv2.txt @@ -0,0 +1,71 @@ +Freescale i.MX General Power Controller v2 +========================================== + +The i.MX7S/D General Power Control (GPC) block contains Power Gating +Control (PGC) for various power domains. + +Required properties: + +- compatible: Should be "fsl,imx7d-gpc" + +- reg: should be register base and length as documented in the + datasheet + +- interrupts: Should contain GPC interrupt request 1 + +Power domains contained within GPC node are generic power domain +providers, documented in +Documentation/devicetree/bindings/power/power_domain.txt, which are +described as subnodes of the power gating controller 'pgc' node, +which, in turn, is expected to contain the following: + +Required properties: + +- reg: Power domain index. Valid values are defined in + include/dt-bindings/power/imx7-power.h + +- #power-domain-cells: Should be 0 + +Optional properties: + +- power-supply: Power supply used to power the domain + +Example: + + gpc: gpc@303a0000 { + compatible = "fsl,imx7d-gpc"; + reg = <0x303a0000 0x1000>; + interrupt-controller; + interrupts = ; + #interrupt-cells = <3>; + interrupt-parent = <&intc>; + + pgc { + #address-cells = <1>; + #size-cells = <0>; + + pgc_pcie_phy: power-domain@3 { + #power-domain-cells = <0>; + + reg = ; + power-supply = <®_1p0d>; + }; + }; + }; + + +Specifying power domain for IP modules +====================================== + +IP cores belonging to a power domain should contain a 'power-domains' +property that is a phandle for PGC node representing the domain. + +Example of a device that is part of the PCIE_PHY power domain: + + pcie: pcie@33800000 { + reg = <0x33800000 0x4000>, + <0x4ff00000 0x80000>; + /* ... */ + power-domains = <&pgc_pcie_phy>; + /* ... */ + }; diff --git a/Documentation/devicetree/bindings/power/power_domain.txt b/Documentation/devicetree/bindings/power/power_domain.txt index 723e1ad937da24edefa5c48b03ccb1f345c9919f..14bd9e945ff6453f3290d628b083a13e645e9967 100644 --- a/Documentation/devicetree/bindings/power/power_domain.txt +++ b/Documentation/devicetree/bindings/power/power_domain.txt @@ -31,7 +31,9 @@ Optional properties: - domain-idle-states : A phandle of an idle-state that shall be soaked into a generic domain power state. The idle state definitions are - compatible with domain-idle-state specified in [1]. + compatible with domain-idle-state specified in [1]. phandles + that are not compatible with domain-idle-state will be + ignored. The domain-idle-state property reflects the idle state of this PM domain and not the idle states of the devices or sub-domains in the PM domain. Devices and sub-domains have their own idle-states independent of the parent @@ -79,7 +81,7 @@ Example 3: child: power-controller@12341000 { compatible = "foo,power-controller"; reg = <0x12341000 0x1000>; - power-domains = <&parent 0>; + power-domains = <&parent>; #power-domain-cells = <0>; domain-idle-states = <&DOMAIN_PWR_DN>; }; diff --git a/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt b/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fec3e100214c618bd2b5d1ba3a0bb01bc9df04e --- /dev/null +++ b/Documentation/devicetree/bindings/power/reset/gemini-poweroff.txt @@ -0,0 +1,17 @@ +* Device-Tree bindings for Cortina Systems Gemini Poweroff + +This is a special IP block in the Cortina Gemini SoC that only +deals with different ways to power the system down. + +Required properties: +- compatible: should be "cortina,gemini-power-controller" +- reg: should contain the physical memory base and size +- interrupts: should contain the power management interrupt + +Example: + +power-controller@4b000000 { + compatible = "cortina,gemini-power-controller"; + reg = <0x4b000000 0x100>; + interrupts = <26 IRQ_TYPE_EDGE_FALLING>; +}; diff --git a/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt b/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt index 1e2546f8b08a4aab989bbe8c04e76d6fd1df63e4..022ed1f3bc808351752ae1130dd226e7c7e7cb38 100644 --- a/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt +++ b/Documentation/devicetree/bindings/power/reset/syscon-poweroff.txt @@ -3,13 +3,20 @@ Generic SYSCON mapped register poweroff driver This is a generic poweroff driver using syscon to map the poweroff register. The poweroff is generally performed with a write to the poweroff register defined by the register map pointed by syscon reference plus the offset -with the mask defined in the poweroff node. +with the value and mask defined in the poweroff node. Required properties: - compatible: should contain "syscon-poweroff" - regmap: this is phandle to the register map node - offset: offset in the register map for the poweroff register (in bytes) -- mask: the poweroff value written to the poweroff register (32 bit access) +- value: the poweroff value written to the poweroff register (32 bit access) + +Optional properties: +- mask: update only the register bits defined by the mask (32 bit) + +Legacy usage: +If a node doesn't contain a value property but contains a mask property, the +mask property is used as the value. Default will be little endian mode, 32 bit access only. diff --git a/Documentation/devicetree/bindings/power/rockchip-io-domain.txt b/Documentation/devicetree/bindings/power/rockchip-io-domain.txt index d23dc002a87e94840c1e68952f184eb4e7167933..d3a5a93a65cd255ff8f5bc3cafcc2fd967d99147 100644 --- a/Documentation/devicetree/bindings/power/rockchip-io-domain.txt +++ b/Documentation/devicetree/bindings/power/rockchip-io-domain.txt @@ -33,6 +33,7 @@ Required properties: - compatible: should be one of: - "rockchip,rk3188-io-voltage-domain" for rk3188 - "rockchip,rk3288-io-voltage-domain" for rk3288 + - "rockchip,rk3328-io-voltage-domain" for rk3328 - "rockchip,rk3368-io-voltage-domain" for rk3368 - "rockchip,rk3368-pmu-io-voltage-domain" for rk3368 pmu-domains - "rockchip,rk3399-io-voltage-domain" for rk3399 diff --git a/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt b/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt new file mode 100644 index 0000000000000000000000000000000000000000..c24886676a6058522a04a2f0c73f4c7f0e77c712 --- /dev/null +++ b/Documentation/devicetree/bindings/power/supply/axp20x_battery.txt @@ -0,0 +1,20 @@ +AXP20x and AXP22x battery power supply + +Required Properties: + - compatible, one of: + "x-powers,axp209-battery-power-supply" + "x-powers,axp221-battery-power-supply" + +This node is a subnode of the axp20x/axp22x PMIC. + +The AXP20X and AXP22X can read the battery voltage, charge and discharge +currents of the battery by reading ADC channels from the AXP20X/AXP22X +ADC. + +Example: + +&axp209 { + battery_power_supply: battery-power-supply { + compatible = "x-powers,axp209-battery-power-supply"; + } +}; diff --git a/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt b/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt new file mode 100644 index 0000000000000000000000000000000000000000..80bd873c3b1de4c2db3612f0fa0fe302fca3c1fd --- /dev/null +++ b/Documentation/devicetree/bindings/power/supply/cpcap-charger.txt @@ -0,0 +1,37 @@ +Motorola CPCAP PMIC battery charger binding + +Required properties: +- compatible: Shall be "motorola,mapphone-cpcap-charger" +- interrupts: Interrupt specifier for each name in interrupt-names +- interrupt-names: Should contain the following entries: + "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn", + "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb" +- io-channels: IIO ADC channel specifier for each name in io-channel-names +- io-channel-names: Should contain the following entries: + "battdetb", "battp", "vbus", "chg_isense", "batti" + +Optional properties: +- mode-gpios: Optionally CPCAP charger can have a companion wireless + charge controller that is controlled with two GPIOs + that are active low. + +Example: + +cpcap_charger: charger { + compatible = "motorola,mapphone-cpcap-charger"; + interrupts-extended = < + &cpcap 13 0 &cpcap 12 0 &cpcap 29 0 &cpcap 28 0 + &cpcap 22 0 &cpcap 20 0 &cpcap 19 0 &cpcap 54 0 + >; + interrupt-names = + "chrg_det", "rvrs_chrg", "chrg_se1b", "se0conn", + "rvrs_mode", "chrgcurr1", "vbusvld", "battdetb"; + mode-gpios = <&gpio3 29 GPIO_ACTIVE_LOW + &gpio3 23 GPIO_ACTIVE_LOW>; + io-channels = <&cpcap_adc 0 &cpcap_adc 1 + &cpcap_adc 2 &cpcap_adc 5 + &cpcap_adc 6>; + io-channel-names = "battdetb", "battp", + "vbus", "chg_isense", + "batti"; +}; diff --git a/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt b/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt new file mode 100644 index 0000000000000000000000000000000000000000..5485633b1faa4798909892c4a9d8c0b2de362c5c --- /dev/null +++ b/Documentation/devicetree/bindings/power/supply/lego_ev3_battery.txt @@ -0,0 +1,21 @@ +LEGO MINDSTORMS EV3 Battery +~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +LEGO MINDSTORMS EV3 has some built-in capability for monitoring the battery. +It uses 6 AA batteries or a special Li-ion rechargeable battery pack that is +detected by a key switch in the battery compartment. + +Required properties: + - compatible: Must be "lego,ev3-battery" + - io-channels: phandles to analog inputs for reading voltage and current + - io-channel-names: Must be "voltage", "current" + - rechargeable-gpios: phandle to the rechargeable battery indication gpio + +Example: + + battery { + compatible = "lego,ev3-battery"; + io-channels = <&adc 4>, <&adc 3>; + io-channel-names = "voltage", "current"; + rechargeable-gpios = <&gpio 136 GPIO_ACTIVE_LOW>; + }; diff --git a/Documentation/devicetree/bindings/power/supply/ltc2941.txt b/Documentation/devicetree/bindings/power/supply/ltc2941.txt index ea42ae12d92451579e0af57ee2bdfb886e60ad0a..a9d7aa60558b739ca312f86826b72cb247b111c0 100644 --- a/Documentation/devicetree/bindings/power/supply/ltc2941.txt +++ b/Documentation/devicetree/bindings/power/supply/ltc2941.txt @@ -6,8 +6,8 @@ temperature monitoring, and uses a slightly different conversion formula for the charge counter. Required properties: -- compatible: Should contain "ltc2941" or "ltc2943" which also indicates the - type of I2C chip attached. +- compatible: Should contain "lltc,ltc2941" or "lltc,ltc2943" which also + indicates the type of I2C chip attached. - reg: The 7-bit I2C address. - lltc,resistor-sense: The sense resistor value in milli-ohms. Can be a 32-bit negative value when the battery has been connected to the wrong end of the @@ -20,7 +20,7 @@ Required properties: Example from the Topic Miami Florida board: fuelgauge: ltc2943@64 { - compatible = "ltc2943"; + compatible = "lltc,ltc2943"; reg = <0x64>; lltc,resistor-sense = <15>; lltc,prescaler-exponent = <5>; /* 2^(2*5) = 1024 */ diff --git a/Documentation/devicetree/bindings/power/supply/max8925_batter.txt b/Documentation/devicetree/bindings/power/supply/max8925_battery.txt similarity index 100% rename from Documentation/devicetree/bindings/power/supply/max8925_batter.txt rename to Documentation/devicetree/bindings/power/supply/max8925_battery.txt diff --git a/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt b/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt new file mode 100644 index 0000000000000000000000000000000000000000..5af426e13334def9d1e7738e264953cae46fcb46 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/ibm,powerpc-cpu-features.txt @@ -0,0 +1,248 @@ +*** NOTE *** +This document is copied from OPAL firmware +(skiboot/doc/device-tree/ibm,powerpc-cpu-features/binding.txt) + +There is more complete overview and documentation of features in that +source tree. All patches and modifications should go there. +************ + +ibm,powerpc-cpu-features binding +================================ + +This device tree binding describes CPU features available to software, with +enablement, privilege, and compatibility metadata. + +More general description of design and implementation of this binding is +found in design.txt, which also points to documentation of specific features. + + +/cpus/ibm,powerpc-cpu-features node binding +------------------------------------------- + +Node: ibm,powerpc-cpu-features + +Description: Container of CPU feature nodes. + +The node name must be "ibm,powerpc-cpu-features". + +It is implemented as a child of the node "/cpus", but this must not be +assumed by parsers. + +The node is optional but should be provided by new OPAL firmware. + +Properties: + +- compatible + Usage: required + Value type: string + Definition: "ibm,powerpc-cpu-features" + + This compatibility refers to backwards compatibility of the overall + design with parsers that behave according to these guidelines. This can + be extended in a backward compatible manner which would not warrant a + revision of the compatible property. + +- isa + Usage: required + Value type: + Definition: + + isa that the CPU is currently running in. This provides instruction set + compatibility, less the individual feature nodes. For example, an ISA v3.0 + implementation that lacks the "transactional-memory" cpufeature node + should not use transactional memory facilities. + + Value corresponds to the "Power ISA Version" multiplied by 1000. + For example, <3000> corresponds to Version 3.0, <2070> to Version 2.07. + The minor digit is available for revisions. + +- display-name + Usage: optional + Value type: string + Definition: + + A human readable name for the CPU. + +/cpus/ibm,powerpc-cpu-features/example-feature node bindings +---------------------------------------------------------------- + +Each child node of cpu-features represents a CPU feature / capability. + +Node: A string describing an architected CPU feature, e.g., "floating-point". + +Description: A feature or capability supported by the CPUs. + +The name of the node is a human readable string that forms the interface +used to describe features to software. Features are currently documented +in the code where they are implemented in skiboot/core/cpufeatures.c + +Presence of the node indicates the feature is available. + +Properties: + +- isa + Usage: required + Value type: + Definition: + + First level of the Power ISA that the feature appears in. + Software should filter out features when constraining the + environment to a particular ISA version. + + Value is defined similarly to /cpus/features/isa + +- usable-privilege + Usage: required + Value type: bit mask + Definition: + Bit numbers are LSB0 + bit 0 - PR (problem state / user mode) + bit 1 - OS (privileged state) + bit 2 - HV (hypervisor state) + All other bits reserved and should be zero. + + This property describes the privilege levels and/or software components + that can use the feature. + + If bit 0 is set, then the hwcap-bit-nr property will exist. + + +- hv-support + Usage: optional + Value type: bit mask + Definition: + Bit numbers are LSB0 + bit 0 - HFSCR + All other bits reserved and should be zero. + + This property describes the HV privilege support required to enable the + feature to lesser privilege levels. If the property does not exist then no + support is required. + + If no bits are set, the hypervisor must have explicit/custom support for + this feature. + + If the HFSCR bit is set, then the hfscr-bit-nr property will exist and + the feature may be enabled by setting this bit in the HFSCR register. + + +- os-support + Usage: optional + Value type: bit mask + Definition: + Bit numbers are LSB0 + bit 0 - FSCR + All other bits reserved and should be zero. + + This property describes the OS privilege support required to enable the + feature to lesser privilege levels. If the property does not exist then no + support is required. + + If no bits are set, the operating system must have explicit/custom support + for this feature. + + If the FSCR bit is set, then the fscr-bit-nr property will exist and + the feature may be enabled by setting this bit in the FSCR register. + + +- hfscr-bit-nr + Usage: optional + Value type: + Definition: HFSCR bit position (LSB0) + + This property exists when the hv-support property HFSCR bit is set. This + property describes the bit number in the HFSCR register that the + hypervisor must set in order to enable this feature. + + This property also exists if an HFSCR bit corresponds with this feature. + This makes CPU feature parsing slightly simpler. + + +- fscr-bit-nr + Usage: optional + Value type: + Definition: FSCR bit position (LSB0) + + This property exists when the os-support property FSCR bit is set. This + property describes the bit number in the FSCR register that the + operating system must set in order to enable this feature. + + This property also exists if an FSCR bit corresponds with this feature. + This makes CPU feature parsing slightly simpler. + + +- hwcap-bit-nr + Usage: optional + Value type: + Definition: Linux ELF AUX vector bit position (LSB0) + + This property may exist when the usable-privilege property value has PR bit set. + This property describes the bit number that should be set in the ELF AUX + hardware capability vectors in order to advertise this feature to userspace. + Bits 0-31 correspond to bits 0-31 in AT_HWCAP vector. Bits 32-63 correspond + to 0-31 in AT_HWCAP2 vector, and so on. Missing AT_HWCAPx vectors implies + that the feature is not enabled or can not be advertised. Operating systems + may provide a number of unassigned hardware capability bits to allow for new + features to be advertised. + + Some properties representing features created before this binding are + advertised to userspace without a one-to-one hwcap bit number may not specify + this bit. Operating system will handle those bits specifically. All new + features usable by userspace will have a hwcap-bit-nr property. + + +- dependencies + Usage: optional + Value type: + Definition: + + If this property exists then it is a list of phandles to cpu feature + nodes that must be enabled for this feature to be enabled. + + +Example +------- + + /cpus/ibm,powerpc-cpu-features { + compatible = "ibm,powerpc-cpu-features"; + + isa = <3020>; + + darn { + isa = <3000>; + usable-privilege = <1 | 2 | 4>; + hwcap-bit-nr = ; + }; + + scv { + isa = <3000>; + usable-privilege = <1 | 2>; + os-support = <0>; + hwcap-bit-nr = ; + }; + + stop { + isa = <3000>; + usable-privilege = <2 | 4>; + hv-support = <0>; + os-support = <0>; + }; + + vsx2 (hypothetical) { + isa = <3010>; + usable-privilege = <1 | 2 | 4>; + hv-support = <0>; + os-support = <0>; + hwcap-bit-nr = ; + }; + + vsx2-newinsns { + isa = <3020>; + usable-privilege = <1 | 2 | 4>; + os-support = <1>; + fscr-bit-nr = ; + hwcap-bit-nr = ; + dependencies = <&vsx2>; + }; + + }; diff --git a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt b/Documentation/devicetree/bindings/pwm/atmel-pwm.txt index 02331b904d4e73780da1999f807068915545ebb9..c8c831d7b0d1bb9d90cfc9b3022b57b9e6a2f74d 100644 --- a/Documentation/devicetree/bindings/pwm/atmel-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/atmel-pwm.txt @@ -4,6 +4,7 @@ Required properties: - compatible: should be one of: - "atmel,at91sam9rl-pwm" - "atmel,sama5d3-pwm" + - "atmel,sama5d2-pwm" - reg: physical base address and length of the controller's registers - #pwm-cells: Should be 3. See pwm.txt in this directory for a description of the cells format. diff --git a/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt b/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt index b4e73778dda3a2e211ec210869d4d78807066122..c57e11b8d9375b85b882407a3477f7ff2ac58553 100644 --- a/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt +++ b/Documentation/devicetree/bindings/pwm/nvidia,tegra20-pwm.txt @@ -19,6 +19,19 @@ Required properties: - reset-names: Must include the following entries: - pwm +Optional properties: +============================ +In some of the interface like PWM based regulator device, it is required +to configure the pins differently in different states, especially in suspend +state of the system. The configuration of pin is provided via the pinctrl +DT node as detailed in the pinctrl DT binding document + Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt + +The PWM node will have following optional properties. +pinctrl-names: Pin state names. Must be "default" and "sleep". +pinctrl-0: phandle for the default/active state of pin configurations. +pinctrl-1: phandle for the sleep state of pin configurations. + Example: pwm: pwm@7000a000 { @@ -29,3 +42,35 @@ Example: resets = <&tegra_car 17>; reset-names = "pwm"; }; + + +Example with the pin configuration for suspend and resume: +========================================================= +Suppose pin PE7 (On Tegra210) interfaced with the regulator device and +it requires PWM output to be tristated when system enters suspend. +Following will be DT binding to achieve this: + +#include + + pinmux@700008d4 { + pwm_active_state: pwm_active_state { + pe7 { + nvidia,pins = "pe7"; + nvidia,tristate = ; + }; + }; + + pwm_sleep_state: pwm_sleep_state { + pe7 { + nvidia,pins = "pe7"; + nvidia,tristate = ; + }; + }; + }; + + pwm@7000a000 { + /* Mandatory PWM properties */ + pinctrl-names = "default", "sleep"; + pinctrl-0 = <&pwm_active_state>; + pinctrl-1 = <&pwm_sleep_state>; + }; diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt new file mode 100644 index 0000000000000000000000000000000000000000..54c59b0560ad701f9542da665f845c3bb1add186 --- /dev/null +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt @@ -0,0 +1,34 @@ +MediaTek PWM controller + +Required properties: + - compatible: should be "mediatek,-pwm": + - "mediatek,mt7623-pwm": found on mt7623 SoC. + - reg: physical base address and length of the controller's registers. + - #pwm-cells: must be 2. See pwm.txt in this directory for a description of + the cell format. + - clocks: phandle and clock specifier of the PWM reference clock. + - clock-names: must contain the following: + - "top": the top clock generator + - "main": clock used by the PWM core + - "pwm1-5": the five per PWM clocks + - pinctrl-names: Must contain a "default" entry. + - pinctrl-0: One property must exist for each entry in pinctrl-names. + See pinctrl/pinctrl-bindings.txt for details of the property values. + +Example: + pwm0: pwm@11006000 { + compatible = "mediatek,mt7623-pwm"; + reg = <0 0x11006000 0 0x1000>; + #pwm-cells = <2>; + clocks = <&topckgen CLK_TOP_PWM_SEL>, + <&pericfg CLK_PERI_PWM>, + <&pericfg CLK_PERI_PWM1>, + <&pericfg CLK_PERI_PWM2>, + <&pericfg CLK_PERI_PWM3>, + <&pericfg CLK_PERI_PWM4>, + <&pericfg CLK_PERI_PWM5>; + clock-names = "top", "main", "pwm1", "pwm2", + "pwm3", "pwm4", "pwm5"; + pinctrl-names = "default"; + pinctrl-0 = <&pwm0_pins>; + }; diff --git a/Documentation/devicetree/bindings/regulator/anatop-regulator.txt b/Documentation/devicetree/bindings/regulator/anatop-regulator.txt index 1d58c8cfdbc01d8fffad426774dd730a987af2d1..a3106c72fbeaa1a862f85b49693490cd74457963 100644 --- a/Documentation/devicetree/bindings/regulator/anatop-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/anatop-regulator.txt @@ -2,6 +2,7 @@ Anatop Voltage regulators Required properties: - compatible: Must be "fsl,anatop-regulator" +- regulator-name: A string used as a descriptive name for regulator outputs - anatop-reg-offset: Anatop MFD register offset - anatop-vol-bit-shift: Bit shift for the register - anatop-vol-bit-width: Number of bits used in the register diff --git a/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt b/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt index 8f14df9d120555adcfca9008238d7653cc7a5675..cc5a6151d85ff40c966a88098236b9fee9a6ea74 100644 --- a/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt +++ b/Documentation/devicetree/bindings/regulator/lm363x-regulator.txt @@ -8,8 +8,8 @@ Required property: Optional properties: LM3632 has external enable pins for two LDOs. - - ti,lcm-en1-gpio: A GPIO specifier for Vpos control pin. - - ti,lcm-en2-gpio: A GPIO specifier for Vneg control pin. + - enable-gpios: Two GPIO specifiers for Vpos and Vneg control pins. + The first entry is Vpos, the second is Vneg enable pin. Child nodes: LM3631 @@ -30,5 +30,79 @@ Child nodes: Examples: Please refer to ti-lmu dt-bindings [2]. +lm3631@29 { + compatible = "ti,lm3631"; + reg = <0x29>; + + regulators { + compatible = "ti,lm363x-regulator"; + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4500000>; + regulator-max-microvolt = <6350000>; + regulator-always-on; + }; + + vcont { + regulator-name = "lcd_vcont"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + }; + + voref { + regulator-name = "lcd_voref"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + regulator-boot-on; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + regulator-boot-on; + }; + }; +}; + +lm3632@11 { + compatible = "ti,lm3632"; + reg = <0x11>; + + regulators { + compatible = "ti,lm363x-regulator"; + + /* GPIO1_16 for Vpos, GPIO1_28 is for Vneg */ + enable-gpios = <&gpio1 16 GPIO_ACTIVE_HIGH>, + <&gpio1 28 GPIO_ACTIVE_HIGH>; + + vboost { + regulator-name = "lcd_boost"; + regulator-min-microvolt = <4500000>; + regulator-max-microvolt = <6400000>; + regulator-always-on; + }; + + vpos { + regulator-name = "lcd_vpos"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + + vneg { + regulator-name = "lcd_vneg"; + regulator-min-microvolt = <4000000>; + regulator-max-microvolt = <6000000>; + }; + }; +}; + [1] ../regulator/regulator.txt [2] ../mfd/ti-lmu.txt diff --git a/Documentation/devicetree/bindings/regulator/pfuze100.txt b/Documentation/devicetree/bindings/regulator/pfuze100.txt index 9b40db88f637bd9ed7f06c5eb809e7b16ddd660d..444c47831a408c6b24d908f707117d7d3788cfde 100644 --- a/Documentation/devicetree/bindings/regulator/pfuze100.txt +++ b/Documentation/devicetree/bindings/regulator/pfuze100.txt @@ -13,7 +13,7 @@ Required child node: --PFUZE100 sw1ab,sw1c,sw2,sw3a,sw3b,sw4,swbst,vsnvs,vrefddr,vgen1~vgen6 --PFUZE200 - sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6 + sw1ab,sw2,sw3a,sw3b,swbst,vsnvs,vrefddr,vgen1~vgen6,coin --PFUZE3000 sw1a,sw1b,sw2,sw3,swbst,vsnvs,vrefddr,vldo1,vldo2,vccsd,v33,vldo3,vldo4 @@ -205,6 +205,12 @@ Example 2: PFUZE200 regulator-max-microvolt = <3300000>; regulator-always-on; }; + + coin_reg: coin { + regulator-min-microvolt = <2500000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + }; }; }; diff --git a/Documentation/devicetree/bindings/regulator/regulator.txt b/Documentation/devicetree/bindings/regulator/regulator.txt index 6ab5aef619d9fe73c1db7e62fa8c10c5c03d33f9..d18edb075e1c746debc19bf4f9bf7325aef0ecef 100644 --- a/Documentation/devicetree/bindings/regulator/regulator.txt +++ b/Documentation/devicetree/bindings/regulator/regulator.txt @@ -21,6 +21,9 @@ Optional properties: design requires. This property describes the total system ramp time required due to the combination of internal ramping of the regulator itself, and board design issues such as trace capacitance and load on the supply. +- regulator-settling-time-us: Settling time, in microseconds, for voltage + change if regulator have the constant time for any level voltage change. + This is useful when regulator have exponential voltage change. - regulator-soft-start: Enable soft start so that voltage ramps slowly - regulator-state-mem sub-root node for Suspend-to-RAM mode : suspend to memory, the device goes to sleep, but all data stored in memory, diff --git a/Documentation/devicetree/bindings/regulator/tps65132-regulator.txt b/Documentation/devicetree/bindings/regulator/tps65132-regulator.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a3505520c692b5fd9e3f5ff1f6a88d732954b43 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/tps65132-regulator.txt @@ -0,0 +1,46 @@ +TPS65132 regulators + +Required properties: +- compatible: "ti,tps65132" +- reg: I2C slave address + +Optional Subnode: +Device supports two regulators OUTP and OUTN. A sub node within the + device node describe the properties of these regulators. The sub-node + names must be as follows: + -For regulator outp, the sub node name should be "outp". + -For regulator outn, the sub node name should be "outn". + +-enable-gpios:(active high, output) Regulators are controlled by the input pins. + If it is connected to GPIO through host system then provide the + gpio number as per gpio.txt. +-active-discharge-gpios: (active high, output) Some configurations use delay mechanisms + on the enable pin, to keep the regulator enabled for some time after + the enable signal goes low. This GPIO is used to actively discharge + the delay mechanism. Requires specification of ti,active-discharge-time-us +-ti,active-discharge-time-us: how long the active discharge gpio should be + asserted for during active discharge, in microseconds. + +Each regulator is defined using the standard binding for regulators. + +Example: + + tps65132@3e { + compatible = "ti,tps65132"; + reg = <0x3e>; + + outp { + regulator-name = "outp"; + regulator-boot-on; + regulator-always-on; + enable-gpios = <&gpio 23 0>; + }; + + outn { + regulator-name = "outn"; + regulator-boot-on; + regulator-always-on; + regulator-active-discharge = <0>; + enable-gpios = <&gpio 40 0>; + }; + }; diff --git a/Documentation/devicetree/bindings/regulator/vctrl.txt b/Documentation/devicetree/bindings/regulator/vctrl.txt new file mode 100644 index 0000000000000000000000000000000000000000..601328d7fdbb725647d48e0e10d6b19dc5ca6254 --- /dev/null +++ b/Documentation/devicetree/bindings/regulator/vctrl.txt @@ -0,0 +1,49 @@ +Bindings for Voltage controlled regulators +========================================== + +Required properties: +-------------------- +- compatible : must be "vctrl-regulator". +- regulator-min-microvolt : smallest voltage consumers may set +- regulator-max-microvolt : largest voltage consumers may set +- ctrl-supply : The regulator supplying the control voltage. +- ctrl-voltage-range : an array of two integer values describing the range + (min/max) of the control voltage. The values specify + the control voltage needed to generate the corresponding + regulator-min/max-microvolt output voltage. + +Optional properties: +-------------------- +- ovp-threshold-percent : overvoltage protection (OVP) threshold of the + regulator in percent. Some regulators have an OVP + circuitry which shuts down the regulator when the + actual output voltage deviates beyond a certain + margin from the expected value for a given control + voltage. On larger voltage decreases this can occur + undesiredly since the output voltage does not adjust + inmediately to changes in the control voltage. To + avoid this situation the vctrl driver breaks down + larger voltage decreases into multiple steps, where + each step is within the OVP threshold. +- min-slew-down-rate : Describes how slowly the regulator voltage will decay + down in the worst case (lightest expected load). + Specified in uV / us (like main regulator ramp rate). + This value is required when ovp-threshold-percent is + specified. + +Example: + + vctrl-reg { + compatible = "vctrl-regulator"; + regulator-name = "vctrl_reg"; + + ctrl-supply = <&ctrl_reg>; + + regulator-min-microvolt = <800000>; + regulator-max-microvolt = <1500000>; + + ctrl-voltage-range = <200000 500000>; + + min-slew-down-rate = <225>; + ovp-threshold-percent = <16>; + }; diff --git a/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e1afc3d8480d45c9b75be4f0fdf2007e47a3d65 --- /dev/null +++ b/Documentation/devicetree/bindings/reset/fsl,imx7-src.txt @@ -0,0 +1,47 @@ +Freescale i.MX7 System Reset Controller +====================================== + +Please also refer to reset.txt in this directory for common reset +controller binding usage. + +Required properties: +- compatible: Should be "fsl,imx7-src", "syscon" +- reg: should be register base and length as documented in the + datasheet +- interrupts: Should contain SRC interrupt +- #reset-cells: 1, see below + +example: + +src: reset-controller@30390000 { + compatible = "fsl,imx7d-src", "syscon"; + reg = <0x30390000 0x2000>; + interrupts = ; + #reset-cells = <1>; +}; + + +Specifying reset lines connected to IP modules +============================================== + +The system reset controller can be used to reset various set of +peripherals. Device nodes that need access to reset lines should +specify them as a reset phandle in their corresponding node as +specified in reset.txt. + +Example: + + pcie: pcie@33800000 { + + ... + + resets = <&src IMX7_RESET_PCIEPHY>, + <&src IMX7_RESET_PCIE_CTRL_APPS_EN>; + reset-names = "pciephy", "apps"; + + ... + }; + + +For list of all valid reset indicies see + diff --git a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt index 202f2d09a23f2cb2d817c1733b88c1b93fb67690..4d403645ac9b3dcd513c5f55a7217542a1eb1d37 100644 --- a/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt +++ b/Documentation/devicetree/bindings/rng/amlogic,meson-rng.txt @@ -6,9 +6,16 @@ Required properties: - compatible : should be "amlogic,meson-rng" - reg : Specifies base physical address and size of the registers. +Optional properties: + +- clocks : phandle to the following named clocks +- clock-names: Name of core clock, must be "core" + Example: rng { - compatible = "amlogic,meson-rng"; - reg = <0x0 0xc8834000 0x0 0x4>; + compatible = "amlogic,meson-rng"; + reg = <0x0 0xc8834000 0x0 0x4>; + clocks = <&clkc CLKID_RNG0>; + clock-names = "core"; }; diff --git a/Documentation/devicetree/bindings/rng/mtk-rng.txt b/Documentation/devicetree/bindings/rng/mtk-rng.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d62a2abd39440ae680d10feb5e7340c5364558 --- /dev/null +++ b/Documentation/devicetree/bindings/rng/mtk-rng.txt @@ -0,0 +1,18 @@ +Device-Tree bindings for Mediatek random number generator +found in Mediatek SoC family + +Required properties: +- compatible : Should be "mediatek,mt7623-rng" +- clocks : list of clock specifiers, corresponding to + entries in clock-names property; +- clock-names : Should contain "rng" entries; +- reg : Specifies base physical address and size of the registers + +Example: + +rng: rng@1020f000 { + compatible = "mediatek,mt7623-rng"; + reg = <0 0x1020f000 0 0x1000>; + clocks = <&infracfg CLK_INFRA_TRNG>; + clock-names = "rng"; +}; diff --git a/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt b/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt new file mode 100644 index 0000000000000000000000000000000000000000..45750ff3112d26448506fdf1aa3f239cfe9f3f3c --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/cpcap-rtc.txt @@ -0,0 +1,18 @@ +Motorola CPCAP PMIC RTC +----------------------- + +This module is part of the CPCAP. For more details about the whole +chip see Documentation/devicetree/bindings/mfd/motorola-cpcap.txt. + +Requires node properties: +- compatible: should contain "motorola,cpcap-rtc" +- interrupts: An interrupt specifier for alarm and 1 Hz irq + +Example: + +&cpcap { + cpcap_rtc: rtc { + compatible = "motorola,cpcap-rtc"; + interrupts = <39 IRQ_TYPE_NONE>, <26 IRQ_TYPE_NONE>; + }; +}; diff --git a/Documentation/devicetree/bindings/rtc/rtc-sh.txt b/Documentation/devicetree/bindings/rtc/rtc-sh.txt new file mode 100644 index 0000000000000000000000000000000000000000..7676c7d2887403733965bac7c234e0665a2216c7 --- /dev/null +++ b/Documentation/devicetree/bindings/rtc/rtc-sh.txt @@ -0,0 +1,28 @@ +* Real Time Clock for Renesas SH and ARM SoCs + +Required properties: +- compatible: Should be "renesas,r7s72100-rtc" and "renesas,sh-rtc" as a + fallback. +- reg: physical base address and length of memory mapped region. +- interrupts: 3 interrupts for alarm, period, and carry. +- interrupt-names: The interrupts should be labeled as "alarm", "period", and + "carry". +- clocks: The functional clock source for the RTC controller must be listed + first (if exists). Additionally, potential clock counting sources are to be + listed. +- clock-names: The functional clock must be labeled as "fck". Other clocks + may be named in accordance to the SoC hardware manuals. + + +Example: +rtc: rtc@fcff1000 { + compatible = "renesas,r7s72100-rtc", "renesas,sh-rtc"; + reg = <0xfcff1000 0x2e>; + interrupts = ; + interrupt-names = "alarm", "period", "carry"; + clocks = <&mstp6_clks R7S72100_CLK_RTC>, <&rtc_x1_clk>, + <&rtc_x3_clk>, <&extal_clk>; + clock-names = "fck", "rtc_x1", "rtc_x3", "extal"; +}; diff --git a/Documentation/devicetree/bindings/serial/sprd-uart.txt b/Documentation/devicetree/bindings/serial/sprd-uart.txt index 2aff0f22c9fa31837b9f49cd6d4213d14186f9e4..cab40f0f6f497ab7bc0b3ab9640bcc0563a8cddb 100644 --- a/Documentation/devicetree/bindings/serial/sprd-uart.txt +++ b/Documentation/devicetree/bindings/serial/sprd-uart.txt @@ -1,7 +1,19 @@ * Spreadtrum serial UART Required properties: -- compatible: must be "sprd,sc9836-uart" +- compatible: must be one of: + * "sprd,sc9836-uart" + * "sprd,sc9860-uart", "sprd,sc9836-uart" + - reg: offset and length of the register set for the device - interrupts: exactly one interrupt specifier - clocks: phandles to input clocks. + +Example: + uart0: serial@0 { + compatible = "sprd,sc9860-uart", + "sprd,sc9836-uart"; + reg = <0x0 0x100>; + interrupts = ; + clocks = <&ext_26m>; + }; diff --git a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt index 349f79fd7076a63b3ad373e0d473f7e00f484c14..626e1afa64a6eb729f89e311ec67716167b45408 100644 --- a/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt +++ b/Documentation/devicetree/bindings/soc/fsl/cpm_qe/gpio.txt @@ -13,8 +13,17 @@ Required properties: - #gpio-cells : Should be two. The first cell is the pin number and the second cell is used to specify optional parameters (currently unused). - gpio-controller : Marks the port as GPIO controller. +Optional properties: +- fsl,cpm1-gpio-irq-mask : For banks having interrupt capability (like port C + on CPM1), this item tells which ports have an associated interrupt (ports are + listed in the same order as in PCINT register) +- interrupts : This property provides the list of interrupt for each GPIO having + one as described by the fsl,cpm1-gpio-irq-mask property. There should be as + many interrupts as number of ones in the mask property. The first interrupt in + the list corresponds to the most significant bit of the mask. +- interrupt-parent : Parent for the above interrupt property. -Example of three SOC GPIO banks defined as gpio-controller nodes: +Example of four SOC GPIO banks defined as gpio-controller nodes: CPM1_PIO_A: gpio-controller@950 { #gpio-cells = <2>; @@ -30,6 +39,16 @@ Example of three SOC GPIO banks defined as gpio-controller nodes: gpio-controller; }; + CPM1_PIO_C: gpio-controller@960 { + #gpio-cells = <2>; + compatible = "fsl,cpm1-pario-bank-c"; + reg = <0x960 0x10>; + fsl,cpm1-gpio-irq-mask = <0x0fff>; + interrupts = <1 2 6 9 10 11 14 15 23 24 26 31>; + interrupt-parent = <&CPM_PIC>; + gpio-controller; + }; + CPM1_PIO_E: gpio-controller@ac8 { #gpio-cells = <2>; compatible = "fsl,cpm1-pario-bank-e"; diff --git a/Documentation/devicetree/bindings/soc/rockchip/grf.txt b/Documentation/devicetree/bindings/soc/rockchip/grf.txt index a0685c2092184d225cbb25982c8af525f48259ce..cc9f05d3cbc1a0b08a2a278364332b8114b027a3 100644 --- a/Documentation/devicetree/bindings/soc/rockchip/grf.txt +++ b/Documentation/devicetree/bindings/soc/rockchip/grf.txt @@ -8,6 +8,8 @@ From RK3368 SoCs, the GRF is divided into two sections, - SGRF, used for general secure system, - PMUGRF, used for always on system +On RK3328 SoCs, the GRF adds a section for USB2PHYGRF, + Required Properties: - compatible: GRF should be one of the following: @@ -16,6 +18,7 @@ Required Properties: - "rockchip,rk3188-grf", "syscon": for rk3188 - "rockchip,rk3228-grf", "syscon": for rk3228 - "rockchip,rk3288-grf", "syscon": for rk3288 + - "rockchip,rk3328-grf", "syscon": for rk3328 - "rockchip,rk3368-grf", "syscon": for rk3368 - "rockchip,rk3399-grf", "syscon": for rk3399 - compatible: PMUGRF should be one of the following: @@ -23,6 +26,8 @@ Required Properties: - "rockchip,rk3399-pmugrf", "syscon": for rk3399 - compatible: SGRF should be one of the following - "rockchip,rk3288-sgrf", "syscon": for rk3288 +- compatible: USB2PHYGRF should be one of the followings + - "rockchip,rk3328-usb2phy-grf", "syscon": for rk3328 - reg: physical base address of the controller and length of memory mapped region. diff --git a/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt b/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt new file mode 100644 index 0000000000000000000000000000000000000000..c705db07d8206d74835c729cf6253d97a7227df0 --- /dev/null +++ b/Documentation/devicetree/bindings/soc/ti/sci-pm-domain.txt @@ -0,0 +1,57 @@ +Texas Instruments TI-SCI Generic Power Domain +--------------------------------------------- + +Some TI SoCs contain a system controller (like the PMMC, etc...) that is +responsible for controlling the state of the IPs that are present. +Communication between the host processor running an OS and the system +controller happens through a protocol known as TI-SCI [1]. + +[1] Documentation/devicetree/bindings/arm/keystone/ti,sci.txt + +PM Domain Node +============== +The PM domain node represents the global PM domain managed by the PMMC, which +in this case is the implementation as documented by the generic PM domain +bindings in Documentation/devicetree/bindings/power/power_domain.txt. Because +this relies on the TI SCI protocol to communicate with the PMMC it must be a +child of the pmmc node. + +Required Properties: +-------------------- +- compatible: should be "ti,sci-pm-domain" +- #power-domain-cells: Must be 1 so that an id can be provided in each + device node. + +Example (K2G): +------------- + pmmc: pmmc { + compatible = "ti,k2g-sci"; + ... + + k2g_pds: power-controller { + compatible = "ti,sci-pm-domain"; + #power-domain-cells = <1>; + }; + }; + +PM Domain Consumers +=================== +Hardware blocks belonging to a PM domain should contain a "power-domains" +property that is a phandle pointing to the corresponding PM domain node +along with an index representing the device id to be passed to the PMMC +for device control. + +Required Properties: +-------------------- +- power-domains: phandle pointing to the corresponding PM domain node + and an ID representing the device. + +See dt-bindings/genpd/k2g.h for the list of valid identifiers for k2g. + +Example (K2G): +-------------------- + uart0: serial@02530c00 { + compatible = "ns16550a"; + ... + power-domains = <&k2g_pds K2G_DEV_UART0>; + }; diff --git a/Documentation/devicetree/bindings/sound/cs35l35.txt b/Documentation/devicetree/bindings/sound/cs35l35.txt new file mode 100644 index 0000000000000000000000000000000000000000..016b768bc722b6ca3e16600eb08ddb5a8b711a6d --- /dev/null +++ b/Documentation/devicetree/bindings/sound/cs35l35.txt @@ -0,0 +1,180 @@ +CS35L35 Boosted Speaker Amplifier + +Required properties: + + - compatible : "cirrus,cs35l35" + + - reg : the I2C address of the device for I2C + + - VA-supply, VP-supply : power supplies for the device, + as covered in + Documentation/devicetree/bindings/regulator/regulator.txt. + + - interrupt-parent : Specifies the phandle of the interrupt controller to + which the IRQs from CS35L35 are delivered to. + - interrupts : IRQ line info CS35L35. + (See Documentation/devicetree/bindings/interrupt-controller/interrupts.txt + for further information relating to interrupt properties) + +Optional properties: + - reset-gpios : gpio used to reset the amplifier + + - cirrus,stereo-config : Boolean to determine if there are 2 AMPs for a + Stereo configuration + + - cirrus,audio-channel : Set Location of Audio Signal on Serial Port + 0 = Data Packet received on Left I2S Channel + 1 = Data Packet received on Right I2S Channel + + - cirrus,advisory-channel : Set Location of Advisory Signal on Serial Port + 0 = Data Packet received on Left I2S Channel + 1 = Data Packet received on Right I2S Channel + + - cirrus,shared-boost : Boolean to enable ClassH tracking of Advisory Signal + if 2 Devices share Boost BST_CTL + + - cirrus,external-boost : Boolean to specify the device is using an external + boost supply, note that sharing a boost from another cs35l35 would constitute + using an external supply for the slave device + + - cirrus,sp-drv-strength : Value for setting the Serial Port drive strength + Table 3-10 of the datasheet lists drive-strength specifications + 0 = 1x (Default) + 1 = .5x + - cirrus,sp-drv-unused : Determines how unused slots should be driven on the + Serial Port. + 0 - Hi-Z + 2 - Drive 0's (Default) + 3 - Drive 1's + + - cirrus,bst-pdn-fet-on : Boolean to determine if the Boost PDN control + powers down with a rectification FET On or Off. If VSPK is supplied + externally then FET is off. + + - cirrus,boost-ctl-millivolt : Boost Voltage Value. Configures the boost + converter's output voltage in mV. The range is from 2600mV to 9000mV with + increments of 100mV. + (Default) VP + + - cirrus,boost-peak-milliamp : Boost-converter peak current limit in mA. + Configures the peak current by monitoring the current through the boost FET. + Range starts at 1680mA and goes to a maximum of 4480mA with increments of + 110mA. + (Default) 2.46 Amps + + - cirrus,amp-gain-zc : Boolean to determine if to use Amplifier gain-change + zero-cross + +Optional H/G Algorithm sub-node: + + The cs35l35 node can have a single "cirrus,classh-internal-algo" sub-node + that will disable automatic control of the internal H/G Algorithm. + + It is strongly recommended that the Datasheet be referenced when adjusting + or using these Class H Algorithm controls over the internal Algorithm. + Serious damage can occur to the Device and surrounding components. + + - cirrus,classh-internal-algo : Sub-node for the Internal Class H Algorithm + See Section 4.3 Internal Class H Algorithm in the Datasheet. + If not used, the device manages the ClassH Algorithm internally. + +Optional properties for the "cirrus,classh-internal-algo" Sub-node + + Section 7.29 Class H Control + - cirrus,classh-bst-overide : Boolean + - cirrus,classh-bst-max-limit + - cirrus,classh-mem-depth + + Section 7.30 Class H Headroom Control + - cirrus,classh-headroom + + Section 7.31 Class H Release Rate + - cirrus,classh-release-rate + + Section 7.32 Class H Weak FET Drive Control + - cirrus,classh-wk-fet-disable + - cirrus,classh-wk-fet-delay + - cirrus,classh-wk-fet-thld + + Section 7.34 Class H VP Control + - cirrus,classh-vpch-auto + - cirrus,classh-vpch-rate + - cirrus,classh-vpch-man + +Optional Monitor Signal Format sub-node: + + The cs35l35 node can have a single "cirrus,monitor-signal-format" sub-node + for adjusting the Depth, Location and Frame of the Monitoring Signals + for Algorithms. + + See Sections 4.8.2 through 4.8.4 Serial-Port Control in the Datasheet + + -cirrus,monitor-signal-format : Sub-node for the Monitor Signaling Formating + on the I2S Port. Each of the 3 8 bit values in the array contain the settings + for depth, location, and frame. + + If not used, the defaults for the 6 monitor signals is used. + + Sections 7.44 - 7.53 lists values for the depth, location, and frame + for each monitoring signal. + + - cirrus,imon : 4 8 bit values to set the depth, location, frame and ADC + scale of the IMON monitor signal. + + - cirrus,vmon : 3 8 bit values to set the depth, location, and frame + of the VMON monitor signal. + + - cirrus,vpmon : 3 8 bit values to set the depth, location, and frame + of the VPMON monitor signal. + + - cirrus,vbstmon : 3 8 bit values to set the depth, location, and frame + of the VBSTMON monitor signal + + - cirrus,vpbrstat : 3 8 bit values to set the depth, location, and frame + of the VPBRSTAT monitor signal + + - cirrus,zerofill : 3 8 bit values to set the depth, location, and frame\ + of the ZEROFILL packet in the monitor signal + +Example: + +cs35l35: cs35l35@20 { + compatible = "cirrus,cs35l35"; + reg = <0x20>; + VA-supply = <&dummy_vreg>; + VP-supply = <&dummy_vreg>; + reset-gpios = <&axi_gpio 54 0>; + interrupt-parent = <&gpio8>; + interrupts = <3 IRQ_TYPE_LEVEL_LOW>; + cirrus,boost-ctl-millivolt = <9000>; + + cirrus,stereo-config; + cirrus,audio-channel = <0x00>; + cirrus,advisory-channel = <0x01>; + cirrus,shared-boost; + + cirrus,classh-internal-algo { + cirrus,classh-bst-overide; + cirrus,classh-bst-max-limit = <0x01>; + cirrus,classh-mem-depth = <0x01>; + cirrus,classh-release-rate = <0x08>; + cirrus,classh-headroom-millivolt = <0x0B>; + cirrus,classh-wk-fet-disable = <0x01>; + cirrus,classh-wk-fet-delay = <0x04>; + cirrus,classh-wk-fet-thld = <0x01>; + cirrus,classh-vpch-auto = <0x01>; + cirrus,classh-vpch-rate = <0x02>; + cirrus,classh-vpch-man = <0x05>; + }; + + /* Depth, Location, Frame */ + cirrus,monitor-signal-format { + cirrus,imon = /bits/ 8 <0x03 0x00 0x01>; + cirrus,vmon = /bits/ 8 <0x03 0x00 0x00>; + cirrus,vpmon = /bits/ 8 <0x03 0x04 0x00>; + cirrus,vbstmon = /bits/ 8 <0x03 0x04 0x01>; + cirrus,vpbrstat = /bits/ 8 <0x00 0x04 0x00>; + cirrus,zerofill = /bits/ 8 <0x00 0x00 0x00>; + }; + +}; diff --git a/Documentation/devicetree/bindings/sound/dioo,dio2125.txt b/Documentation/devicetree/bindings/sound/dioo,dio2125.txt new file mode 100644 index 0000000000000000000000000000000000000000..63dbfe0f11d0cfb139f5fa03030ab3a809b6167b --- /dev/null +++ b/Documentation/devicetree/bindings/sound/dioo,dio2125.txt @@ -0,0 +1,12 @@ +DIO2125 Audio Driver + +Required properties: +- compatible : "dioo,dio2125" +- enable-gpios : the gpio connected to the enable pin of the dio2125 + +Example: + +amp: analog-amplifier { + compatible = "dioo,dio2125"; + enable-gpios = <&gpio GPIOH_3 0>; +}; diff --git a/Documentation/devicetree/bindings/sound/everest,es7134.txt b/Documentation/devicetree/bindings/sound/everest,es7134.txt new file mode 100644 index 0000000000000000000000000000000000000000..5495a3cb8b7bc2b2e6fe7925038caa9b86040ede --- /dev/null +++ b/Documentation/devicetree/bindings/sound/everest,es7134.txt @@ -0,0 +1,10 @@ +ES7134 i2s DA converter + +Required properties: +- compatible : "everest,es7134" or "everest,es7144" + +Example: + +i2s_codec: external-codec { + compatible = "everest,es7134"; +}; diff --git a/Documentation/devicetree/bindings/sound/fsl,ssi.txt b/Documentation/devicetree/bindings/sound/fsl,ssi.txt index 5b76be45d18bfecce403bf4b1f09730bb74a3c0c..d415888e131659d4b757ea8abba98c3ac4a49b2b 100644 --- a/Documentation/devicetree/bindings/sound/fsl,ssi.txt +++ b/Documentation/devicetree/bindings/sound/fsl,ssi.txt @@ -20,24 +20,8 @@ Required properties: have. - interrupt-parent: The phandle for the interrupt controller that services interrupts for this device. -- fsl,playback-dma: Phandle to a node for the DMA channel to use for - playback of audio. This is typically dictated by SOC - design. See the notes below. -- fsl,capture-dma: Phandle to a node for the DMA channel to use for - capture (recording) of audio. This is typically dictated - by SOC design. See the notes below. - fsl,fifo-depth: The number of elements in the transmit and receive FIFOs. This number is the maximum allowed value for SFCSR[TFWM0]. -- fsl,ssi-asynchronous: - If specified, the SSI is to be programmed in asynchronous - mode. In this mode, pins SRCK, STCK, SRFS, and STFS must - all be connected to valid signals. In synchronous mode, - SRCK and SRFS are ignored. Asynchronous mode allows - playback and capture to use different sample sizes and - sample rates. Some drivers may require that SRCK and STCK - be connected together, and SRFS and STFS be connected - together. This would still allow different sample sizes, - but not different sample rates. - clocks: "ipg" - Required clock for the SSI unit "baud" - Required clock for SSI master mode. Otherwise this clock is not used @@ -61,6 +45,24 @@ Optional properties: - fsl,mode: The operating mode for the AC97 interface only. "ac97-slave" - AC97 mode, SSI is clock slave "ac97-master" - AC97 mode, SSI is clock master +- fsl,ssi-asynchronous: + If specified, the SSI is to be programmed in asynchronous + mode. In this mode, pins SRCK, STCK, SRFS, and STFS must + all be connected to valid signals. In synchronous mode, + SRCK and SRFS are ignored. Asynchronous mode allows + playback and capture to use different sample sizes and + sample rates. Some drivers may require that SRCK and STCK + be connected together, and SRFS and STFS be connected + together. This would still allow different sample sizes, + but not different sample rates. +- fsl,playback-dma: Phandle to a node for the DMA channel to use for + playback of audio. This is typically dictated by SOC + design. See the notes below. + Only used on Power Architecture. +- fsl,capture-dma: Phandle to a node for the DMA channel to use for + capture (recording) of audio. This is typically dictated + by SOC design. See the notes below. + Only used on Power Architecture. Child 'codec' node required properties: - compatible: Compatible list, contains the name of the codec diff --git a/Documentation/devicetree/bindings/sound/hisilicon,hi6210-i2s.txt b/Documentation/devicetree/bindings/sound/hisilicon,hi6210-i2s.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a296784eb37489dbad5ca68fb2cf9217e69080d --- /dev/null +++ b/Documentation/devicetree/bindings/sound/hisilicon,hi6210-i2s.txt @@ -0,0 +1,42 @@ +* Hisilicon 6210 i2s controller + +Required properties: + +- compatible: should be one of the following: + - "hisilicon,hi6210-i2s" +- reg: physical base address of the i2s controller unit and length of + memory mapped region. +- interrupts: should contain the i2s interrupt. +- clocks: a list of phandle + clock-specifier pairs, one for each entry + in clock-names. +- clock-names: should contain following: + - "dacodec" + - "i2s-base" +- dmas: DMA specifiers for tx dma. See the DMA client binding, + Documentation/devicetree/bindings/dma/dma.txt +- dma-names: should be "tx" and "rx" +- hisilicon,sysctrl-syscon: phandle to sysctrl syscon +- #sound-dai-cells: Should be set to 1 (for multi-dai) + - The dai cell indexes reference the following interfaces: + 0: S2 interface + (Currently that is the only one available, but more may be + supported in the future) + +Example for the hi6210 i2s controller: + +i2s0: i2s@f7118000{ + compatible = "hisilicon,hi6210-i2s"; + reg = <0x0 0xf7118000 0x0 0x8000>; /* i2s unit */ + interrupts = ; /* 155 "DigACodec_intr"-32 */ + clocks = <&sys_ctrl HI6220_DACODEC_PCLK>, + <&sys_ctrl HI6220_BBPPLL0_DIV>; + clock-names = "dacodec", "i2s-base"; + dmas = <&dma0 15 &dma0 14>; + dma-names = "rx", "tx"; + hisilicon,sysctrl-syscon = <&sys_ctrl>; + #sound-dai-cells = <1>; +}; + +Then when referencing the i2s controller: + sound-dai = <&i2s0 0>; /* index 0 => S2 interface */ + diff --git a/Documentation/devicetree/bindings/sound/max98925.txt b/Documentation/devicetree/bindings/sound/max98925.txt deleted file mode 100644 index 27be63e2aa0de08c7160d8e37960d3816800a7ab..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/sound/max98925.txt +++ /dev/null @@ -1,22 +0,0 @@ -max98925 audio CODEC - -This device supports I2C. - -Required properties: - - - compatible : "maxim,max98925" - - - vmon-slot-no : slot number used to send voltage information - - - imon-slot-no : slot number used to send current information - - - reg : the I2C address of the device for I2C - -Example: - -codec: max98925@1a { - compatible = "maxim,max98925"; - vmon-slot-no = <0>; - imon-slot-no = <2>; - reg = <0x1a>; -}; diff --git a/Documentation/devicetree/bindings/sound/max98926.txt b/Documentation/devicetree/bindings/sound/max98926.txt deleted file mode 100644 index 0b7f4e4d5f9a5c18b53398fe2291cc84e5e2b134..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/sound/max98926.txt +++ /dev/null @@ -1,32 +0,0 @@ -max98926 audio CODEC - -This device supports I2C. - -Required properties: - - - compatible : "maxim,max98926" - - - vmon-slot-no : slot number used to send voltage information - or in inteleave mode this will be used as - interleave slot. - - - imon-slot-no : slot number used to send current information - - - interleave-mode : When using two MAX98926 in a system it is - possible to create ADC data that that will - overflow the frame size. Digital Audio Interleave - mode provides a means to output VMON and IMON data - from two devices on a single DOUT line when running - smaller frames sizes such as 32 BCLKS per LRCLK or - 48 BCLKS per LRCLK. - - - reg : the I2C address of the device for I2C - -Example: - -codec: max98926@1a { - compatible = "maxim,max98926"; - vmon-slot-no = <0>; - imon-slot-no = <2>; - reg = <0x1a>; -}; diff --git a/Documentation/devicetree/bindings/sound/max9892x.txt b/Documentation/devicetree/bindings/sound/max9892x.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6171591ddc61daa71f2fb58e84aff5c2744297e --- /dev/null +++ b/Documentation/devicetree/bindings/sound/max9892x.txt @@ -0,0 +1,41 @@ +Maxim Integrated MAX98925/MAX98926/MAX98927 Speaker Amplifier + +This device supports I2C. + +Required properties: + + - compatible : should be one of the following + - "maxim,max98925" + - "maxim,max98926" + - "maxim,max98927" + + - vmon-slot-no : slot number used to send voltage information + or in inteleave mode this will be used as + interleave slot. + MAX98925/MAX98926 slot range : 0 ~ 30, Default : 0 + MAX98927 slot range : 0 ~ 15, Default : 0 + + - imon-slot-no : slot number used to send current information + MAX98925/MAX98926 slot range : 0 ~ 30, Default : 0 + MAX98927 slot range : 0 ~ 15, Default : 0 + + - interleave-mode : When using two MAX9892X in a system it is + possible to create ADC data that that will + overflow the frame size. Digital Audio Interleave + mode provides a means to output VMON and IMON data + from two devices on a single DOUT line when running + smaller frames sizes such as 32 BCLKS per LRCLK or + 48 BCLKS per LRCLK. + Range : 0 (off), 1 (on), Default : 0 + + - reg : the I2C address of the device for I2C + +Example: + +codec: max98927@3a { + compatible = "maxim,max98927"; + vmon-slot-no = <0>; + imon-slot-no = <1>; + interleave-mode = <0>; + reg = <0x3a>; +}; diff --git a/Documentation/devicetree/bindings/sound/mt2701-wm8960.txt b/Documentation/devicetree/bindings/sound/mt2701-wm8960.txt new file mode 100644 index 0000000000000000000000000000000000000000..809b609ea9d0b56e0761305f916bcf8c860e5f90 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/mt2701-wm8960.txt @@ -0,0 +1,24 @@ +MT2701 with WM8960 CODEC + +Required properties: +- compatible: "mediatek,mt2701-wm8960-machine" +- mediatek,platform: the phandle of MT2701 ASoC platform +- audio-routing: a list of the connections between audio +- mediatek,audio-codec: the phandles of wm8960 codec +- pinctrl-names: Should contain only one value - "default" +- pinctrl-0: Should specify pin control groups used for this controller. + +Example: + + sound:sound { + compatible = "mediatek,mt2701-wm8960-machine"; + mediatek,platform = <&afe>; + audio-routing = + "Headphone", "HP_L", + "Headphone", "HP_R", + "LINPUT1", "AMIC", + "RINPUT1", "AMIC"; + mediatek,audio-codec = <&wm8960>; + pinctrl-names = "default"; + pinctrl-0 = <&aud_pins_default>; + }; diff --git a/Documentation/devicetree/bindings/sound/nau8824.txt b/Documentation/devicetree/bindings/sound/nau8824.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0058b97e49a48240dee993beca7ae27ea994f8e --- /dev/null +++ b/Documentation/devicetree/bindings/sound/nau8824.txt @@ -0,0 +1,88 @@ +Nuvoton NAU8824 audio codec + +This device supports I2C only. + +Required properties: + - compatible : Must be "nuvoton,nau8824" + + - reg : the I2C address of the device. This is either 0x1a (CSB=0) or 0x1b (CSB=1). + +Optional properties: + - nuvoton,jkdet-polarity: JKDET pin polarity. 0 - active high, 1 - active low. + + - nuvoton,vref-impedance: VREF Impedance selection + 0 - Open + 1 - 25 kOhm + 2 - 125 kOhm + 3 - 2.5 kOhm + + - nuvoton,micbias-voltage: Micbias voltage level. + 0 - VDDA + 1 - VDDA + 2 - VDDA * 1.1 + 3 - VDDA * 1.2 + 4 - VDDA * 1.3 + 5 - VDDA * 1.4 + 6 - VDDA * 1.53 + 7 - VDDA * 1.53 + + - nuvoton,sar-threshold-num: Number of buttons supported + - nuvoton,sar-threshold: Impedance threshold for each button. Array that contains up to 8 buttons configuration. SAR value is calculated as + SAR = 255 * MICBIAS / SAR_VOLTAGE * R / (2000 + R) + where MICBIAS is configured by 'nuvoton,micbias-voltage', SAR_VOLTAGE is configured by 'nuvoton,sar-voltage', R - button impedance. + Refer datasheet section 10.2 for more information about threshold calculation. + + - nuvoton,sar-hysteresis: Button impedance measurement hysteresis. + + - nuvoton,sar-voltage: Reference voltage for button impedance measurement. + 0 - VDDA + 1 - VDDA + 2 - VDDA * 1.1 + 3 - VDDA * 1.2 + 4 - VDDA * 1.3 + 5 - VDDA * 1.4 + 6 - VDDA * 1.53 + 7 - VDDA * 1.53 + + - nuvoton,sar-compare-time: SAR compare time + 0 - 500 ns + 1 - 1 us + 2 - 2 us + 3 - 4 us + + - nuvoton,sar-sampling-time: SAR sampling time + 0 - 2 us + 1 - 4 us + 2 - 8 us + 3 - 16 us + + - nuvoton,short-key-debounce: Button short key press debounce time. + 0 - 30 ms + 1 - 50 ms + 2 - 100 ms + + - nuvoton,jack-eject-debounce: Jack ejection debounce time. + 0 - 0 ms + 1 - 1 ms + 2 - 10 ms + + +Example: + + headset: nau8824@1a { + compatible = "nuvoton,nau8824"; + reg = <0x1a>; + interrupt-parent = <&gpio>; + interrupts = ; + nuvoton,vref-impedance = <2>; + nuvoton,micbias-voltage = <6>; + // Setup 4 buttons impedance according to Android specification + nuvoton,sar-threshold-num = <4>; + nuvoton,sar-threshold = <0xc 0x1e 0x38 0x60>; + nuvoton,sar-hysteresis = <0>; + nuvoton,sar-voltage = <6>; + nuvoton,sar-compare-time = <1>; + nuvoton,sar-sampling-time = <1>; + nuvoton,short-key-debounce = <0>; + nuvoton,jack-eject-debounce = <1>; + }; diff --git a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt index a6600f6dea64dd7c22188e05bc1a38e3ff4a0df0..206aba1b34bb5a3f44e528d874a7520e5882c3fc 100644 --- a/Documentation/devicetree/bindings/sound/rockchip-i2s.txt +++ b/Documentation/devicetree/bindings/sound/rockchip-i2s.txt @@ -9,6 +9,7 @@ Required properties: - "rockchip,rk3066-i2s": for rk3066 - "rockchip,rk3188-i2s", "rockchip,rk3066-i2s": for rk3188 - "rockchip,rk3288-i2s", "rockchip,rk3066-i2s": for rk3288 + - "rockchip,rk3368-i2s", "rockchip,rk3066-i2s": for rk3368 - "rockchip,rk3399-i2s", "rockchip,rk3066-i2s": for rk3399 - reg: physical base address of the controller and length of memory mapped region. diff --git a/Documentation/devicetree/bindings/sound/samsung,odroid.txt b/Documentation/devicetree/bindings/sound/samsung,odroid.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1ac70cb0afb470d079d202a83fcbdc2ed227ce6 --- /dev/null +++ b/Documentation/devicetree/bindings/sound/samsung,odroid.txt @@ -0,0 +1,57 @@ +Samsung Exynos Odroid XU3/XU4 audio complex with MAX98090 codec + +Required properties: + + - compatible - "samsung,odroidxu3-audio" - for Odroid XU3 board, + "samsung,odroidxu4-audio" - for Odroid XU4 board + - model - the user-visible name of this sound complex + - 'cpu' subnode with a 'sound-dai' property containing the phandle of the I2S + controller + - 'codec' subnode with a 'sound-dai' property containing list of phandles + to the CODEC nodes, first entry must be corresponding to the MAX98090 + CODEC and the second entry must be the phandle of the HDMI IP block node + - clocks - should contain entries matching clock names in the clock-names + property + - clock-names - should contain following entries: + - "epll" - indicating the EPLL output clock + - "i2s_rclk" - indicating the RCLK (root) clock of the I2S0 controller + - samsung,audio-widgets - this property specifies off-codec audio elements + like headphones or speakers, for details see widgets.txt + - samsung,audio-routing - a list of the connections between audio + components; each entry is a pair of strings, the first being the + connection's sink, the second being the connection's source; + valid names for sources and sinks are the MAX98090's pins (as + documented in its binding), and the jacks on the board + + For Odroid X2: + "Headphone Jack", "Mic Jack", "DMIC" + + For Odroid U3, XU3: + "Headphone Jack", "Speakers" + + For Odroid XU4: + no entries + +Example: + +sound { + compatible = "samsung,odroidxu3-audio"; + samsung,cpu-dai = <&i2s0>; + samsung,codec-dai = <&max98090>; + model = "Odroid-XU3"; + samsung,audio-routing = + "Headphone Jack", "HPL", + "Headphone Jack", "HPR", + "IN1", "Mic Jack", + "Mic Jack", "MICBIAS"; + + clocks = <&clock CLK_FOUT_EPLL>, <&i2s0 CLK_I2S_RCLK_SRC>; + clock-names = "epll", "sclk_i2s"; + + cpu { + sound-dai = <&i2s0 0>; + }; + codec { + sound-dai = <&hdmi>, <&max98090>; + }; +}; diff --git a/Documentation/devicetree/bindings/sound/sgtl5000.txt b/Documentation/devicetree/bindings/sound/sgtl5000.txt index 5666da7b86059f39c411a6acebf5b8dc758342ac..7a73a9d62015ecc600af3c3f1c69f001b180b410 100644 --- a/Documentation/devicetree/bindings/sound/sgtl5000.txt +++ b/Documentation/devicetree/bindings/sound/sgtl5000.txt @@ -26,6 +26,15 @@ Optional properties: If this node is not mentioned or the value is unknown, then the value is set to 1.25V. +- lrclk-strength: the LRCLK pad strength. Possible values are: +0, 1, 2 and 3 as per the table below: + +VDDIO 1.8V 2.5V 3.3V +0 = Disable +1 = 1.66 mA 2.87 mA 4.02 mA +2 = 3.33 mA 5.74 mA 8.03 mA +3 = 4.99 mA 8.61 mA 12.05 mA + Example: codec: sgtl5000@0a { diff --git a/Documentation/devicetree/bindings/sound/st,stm32-sai.txt b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt new file mode 100644 index 0000000000000000000000000000000000000000..c59a3d779e067b9a56feba52962498dfb2e241fc --- /dev/null +++ b/Documentation/devicetree/bindings/sound/st,stm32-sai.txt @@ -0,0 +1,89 @@ +STMicroelectronics STM32 Serial Audio Interface (SAI). + +The SAI interface (Serial Audio Interface) offers a wide set of audio protocols +as I2S standards, LSB or MSB-justified, PCM/DSP, TDM, and AC'97. +The SAI contains two independent audio sub-blocks. Each sub-block has +its own clock generator and I/O lines controller. + +Required properties: + - compatible: Should be "st,stm32f4-sai" + - reg: Base address and size of SAI common register set. + - clocks: Must contain phandle and clock specifier pairs for each entry + in clock-names. + - clock-names: Must contain "x8k" and "x11k" + "x8k": SAI parent clock for sampling rates multiple of 8kHz. + "x11k": SAI parent clock for sampling rates multiple of 11.025kHz. + - interrupts: cpu DAI interrupt line shared by SAI sub-blocks + +Optional properties: + - resets: Reference to a reset controller asserting the SAI + +SAI subnodes: +Two subnodes corresponding to SAI sub-block instances A et B can be defined. +Subnode can be omitted for unsused sub-block. + +SAI subnodes required properties: + - compatible: Should be "st,stm32-sai-sub-a" or "st,stm32-sai-sub-b" + for SAI sub-block A or B respectively. + - reg: Base address and size of SAI sub-block register set. + - clocks: Must contain one phandle and clock specifier pair + for sai_ck which feeds the internal clock generator. + - clock-names: Must contain "sai_ck". + - dmas: see Documentation/devicetree/bindings/dma/stm32-dma.txt + - dma-names: identifier string for each DMA request line + "tx": if sai sub-block is configured as playback DAI + "rx": if sai sub-block is configured as capture DAI + - pinctrl-names: should contain only value "default" + - pinctrl-0: see Documentation/devicetree/bindings/pinctrl/pinctrl-stm32.txt + +Example: +sound_card { + compatible = "audio-graph-card"; + dais = <&sai1b_port>; +}; + +sai1: sai1@40015800 { + compatible = "st,stm32f4-sai"; + #address-cells = <1>; + #size-cells = <1>; + ranges; + reg = <0x40015800 0x4>; + clocks = <&rcc 1 CLK_SAIQ_PDIV>, <&rcc 1 CLK_I2SQ_PDIV>; + clock-names = "x8k", "x11k"; + interrupts = <87>; + + sai1b: audio-controller@40015824 { + #sound-dai-cells = <0>; + compatible = "st,stm32-sai-sub-b"; + reg = <0x40015824 0x1C>; + clocks = <&rcc 1 CLK_SAI2>; + clock-names = "sai_ck"; + dmas = <&dma2 5 0 0x400 0x0>; + dma-names = "tx"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_sai1b>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + sai1b_port: port@0 { + reg = <0>; + cpu_endpoint: endpoint { + remote-endpoint = <&codec_endpoint>; + audio-graph-card,format = "i2s"; + audio-graph-card,bitclock-master = <&codec_endpoint>; + audio-graph-card,frame-master = <&codec_endpoint>; + }; + }; + }; + }; +}; + +audio-codec { + codec_port: port { + codec_endpoint: endpoint { + remote-endpoint = <&cpu_endpoint>; + }; + }; +}; diff --git a/Documentation/devicetree/bindings/sound/tas2552.txt b/Documentation/devicetree/bindings/sound/tas2552.txt index c49992c0b62acb97f9d6bb11ad41812747242f84..2d71eb05c1d384c72e166d02a46fc2d695cea382 100644 --- a/Documentation/devicetree/bindings/sound/tas2552.txt +++ b/Documentation/devicetree/bindings/sound/tas2552.txt @@ -5,7 +5,8 @@ The tas2552 serial control bus communicates through I2C protocols Required properties: - compatible - One of: "ti,tas2552" - TAS2552 - - reg - I2C slave address + - reg - I2C slave address: it can be 0x40 if ADDR pin is 0 + or 0x41 if ADDR pin is 1. - supply-*: Required supply regulators are: "vbat" battery voltage "iovdd" I/O Voltage @@ -14,17 +15,20 @@ Required properties: Optional properties: - enable-gpio - gpio pin to enable/disable the device -tas2552 can receive it's reference clock via MCLK, BCLK, IVCLKIN pin or use the +tas2552 can receive its reference clock via MCLK, BCLK, IVCLKIN pin or use the internal 1.8MHz. This CLKIN is used by the PLL. In addition to PLL, the PDM reference clock is also selectable: PLL, IVCLKIN, BCLK or MCLK. For system integration the dt-bindings/sound/tas2552.h header file provides -defined values to selct and configure the PLL and PDM reference clocks. +defined values to select and configure the PLL and PDM reference clocks. Example: tas2552: tas2552@41 { compatible = "ti,tas2552"; reg = <0x41>; + vbat-supply = <®_vbat>; + iovdd-supply = <®_iovdd>; + avdd-supply = <®_avdd>; enable-gpio = <&gpio4 2 GPIO_ACTIVE_HIGH>; }; diff --git a/Documentation/devicetree/bindings/sound/wm8903.txt b/Documentation/devicetree/bindings/sound/wm8903.txt index 94ec32c194bb1fc8f3dd4b6bbe10069008e4d673..afc51caf113705e4da112635385c205f290442ac 100644 --- a/Documentation/devicetree/bindings/sound/wm8903.txt +++ b/Documentation/devicetree/bindings/sound/wm8903.txt @@ -28,6 +28,14 @@ Optional properties: performed. If any entry has the value 0xffffffff, that GPIO's configuration will not be modified. + - AVDD-supply : Analog power supply regulator on the AVDD pin. + + - CPVDD-supply : Charge pump supply regulator on the CPVDD pin. + + - DBVDD-supply : Digital buffer supply regulator for the DBVDD pin. + + - DCVDD-supply : Digital core supply regulator for the DCVDD pin. + Pins on the device (for linking into audio routes): * IN1L @@ -54,6 +62,11 @@ codec: wm8903@1a { reg = <0x1a>; interrupts = < 347 >; + AVDD-supply = <&fooreg_a>; + CPVDD-supply = <&fooreg_b>; + DBVDD-supply = <&fooreg_c>; + DCVDC-supply = <&fooreg_d>; + gpio-controller; #gpio-cells = <2>; diff --git a/Documentation/devicetree/bindings/sound/zte,tdm.txt b/Documentation/devicetree/bindings/sound/zte,tdm.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a07ca655264c377a2d8a658b10f594d6206b34a --- /dev/null +++ b/Documentation/devicetree/bindings/sound/zte,tdm.txt @@ -0,0 +1,30 @@ +ZTE TDM DAI driver + +Required properties: + +- compatible : should be one of the following. + * zte,zx296718-tdm +- reg : physical base address of the controller and length of memory mapped + region. +- clocks : Pairs of phandle and specifier referencing the controller's clocks. +- clock-names: "wclk" for the wclk. + "pclk" for the pclk. +-#clock-cells: should be 1. +- zte,tdm-dma-sysctrl : Reference to the sysctrl controller controlling + the dma. includes: + phandle of sysctrl. + register offset in sysctrl for control dma. + mask of the register that be written to sysctrl. + +Example: + + tdm: tdm@1487000 { + compatible = "zte,zx296718-tdm"; + reg = <0x01487000 0x1000>; + clocks = <&audiocrm AUDIO_TDM_WCLK>, <&audiocrm AUDIO_TDM_PCLK>; + clock-names = "wclk", "pclk"; + #clock-cells = <1>; + pinctrl-names = "default"; + pinctrl-0 = <&tdm_global_pin>; + zte,tdm-dma-sysctrl = <&sysctrl 0x10c 4>; + }; diff --git a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt index 8bc95e2fc47fadf7eb75d215494f48af45107922..31b5b21598ff5755d3e12a50aac2abf51d6653f1 100644 --- a/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt +++ b/Documentation/devicetree/bindings/spi/fsl-imx-cspi.txt @@ -23,6 +23,12 @@ See the clock consumer binding, Obsolete properties: - fsl,spi-num-chipselects : Contains the number of the chipselect +Optional properties: +- fsl,spi-rdy-drctl: Integer, representing the value of DRCTL, the register +controlling the SPI_READY handling. Note that to enable the DRCTL consideration, +the SPI_READY mode-flag needs to be set too. +Valid values are: 0 (disabled), 1 (edge-triggered burst) and 2 (level-triggered burst). + Example: ecspi@70010000 { @@ -35,4 +41,5 @@ ecspi@70010000 { <&gpio3 25 0>; /* GPIO3_25 */ dmas = <&sdma 3 7 1>, <&sdma 4 7 2>; dma-names = "rx", "tx"; + fsl,spi-rdy-drctl = <1>; }; diff --git a/Documentation/devicetree/bindings/spi/spi-bcm63xx-hsspi.txt b/Documentation/devicetree/bindings/spi/spi-bcm63xx-hsspi.txt new file mode 100644 index 0000000000000000000000000000000000000000..37b29ee138600fe265b4caa1166e209a2714b3fa --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-bcm63xx-hsspi.txt @@ -0,0 +1,33 @@ +Binding for Broadcom BCM6328 High Speed SPI controller + +Required properties: +- compatible: must contain of "brcm,bcm6328-hsspi". +- reg: Base address and size of the controllers memory area. +- interrupts: Interrupt for the SPI block. +- clocks: phandles of the SPI clock and the PLL clock. +- clock-names: must be "hsspi", "pll". +- #address-cells: <1>, as required by generic SPI binding. +- #size-cells: <0>, also as required by generic SPI binding. + +Optional properties: +- num-cs: some controllers have less than 8 cs signals. Defaults to 8 + if absent. + +Child nodes as per the generic SPI binding. + +Example: + + spi@10001000 { + compatible = "brcm,bcm6328-hsspi"; + reg = <0x10001000 0x600>; + + interrupts = <29>; + + clocks = <&clkctl 9>, <&hsspi_pll>; + clock-names = "hsspi", "pll"; + + num-cs = <2>; + + #address-cells = <1>; + #size-cells = <0>; + }; diff --git a/Documentation/devicetree/bindings/spi/spi-bcm63xx.txt b/Documentation/devicetree/bindings/spi/spi-bcm63xx.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c16f66926137eb56817dd58ce360e585fece179 --- /dev/null +++ b/Documentation/devicetree/bindings/spi/spi-bcm63xx.txt @@ -0,0 +1,33 @@ +Binding for Broadcom BCM6348/BCM6358 SPI controller + +Required properties: +- compatible: must contain one of "brcm,bcm6348-spi", "brcm,bcm6358-spi". +- reg: Base address and size of the controllers memory area. +- interrupts: Interrupt for the SPI block. +- clocks: phandle of the SPI clock. +- clock-names: has to be "spi". +- #address-cells: <1>, as required by generic SPI binding. +- #size-cells: <0>, also as required by generic SPI binding. + +Optional properties: +- num-cs: some controllers have less than 8 cs signals. Defaults to 8 + if absent. + +Child nodes as per the generic SPI binding. + +Example: + + spi@10000800 { + compatible = "brcm,bcm6368-spi", "brcm,bcm6358-spi"; + reg = <0x10000800 0x70c>; + + interrupts = <1>; + + clocks = <&clkctl 9>; + clock-names = "spi"; + + num-cs = <5>; + + #address-cells = <1>; + #size-cells = <0>; + }; diff --git a/Documentation/devicetree/bindings/spi/spi_pl022.txt b/Documentation/devicetree/bindings/spi/spi_pl022.txt index 4d1673ca8cf802d4b9e13fb53538baa3d5372a48..7638b4968ddb4ad8adabbf36ab540751b6eec343 100644 --- a/Documentation/devicetree/bindings/spi/spi_pl022.txt +++ b/Documentation/devicetree/bindings/spi/spi_pl022.txt @@ -30,7 +30,10 @@ contain the following properties. 0: SPI 1: Texas Instruments Synchronous Serial Frame Format 2: Microwire (Half Duplex) -- pl022,com-mode : polling, interrupt or dma +- pl022,com-mode : specifies the transfer mode: + 0: interrupt mode + 1: polling mode (default mode if property not present) + 2: DMA mode - pl022,rx-level-trig : Rx FIFO watermark level - pl022,tx-level-trig : Tx FIFO watermark level - pl022,ctrl-len : Microwire interface: Control length @@ -56,9 +59,7 @@ Example: spi-max-frequency = <12000000>; spi-cpol; spi-cpha; - pl022,hierarchy = <0>; pl022,interface = <0>; - pl022,slave-tx-disable; pl022,com-mode = <0x2>; pl022,rx-level-trig = <0>; pl022,tx-level-trig = <0>; @@ -67,4 +68,3 @@ Example: pl022,duplex = <0>; }; }; - diff --git a/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt b/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt index 474531d2b2c5fc746d625a5725e9a09f3ec19b8a..da8c5b73ad105a5fa5bf1510f5ec1f2d1ab462f9 100644 --- a/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/brcm,bcm2835-thermal.txt @@ -3,15 +3,39 @@ Binding for Thermal Sensor driver for BCM2835 SoCs. Required parameters: ------------------- -compatible: should be one of: "brcm,bcm2835-thermal", - "brcm,bcm2836-thermal" or "brcm,bcm2837-thermal" -reg: Address range of the thermal registers. -clocks: Phandle of the clock used by the thermal sensor. +compatible: should be one of: "brcm,bcm2835-thermal", + "brcm,bcm2836-thermal" or "brcm,bcm2837-thermal" +reg: Address range of the thermal registers. +clocks: Phandle of the clock used by the thermal sensor. +#thermal-sensor-cells: should be 0 (see thermal.txt) Example: +thermal-zones { + cpu_thermal: cpu-thermal { + polling-delay-passive = <0>; + polling-delay = <1000>; + + thermal-sensors = <&thermal>; + + trips { + cpu-crit { + temperature = <80000>; + hysteresis = <0>; + type = "critical"; + }; + }; + + coefficients = <(-538) 407000>; + + cooling-maps { + }; + }; +}; + thermal: thermal@7e212000 { compatible = "brcm,bcm2835-thermal"; reg = <0x7e212000 0x8>; clocks = <&clocks BCM2835_CLOCK_TSENS>; + #thermal-sensor-cells = <0>; }; diff --git a/Documentation/devicetree/bindings/thermal/brcm,ns-thermal b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal new file mode 100644 index 0000000000000000000000000000000000000000..68e047170039ecc08625fb1902d939aeb32bd58e --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/brcm,ns-thermal @@ -0,0 +1,37 @@ +* Broadcom Northstar Thermal + +This binding describes thermal sensor that is part of Northstar's DMU (Device +Management Unit). + +Required properties: +- compatible : Must be "brcm,ns-thermal" +- reg : iomem address range of PVTMON registers +- #thermal-sensor-cells : Should be <0> + +Example: + +thermal: thermal@1800c2c0 { + compatible = "brcm,ns-thermal"; + reg = <0x1800c2c0 0x10>; + #thermal-sensor-cells = <0>; +}; + +thermal-zones { + cpu_thermal: cpu-thermal { + polling-delay-passive = <0>; + polling-delay = <1000>; + coefficients = <(-556) 418000>; + thermal-sensors = <&thermal>; + + trips { + cpu-crit { + temperature = <125000>; + hysteresis = <0>; + type = "critical"; + }; + }; + + cooling-maps { + }; + }; +}; diff --git a/Documentation/devicetree/bindings/thermal/da9062-thermal.txt b/Documentation/devicetree/bindings/thermal/da9062-thermal.txt new file mode 100644 index 0000000000000000000000000000000000000000..e241bb5a5584d2c5c0b4ff068926794264920f45 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/da9062-thermal.txt @@ -0,0 +1,36 @@ +* Dialog DA9062/61 TJUNC Thermal Module + +This module is part of the DA9061/DA9062. For more details about entire +DA9062 and DA9061 chips see Documentation/devicetree/bindings/mfd/da9062.txt + +Junction temperature thermal module uses an interrupt signal to identify +high THERMAL_TRIP_HOT temperatures for the PMIC device. + +Required properties: + +- compatible: should be one of the following valid compatible string lines: + "dlg,da9061-thermal", "dlg,da9062-thermal" + "dlg,da9062-thermal" + +Optional properties: + +- polling-delay-passive : Specify the polling period, measured in + milliseconds, between thermal zone device update checks. + +Example: DA9062 + + pmic0: da9062@58 { + thermal { + compatible = "dlg,da9062-thermal"; + polling-delay-passive = <3000>; + }; + }; + +Example: DA9061 using a fall-back compatible for the DA9062 onkey driver + + pmic0: da9061@58 { + thermal { + compatible = "dlg,da9061-thermal", "dlg,da9062-thermal"; + polling-delay-passive = <3000>; + }; + }; diff --git a/Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt b/Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt deleted file mode 100644 index 16ea1d3b2e9e22f8c0bb1b2a2d28a0b765259c89..0000000000000000000000000000000000000000 --- a/Documentation/devicetree/bindings/timer/cortina,gemini-timer.txt +++ /dev/null @@ -1,22 +0,0 @@ -Cortina Systems Gemini timer - -This timer is embedded in the Cortina Systems Gemini SoCs. - -Required properties: - -- compatible : Must be "cortina,gemini-timer" -- reg : Should contain registers location and length -- interrupts : Should contain the three timer interrupts with - flags for rising edge -- syscon : a phandle to the global Gemini system controller - -Example: - -timer@43000000 { - compatible = "cortina,gemini-timer"; - reg = <0x43000000 0x1000>; - interrupts = <14 IRQ_TYPE_EDGE_RISING>, /* Timer 1 */ - <15 IRQ_TYPE_EDGE_RISING>, /* Timer 2 */ - <16 IRQ_TYPE_EDGE_RISING>; /* Timer 3 */ - syscon = <&syscon>; -}; diff --git a/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt new file mode 100644 index 0000000000000000000000000000000000000000..b73ca6cd07f80c5ff3296d0ee71e865b6d31688a --- /dev/null +++ b/Documentation/devicetree/bindings/timer/faraday,fttmr010.txt @@ -0,0 +1,33 @@ +Faraday Technology timer + +This timer is a generic IP block from Faraday Technology, embedded in the +Cortina Systems Gemini SoCs and other designs. + +Required properties: + +- compatible : Must be one of + "faraday,fttmr010" + "cortina,gemini-timer" +- reg : Should contain registers location and length +- interrupts : Should contain the three timer interrupts usually with + flags for falling edge + +Optionally required properties: + +- clocks : a clock to provide the tick rate for "faraday,fttmr010" +- clock-names : should be "EXTCLK" and "PCLK" for the external tick timer + and peripheral clock respectively, for "faraday,fttmr010" +- syscon : a phandle to the global Gemini system controller if the compatible + type is "cortina,gemini-timer" + +Example: + +timer@43000000 { + compatible = "faraday,fttmr010"; + reg = <0x43000000 0x1000>; + interrupts = <14 IRQ_TYPE_EDGE_FALLING>, /* Timer 1 */ + <15 IRQ_TYPE_EDGE_FALLING>, /* Timer 2 */ + <16 IRQ_TYPE_EDGE_FALLING>; /* Timer 3 */ + clocks = <&extclk>, <&pclk>; + clock-names = "EXTCLK", "PCLK"; +}; diff --git a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt index a41b184d553843679323b36024c7d2653ca26f4a..16a5f4577a61886be16103aedf1a38539d4248b3 100644 --- a/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt +++ b/Documentation/devicetree/bindings/timer/rockchip,rk-timer.txt @@ -1,9 +1,15 @@ Rockchip rk timer Required properties: -- compatible: shall be one of: - "rockchip,rk3288-timer" - for rk3066, rk3036, rk3188, rk322x, rk3288, rk3368 - "rockchip,rk3399-timer" - for rk3399 +- compatible: should be: + "rockchip,rk3036-timer", "rockchip,rk3288-timer": for Rockchip RK3036 + "rockchip,rk3066-timer", "rockchip,rk3288-timer": for Rockchip RK3066 + "rockchip,rk3188-timer", "rockchip,rk3288-timer": for Rockchip RK3188 + "rockchip,rk3228-timer", "rockchip,rk3288-timer": for Rockchip RK3228 + "rockchip,rk3229-timer", "rockchip,rk3288-timer": for Rockchip RK3229 + "rockchip,rk3288-timer": for Rockchip RK3288 + "rockchip,rk3368-timer", "rockchip,rk3288-timer": for Rockchip RK3368 + "rockchip,rk3399-timer": for Rockchip RK3399 - reg: base address of the timer register starting with TIMERS CONTROL register - interrupts: should contain the interrupts for Timer0 - clocks : must contain an entry for each entry in clock-names diff --git a/Documentation/devicetree/bindings/trivial-devices.txt b/Documentation/devicetree/bindings/trivial-devices.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e0a34c88e07759bdd076e98d43a4c988e1de4cc --- /dev/null +++ b/Documentation/devicetree/bindings/trivial-devices.txt @@ -0,0 +1,175 @@ +This is a list of trivial i2c devices that have simple device tree +bindings, consisting only of a compatible field, an address and +possibly an interrupt line. + +If a device needs more specific bindings, such as properties to +describe some aspect of it, there needs to be a specific binding +document for it just like any other devices. + + +Compatible Vendor / Chip +========== ============= +abracon,abb5zes3 AB-RTCMC-32.768kHz-B5ZE-S3: Real Time Clock/Calendar Module with I2C Interface +ad,ad7414 SMBus/I2C Digital Temperature Sensor in 6-Pin SOT with SMBus Alert and Over Temperature Pin +ad,adm9240 ADM9240: Complete System Hardware Monitor for uProcessor-Based Systems +adi,adt7461 +/-1C TDM Extended Temp Range I.C +adt7461 +/-1C TDM Extended Temp Range I.C +adi,adt7473 +/-1C TDM Extended Temp Range I.C +adi,adt7475 +/-1C TDM Extended Temp Range I.C +adi,adt7476 +/-1C TDM Extended Temp Range I.C +adi,adt7490 +/-1C TDM Extended Temp Range I.C +adi,adxl345 Three-Axis Digital Accelerometer +adi,adxl346 Three-Axis Digital Accelerometer (backward-compatibility value "adi,adxl345" must be listed too) +ams,iaq-core AMS iAQ-Core VOC Sensor +at,24c08 i2c serial eeprom (24cxx) +atmel,at97sc3204t i2c trusted platform module (TPM) +capella,cm32181 CM32181: Ambient Light Sensor +capella,cm3232 CM3232: Ambient Light Sensor +cirrus,cs42l51 Cirrus Logic CS42L51 audio codec +dallas,ds1307 64 x 8, Serial, I2C Real-Time Clock +dallas,ds1338 I2C RTC with 56-Byte NV RAM +dallas,ds1340 I2C RTC with Trickle Charger +dallas,ds1374 I2C, 32-Bit Binary Counter Watchdog RTC with Trickle Charger and Reset Input/Output +dallas,ds1631 High-Precision Digital Thermometer +dallas,ds1682 Total-Elapsed-Time Recorder with Alarm +dallas,ds1775 Tiny Digital Thermometer and Thermostat +dallas,ds3232 Extremely Accurate I²C RTC with Integrated Crystal and SRAM +dallas,ds4510 CPU Supervisor with Nonvolatile Memory and Programmable I/O +dallas,ds75 Digital Thermometer and Thermostat +devantech,srf08 Devantech SRF08 ultrasonic ranger +dlg,da9053 DA9053: flexible system level PMIC with multicore support +dlg,da9063 DA9063: system PMIC for quad-core application processors +domintech,dmard09 DMARD09: 3-axis Accelerometer +domintech,dmard10 DMARD10: 3-axis Accelerometer +epson,rx8010 I2C-BUS INTERFACE REAL TIME CLOCK MODULE +epson,rx8025 High-Stability. I2C-Bus INTERFACE REAL TIME CLOCK MODULE +epson,rx8581 I2C-BUS INTERFACE REAL TIME CLOCK MODULE +fsl,mag3110 MAG3110: Xtrinsic High Accuracy, 3D Magnetometer +fsl,mc13892 MC13892: Power Management Integrated Circuit (PMIC) for i.MX35/51 +fsl,mma7660 MMA7660FC: 3-Axis Orientation/Motion Detection Sensor +fsl,mma8450 MMA8450Q: Xtrinsic Low-power, 3-axis Xtrinsic Accelerometer +fsl,mpl3115 MPL3115: Absolute Digital Pressure Sensor +fsl,mpr121 MPR121: Proximity Capacitive Touch Sensor Controller +fsl,sgtl5000 SGTL5000: Ultra Low-Power Audio Codec +gmt,g751 G751: Digital Temperature Sensor and Thermal Watchdog with Two-Wire Interface +infineon,slb9635tt Infineon SLB9635 (Soft-) I2C TPM (old protocol, max 100khz) +infineon,slb9645tt Infineon SLB9645 I2C TPM (new protocol, max 400khz) +isil,isl29028 Intersil ISL29028 Ambient Light and Proximity Sensor +maxim,ds1050 5 Bit Programmable, Pulse-Width Modulator +maxim,max1237 Low-Power, 4-/12-Channel, 2-Wire Serial, 12-Bit ADCs +maxim,max6625 9-Bit/12-Bit Temperature Sensors with I²C-Compatible Serial Interface +mc,rv3029c2 Real Time Clock Module with I2C-Bus +mcube,mc3230 mCube 3-axis 8-bit digital accelerometer +memsic,mxc6225 MEMSIC 2-axis 8-bit digital accelerometer +microchip,mcp4531-502 Microchip 7-bit Single I2C Digital Potentiometer (5k) +microchip,mcp4531-103 Microchip 7-bit Single I2C Digital Potentiometer (10k) +microchip,mcp4531-503 Microchip 7-bit Single I2C Digital Potentiometer (50k) +microchip,mcp4531-104 Microchip 7-bit Single I2C Digital Potentiometer (100k) +microchip,mcp4532-502 Microchip 7-bit Single I2C Digital Potentiometer (5k) +microchip,mcp4532-103 Microchip 7-bit Single I2C Digital Potentiometer (10k) +microchip,mcp4532-503 Microchip 7-bit Single I2C Digital Potentiometer (50k) +microchip,mcp4532-104 Microchip 7-bit Single I2C Digital Potentiometer (100k) +microchip,mcp4541-502 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4541-103 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4541-503 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4541-104 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4542-502 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4542-103 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4542-503 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4542-104 Microchip 7-bit Single I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4551-502 Microchip 8-bit Single I2C Digital Potentiometer (5k) +microchip,mcp4551-103 Microchip 8-bit Single I2C Digital Potentiometer (10k) +microchip,mcp4551-503 Microchip 8-bit Single I2C Digital Potentiometer (50k) +microchip,mcp4551-104 Microchip 8-bit Single I2C Digital Potentiometer (100k) +microchip,mcp4552-502 Microchip 8-bit Single I2C Digital Potentiometer (5k) +microchip,mcp4552-103 Microchip 8-bit Single I2C Digital Potentiometer (10k) +microchip,mcp4552-503 Microchip 8-bit Single I2C Digital Potentiometer (50k) +microchip,mcp4552-104 Microchip 8-bit Single I2C Digital Potentiometer (100k) +microchip,mcp4561-502 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4561-103 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4561-503 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4561-104 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4562-502 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4562-103 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4562-503 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4562-104 Microchip 8-bit Single I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4631-502 Microchip 7-bit Dual I2C Digital Potentiometer (5k) +microchip,mcp4631-103 Microchip 7-bit Dual I2C Digital Potentiometer (10k) +microchip,mcp4631-503 Microchip 7-bit Dual I2C Digital Potentiometer (50k) +microchip,mcp4631-104 Microchip 7-bit Dual I2C Digital Potentiometer (100k) +microchip,mcp4632-502 Microchip 7-bit Dual I2C Digital Potentiometer (5k) +microchip,mcp4632-103 Microchip 7-bit Dual I2C Digital Potentiometer (10k) +microchip,mcp4632-503 Microchip 7-bit Dual I2C Digital Potentiometer (50k) +microchip,mcp4632-104 Microchip 7-bit Dual I2C Digital Potentiometer (100k) +microchip,mcp4641-502 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4641-103 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4641-503 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4641-104 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4642-502 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4642-103 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4642-503 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4642-104 Microchip 7-bit Dual I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4651-502 Microchip 8-bit Dual I2C Digital Potentiometer (5k) +microchip,mcp4651-103 Microchip 8-bit Dual I2C Digital Potentiometer (10k) +microchip,mcp4651-503 Microchip 8-bit Dual I2C Digital Potentiometer (50k) +microchip,mcp4651-104 Microchip 8-bit Dual I2C Digital Potentiometer (100k) +microchip,mcp4652-502 Microchip 8-bit Dual I2C Digital Potentiometer (5k) +microchip,mcp4652-103 Microchip 8-bit Dual I2C Digital Potentiometer (10k) +microchip,mcp4652-503 Microchip 8-bit Dual I2C Digital Potentiometer (50k) +microchip,mcp4652-104 Microchip 8-bit Dual I2C Digital Potentiometer (100k) +microchip,mcp4661-502 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4661-103 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4661-503 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4661-104 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (100k) +microchip,mcp4662-502 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (5k) +microchip,mcp4662-103 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (10k) +microchip,mcp4662-503 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (50k) +microchip,mcp4662-104 Microchip 8-bit Dual I2C Digital Potentiometer with NV Memory (100k) +microchip,tc654 PWM Fan Speed Controller With Fan Fault Detection +microchip,tc655 PWM Fan Speed Controller With Fan Fault Detection +miramems,da226 MiraMEMS DA226 2-axis 14-bit digital accelerometer +miramems,da280 MiraMEMS DA280 3-axis 14-bit digital accelerometer +miramems,da311 MiraMEMS DA311 3-axis 12-bit digital accelerometer +national,lm63 Temperature sensor with integrated fan control +national,lm75 I2C TEMP SENSOR +national,lm80 Serial Interface ACPI-Compatible Microprocessor System Hardware Monitor +national,lm85 Temperature sensor with integrated fan control +national,lm92 ±0.33°C Accurate, 12-Bit + Sign Temperature Sensor and Thermal Window Comparator with Two-Wire Interface +nuvoton,npct501 i2c trusted platform module (TPM) +nuvoton,npct601 i2c trusted platform module (TPM2) +nxp,pca9556 Octal SMBus and I2C registered interface +nxp,pca9557 8-bit I2C-bus and SMBus I/O port with reset +nxp,pcf2127 Real-time clock +nxp,pcf2129 Real-time clock +nxp,pcf8563 Real-time clock/calendar +nxp,pcf85063 Tiny Real-Time Clock +oki,ml86v7667 OKI ML86V7667 video decoder +ovti,ov5642 OV5642: Color CMOS QSXGA (5-megapixel) Image Sensor with OmniBSI and Embedded TrueFocus +pericom,pt7c4338 Real-time Clock Module +plx,pex8648 48-Lane, 12-Port PCI Express Gen 2 (5.0 GT/s) Switch +pulsedlight,lidar-lite-v2 Pulsedlight LIDAR range-finding sensor +ricoh,r2025sd I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +ricoh,r2221tl I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +ricoh,rs5c372a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +ricoh,rs5c372b I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +ricoh,rv5c386 I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +ricoh,rv5c387a I2C bus SERIAL INTERFACE REAL-TIME CLOCK IC +samsung,24ad0xd1 S524AD0XF1 (128K/256K-bit Serial EEPROM for Low Power) +sgx,vz89x SGX Sensortech VZ89X Sensors +sii,s35390a 2-wire CMOS real-time clock +silabs,si7020 Relative Humidity and Temperature Sensors +skyworks,sky81452 Skyworks SKY81452: Six-Channel White LED Driver with Touch Panel Bias Supply +st,24c256 i2c serial eeprom (24cxx) +st,m41t0 Serial real-time clock (RTC) +st,m41t00 Serial real-time clock (RTC) +st,m41t62 Serial real-time clock (RTC) with alarm +st,m41t80 M41T80 - SERIAL ACCESS RTC WITH ALARMS +taos,tsl2550 Ambient Light Sensor with SMBUS/Two Wire Serial Interface +ti,ads7828 8-Channels, 12-bit ADC +ti,ads7830 8-Channels, 8-bit ADC +ti,tsc2003 I2C Touch-Screen Controller +ti,tmp102 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface +ti,tmp103 Low Power Digital Temperature Sensor with SMBUS/Two Wire Serial Interface +ti,tmp275 Digital Temperature Sensor +winbond,w83793 Winbond/Nuvoton H/W Monitor +winbond,wpct301 i2c trusted platform module (TPM) diff --git a/Documentation/devicetree/bindings/usb/da8xx-usb.txt b/Documentation/devicetree/bindings/usb/da8xx-usb.txt index ccb844aba7d4b4d241affaba6d216c4494e0971c..717c5f6562373decc6eb91e7963d445b59739f59 100644 --- a/Documentation/devicetree/bindings/usb/da8xx-usb.txt +++ b/Documentation/devicetree/bindings/usb/da8xx-usb.txt @@ -18,10 +18,26 @@ Required properties: - phy-names: Should be "usb-phy" + - dmas: specifies the dma channels + + - dma-names: specifies the names of the channels. Use "rxN" for receive + and "txN" for transmit endpoints. N specifies the endpoint number. + Optional properties: ~~~~~~~~~~~~~~~~~~~~ - vbus-supply: Phandle to a regulator providing the USB bus power. +DMA +~~~ +- compatible: ti,da830-cppi41 +- reg: offset and length of the following register spaces: CPPI DMA Controller, + CPPI DMA Scheduler, Queue Manager +- reg-names: "controller", "scheduler", "queuemgr" +- #dma-cells: should be set to 2. The first number represents the + channel number (0 … 3 for endpoints 1 … 4). + The second number is 0 for RX and 1 for TX transfers. +- #dma-channels: should be set to 4 representing the 4 endpoints. + Example: usb_phy: usb-phy { compatible = "ti,da830-usb-phy"; @@ -30,7 +46,10 @@ Example: }; usb0: usb@200000 { compatible = "ti,da830-musb"; - reg = <0x00200000 0x10000>; + reg = <0x00200000 0x1000>; + ranges; + #address-cells = <1>; + #size-cells = <1>; interrupts = <58>; interrupt-names = "mc"; @@ -39,5 +58,25 @@ Example: phys = <&usb_phy 0>; phy-names = "usb-phy"; + dmas = <&cppi41dma 0 0 &cppi41dma 1 0 + &cppi41dma 2 0 &cppi41dma 3 0 + &cppi41dma 0 1 &cppi41dma 1 1 + &cppi41dma 2 1 &cppi41dma 3 1>; + dma-names = + "rx1", "rx2", "rx3", "rx4", + "tx1", "tx2", "tx3", "tx4"; + status = "okay"; + + cppi41dma: dma-controller@201000 { + compatible = "ti,da830-cppi41"; + reg = <0x201000 0x1000 + 0x202000 0x1000 + 0x204000 0x4000>; + reg-names = "controller", "scheduler", "queuemgr"; + interrupts = <58>; + #dma-cells = <2>; + #dma-channels = <4>; + }; + }; diff --git a/Documentation/devicetree/bindings/usb/dwc2.txt b/Documentation/devicetree/bindings/usb/dwc2.txt index 6c7c2bce6d0c3c2b59c8726da4a40a9a1a4a345d..00bea038639e4ea341735c875040ea55a12e8be3 100644 --- a/Documentation/devicetree/bindings/usb/dwc2.txt +++ b/Documentation/devicetree/bindings/usb/dwc2.txt @@ -14,6 +14,10 @@ Required properties: - "amlogic,meson-gxbb-usb": The DWC2 USB controller instance in Amlogic S905 SoCs; - "amcc,dwc-otg": The DWC2 USB controller instance in AMCC Canyonlands 460EX SoCs; - snps,dwc2: A generic DWC2 USB controller with default parameters. + - "st,stm32f4x9-fsotg": The DWC2 USB FS/HS controller instance in STM32F4x9 SoCs + configured in FS mode; + - "st,stm32f4x9-hsotg": The DWC2 USB HS controller instance in STM32F4x9 SoCs + configured in HS mode; - reg : Should contain 1 register range (address and length) - interrupts : Should contain 1 interrupt - clocks: clock provider specifier diff --git a/Documentation/devicetree/bindings/usb/ehci-orion.txt b/Documentation/devicetree/bindings/usb/ehci-orion.txt index 17c3bc858b86dd2bbcdaba8c2b175c1fa0dc34d6..2855bae79fda60b2aa4e2ae7a6a7c54115e6cd60 100644 --- a/Documentation/devicetree/bindings/usb/ehci-orion.txt +++ b/Documentation/devicetree/bindings/usb/ehci-orion.txt @@ -1,7 +1,9 @@ * EHCI controller, Orion Marvell variants Required properties: -- compatible: must be "marvell,orion-ehci" +- compatible: must be one of the following + "marvell,orion-ehci" + "marvell,armada-3700-ehci" - reg: physical base address of the controller and length of memory mapped region. - interrupts: The EHCI interrupt diff --git a/Documentation/devicetree/bindings/usb/generic.txt b/Documentation/devicetree/bindings/usb/generic.txt index bfadeb1c3bab6212993a3efc2a01750409e9a798..0a74ab8dfdc2f369979232e4fc90b4c660cf6b3a 100644 --- a/Documentation/devicetree/bindings/usb/generic.txt +++ b/Documentation/devicetree/bindings/usb/generic.txt @@ -22,6 +22,7 @@ Optional properties: property is used if any real OTG features(HNP/SRP/ADP) is enabled, if ADP is required, otg-rev should be 0x0200 or above. + - companion: phandle of a companion - hnp-disable: tells OTG controllers we want to disable OTG HNP, normally HNP is the basic function of real OTG except you want it to be a srp-capable only B device. diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index aed180d8e5850fce7b6d1e2569b766831e764cdd..c03d201403661926164f1f9ba45ecd381a77b544 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -51,6 +51,7 @@ brcm Broadcom Corporation buffalo Buffalo, Inc. calxeda Calxeda capella Capella Microsystems, Inc +cascoda Cascoda, Ltd. cavium Cavium, Inc. cdns Cadence Design Systems Inc. ceva Ceva, Inc. @@ -79,6 +80,7 @@ denx Denx Software Engineering devantech Devantech, Ltd. digi Digi International Inc. digilent Diglent, Inc. +dioo Dioo Microcircuit Co., Ltd dlg Dialog Semiconductor dlink D-Link Corporation dmo Data Modul AG @@ -102,6 +104,7 @@ ettus NI Ettus Research eukrea Eukréa Electromatique everest Everest Semiconductor Co. Ltd. everspin Everspin Technologies, Inc. +exar Exar Corporation excito Excito ezchip EZchip Semiconductor faraday Faraday Technology Corporation @@ -136,6 +139,7 @@ holt Holt Integrated Circuits, Inc. honeywell Honeywell hp Hewlett Packard holtek Holtek Semiconductor, Inc. +hwacom HwaCom Systems Inc. i2se I2SE GmbH ibm International Business Machines (IBM) idt Integrated Device Technologies, Inc. @@ -159,6 +163,7 @@ jedec JEDEC Solid State Technology Association karo Ka-Ro electronics GmbH keithkoep Keith & Koep GmbH keymile Keymile GmbH +khadas Khadas kinetic Kinetic Technologies kosagi Sutajio Ko-Usagi PTE Ltd. kyo Kyocera Corporation @@ -168,6 +173,7 @@ lego LEGO Systems A/S lenovo Lenovo Group Ltd. lg LG Corporation licheepi Lichee Pi +linaro Linaro Limited linux Linux-specific binding lltc Linear Technology Corporation lsi LSI Corp. (LSI Logic) @@ -191,6 +197,7 @@ minix MINIX Technology Ltd. miramems MiraMEMS Sensing Technology Co., Ltd. mitsubishi Mitsubishi Electric Corporation mosaixtech Mosaix Technologies, Inc. +motorola Motorola, Inc. moxa Moxa mpl MPL AG mqmaker mqmaker Inc. @@ -213,6 +220,7 @@ newhaven Newhaven Display International ni National Instruments nintendo Nintendo nokia Nokia +nordic Nordic Semiconductor nuvoton Nuvoton Technology Corporation nvd New Vision Display nvidia NVIDIA @@ -259,6 +267,7 @@ richtek Richtek Technology Corporation ricoh Ricoh Co. Ltd. rikomagic Rikomagic Tech Corp. Ltd rockchip Fuzhou Rockchip Electronics Co., Ltd +rohm ROHM Semiconductor Co., Ltd samsung Samsung Semiconductor samtec Samtec/Softing company sandisk Sandisk Corporation @@ -266,6 +275,7 @@ sbs Smart Battery System schindler Schindler seagate Seagate Technology PLC semtech Semtech Corporation +sensirion Sensirion AG sgx SGX Sensortech sharp Sharp Corporation si-en Si-En Technology Ltd. diff --git a/Documentation/devicetree/bindings/watchdog/cortina,gemini-watchdog.txt b/Documentation/devicetree/bindings/watchdog/cortina,gemini-watchdog.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc4b865d178b4f386ef9fdb9bb4f81e14816d214 --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/cortina,gemini-watchdog.txt @@ -0,0 +1,17 @@ +Cortina Systems Gemini SoC Watchdog + +Required properties: +- compatible : must be "cortina,gemini-watchdog" +- reg : shall contain base register location and length +- interrupts : shall contain the interrupt for the watchdog + +Optional properties: +- timeout-sec : the default watchdog timeout in seconds. + +Example: + +watchdog@41000000 { + compatible = "cortina,gemini-watchdog"; + reg = <0x41000000 0x1000>; + interrupts = <3 IRQ_TYPE_LEVEL_HIGH>; +}; diff --git a/Documentation/driver-api/80211/cfg80211.rst b/Documentation/driver-api/80211/cfg80211.rst index eca534ab617259a63bdf0d63f6a14319dedc7c0d..8ffac57e1f5b7ddd0b56547258f604d06816e9bd 100644 --- a/Documentation/driver-api/80211/cfg80211.rst +++ b/Documentation/driver-api/80211/cfg80211.rst @@ -2,6 +2,9 @@ cfg80211 subsystem ================== +.. kernel-doc:: include/net/cfg80211.h + :doc: Introduction + Device registration =================== @@ -179,6 +182,12 @@ Actions and configuration .. kernel-doc:: include/net/cfg80211.h :functions: cfg80211_ibss_joined +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_resp_params + +.. kernel-doc:: include/net/cfg80211.h + :functions: cfg80211_connect_done + .. kernel-doc:: include/net/cfg80211.h :functions: cfg80211_connect_result diff --git a/Documentation/driver-api/basics.rst b/Documentation/driver-api/basics.rst index 935b9b8d456c8af606324f57c81a5ef4750ac6c6..472e7a664d13ce6ae6d09cc36ccdde60089de99a 100644 --- a/Documentation/driver-api/basics.rst +++ b/Documentation/driver-api/basics.rst @@ -7,6 +7,12 @@ Driver Entry and Exit points .. kernel-doc:: include/linux/init.h :internal: +Driver device table +------------------- + +.. kernel-doc:: include/linux/mod_devicetable.h + :internal: + Atomic and pointer manipulation ------------------------------- diff --git a/Documentation/driver-api/firmware/index.rst b/Documentation/driver-api/firmware/index.rst index 1abe0179303108008b90e9dba1b9c06e0d28fd86..29da39ec4b8a4c5ab60145ee4b3c72684e828ee6 100644 --- a/Documentation/driver-api/firmware/index.rst +++ b/Documentation/driver-api/firmware/index.rst @@ -7,6 +7,7 @@ Linux Firmware API introduction core request_firmware + other_interfaces .. only:: subproject and html diff --git a/Documentation/driver-api/firmware/other_interfaces.rst b/Documentation/driver-api/firmware/other_interfaces.rst new file mode 100644 index 0000000000000000000000000000000000000000..36c47b1e98242ce16bbfea4050ef2e8a2341fed9 --- /dev/null +++ b/Documentation/driver-api/firmware/other_interfaces.rst @@ -0,0 +1,15 @@ +Other Firmware Interfaces +========================= + +DMI Interfaces +-------------- + +.. kernel-doc:: drivers/firmware/dmi_scan.c + :export: + +EDD Interfaces +-------------- + +.. kernel-doc:: drivers/firmware/edd.c + :internal: + diff --git a/Documentation/driver-api/index.rst b/Documentation/driver-api/index.rst index 60db00d1532baede53509ce01eea748793d5264c..8058a87c1c74e1ba36c585dfb34bf4970841d412 100644 --- a/Documentation/driver-api/index.rst +++ b/Documentation/driver-api/index.rst @@ -26,7 +26,8 @@ available subsections can be seen below. regulator iio/index input - usb + usb/index + pci spi i2c hsi @@ -36,6 +37,7 @@ available subsections can be seen below. 80211/index uio-howto firmware/index + misc_devices .. only:: subproject and html diff --git a/Documentation/driver-api/misc_devices.rst b/Documentation/driver-api/misc_devices.rst new file mode 100644 index 0000000000000000000000000000000000000000..c7ee7b02ba889e5d6a6a72a15dc17974ba4303d9 --- /dev/null +++ b/Documentation/driver-api/misc_devices.rst @@ -0,0 +1,5 @@ +Miscellaneous Devices +===================== + +.. kernel-doc:: drivers/char/misc.c + :export: diff --git a/Documentation/driver-api/pci.rst b/Documentation/driver-api/pci.rst new file mode 100644 index 0000000000000000000000000000000000000000..01a6c8b7d3a7b59dcee9a11d828968e464af9c1c --- /dev/null +++ b/Documentation/driver-api/pci.rst @@ -0,0 +1,50 @@ +PCI Support Library +------------------- + +.. kernel-doc:: drivers/pci/pci.c + :export: + +.. kernel-doc:: drivers/pci/pci-driver.c + :export: + +.. kernel-doc:: drivers/pci/remove.c + :export: + +.. kernel-doc:: drivers/pci/search.c + :export: + +.. kernel-doc:: drivers/pci/msi.c + :export: + +.. kernel-doc:: drivers/pci/bus.c + :export: + +.. kernel-doc:: drivers/pci/access.c + :export: + +.. kernel-doc:: drivers/pci/irq.c + :export: + +.. kernel-doc:: drivers/pci/htirq.c + :export: + +.. kernel-doc:: drivers/pci/probe.c + :export: + +.. kernel-doc:: drivers/pci/slot.c + :export: + +.. kernel-doc:: drivers/pci/rom.c + :export: + +.. kernel-doc:: drivers/pci/iov.c + :export: + +.. kernel-doc:: drivers/pci/pci-sysfs.c + :internal: + +PCI Hotplug Support Library +--------------------------- + +.. kernel-doc:: drivers/pci/hotplug/pci_hotplug_core.c + :export: diff --git a/Documentation/driver-api/usb.rst b/Documentation/driver-api/usb.rst deleted file mode 100644 index 851cc40b66b523d8778e968bc0a6a9f459ac2b9a..0000000000000000000000000000000000000000 --- a/Documentation/driver-api/usb.rst +++ /dev/null @@ -1,748 +0,0 @@ -=========================== -The Linux-USB Host Side API -=========================== - -Introduction to USB on Linux -============================ - -A Universal Serial Bus (USB) is used to connect a host, such as a PC or -workstation, to a number of peripheral devices. USB uses a tree -structure, with the host as the root (the system's master), hubs as -interior nodes, and peripherals as leaves (and slaves). Modern PCs -support several such trees of USB devices, usually -a few USB 3.0 (5 GBit/s) or USB 3.1 (10 GBit/s) and some legacy -USB 2.0 (480 MBit/s) busses just in case. - -That master/slave asymmetry was designed-in for a number of reasons, one -being ease of use. It is not physically possible to mistake upstream and -downstream or it does not matter with a type C plug (or they are built into the -peripheral). Also, the host software doesn't need to deal with -distributed auto-configuration since the pre-designated master node -manages all that. - -Kernel developers added USB support to Linux early in the 2.2 kernel -series and have been developing it further since then. Besides support -for each new generation of USB, various host controllers gained support, -new drivers for peripherals have been added and advanced features for latency -measurement and improved power management introduced. - -Linux can run inside USB devices as well as on the hosts that control -the devices. But USB device drivers running inside those peripherals -don't do the same things as the ones running inside hosts, so they've -been given a different name: *gadget drivers*. This document does not -cover gadget drivers. - -USB Host-Side API Model -======================= - -Host-side drivers for USB devices talk to the "usbcore" APIs. There are -two. One is intended for *general-purpose* drivers (exposed through -driver frameworks), and the other is for drivers that are *part of the -core*. Such core drivers include the *hub* driver (which manages trees -of USB devices) and several different kinds of *host controller -drivers*, which control individual busses. - -The device model seen by USB drivers is relatively complex. - -- USB supports four kinds of data transfers (control, bulk, interrupt, - and isochronous). Two of them (control and bulk) use bandwidth as - it's available, while the other two (interrupt and isochronous) are - scheduled to provide guaranteed bandwidth. - -- The device description model includes one or more "configurations" - per device, only one of which is active at a time. Devices are supposed - to be capable of operating at lower than their top - speeds and may provide a BOS descriptor showing the lowest speed they - remain fully operational at. - -- From USB 3.0 on configurations have one or more "functions", which - provide a common functionality and are grouped together for purposes - of power management. - -- Configurations or functions have one or more "interfaces", each of which may have - "alternate settings". Interfaces may be standardized by USB "Class" - specifications, or may be specific to a vendor or device. - - USB device drivers actually bind to interfaces, not devices. Think of - them as "interface drivers", though you may not see many devices - where the distinction is important. *Most USB devices are simple, - with only one function, one configuration, one interface, and one alternate - setting.* - -- Interfaces have one or more "endpoints", each of which supports one - type and direction of data transfer such as "bulk out" or "interrupt - in". The entire configuration may have up to sixteen endpoints in - each direction, allocated as needed among all the interfaces. - -- Data transfer on USB is packetized; each endpoint has a maximum - packet size. Drivers must often be aware of conventions such as - flagging the end of bulk transfers using "short" (including zero - length) packets. - -- The Linux USB API supports synchronous calls for control and bulk - messages. It also supports asynchronous calls for all kinds of data - transfer, using request structures called "URBs" (USB Request - Blocks). - -Accordingly, the USB Core API exposed to device drivers covers quite a -lot of territory. You'll probably need to consult the USB 3.0 -specification, available online from www.usb.org at no cost, as well as -class or device specifications. - -The only host-side drivers that actually touch hardware (reading/writing -registers, handling IRQs, and so on) are the HCDs. In theory, all HCDs -provide the same functionality through the same API. In practice, that's -becoming more true, but there are still differences -that crop up especially with fault handling on the less common controllers. -Different controllers don't -necessarily report the same aspects of failures, and recovery from -faults (including software-induced ones like unlinking an URB) isn't yet -fully consistent. Device driver authors should make a point of doing -disconnect testing (while the device is active) with each different host -controller driver, to make sure drivers don't have bugs of their own as -well as to make sure they aren't relying on some HCD-specific behavior. - -USB-Standard Types -================== - -In ```` you will find the USB data types defined in -chapter 9 of the USB specification. These data types are used throughout -USB, and in APIs including this host side API, gadget APIs, and usbfs. - -.. kernel-doc:: include/linux/usb/ch9.h - :internal: - -Host-Side Data Types and Macros -=============================== - -The host side API exposes several layers to drivers, some of which are -more necessary than others. These support lifecycle models for host side -drivers and devices, and support passing buffers through usbcore to some -HCD that performs the I/O for the device driver. - -.. kernel-doc:: include/linux/usb.h - :internal: - -USB Core APIs -============= - -There are two basic I/O models in the USB API. The most elemental one is -asynchronous: drivers submit requests in the form of an URB, and the -URB's completion callback handles the next step. All USB transfer types -support that model, although there are special cases for control URBs -(which always have setup and status stages, but may not have a data -stage) and isochronous URBs (which allow large packets and include -per-packet fault reports). Built on top of that is synchronous API -support, where a driver calls a routine that allocates one or more URBs, -submits them, and waits until they complete. There are synchronous -wrappers for single-buffer control and bulk transfers (which are awkward -to use in some driver disconnect scenarios), and for scatterlist based -streaming i/o (bulk or interrupt). - -USB drivers need to provide buffers that can be used for DMA, although -they don't necessarily need to provide the DMA mapping themselves. There -are APIs to use used when allocating DMA buffers, which can prevent use -of bounce buffers on some systems. In some cases, drivers may be able to -rely on 64bit DMA to eliminate another kind of bounce buffer. - -.. kernel-doc:: drivers/usb/core/urb.c - :export: - -.. kernel-doc:: drivers/usb/core/message.c - :export: - -.. kernel-doc:: drivers/usb/core/file.c - :export: - -.. kernel-doc:: drivers/usb/core/driver.c - :export: - -.. kernel-doc:: drivers/usb/core/usb.c - :export: - -.. kernel-doc:: drivers/usb/core/hub.c - :export: - -Host Controller APIs -==================== - -These APIs are only for use by host controller drivers, most of which -implement standard register interfaces such as XHCI, EHCI, OHCI, or UHCI. UHCI -was one of the first interfaces, designed by Intel and also used by VIA; -it doesn't do much in hardware. OHCI was designed later, to have the -hardware do more work (bigger transfers, tracking protocol state, and so -on). EHCI was designed with USB 2.0; its design has features that -resemble OHCI (hardware does much more work) as well as UHCI (some parts -of ISO support, TD list processing). XHCI was designed with USB 3.0. It -continues to shift support for functionality into hardware. - -There are host controllers other than the "big three", although most PCI -based controllers (and a few non-PCI based ones) use one of those -interfaces. Not all host controllers use DMA; some use PIO, and there is -also a simulator and a virtual host controller to pipe USB over the network. - -The same basic APIs are available to drivers for all those controllers. -For historical reasons they are in two layers: :c:type:`struct -usb_bus ` is a rather thin layer that became available -in the 2.2 kernels, while :c:type:`struct usb_hcd ` -is a more featureful layer -that lets HCDs share common code, to shrink driver size and -significantly reduce hcd-specific behaviors. - -.. kernel-doc:: drivers/usb/core/hcd.c - :export: - -.. kernel-doc:: drivers/usb/core/hcd-pci.c - :export: - -.. kernel-doc:: drivers/usb/core/buffer.c - :internal: - -The USB Filesystem (usbfs) -========================== - -This chapter presents the Linux *usbfs*. You may prefer to avoid writing -new kernel code for your USB driver; that's the problem that usbfs set -out to solve. User mode device drivers are usually packaged as -applications or libraries, and may use usbfs through some programming -library that wraps it. Such libraries include -`libusb `__ for C/C++, and -`jUSB `__ for Java. - - **Note** - - This particular documentation is incomplete, especially with respect - to the asynchronous mode. As of kernel 2.5.66 the code and this - (new) documentation need to be cross-reviewed. - -Configure usbfs into Linux kernels by enabling the *USB filesystem* -option (CONFIG_USB_DEVICEFS), and you get basic support for user mode -USB device drivers. Until relatively recently it was often (confusingly) -called *usbdevfs* although it wasn't solving what *devfs* was. Every USB -device will appear in usbfs, regardless of whether or not it has a -kernel driver. - -What files are in "usbfs"? --------------------------- - -Conventionally mounted at ``/proc/bus/usb``, usbfs features include: - -- ``/proc/bus/usb/devices`` ... a text file showing each of the USB - devices on known to the kernel, and their configuration descriptors. - You can also poll() this to learn about new devices. - -- ``/proc/bus/usb/BBB/DDD`` ... magic files exposing the each device's - configuration descriptors, and supporting a series of ioctls for - making device requests, including I/O to devices. (Purely for access - by programs.) - -Each bus is given a number (BBB) based on when it was enumerated; within -each bus, each device is given a similar number (DDD). Those BBB/DDD -paths are not "stable" identifiers; expect them to change even if you -always leave the devices plugged in to the same hub port. *Don't even -think of saving these in application configuration files.* Stable -identifiers are available, for user mode applications that want to use -them. HID and networking devices expose these stable IDs, so that for -example you can be sure that you told the right UPS to power down its -second server. "usbfs" doesn't (yet) expose those IDs. - -Mounting and Access Control ---------------------------- - -There are a number of mount options for usbfs, which will be of most -interest to you if you need to override the default access control -policy. That policy is that only root may read or write device files -(``/proc/bus/BBB/DDD``) although anyone may read the ``devices`` or -``drivers`` files. I/O requests to the device also need the -CAP_SYS_RAWIO capability, - -The significance of that is that by default, all user mode device -drivers need super-user privileges. You can change modes or ownership in -a driver setup when the device hotplugs, or maye just start the driver -right then, as a privileged server (or some activity within one). That's -the most secure approach for multi-user systems, but for single user -systems ("trusted" by that user) it's more convenient just to grant -everyone all access (using the *devmode=0666* option) so the driver can -start whenever it's needed. - -The mount options for usbfs, usable in /etc/fstab or in command line -invocations of *mount*, are: - -*busgid*\ =NNNNN - Controls the GID used for the /proc/bus/usb/BBB directories. - (Default: 0) - -*busmode*\ =MMM - Controls the file mode used for the /proc/bus/usb/BBB directories. - (Default: 0555) - -*busuid*\ =NNNNN - Controls the UID used for the /proc/bus/usb/BBB directories. - (Default: 0) - -*devgid*\ =NNNNN - Controls the GID used for the /proc/bus/usb/BBB/DDD files. (Default: - 0) - -*devmode*\ =MMM - Controls the file mode used for the /proc/bus/usb/BBB/DDD files. - (Default: 0644) - -*devuid*\ =NNNNN - Controls the UID used for the /proc/bus/usb/BBB/DDD files. (Default: - 0) - -*listgid*\ =NNNNN - Controls the GID used for the /proc/bus/usb/devices and drivers - files. (Default: 0) - -*listmode*\ =MMM - Controls the file mode used for the /proc/bus/usb/devices and - drivers files. (Default: 0444) - -*listuid*\ =NNNNN - Controls the UID used for the /proc/bus/usb/devices and drivers - files. (Default: 0) - -Note that many Linux distributions hard-wire the mount options for usbfs -in their init scripts, such as ``/etc/rc.d/rc.sysinit``, rather than -making it easy to set this per-system policy in ``/etc/fstab``. - -/proc/bus/usb/devices ---------------------- - -This file is handy for status viewing tools in user mode, which can scan -the text format and ignore most of it. More detailed device status -(including class and vendor status) is available from device-specific -files. For information about the current format of this file, see the -``Documentation/usb/proc_usb_info.txt`` file in your Linux kernel -sources. - -This file, in combination with the poll() system call, can also be used -to detect when devices are added or removed: - -:: - - int fd; - struct pollfd pfd; - - fd = open("/proc/bus/usb/devices", O_RDONLY); - pfd = { fd, POLLIN, 0 }; - for (;;) { - /* The first time through, this call will return immediately. */ - poll(&pfd, 1, -1); - - /* To see what's changed, compare the file's previous and current - contents or scan the filesystem. (Scanning is more precise.) */ - } - -Note that this behavior is intended to be used for informational and -debug purposes. It would be more appropriate to use programs such as -udev or HAL to initialize a device or start a user-mode helper program, -for instance. - -/proc/bus/usb/BBB/DDD ---------------------- - -Use these files in one of these basic ways: - -*They can be read,* producing first the device descriptor (18 bytes) and -then the descriptors for the current configuration. See the USB 2.0 spec -for details about those binary data formats. You'll need to convert most -multibyte values from little endian format to your native host byte -order, although a few of the fields in the device descriptor (both of -the BCD-encoded fields, and the vendor and product IDs) will be -byteswapped for you. Note that configuration descriptors include -descriptors for interfaces, altsettings, endpoints, and maybe additional -class descriptors. - -*Perform USB operations* using *ioctl()* requests to make endpoint I/O -requests (synchronously or asynchronously) or manage the device. These -requests need the CAP_SYS_RAWIO capability, as well as filesystem -access permissions. Only one ioctl request can be made on one of these -device files at a time. This means that if you are synchronously reading -an endpoint from one thread, you won't be able to write to a different -endpoint from another thread until the read completes. This works for -*half duplex* protocols, but otherwise you'd use asynchronous i/o -requests. - -Life Cycle of User Mode Drivers -------------------------------- - -Such a driver first needs to find a device file for a device it knows -how to handle. Maybe it was told about it because a ``/sbin/hotplug`` -event handling agent chose that driver to handle the new device. Or -maybe it's an application that scans all the /proc/bus/usb device files, -and ignores most devices. In either case, it should :c:func:`read()` -all the descriptors from the device file, and check them against what it -knows how to handle. It might just reject everything except a particular -vendor and product ID, or need a more complex policy. - -Never assume there will only be one such device on the system at a time! -If your code can't handle more than one device at a time, at least -detect when there's more than one, and have your users choose which -device to use. - -Once your user mode driver knows what device to use, it interacts with -it in either of two styles. The simple style is to make only control -requests; some devices don't need more complex interactions than those. -(An example might be software using vendor-specific control requests for -some initialization or configuration tasks, with a kernel driver for the -rest.) - -More likely, you need a more complex style driver: one using non-control -endpoints, reading or writing data and claiming exclusive use of an -interface. *Bulk* transfers are easiest to use, but only their sibling -*interrupt* transfers work with low speed devices. Both interrupt and -*isochronous* transfers offer service guarantees because their bandwidth -is reserved. Such "periodic" transfers are awkward to use through usbfs, -unless you're using the asynchronous calls. However, interrupt transfers -can also be used in a synchronous "one shot" style. - -Your user-mode driver should never need to worry about cleaning up -request state when the device is disconnected, although it should close -its open file descriptors as soon as it starts seeing the ENODEV errors. - -The ioctl() Requests --------------------- - -To use these ioctls, you need to include the following headers in your -userspace program: - -:: - - #include - #include - #include - -The standard USB device model requests, from "Chapter 9" of the USB 2.0 -specification, are automatically included from the ```` -header. - -Unless noted otherwise, the ioctl requests described here will update -the modification time on the usbfs file to which they are applied -(unless they fail). A return of zero indicates success; otherwise, a -standard USB error code is returned. (These are documented in -``Documentation/usb/error-codes.txt`` in your kernel sources.) - -Each of these files multiplexes access to several I/O streams, one per -endpoint. Each device has one control endpoint (endpoint zero) which -supports a limited RPC style RPC access. Devices are configured by -hub_wq (in the kernel) setting a device-wide *configuration* that -affects things like power consumption and basic functionality. The -endpoints are part of USB *interfaces*, which may have *altsettings* -affecting things like which endpoints are available. Many devices only -have a single configuration and interface, so drivers for them will -ignore configurations and altsettings. - -Management/Status Requests -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -A number of usbfs requests don't deal very directly with device I/O. -They mostly relate to device management and status. These are all -synchronous requests. - -USBDEVFS_CLAIMINTERFACE - This is used to force usbfs to claim a specific interface, which has - not previously been claimed by usbfs or any other kernel driver. The - ioctl parameter is an integer holding the number of the interface - (bInterfaceNumber from descriptor). - - Note that if your driver doesn't claim an interface before trying to - use one of its endpoints, and no other driver has bound to it, then - the interface is automatically claimed by usbfs. - - This claim will be released by a RELEASEINTERFACE ioctl, or by - closing the file descriptor. File modification time is not updated - by this request. - -USBDEVFS_CONNECTINFO - Says whether the device is lowspeed. The ioctl parameter points to a - structure like this: - - :: - - struct usbdevfs_connectinfo { - unsigned int devnum; - unsigned char slow; - }; - - File modification time is not updated by this request. - - *You can't tell whether a "not slow" device is connected at high - speed (480 MBit/sec) or just full speed (12 MBit/sec).* You should - know the devnum value already, it's the DDD value of the device file - name. - -USBDEVFS_GETDRIVER - Returns the name of the kernel driver bound to a given interface (a - string). Parameter is a pointer to this structure, which is - modified: - - :: - - struct usbdevfs_getdriver { - unsigned int interface; - char driver[USBDEVFS_MAXDRIVERNAME + 1]; - }; - - File modification time is not updated by this request. - -USBDEVFS_IOCTL - Passes a request from userspace through to a kernel driver that has - an ioctl entry in the *struct usb_driver* it registered. - - :: - - struct usbdevfs_ioctl { - int ifno; - int ioctl_code; - void *data; - }; - - /* user mode call looks like this. - * 'request' becomes the driver->ioctl() 'code' parameter. - * the size of 'param' is encoded in 'request', and that data - * is copied to or from the driver->ioctl() 'buf' parameter. - */ - static int - usbdev_ioctl (int fd, int ifno, unsigned request, void *param) - { - struct usbdevfs_ioctl wrapper; - - wrapper.ifno = ifno; - wrapper.ioctl_code = request; - wrapper.data = param; - - return ioctl (fd, USBDEVFS_IOCTL, &wrapper); - } - - File modification time is not updated by this request. - - This request lets kernel drivers talk to user mode code through - filesystem operations even when they don't create a character or - block special device. It's also been used to do things like ask - devices what device special file should be used. Two pre-defined - ioctls are used to disconnect and reconnect kernel drivers, so that - user mode code can completely manage binding and configuration of - devices. - -USBDEVFS_RELEASEINTERFACE - This is used to release the claim usbfs made on interface, either - implicitly or because of a USBDEVFS_CLAIMINTERFACE call, before the - file descriptor is closed. The ioctl parameter is an integer holding - the number of the interface (bInterfaceNumber from descriptor); File - modification time is not updated by this request. - - **Warning** - - *No security check is made to ensure that the task which made - the claim is the one which is releasing it. This means that user - mode driver may interfere other ones.* - -USBDEVFS_RESETEP - Resets the data toggle value for an endpoint (bulk or interrupt) to - DATA0. The ioctl parameter is an integer endpoint number (1 to 15, - as identified in the endpoint descriptor), with USB_DIR_IN added - if the device's endpoint sends data to the host. - - **Warning** - - *Avoid using this request. It should probably be removed.* Using - it typically means the device and driver will lose toggle - synchronization. If you really lost synchronization, you likely - need to completely handshake with the device, using a request - like CLEAR_HALT or SET_INTERFACE. - -USBDEVFS_DROP_PRIVILEGES - This is used to relinquish the ability to do certain operations - which are considered to be privileged on a usbfs file descriptor. - This includes claiming arbitrary interfaces, resetting a device on - which there are currently claimed interfaces from other users, and - issuing USBDEVFS_IOCTL calls. The ioctl parameter is a 32 bit mask - of interfaces the user is allowed to claim on this file descriptor. - You may issue this ioctl more than one time to narrow said mask. - -Synchronous I/O Support -~~~~~~~~~~~~~~~~~~~~~~~ - -Synchronous requests involve the kernel blocking until the user mode -request completes, either by finishing successfully or by reporting an -error. In most cases this is the simplest way to use usbfs, although as -noted above it does prevent performing I/O to more than one endpoint at -a time. - -USBDEVFS_BULK - Issues a bulk read or write request to the device. The ioctl - parameter is a pointer to this structure: - - :: - - struct usbdevfs_bulktransfer { - unsigned int ep; - unsigned int len; - unsigned int timeout; /* in milliseconds */ - void *data; - }; - - The "ep" value identifies a bulk endpoint number (1 to 15, as - identified in an endpoint descriptor), masked with USB_DIR_IN when - referring to an endpoint which sends data to the host from the - device. The length of the data buffer is identified by "len"; Recent - kernels support requests up to about 128KBytes. *FIXME say how read - length is returned, and how short reads are handled.*. - -USBDEVFS_CLEAR_HALT - Clears endpoint halt (stall) and resets the endpoint toggle. This is - only meaningful for bulk or interrupt endpoints. The ioctl parameter - is an integer endpoint number (1 to 15, as identified in an endpoint - descriptor), masked with USB_DIR_IN when referring to an endpoint - which sends data to the host from the device. - - Use this on bulk or interrupt endpoints which have stalled, - returning *-EPIPE* status to a data transfer request. Do not issue - the control request directly, since that could invalidate the host's - record of the data toggle. - -USBDEVFS_CONTROL - Issues a control request to the device. The ioctl parameter points - to a structure like this: - - :: - - struct usbdevfs_ctrltransfer { - __u8 bRequestType; - __u8 bRequest; - __u16 wValue; - __u16 wIndex; - __u16 wLength; - __u32 timeout; /* in milliseconds */ - void *data; - }; - - The first eight bytes of this structure are the contents of the - SETUP packet to be sent to the device; see the USB 2.0 specification - for details. The bRequestType value is composed by combining a - USB_TYPE_\* value, a USB_DIR_\* value, and a USB_RECIP_\* - value (from **). If wLength is nonzero, it describes - the length of the data buffer, which is either written to the device - (USB_DIR_OUT) or read from the device (USB_DIR_IN). - - At this writing, you can't transfer more than 4 KBytes of data to or - from a device; usbfs has a limit, and some host controller drivers - have a limit. (That's not usually a problem.) *Also* there's no way - to say it's not OK to get a short read back from the device. - -USBDEVFS_RESET - Does a USB level device reset. The ioctl parameter is ignored. After - the reset, this rebinds all device interfaces. File modification - time is not updated by this request. - - **Warning** - - *Avoid using this call* until some usbcore bugs get fixed, since - it does not fully synchronize device, interface, and driver (not - just usbfs) state. - -USBDEVFS_SETINTERFACE - Sets the alternate setting for an interface. The ioctl parameter is - a pointer to a structure like this: - - :: - - struct usbdevfs_setinterface { - unsigned int interface; - unsigned int altsetting; - }; - - File modification time is not updated by this request. - - Those struct members are from some interface descriptor applying to - the current configuration. The interface number is the - bInterfaceNumber value, and the altsetting number is the - bAlternateSetting value. (This resets each endpoint in the - interface.) - -USBDEVFS_SETCONFIGURATION - Issues the :c:func:`usb_set_configuration()` call for the - device. The parameter is an integer holding the number of a - configuration (bConfigurationValue from descriptor). File - modification time is not updated by this request. - - **Warning** - - *Avoid using this call* until some usbcore bugs get fixed, since - it does not fully synchronize device, interface, and driver (not - just usbfs) state. - -Asynchronous I/O Support -~~~~~~~~~~~~~~~~~~~~~~~~ - -As mentioned above, there are situations where it may be important to -initiate concurrent operations from user mode code. This is particularly -important for periodic transfers (interrupt and isochronous), but it can -be used for other kinds of USB requests too. In such cases, the -asynchronous requests described here are essential. Rather than -submitting one request and having the kernel block until it completes, -the blocking is separate. - -These requests are packaged into a structure that resembles the URB used -by kernel device drivers. (No POSIX Async I/O support here, sorry.) It -identifies the endpoint type (USBDEVFS_URB_TYPE_\*), endpoint -(number, masked with USB_DIR_IN as appropriate), buffer and length, -and a user "context" value serving to uniquely identify each request. -(It's usually a pointer to per-request data.) Flags can modify requests -(not as many as supported for kernel drivers). - -Each request can specify a realtime signal number (between SIGRTMIN and -SIGRTMAX, inclusive) to request a signal be sent when the request -completes. - -When usbfs returns these urbs, the status value is updated, and the -buffer may have been modified. Except for isochronous transfers, the -actual_length is updated to say how many bytes were transferred; if the -USBDEVFS_URB_DISABLE_SPD flag is set ("short packets are not OK"), if -fewer bytes were read than were requested then you get an error report. - -:: - - struct usbdevfs_iso_packet_desc { - unsigned int length; - unsigned int actual_length; - unsigned int status; - }; - - struct usbdevfs_urb { - unsigned char type; - unsigned char endpoint; - int status; - unsigned int flags; - void *buffer; - int buffer_length; - int actual_length; - int start_frame; - int number_of_packets; - int error_count; - unsigned int signr; - void *usercontext; - struct usbdevfs_iso_packet_desc iso_frame_desc[]; - }; - -For these asynchronous requests, the file modification time reflects -when the request was initiated. This contrasts with their use with the -synchronous requests, where it reflects when requests complete. - -USBDEVFS_DISCARDURB - *TBS* File modification time is not updated by this request. - -USBDEVFS_DISCSIGNAL - *TBS* File modification time is not updated by this request. - -USBDEVFS_REAPURB - *TBS* File modification time is not updated by this request. - -USBDEVFS_REAPURBNDELAY - *TBS* File modification time is not updated by this request. - -USBDEVFS_SUBMITURB - *TBS* diff --git a/Documentation/driver-api/usb/URB.rst b/Documentation/driver-api/usb/URB.rst new file mode 100644 index 0000000000000000000000000000000000000000..61a54da9fce94f4d952fdf67c8a311db810cfac0 --- /dev/null +++ b/Documentation/driver-api/usb/URB.rst @@ -0,0 +1,290 @@ +.. _usb-urb: + +USB Request Block (URB) +~~~~~~~~~~~~~~~~~~~~~~~ + +:Revised: 2000-Dec-05 +:Again: 2002-Jul-06 +:Again: 2005-Sep-19 +:Again: 2017-Mar-29 + + +.. note:: + + The USB subsystem now has a substantial section at :ref:`usb-hostside-api` + section, generated from the current source code. + This particular documentation file isn't complete and may not be + updated to the last version; don't rely on it except for a quick + overview. + +Basic concept or 'What is an URB?' +================================== + +The basic idea of the new driver is message passing, the message itself is +called USB Request Block, or URB for short. + +- An URB consists of all relevant information to execute any USB transaction + and deliver the data and status back. + +- Execution of an URB is inherently an asynchronous operation, i.e. the + :c:func:`usb_submit_urb` call returns immediately after it has successfully + queued the requested action. + +- Transfers for one URB can be canceled with :c:func:`usb_unlink_urb` + at any time. + +- Each URB has a completion handler, which is called after the action + has been successfully completed or canceled. The URB also contains a + context-pointer for passing information to the completion handler. + +- Each endpoint for a device logically supports a queue of requests. + You can fill that queue, so that the USB hardware can still transfer + data to an endpoint while your driver handles completion of another. + This maximizes use of USB bandwidth, and supports seamless streaming + of data to (or from) devices when using periodic transfer modes. + + +The URB structure +================= + +Some of the fields in struct :c:type:`urb` are:: + + struct urb + { + // (IN) device and pipe specify the endpoint queue + struct usb_device *dev; // pointer to associated USB device + unsigned int pipe; // endpoint information + + unsigned int transfer_flags; // URB_ISO_ASAP, URB_SHORT_NOT_OK, etc. + + // (IN) all urbs need completion routines + void *context; // context for completion routine + usb_complete_t complete; // pointer to completion routine + + // (OUT) status after each completion + int status; // returned status + + // (IN) buffer used for data transfers + void *transfer_buffer; // associated data buffer + u32 transfer_buffer_length; // data buffer length + int number_of_packets; // size of iso_frame_desc + + // (OUT) sometimes only part of CTRL/BULK/INTR transfer_buffer is used + u32 actual_length; // actual data buffer length + + // (IN) setup stage for CTRL (pass a struct usb_ctrlrequest) + unsigned char *setup_packet; // setup packet (control only) + + // Only for PERIODIC transfers (ISO, INTERRUPT) + // (IN/OUT) start_frame is set unless URB_ISO_ASAP isn't set + int start_frame; // start frame + int interval; // polling interval + + // ISO only: packets are only "best effort"; each can have errors + int error_count; // number of errors + struct usb_iso_packet_descriptor iso_frame_desc[0]; + }; + +Your driver must create the "pipe" value using values from the appropriate +endpoint descriptor in an interface that it's claimed. + + +How to get an URB? +================== + +URBs are allocated by calling :c:func:`usb_alloc_urb`:: + + struct urb *usb_alloc_urb(int isoframes, int mem_flags) + +Return value is a pointer to the allocated URB, 0 if allocation failed. +The parameter isoframes specifies the number of isochronous transfer frames +you want to schedule. For CTRL/BULK/INT, use 0. The mem_flags parameter +holds standard memory allocation flags, letting you control (among other +things) whether the underlying code may block or not. + +To free an URB, use :c:func:`usb_free_urb`:: + + void usb_free_urb(struct urb *urb) + +You may free an urb that you've submitted, but which hasn't yet been +returned to you in a completion callback. It will automatically be +deallocated when it is no longer in use. + + +What has to be filled in? +========================= + +Depending on the type of transaction, there are some inline functions +defined in ``linux/usb.h`` to simplify the initialization, such as +:c:func:`usb_fill_control_urb`, :c:func:`usb_fill_bulk_urb` and +:c:func:`usb_fill_int_urb`. In general, they need the usb device pointer, +the pipe (usual format from usb.h), the transfer buffer, the desired transfer +length, the completion handler, and its context. Take a look at the some +existing drivers to see how they're used. + +Flags: + +- For ISO there are two startup behaviors: Specified start_frame or ASAP. +- For ASAP set ``URB_ISO_ASAP`` in transfer_flags. + +If short packets should NOT be tolerated, set ``URB_SHORT_NOT_OK`` in +transfer_flags. + + +How to submit an URB? +===================== + +Just call :c:func:`usb_submit_urb`:: + + int usb_submit_urb(struct urb *urb, int mem_flags) + +The ``mem_flags`` parameter, such as ``GFP_ATOMIC``, controls memory +allocation, such as whether the lower levels may block when memory is tight. + +It immediately returns, either with status 0 (request queued) or some +error code, usually caused by the following: + +- Out of memory (``-ENOMEM``) +- Unplugged device (``-ENODEV``) +- Stalled endpoint (``-EPIPE``) +- Too many queued ISO transfers (``-EAGAIN``) +- Too many requested ISO frames (``-EFBIG``) +- Invalid INT interval (``-EINVAL``) +- More than one packet for INT (``-EINVAL``) + +After submission, ``urb->status`` is ``-EINPROGRESS``; however, you should +never look at that value except in your completion callback. + +For isochronous endpoints, your completion handlers should (re)submit +URBs to the same endpoint with the ``URB_ISO_ASAP`` flag, using +multi-buffering, to get seamless ISO streaming. + + +How to cancel an already running URB? +===================================== + +There are two ways to cancel an URB you've submitted but which hasn't +been returned to your driver yet. For an asynchronous cancel, call +:c:func:`usb_unlink_urb`:: + + int usb_unlink_urb(struct urb *urb) + +It removes the urb from the internal list and frees all allocated +HW descriptors. The status is changed to reflect unlinking. Note +that the URB will not normally have finished when :c:func:`usb_unlink_urb` +returns; you must still wait for the completion handler to be called. + +To cancel an URB synchronously, call :c:func:`usb_kill_urb`:: + + void usb_kill_urb(struct urb *urb) + +It does everything :c:func:`usb_unlink_urb` does, and in addition it waits +until after the URB has been returned and the completion handler +has finished. It also marks the URB as temporarily unusable, so +that if the completion handler or anyone else tries to resubmit it +they will get a ``-EPERM`` error. Thus you can be sure that when +:c:func:`usb_kill_urb` returns, the URB is totally idle. + +There is a lifetime issue to consider. An URB may complete at any +time, and the completion handler may free the URB. If this happens +while :c:func:`usb_unlink_urb` or :c:func:`usb_kill_urb` is running, it will +cause a memory-access violation. The driver is responsible for avoiding this, +which often means some sort of lock will be needed to prevent the URB +from being deallocated while it is still in use. + +On the other hand, since usb_unlink_urb may end up calling the +completion handler, the handler must not take any lock that is held +when usb_unlink_urb is invoked. The general solution to this problem +is to increment the URB's reference count while holding the lock, then +drop the lock and call usb_unlink_urb or usb_kill_urb, and then +decrement the URB's reference count. You increment the reference +count by calling :c:func`usb_get_urb`:: + + struct urb *usb_get_urb(struct urb *urb) + +(ignore the return value; it is the same as the argument) and +decrement the reference count by calling :c:func:`usb_free_urb`. Of course, +none of this is necessary if there's no danger of the URB being freed +by the completion handler. + + +What about the completion handler? +================================== + +The handler is of the following type:: + + typedef void (*usb_complete_t)(struct urb *) + +I.e., it gets the URB that caused the completion call. In the completion +handler, you should have a look at ``urb->status`` to detect any USB errors. +Since the context parameter is included in the URB, you can pass +information to the completion handler. + +Note that even when an error (or unlink) is reported, data may have been +transferred. That's because USB transfers are packetized; it might take +sixteen packets to transfer your 1KByte buffer, and ten of them might +have transferred successfully before the completion was called. + + +.. warning:: + + NEVER SLEEP IN A COMPLETION HANDLER. + + These are often called in atomic context. + +In the current kernel, completion handlers run with local interrupts +disabled, but in the future this will be changed, so don't assume that +local IRQs are always disabled inside completion handlers. + +How to do isochronous (ISO) transfers? +====================================== + +Besides the fields present on a bulk transfer, for ISO, you also +also have to set ``urb->interval`` to say how often to make transfers; it's +often one per frame (which is once every microframe for highspeed devices). +The actual interval used will be a power of two that's no bigger than what +you specify. You can use the :c:func:`usb_fill_int_urb` macro to fill +most ISO transfer fields. + +For ISO transfers you also have to fill a :c:type:`usb_iso_packet_descriptor` +structure, allocated at the end of the URB by :c:func:`usb_alloc_urb`, for +each packet you want to schedule. + +The :c:func:`usb_submit_urb` call modifies ``urb->interval`` to the implemented +interval value that is less than or equal to the requested interval value. If +``URB_ISO_ASAP`` scheduling is used, ``urb->start_frame`` is also updated. + +For each entry you have to specify the data offset for this frame (base is +transfer_buffer), and the length you want to write/expect to read. +After completion, actual_length contains the actual transferred length and +status contains the resulting status for the ISO transfer for this frame. +It is allowed to specify a varying length from frame to frame (e.g. for +audio synchronisation/adaptive transfer rates). You can also use the length +0 to omit one or more frames (striping). + +For scheduling you can choose your own start frame or ``URB_ISO_ASAP``. As +explained earlier, if you always keep at least one URB queued and your +completion keeps (re)submitting a later URB, you'll get smooth ISO streaming +(if usb bandwidth utilization allows). + +If you specify your own start frame, make sure it's several frames in advance +of the current frame. You might want this model if you're synchronizing +ISO data with some other event stream. + + +How to start interrupt (INT) transfers? +======================================= + +Interrupt transfers, like isochronous transfers, are periodic, and happen +in intervals that are powers of two (1, 2, 4 etc) units. Units are frames +for full and low speed devices, and microframes for high speed ones. +You can use the :c:func:`usb_fill_int_urb` macro to fill INT transfer fields. + +The :c:func:`usb_submit_urb` call modifies ``urb->interval`` to the implemented +interval value that is less than or equal to the requested interval value. + +In Linux 2.6, unlike earlier versions, interrupt URBs are not automagically +restarted when they complete. They end when the completion handler is +called, just like other URBs. If you want an interrupt URB to be restarted, +your completion handler must resubmit it. +s diff --git a/Documentation/driver-api/usb/anchors.rst b/Documentation/driver-api/usb/anchors.rst new file mode 100644 index 0000000000000000000000000000000000000000..4b248e691bd6e76c838d84e3c778f6e06415c15e --- /dev/null +++ b/Documentation/driver-api/usb/anchors.rst @@ -0,0 +1,83 @@ +USB Anchors +~~~~~~~~~~~ + +What is anchor? +=============== + +A USB driver needs to support some callbacks requiring +a driver to cease all IO to an interface. To do so, a +driver has to keep track of the URBs it has submitted +to know they've all completed or to call usb_kill_urb +for them. The anchor is a data structure takes care of +keeping track of URBs and provides methods to deal with +multiple URBs. + +Allocation and Initialisation +============================= + +There's no API to allocate an anchor. It is simply declared +as struct usb_anchor. :c:func:`init_usb_anchor` must be called to +initialise the data structure. + +Deallocation +============ + +Once it has no more URBs associated with it, the anchor can be +freed with normal memory management operations. + +Association and disassociation of URBs with anchors +=================================================== + +An association of URBs to an anchor is made by an explicit +call to :c:func:`usb_anchor_urb`. The association is maintained until +an URB is finished by (successful) completion. Thus disassociation +is automatic. A function is provided to forcibly finish (kill) +all URBs associated with an anchor. +Furthermore, disassociation can be made with :c:func:`usb_unanchor_urb` + +Operations on multitudes of URBs +================================ + +:c:func:`usb_kill_anchored_urbs` +-------------------------------- + +This function kills all URBs associated with an anchor. The URBs +are called in the reverse temporal order they were submitted. +This way no data can be reordered. + +:c:func:`usb_unlink_anchored_urbs` +---------------------------------- + + +This function unlinks all URBs associated with an anchor. The URBs +are processed in the reverse temporal order they were submitted. +This is similar to :c:func:`usb_kill_anchored_urbs`, but it will not sleep. +Therefore no guarantee is made that the URBs have been unlinked when +the call returns. They may be unlinked later but will be unlinked in +finite time. + +:c:func:`usb_scuttle_anchored_urbs` +----------------------------------- + +All URBs of an anchor are unanchored en masse. + +:c:func:`usb_wait_anchor_empty_timeout` +--------------------------------------- + +This function waits for all URBs associated with an anchor to finish +or a timeout, whichever comes first. Its return value will tell you +whether the timeout was reached. + +:c:func:`usb_anchor_empty` +-------------------------- + +Returns true if no URBs are associated with an anchor. Locking +is the caller's responsibility. + +:c:func:`usb_get_from_anchor` +----------------------------- + +Returns the oldest anchored URB of an anchor. The URB is unanchored +and returned with a reference. As you may mix URBs to several +destinations in one anchor you have no guarantee the chronologically +first submitted URB is returned. diff --git a/Documentation/driver-api/usb/bulk-streams.rst b/Documentation/driver-api/usb/bulk-streams.rst new file mode 100644 index 0000000000000000000000000000000000000000..99b515babdeb2b41bc8053a562e77522b2403789 --- /dev/null +++ b/Documentation/driver-api/usb/bulk-streams.rst @@ -0,0 +1,83 @@ +USB bulk streams +~~~~~~~~~~~~~~~~ + +Background +========== + +Bulk endpoint streams were added in the USB 3.0 specification. Streams allow a +device driver to overload a bulk endpoint so that multiple transfers can be +queued at once. + +Streams are defined in sections 4.4.6.4 and 8.12.1.4 of the Universal Serial Bus +3.0 specification at http://www.usb.org/developers/docs/ The USB Attached SCSI +Protocol, which uses streams to queue multiple SCSI commands, can be found on +the T10 website (http://t10.org/). + + +Device-side implications +======================== + +Once a buffer has been queued to a stream ring, the device is notified (through +an out-of-band mechanism on another endpoint) that data is ready for that stream +ID. The device then tells the host which "stream" it wants to start. The host +can also initiate a transfer on a stream without the device asking, but the +device can refuse that transfer. Devices can switch between streams at any +time. + + +Driver implications +=================== + +:: + + int usb_alloc_streams(struct usb_interface *interface, + struct usb_host_endpoint **eps, unsigned int num_eps, + unsigned int num_streams, gfp_t mem_flags); + +Device drivers will call this API to request that the host controller driver +allocate memory so the driver can use up to num_streams stream IDs. They must +pass an array of usb_host_endpoints that need to be setup with similar stream +IDs. This is to ensure that a UASP driver will be able to use the same stream +ID for the bulk IN and OUT endpoints used in a Bi-directional command sequence. + +The return value is an error condition (if one of the endpoints doesn't support +streams, or the xHCI driver ran out of memory), or the number of streams the +host controller allocated for this endpoint. The xHCI host controller hardware +declares how many stream IDs it can support, and each bulk endpoint on a +SuperSpeed device will say how many stream IDs it can handle. Therefore, +drivers should be able to deal with being allocated less stream IDs than they +requested. + +Do NOT call this function if you have URBs enqueued for any of the endpoints +passed in as arguments. Do not call this function to request less than two +streams. + +Drivers will only be allowed to call this API once for the same endpoint +without calling usb_free_streams(). This is a simplification for the xHCI host +controller driver, and may change in the future. + + +Picking new Stream IDs to use +============================= + +Stream ID 0 is reserved, and should not be used to communicate with devices. If +usb_alloc_streams() returns with a value of N, you may use streams 1 though N. +To queue an URB for a specific stream, set the urb->stream_id value. If the +endpoint does not support streams, an error will be returned. + +Note that new API to choose the next stream ID will have to be added if the xHCI +driver supports secondary stream IDs. + + +Clean up +======== + +If a driver wishes to stop using streams to communicate with the device, it +should call:: + + void usb_free_streams(struct usb_interface *interface, + struct usb_host_endpoint **eps, unsigned int num_eps, + gfp_t mem_flags); + +All stream IDs will be deallocated when the driver releases the interface, to +ensure that drivers that don't support streams will be able to use the endpoint. diff --git a/Documentation/driver-api/usb/callbacks.rst b/Documentation/driver-api/usb/callbacks.rst new file mode 100644 index 0000000000000000000000000000000000000000..2b80cf54bcc3118b956a8292aeae8e3af80acf7a --- /dev/null +++ b/Documentation/driver-api/usb/callbacks.rst @@ -0,0 +1,157 @@ +USB core callbacks +~~~~~~~~~~~~~~~~~~ + +What callbacks will usbcore do? +=============================== + +Usbcore will call into a driver through callbacks defined in the driver +structure and through the completion handler of URBs a driver submits. +Only the former are in the scope of this document. These two kinds of +callbacks are completely independent of each other. Information on the +completion callback can be found in :ref:`usb-urb`. + +The callbacks defined in the driver structure are: + +1. Hotplugging callbacks: + + - @probe: + Called to see if the driver is willing to manage a particular + interface on a device. + + - @disconnect: + Called when the interface is no longer accessible, usually + because its device has been (or is being) disconnected or the + driver module is being unloaded. + +2. Odd backdoor through usbfs: + + - @ioctl: + Used for drivers that want to talk to userspace through + the "usbfs" filesystem. This lets devices provide ways to + expose information to user space regardless of where they + do (or don't) show up otherwise in the filesystem. + +3. Power management (PM) callbacks: + + - @suspend: + Called when the device is going to be suspended. + + - @resume: + Called when the device is being resumed. + + - @reset_resume: + Called when the suspended device has been reset instead + of being resumed. + +4. Device level operations: + + - @pre_reset: + Called when the device is about to be reset. + + - @post_reset: + Called after the device has been reset + +The ioctl interface (2) should be used only if you have a very good +reason. Sysfs is preferred these days. The PM callbacks are covered +separately in :ref:`usb-power-management`. + +Calling conventions +=================== + +All callbacks are mutually exclusive. There's no need for locking +against other USB callbacks. All callbacks are called from a task +context. You may sleep. However, it is important that all sleeps have a +small fixed upper limit in time. In particular you must not call out to +user space and await results. + +Hotplugging callbacks +===================== + +These callbacks are intended to associate and disassociate a driver with +an interface. A driver's bond to an interface is exclusive. + +The probe() callback +-------------------- + +:: + + int (*probe) (struct usb_interface *intf, + const struct usb_device_id *id); + +Accept or decline an interface. If you accept the device return 0, +otherwise -ENODEV or -ENXIO. Other error codes should be used only if a +genuine error occurred during initialisation which prevented a driver +from accepting a device that would else have been accepted. +You are strongly encouraged to use usbcore's facility, +usb_set_intfdata(), to associate a data structure with an interface, so +that you know which internal state and identity you associate with a +particular interface. The device will not be suspended and you may do IO +to the interface you are called for and endpoint 0 of the device. Device +initialisation that doesn't take too long is a good idea here. + +The disconnect() callback +------------------------- + +:: + + void (*disconnect) (struct usb_interface *intf); + +This callback is a signal to break any connection with an interface. +You are not allowed any IO to a device after returning from this +callback. You also may not do any other operation that may interfere +with another driver bound the interface, eg. a power management +operation. +If you are called due to a physical disconnection, all your URBs will be +killed by usbcore. Note that in this case disconnect will be called some +time after the physical disconnection. Thus your driver must be prepared +to deal with failing IO even prior to the callback. + +Device level callbacks +====================== + +pre_reset +--------- + +:: + + int (*pre_reset)(struct usb_interface *intf); + +A driver or user space is triggering a reset on the device which +contains the interface passed as an argument. Cease IO, wait for all +outstanding URBs to complete, and save any device state you need to +restore. No more URBs may be submitted until the post_reset method +is called. + +If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if you +are in atomic context. + +post_reset +---------- + +:: + + int (*post_reset)(struct usb_interface *intf); + +The reset has completed. Restore any saved device state and begin +using the device again. + +If you need to allocate memory here, use GFP_NOIO or GFP_ATOMIC, if you +are in atomic context. + +Call sequences +============== + +No callbacks other than probe will be invoked for an interface +that isn't bound to your driver. + +Probe will never be called for an interface bound to a driver. +Hence following a successful probe, disconnect will be called +before there is another probe for the same interface. + +Once your driver is bound to an interface, disconnect can be +called at any time except in between pre_reset and post_reset. +pre_reset is always followed by post_reset, even if the reset +failed or the device has been unplugged. + +suspend is always followed by one of: resume, reset_resume, or +disconnect. diff --git a/Documentation/driver-api/usb/dma.rst b/Documentation/driver-api/usb/dma.rst new file mode 100644 index 0000000000000000000000000000000000000000..59d5aee89e37e9dfd5a3b3019035fb3d1af2e5ed --- /dev/null +++ b/Documentation/driver-api/usb/dma.rst @@ -0,0 +1,136 @@ +USB DMA +~~~~~~~ + +In Linux 2.5 kernels (and later), USB device drivers have additional control +over how DMA may be used to perform I/O operations. The APIs are detailed +in the kernel usb programming guide (kerneldoc, from the source code). + +API overview +============ + +The big picture is that USB drivers can continue to ignore most DMA issues, +though they still must provide DMA-ready buffers (see +``Documentation/DMA-API-HOWTO.txt``). That's how they've worked through +the 2.4 (and earlier) kernels, or they can now be DMA-aware. + +DMA-aware usb drivers: + +- New calls enable DMA-aware drivers, letting them allocate dma buffers and + manage dma mappings for existing dma-ready buffers (see below). + +- URBs have an additional "transfer_dma" field, as well as a transfer_flags + bit saying if it's valid. (Control requests also have "setup_dma", but + drivers must not use it.) + +- "usbcore" will map this DMA address, if a DMA-aware driver didn't do + it first and set ``URB_NO_TRANSFER_DMA_MAP``. HCDs + don't manage dma mappings for URBs. + +- There's a new "generic DMA API", parts of which are usable by USB device + drivers. Never use dma_set_mask() on any USB interface or device; that + would potentially break all devices sharing that bus. + +Eliminating copies +================== + +It's good to avoid making CPUs copy data needlessly. The costs can add up, +and effects like cache-trashing can impose subtle penalties. + +- If you're doing lots of small data transfers from the same buffer all + the time, that can really burn up resources on systems which use an + IOMMU to manage the DMA mappings. It can cost MUCH more to set up and + tear down the IOMMU mappings with each request than perform the I/O! + + For those specific cases, USB has primitives to allocate less expensive + memory. They work like kmalloc and kfree versions that give you the right + kind of addresses to store in urb->transfer_buffer and urb->transfer_dma. + You'd also set ``URB_NO_TRANSFER_DMA_MAP`` in urb->transfer_flags:: + + void *usb_alloc_coherent (struct usb_device *dev, size_t size, + int mem_flags, dma_addr_t *dma); + + void usb_free_coherent (struct usb_device *dev, size_t size, + void *addr, dma_addr_t dma); + + Most drivers should **NOT** be using these primitives; they don't need + to use this type of memory ("dma-coherent"), and memory returned from + :c:func:`kmalloc` will work just fine. + + The memory buffer returned is "dma-coherent"; sometimes you might need to + force a consistent memory access ordering by using memory barriers. It's + not using a streaming DMA mapping, so it's good for small transfers on + systems where the I/O would otherwise thrash an IOMMU mapping. (See + ``Documentation/DMA-API-HOWTO.txt`` for definitions of "coherent" and + "streaming" DMA mappings.) + + Asking for 1/Nth of a page (as well as asking for N pages) is reasonably + space-efficient. + + On most systems the memory returned will be uncached, because the + semantics of dma-coherent memory require either bypassing CPU caches + or using cache hardware with bus-snooping support. While x86 hardware + has such bus-snooping, many other systems use software to flush cache + lines to prevent DMA conflicts. + +- Devices on some EHCI controllers could handle DMA to/from high memory. + + Unfortunately, the current Linux DMA infrastructure doesn't have a sane + way to expose these capabilities ... and in any case, HIGHMEM is mostly a + design wart specific to x86_32. So your best bet is to ensure you never + pass a highmem buffer into a USB driver. That's easy; it's the default + behavior. Just don't override it; e.g. with ``NETIF_F_HIGHDMA``. + + This may force your callers to do some bounce buffering, copying from + high memory to "normal" DMA memory. If you can come up with a good way + to fix this issue (for x86_32 machines with over 1 GByte of memory), + feel free to submit patches. + +Working with existing buffers +============================= + +Existing buffers aren't usable for DMA without first being mapped into the +DMA address space of the device. However, most buffers passed to your +driver can safely be used with such DMA mapping. (See the first section +of Documentation/DMA-API-HOWTO.txt, titled "What memory is DMA-able?") + +- When you're using scatterlists, you can map everything at once. On some + systems, this kicks in an IOMMU and turns the scatterlists into single + DMA transactions:: + + int usb_buffer_map_sg (struct usb_device *dev, unsigned pipe, + struct scatterlist *sg, int nents); + + void usb_buffer_dmasync_sg (struct usb_device *dev, unsigned pipe, + struct scatterlist *sg, int n_hw_ents); + + void usb_buffer_unmap_sg (struct usb_device *dev, unsigned pipe, + struct scatterlist *sg, int n_hw_ents); + + It's probably easier to use the new ``usb_sg_*()`` calls, which do the DMA + mapping and apply other tweaks to make scatterlist i/o be fast. + +- Some drivers may prefer to work with the model that they're mapping large + buffers, synchronizing their safe re-use. (If there's no re-use, then let + usbcore do the map/unmap.) Large periodic transfers make good examples + here, since it's cheaper to just synchronize the buffer than to unmap it + each time an urb completes and then re-map it on during resubmission. + + These calls all work with initialized urbs: ``urb->dev``, ``urb->pipe``, + ``urb->transfer_buffer``, and ``urb->transfer_buffer_length`` must all be + valid when these calls are used (``urb->setup_packet`` must be valid too + if urb is a control request):: + + struct urb *usb_buffer_map (struct urb *urb); + + void usb_buffer_dmasync (struct urb *urb); + + void usb_buffer_unmap (struct urb *urb); + + The calls manage ``urb->transfer_dma`` for you, and set + ``URB_NO_TRANSFER_DMA_MAP`` so that usbcore won't map or unmap the buffer. + They cannot be used for setup_packet buffers in control requests. + +Note that several of those interfaces are currently commented out, since +they don't have current users. See the source code. Other than the dmasync +calls (where the underlying DMA primitives have changed), most of them can +easily be commented back in if you want to use them. diff --git a/Documentation/driver-api/usb/error-codes.rst b/Documentation/driver-api/usb/error-codes.rst new file mode 100644 index 0000000000000000000000000000000000000000..a3e84bfac77649bc62a73a3575cb56f3e1645c5d --- /dev/null +++ b/Documentation/driver-api/usb/error-codes.rst @@ -0,0 +1,207 @@ +.. _usb-error-codes: + +USB Error codes +~~~~~~~~~~~~~~~ + +:Revised: 2004-Oct-21 + +This is the documentation of (hopefully) all possible error codes (and +their interpretation) that can be returned from usbcore. + +Some of them are returned by the Host Controller Drivers (HCDs), which +device drivers only see through usbcore. As a rule, all the HCDs should +behave the same except for transfer speed dependent behaviors and the +way certain faults are reported. + + +Error codes returned by :c:func:`usb_submit_urb` +================================================ + +Non-USB-specific: + + +=============== =============================================== +0 URB submission went fine + +``-ENOMEM`` no memory for allocation of internal structures +=============== =============================================== + +USB-specific: + +======================= ======================================================= +``-EBUSY`` The URB is already active. + +``-ENODEV`` specified USB-device or bus doesn't exist + +``-ENOENT`` specified interface or endpoint does not exist or + is not enabled + +``-ENXIO`` host controller driver does not support queuing of + this type of urb. (treat as a host controller bug.) + +``-EINVAL`` a) Invalid transfer type specified (or not supported) + b) Invalid or unsupported periodic transfer interval + c) ISO: attempted to change transfer interval + d) ISO: ``number_of_packets`` is < 0 + e) various other cases + +``-EXDEV`` ISO: ``URB_ISO_ASAP`` wasn't specified and all the + frames the URB would be scheduled in have already + expired. + +``-EFBIG`` Host controller driver can't schedule that many ISO + frames. + +``-EPIPE`` The pipe type specified in the URB doesn't match the + endpoint's actual type. + +``-EMSGSIZE`` (a) endpoint maxpacket size is zero; it is not usable + in the current interface altsetting. + (b) ISO packet is larger than the endpoint maxpacket. + (c) requested data transfer length is invalid: negative + or too large for the host controller. + +``-ENOSPC`` This request would overcommit the usb bandwidth reserved + for periodic transfers (interrupt, isochronous). + +``-ESHUTDOWN`` The device or host controller has been disabled due to + some problem that could not be worked around. + +``-EPERM`` Submission failed because ``urb->reject`` was set. + +``-EHOSTUNREACH`` URB was rejected because the device is suspended. + +``-ENOEXEC`` A control URB doesn't contain a Setup packet. +======================= ======================================================= + +Error codes returned by ``in urb->status`` or in ``iso_frame_desc[n].status`` (for ISO) +======================================================================================= + +USB device drivers may only test urb status values in completion handlers. +This is because otherwise there would be a race between HCDs updating +these values on one CPU, and device drivers testing them on another CPU. + +A transfer's actual_length may be positive even when an error has been +reported. That's because transfers often involve several packets, so that +one or more packets could finish before an error stops further endpoint I/O. + +For isochronous URBs, the urb status value is non-zero only if the URB is +unlinked, the device is removed, the host controller is disabled, or the total +transferred length is less than the requested length and the +``URB_SHORT_NOT_OK`` flag is set. Completion handlers for isochronous URBs +should only see ``urb->status`` set to zero, ``-ENOENT``, ``-ECONNRESET``, +``-ESHUTDOWN``, or ``-EREMOTEIO``. Individual frame descriptor status fields +may report more status codes. + + +=============================== =============================================== +0 Transfer completed successfully + +``-ENOENT`` URB was synchronously unlinked by + :c:func:`usb_unlink_urb` + +``-EINPROGRESS`` URB still pending, no results yet + (That is, if drivers see this it's a bug.) + +``-EPROTO`` [#f1]_, [#f2]_ a) bitstuff error + b) no response packet received within the + prescribed bus turn-around time + c) unknown USB error + +``-EILSEQ`` [#f1]_, [#f2]_ a) CRC mismatch + b) no response packet received within the + prescribed bus turn-around time + c) unknown USB error + + Note that often the controller hardware does + not distinguish among cases a), b), and c), so + a driver cannot tell whether there was a + protocol error, a failure to respond (often + caused by device disconnect), or some other + fault. + +``-ETIME`` [#f2]_ No response packet received within the + prescribed bus turn-around time. This error + may instead be reported as + ``-EPROTO`` or ``-EILSEQ``. + +``-ETIMEDOUT`` Synchronous USB message functions use this code + to indicate timeout expired before the transfer + completed, and no other error was reported + by HC. + +``-EPIPE`` [#f2]_ Endpoint stalled. For non-control endpoints, + reset this status with + :c:func:`usb_clear_halt`. + +``-ECOMM`` During an IN transfer, the host controller + received data from an endpoint faster than it + could be written to system memory + +``-ENOSR`` During an OUT transfer, the host controller + could not retrieve data from system memory fast + enough to keep up with the USB data rate + +``-EOVERFLOW`` [#f1]_ The amount of data returned by the endpoint was + greater than either the max packet size of the + endpoint or the remaining buffer size. + "Babble". + +``-EREMOTEIO`` The data read from the endpoint did not fill + the specified buffer, and ``URB_SHORT_NOT_OK`` + was set in ``urb->transfer_flags``. + +``-ENODEV`` Device was removed. Often preceded by a burst + of other errors, since the hub driver doesn't + detect device removal events immediately. + +``-EXDEV`` ISO transfer only partially completed + (only set in ``iso_frame_desc[n].status``, + not ``urb->status``) + +``-EINVAL`` ISO madness, if this happens: Log off and + go home + +``-ECONNRESET`` URB was asynchronously unlinked by + :c:func:`usb_unlink_urb` + +``-ESHUTDOWN`` The device or host controller has been + disabled due to some problem that could not + be worked around, such as a physical + disconnect. +=============================== =============================================== + + +.. [#f1] + + Error codes like ``-EPROTO``, ``-EILSEQ`` and ``-EOVERFLOW`` normally + indicate hardware problems such as bad devices (including firmware) + or cables. + +.. [#f2] + + This is also one of several codes that different kinds of host + controller use to indicate a transfer has failed because of device + disconnect. In the interval before the hub driver starts disconnect + processing, devices may receive such fault reports for every request. + + + +Error codes returned by usbcore-functions +========================================= + +.. note:: expect also other submit and transfer status codes + +:c:func:`usb_register`: + +======================= =================================== +``-EINVAL`` error during registering new driver +======================= =================================== + +``usb_get_*/usb_set_*()``, +:c:func:`usb_control_msg`, +:c:func:`usb_bulk_msg()`: + +======================= ============================================== +``-ETIMEDOUT`` Timeout expired before the transfer completed. +======================= ============================================== diff --git a/Documentation/driver-api/usb/gadget.rst b/Documentation/driver-api/usb/gadget.rst new file mode 100644 index 0000000000000000000000000000000000000000..3e8a3809c0b82afb50551499b9ce090ce92f26b5 --- /dev/null +++ b/Documentation/driver-api/usb/gadget.rst @@ -0,0 +1,510 @@ +======================== +USB Gadget API for Linux +======================== + +:Author: David Brownell +:Date: 20 August 2004 + +Introduction +============ + +This document presents a Linux-USB "Gadget" kernel mode API, for use +within peripherals and other USB devices that embed Linux. It provides +an overview of the API structure, and shows how that fits into a system +development project. This is the first such API released on Linux to +address a number of important problems, including: + +- Supports USB 2.0, for high speed devices which can stream data at + several dozen megabytes per second. + +- Handles devices with dozens of endpoints just as well as ones with + just two fixed-function ones. Gadget drivers can be written so + they're easy to port to new hardware. + +- Flexible enough to expose more complex USB device capabilities such + as multiple configurations, multiple interfaces, composite devices, + and alternate interface settings. + +- USB "On-The-Go" (OTG) support, in conjunction with updates to the + Linux-USB host side. + +- Sharing data structures and API models with the Linux-USB host side + API. This helps the OTG support, and looks forward to more-symmetric + frameworks (where the same I/O model is used by both host and device + side drivers). + +- Minimalist, so it's easier to support new device controller hardware. + I/O processing doesn't imply large demands for memory or CPU + resources. + +Most Linux developers will not be able to use this API, since they have +USB ``host`` hardware in a PC, workstation, or server. Linux users with +embedded systems are more likely to have USB peripheral hardware. To +distinguish drivers running inside such hardware from the more familiar +Linux "USB device drivers", which are host side proxies for the real USB +devices, a different term is used: the drivers inside the peripherals +are "USB gadget drivers". In USB protocol interactions, the device +driver is the master (or "client driver") and the gadget driver is the +slave (or "function driver"). + +The gadget API resembles the host side Linux-USB API in that both use +queues of request objects to package I/O buffers, and those requests may +be submitted or canceled. They share common definitions for the standard +USB *Chapter 9* messages, structures, and constants. Also, both APIs +bind and unbind drivers to devices. The APIs differ in detail, since the +host side's current URB framework exposes a number of implementation +details and assumptions that are inappropriate for a gadget API. While +the model for control transfers and configuration management is +necessarily different (one side is a hardware-neutral master, the other +is a hardware-aware slave), the endpoint I/0 API used here should also +be usable for an overhead-reduced host side API. + +Structure of Gadget Drivers +=========================== + +A system running inside a USB peripheral normally has at least three +layers inside the kernel to handle USB protocol processing, and may have +additional layers in user space code. The ``gadget`` API is used by the +middle layer to interact with the lowest level (which directly handles +hardware). + +In Linux, from the bottom up, these layers are: + +*USB Controller Driver* + This is the lowest software level. It is the only layer that talks + to hardware, through registers, fifos, dma, irqs, and the like. The + ```` API abstracts the peripheral controller + endpoint hardware. That hardware is exposed through endpoint + objects, which accept streams of IN/OUT buffers, and through + callbacks that interact with gadget drivers. Since normal USB + devices only have one upstream port, they only have one of these + drivers. The controller driver can support any number of different + gadget drivers, but only one of them can be used at a time. + + Examples of such controller hardware include the PCI-based NetChip + 2280 USB 2.0 high speed controller, the SA-11x0 or PXA-25x UDC + (found within many PDAs), and a variety of other products. + +*Gadget Driver* + The lower boundary of this driver implements hardware-neutral USB + functions, using calls to the controller driver. Because such + hardware varies widely in capabilities and restrictions, and is used + in embedded environments where space is at a premium, the gadget + driver is often configured at compile time to work with endpoints + supported by one particular controller. Gadget drivers may be + portable to several different controllers, using conditional + compilation. (Recent kernels substantially simplify the work + involved in supporting new hardware, by *autoconfiguring* endpoints + automatically for many bulk-oriented drivers.) Gadget driver + responsibilities include: + + - handling setup requests (ep0 protocol responses) possibly + including class-specific functionality + + - returning configuration and string descriptors + + - (re)setting configurations and interface altsettings, including + enabling and configuring endpoints + + - handling life cycle events, such as managing bindings to + hardware, USB suspend/resume, remote wakeup, and disconnection + from the USB host. + + - managing IN and OUT transfers on all currently enabled endpoints + + Such drivers may be modules of proprietary code, although that + approach is discouraged in the Linux community. + +*Upper Level* + Most gadget drivers have an upper boundary that connects to some + Linux driver or framework in Linux. Through that boundary flows the + data which the gadget driver produces and/or consumes through + protocol transfers over USB. Examples include: + + - user mode code, using generic (gadgetfs) or application specific + files in ``/dev`` + + - networking subsystem (for network gadgets, like the CDC Ethernet + Model gadget driver) + + - data capture drivers, perhaps video4Linux or a scanner driver; or + test and measurement hardware. + + - input subsystem (for HID gadgets) + + - sound subsystem (for audio gadgets) + + - file system (for PTP gadgets) + + - block i/o subsystem (for usb-storage gadgets) + + - ... and more + +*Additional Layers* + Other layers may exist. These could include kernel layers, such as + network protocol stacks, as well as user mode applications building + on standard POSIX system call APIs such as ``open()``, ``close()``, + ``read()`` and ``write()``. On newer systems, POSIX Async I/O calls may + be an option. Such user mode code will not necessarily be subject to + the GNU General Public License (GPL). + +OTG-capable systems will also need to include a standard Linux-USB host +side stack, with ``usbcore``, one or more *Host Controller Drivers* +(HCDs), *USB Device Drivers* to support the OTG "Targeted Peripheral +List", and so forth. There will also be an *OTG Controller Driver*, +which is visible to gadget and device driver developers only indirectly. +That helps the host and device side USB controllers implement the two +new OTG protocols (HNP and SRP). Roles switch (host to peripheral, or +vice versa) using HNP during USB suspend processing, and SRP can be +viewed as a more battery-friendly kind of device wakeup protocol. + +Over time, reusable utilities are evolving to help make some gadget +driver tasks simpler. For example, building configuration descriptors +from vectors of descriptors for the configurations interfaces and +endpoints is now automated, and many drivers now use autoconfiguration +to choose hardware endpoints and initialize their descriptors. A +potential example of particular interest is code implementing standard +USB-IF protocols for HID, networking, storage, or audio classes. Some +developers are interested in KDB or KGDB hooks, to let target hardware +be remotely debugged. Most such USB protocol code doesn't need to be +hardware-specific, any more than network protocols like X11, HTTP, or +NFS are. Such gadget-side interface drivers should eventually be +combined, to implement composite devices. + +Kernel Mode Gadget API +====================== + +Gadget drivers declare themselves through a struct +:c:type:`usb_gadget_driver`, which is responsible for most parts of enumeration +for a struct :c:type:`usb_gadget`. The response to a set_configuration usually +involves enabling one or more of the struct :c:type:`usb_ep` objects exposed by +the gadget, and submitting one or more struct :c:type:`usb_request` buffers to +transfer data. Understand those four data types, and their operations, +and you will understand how this API works. + +.. Note:: + + Other than the "Chapter 9" data types, most of the significant data + types and functions are described here. + + However, some relevant information is likely omitted from what you + are reading. One example of such information is endpoint + autoconfiguration. You'll have to read the header file, and use + example source code (such as that for "Gadget Zero"), to fully + understand the API. + + The part of the API implementing some basic driver capabilities is + specific to the version of the Linux kernel that's in use. The 2.6 + and upper kernel versions include a *driver model* framework that has + no analogue on earlier kernels; so those parts of the gadget API are + not fully portable. (They are implemented on 2.4 kernels, but in a + different way.) The driver model state is another part of this API that is + ignored by the kerneldoc tools. + +The core API does not expose every possible hardware feature, only the +most widely available ones. There are significant hardware features, +such as device-to-device DMA (without temporary storage in a memory +buffer) that would be added using hardware-specific APIs. + +This API allows drivers to use conditional compilation to handle +endpoint capabilities of different hardware, but doesn't require that. +Hardware tends to have arbitrary restrictions, relating to transfer +types, addressing, packet sizes, buffering, and availability. As a rule, +such differences only matter for "endpoint zero" logic that handles +device configuration and management. The API supports limited run-time +detection of capabilities, through naming conventions for endpoints. +Many drivers will be able to at least partially autoconfigure +themselves. In particular, driver init sections will often have endpoint +autoconfiguration logic that scans the hardware's list of endpoints to +find ones matching the driver requirements (relying on those +conventions), to eliminate some of the most common reasons for +conditional compilation. + +Like the Linux-USB host side API, this API exposes the "chunky" nature +of USB messages: I/O requests are in terms of one or more "packets", and +packet boundaries are visible to drivers. Compared to RS-232 serial +protocols, USB resembles synchronous protocols like HDLC (N bytes per +frame, multipoint addressing, host as the primary station and devices as +secondary stations) more than asynchronous ones (tty style: 8 data bits +per frame, no parity, one stop bit). So for example the controller +drivers won't buffer two single byte writes into a single two-byte USB +IN packet, although gadget drivers may do so when they implement +protocols where packet boundaries (and "short packets") are not +significant. + +Driver Life Cycle +----------------- + +Gadget drivers make endpoint I/O requests to hardware without needing to +know many details of the hardware, but driver setup/configuration code +needs to handle some differences. Use the API like this: + +1. Register a driver for the particular device side usb controller + hardware, such as the net2280 on PCI (USB 2.0), sa11x0 or pxa25x as + found in Linux PDAs, and so on. At this point the device is logically + in the USB ch9 initial state (``attached``), drawing no power and not + usable (since it does not yet support enumeration). Any host should + not see the device, since it's not activated the data line pullup + used by the host to detect a device, even if VBUS power is available. + +2. Register a gadget driver that implements some higher level device + function. That will then bind() to a :c:type:`usb_gadget`, which activates + the data line pullup sometime after detecting VBUS. + +3. The hardware driver can now start enumerating. The steps it handles + are to accept USB ``power`` and ``set_address`` requests. Other steps are + handled by the gadget driver. If the gadget driver module is unloaded + before the host starts to enumerate, steps before step 7 are skipped. + +4. The gadget driver's ``setup()`` call returns usb descriptors, based both + on what the bus interface hardware provides and on the functionality + being implemented. That can involve alternate settings or + configurations, unless the hardware prevents such operation. For OTG + devices, each configuration descriptor includes an OTG descriptor. + +5. The gadget driver handles the last step of enumeration, when the USB + host issues a ``set_configuration`` call. It enables all endpoints used + in that configuration, with all interfaces in their default settings. + That involves using a list of the hardware's endpoints, enabling each + endpoint according to its descriptor. It may also involve using + ``usb_gadget_vbus_draw`` to let more power be drawn from VBUS, as + allowed by that configuration. For OTG devices, setting a + configuration may also involve reporting HNP capabilities through a + user interface. + +6. Do real work and perform data transfers, possibly involving changes + to interface settings or switching to new configurations, until the + device is disconnect()ed from the host. Queue any number of transfer + requests to each endpoint. It may be suspended and resumed several + times before being disconnected. On disconnect, the drivers go back + to step 3 (above). + +7. When the gadget driver module is being unloaded, the driver unbind() + callback is issued. That lets the controller driver be unloaded. + +Drivers will normally be arranged so that just loading the gadget driver +module (or statically linking it into a Linux kernel) allows the +peripheral device to be enumerated, but some drivers will defer +enumeration until some higher level component (like a user mode daemon) +enables it. Note that at this lowest level there are no policies about +how ep0 configuration logic is implemented, except that it should obey +USB specifications. Such issues are in the domain of gadget drivers, +including knowing about implementation constraints imposed by some USB +controllers or understanding that composite devices might happen to be +built by integrating reusable components. + +Note that the lifecycle above can be slightly different for OTG devices. +Other than providing an additional OTG descriptor in each configuration, +only the HNP-related differences are particularly visible to driver +code. They involve reporting requirements during the ``SET_CONFIGURATION`` +request, and the option to invoke HNP during some suspend callbacks. +Also, SRP changes the semantics of ``usb_gadget_wakeup`` slightly. + +USB 2.0 Chapter 9 Types and Constants +------------------------------------- + +Gadget drivers rely on common USB structures and constants defined in +the :ref:`linux/usb/ch9.h ` header file, which is standard in +Linux 2.6+ kernels. These are the same types and constants used by host side +drivers (and usbcore). + +Core Objects and Methods +------------------------ + +These are declared in ````, and are used by gadget +drivers to interact with USB peripheral controller drivers. + +.. kernel-doc:: include/linux/usb/gadget.h + :internal: + +Optional Utilities +------------------ + +The core API is sufficient for writing a USB Gadget Driver, but some +optional utilities are provided to simplify common tasks. These +utilities include endpoint autoconfiguration. + +.. kernel-doc:: drivers/usb/gadget/usbstring.c + :export: + +.. kernel-doc:: drivers/usb/gadget/config.c + :export: + +Composite Device Framework +-------------------------- + +The core API is sufficient for writing drivers for composite USB devices +(with more than one function in a given configuration), and also +multi-configuration devices (also more than one function, but not +necessarily sharing a given configuration). There is however an optional +framework which makes it easier to reuse and combine functions. + +Devices using this framework provide a struct :c:type:`usb_composite_driver`, +which in turn provides one or more struct :c:type:`usb_configuration` +instances. Each such configuration includes at least one struct +:c:type:`usb_function`, which packages a user visible role such as "network +link" or "mass storage device". Management functions may also exist, +such as "Device Firmware Upgrade". + +.. kernel-doc:: include/linux/usb/composite.h + :internal: + +.. kernel-doc:: drivers/usb/gadget/composite.c + :export: + +Composite Device Functions +-------------------------- + +At this writing, a few of the current gadget drivers have been converted +to this framework. Near-term plans include converting all of them, +except for ``gadgetfs``. + +Peripheral Controller Drivers +============================= + +The first hardware supporting this API was the NetChip 2280 controller, +which supports USB 2.0 high speed and is based on PCI. This is the +``net2280`` driver module. The driver supports Linux kernel versions 2.4 +and 2.6; contact NetChip Technologies for development boards and product +information. + +Other hardware working in the ``gadget`` framework includes: Intel's PXA +25x and IXP42x series processors (``pxa2xx_udc``), Toshiba TC86c001 +"Goku-S" (``goku_udc``), Renesas SH7705/7727 (``sh_udc``), MediaQ 11xx +(``mq11xx_udc``), Hynix HMS30C7202 (``h7202_udc``), National 9303/4 +(``n9604_udc``), Texas Instruments OMAP (``omap_udc``), Sharp LH7A40x +(``lh7a40x_udc``), and more. Most of those are full speed controllers. + +At this writing, there are people at work on drivers in this framework +for several other USB device controllers, with plans to make many of +them be widely available. + +A partial USB simulator, the ``dummy_hcd`` driver, is available. It can +act like a net2280, a pxa25x, or an sa11x0 in terms of available +endpoints and device speeds; and it simulates control, bulk, and to some +extent interrupt transfers. That lets you develop some parts of a gadget +driver on a normal PC, without any special hardware, and perhaps with +the assistance of tools such as GDB running with User Mode Linux. At +least one person has expressed interest in adapting that approach, +hooking it up to a simulator for a microcontroller. Such simulators can +help debug subsystems where the runtime hardware is unfriendly to +software development, or is not yet available. + +Support for other controllers is expected to be developed and +contributed over time, as this driver framework evolves. + +Gadget Drivers +============== + +In addition to *Gadget Zero* (used primarily for testing and development +with drivers for usb controller hardware), other gadget drivers exist. + +There's an ``ethernet`` gadget driver, which implements one of the most +useful *Communications Device Class* (CDC) models. One of the standards +for cable modem interoperability even specifies the use of this ethernet +model as one of two mandatory options. Gadgets using this code look to a +USB host as if they're an Ethernet adapter. It provides access to a +network where the gadget's CPU is one host, which could easily be +bridging, routing, or firewalling access to other networks. Since some +hardware can't fully implement the CDC Ethernet requirements, this +driver also implements a "good parts only" subset of CDC Ethernet. (That +subset doesn't advertise itself as CDC Ethernet, to avoid creating +problems.) + +Support for Microsoft's ``RNDIS`` protocol has been contributed by +Pengutronix and Auerswald GmbH. This is like CDC Ethernet, but it runs +on more slightly USB hardware (but less than the CDC subset). However, +its main claim to fame is being able to connect directly to recent +versions of Windows, using drivers that Microsoft bundles and supports, +making it much simpler to network with Windows. + +There is also support for user mode gadget drivers, using ``gadgetfs``. +This provides a *User Mode API* that presents each endpoint as a single +file descriptor. I/O is done using normal ``read()`` and ``read()`` calls. +Familiar tools like GDB and pthreads can be used to develop and debug +user mode drivers, so that once a robust controller driver is available +many applications for it won't require new kernel mode software. Linux +2.6 *Async I/O (AIO)* support is available, so that user mode software +can stream data with only slightly more overhead than a kernel driver. + +There's a USB Mass Storage class driver, which provides a different +solution for interoperability with systems such as MS-Windows and MacOS. +That *Mass Storage* driver uses a file or block device as backing store +for a drive, like the ``loop`` driver. The USB host uses the BBB, CB, or +CBI versions of the mass storage class specification, using transparent +SCSI commands to access the data from the backing store. + +There's a "serial line" driver, useful for TTY style operation over USB. +The latest version of that driver supports CDC ACM style operation, like +a USB modem, and so on most hardware it can interoperate easily with +MS-Windows. One interesting use of that driver is in boot firmware (like +a BIOS), which can sometimes use that model with very small systems +without real serial lines. + +Support for other kinds of gadget is expected to be developed and +contributed over time, as this driver framework evolves. + +USB On-The-GO (OTG) +=================== + +USB OTG support on Linux 2.6 was initially developed by Texas +Instruments for `OMAP `__ 16xx and 17xx series +processors. Other OTG systems should work in similar ways, but the +hardware level details could be very different. + +Systems need specialized hardware support to implement OTG, notably +including a special *Mini-AB* jack and associated transceiver to support +*Dual-Role* operation: they can act either as a host, using the standard +Linux-USB host side driver stack, or as a peripheral, using this +``gadget`` framework. To do that, the system software relies on small +additions to those programming interfaces, and on a new internal +component (here called an "OTG Controller") affecting which driver stack +connects to the OTG port. In each role, the system can re-use the +existing pool of hardware-neutral drivers, layered on top of the +controller driver interfaces (:c:type:`usb_bus` or :c:type:`usb_gadget`). +Such drivers need at most minor changes, and most of the calls added to +support OTG can also benefit non-OTG products. + +- Gadget drivers test the ``is_otg`` flag, and use it to determine + whether or not to include an OTG descriptor in each of their + configurations. + +- Gadget drivers may need changes to support the two new OTG protocols, + exposed in new gadget attributes such as ``b_hnp_enable`` flag. HNP + support should be reported through a user interface (two LEDs could + suffice), and is triggered in some cases when the host suspends the + peripheral. SRP support can be user-initiated just like remote + wakeup, probably by pressing the same button. + +- On the host side, USB device drivers need to be taught to trigger HNP + at appropriate moments, using ``usb_suspend_device()``. That also + conserves battery power, which is useful even for non-OTG + configurations. + +- Also on the host side, a driver must support the OTG "Targeted + Peripheral List". That's just a whitelist, used to reject peripherals + not supported with a given Linux OTG host. *This whitelist is + product-specific; each product must modify* ``otg_whitelist.h`` *to + match its interoperability specification.* + + Non-OTG Linux hosts, like PCs and workstations, normally have some + solution for adding drivers, so that peripherals that aren't + recognized can eventually be supported. That approach is unreasonable + for consumer products that may never have their firmware upgraded, + and where it's usually unrealistic to expect traditional + PC/workstation/server kinds of support model to work. For example, + it's often impractical to change device firmware once the product has + been distributed, so driver bugs can't normally be fixed if they're + found after shipment. + +Additional changes are needed below those hardware-neutral :c:type:`usb_bus` +and :c:type:`usb_gadget` driver interfaces; those aren't discussed here in any +detail. Those affect the hardware-specific code for each USB Host or +Peripheral controller, and how the HCD initializes (since OTG can be +active only on a single port). They also involve what may be called an +*OTG Controller Driver*, managing the OTG transceiver and the OTG state +machine logic as well as much of the root hub behavior for the OTG port. +The OTG controller driver needs to activate and deactivate USB +controllers depending on the relevant device role. Some related changes +were needed inside usbcore, so that it can identify OTG-capable devices +and respond appropriately to HNP or SRP protocols. diff --git a/Documentation/driver-api/usb/hotplug.rst b/Documentation/driver-api/usb/hotplug.rst new file mode 100644 index 0000000000000000000000000000000000000000..79663e653ca1cca947eadfd8c5349a62576ab3fb --- /dev/null +++ b/Documentation/driver-api/usb/hotplug.rst @@ -0,0 +1,154 @@ +USB hotplugging +~~~~~~~~~~~~~~~ + +Linux Hotplugging +================= + + +In hotpluggable busses like USB (and Cardbus PCI), end-users plug devices +into the bus with power on. In most cases, users expect the devices to become +immediately usable. That means the system must do many things, including: + + - Find a driver that can handle the device. That may involve + loading a kernel module; newer drivers can use module-init-tools + to publish their device (and class) support to user utilities. + + - Bind a driver to that device. Bus frameworks do that using a + device driver's probe() routine. + + - Tell other subsystems to configure the new device. Print + queues may need to be enabled, networks brought up, disk + partitions mounted, and so on. In some cases these will + be driver-specific actions. + +This involves a mix of kernel mode and user mode actions. Making devices +be immediately usable means that any user mode actions can't wait for an +administrator to do them: the kernel must trigger them, either passively +(triggering some monitoring daemon to invoke a helper program) or +actively (calling such a user mode helper program directly). + +Those triggered actions must support a system's administrative policies; +such programs are called "policy agents" here. Typically they involve +shell scripts that dispatch to more familiar administration tools. + +Because some of those actions rely on information about drivers (metadata) +that is currently available only when the drivers are dynamically linked, +you get the best hotplugging when you configure a highly modular system. + +Kernel Hotplug Helper (``/sbin/hotplug``) +========================================= + +There is a kernel parameter: ``/proc/sys/kernel/hotplug``, which normally +holds the pathname ``/sbin/hotplug``. That parameter names a program +which the kernel may invoke at various times. + +The /sbin/hotplug program can be invoked by any subsystem as part of its +reaction to a configuration change, from a thread in that subsystem. +Only one parameter is required: the name of a subsystem being notified of +some kernel event. That name is used as the first key for further event +dispatch; any other argument and environment parameters are specified by +the subsystem making that invocation. + +Hotplug software and other resources is available at: + + http://linux-hotplug.sourceforge.net + +Mailing list information is also available at that site. + + +USB Policy Agent +================ + +The USB subsystem currently invokes ``/sbin/hotplug`` when USB devices +are added or removed from system. The invocation is done by the kernel +hub workqueue [hub_wq], or else as part of root hub initialization +(done by init, modprobe, kapmd, etc). Its single command line parameter +is the string "usb", and it passes these environment variables: + +========== ============================================ +ACTION ``add``, ``remove`` +PRODUCT USB vendor, product, and version codes (hex) +TYPE device class codes (decimal) +INTERFACE interface 0 class codes (decimal) +========== ============================================ + +If "usbdevfs" is configured, DEVICE and DEVFS are also passed. DEVICE is +the pathname of the device, and is useful for devices with multiple and/or +alternate interfaces that complicate driver selection. By design, USB +hotplugging is independent of ``usbdevfs``: you can do most essential parts +of USB device setup without using that filesystem, and without running a +user mode daemon to detect changes in system configuration. + +Currently available policy agent implementations can load drivers for +modules, and can invoke driver-specific setup scripts. The newest ones +leverage USB module-init-tools support. Later agents might unload drivers. + + +USB Modutils Support +==================== + +Current versions of module-init-tools will create a ``modules.usbmap`` file +which contains the entries from each driver's ``MODULE_DEVICE_TABLE``. Such +files can be used by various user mode policy agents to make sure all the +right driver modules get loaded, either at boot time or later. + +See ``linux/usb.h`` for full information about such table entries; or look +at existing drivers. Each table entry describes one or more criteria to +be used when matching a driver to a device or class of devices. The +specific criteria are identified by bits set in "match_flags", paired +with field values. You can construct the criteria directly, or with +macros such as these, and use driver_info to store more information:: + + USB_DEVICE (vendorId, productId) + ... matching devices with specified vendor and product ids + USB_DEVICE_VER (vendorId, productId, lo, hi) + ... like USB_DEVICE with lo <= productversion <= hi + USB_INTERFACE_INFO (class, subclass, protocol) + ... matching specified interface class info + USB_DEVICE_INFO (class, subclass, protocol) + ... matching specified device class info + +A short example, for a driver that supports several specific USB devices +and their quirks, might have a MODULE_DEVICE_TABLE like this:: + + static const struct usb_device_id mydriver_id_table[] = { + { USB_DEVICE (0x9999, 0xaaaa), driver_info: QUIRK_X }, + { USB_DEVICE (0xbbbb, 0x8888), driver_info: QUIRK_Y|QUIRK_Z }, + ... + { } /* end with an all-zeroes entry */ + }; + MODULE_DEVICE_TABLE(usb, mydriver_id_table); + +Most USB device drivers should pass these tables to the USB subsystem as +well as to the module management subsystem. Not all, though: some driver +frameworks connect using interfaces layered over USB, and so they won't +need such a struct :c:type:`usb_driver`. + +Drivers that connect directly to the USB subsystem should be declared +something like this:: + + static struct usb_driver mydriver = { + .name = "mydriver", + .id_table = mydriver_id_table, + .probe = my_probe, + .disconnect = my_disconnect, + + /* + if using the usb chardev framework: + .minor = MY_USB_MINOR_START, + .fops = my_file_ops, + if exposing any operations through usbdevfs: + .ioctl = my_ioctl, + */ + }; + +When the USB subsystem knows about a driver's device ID table, it's used when +choosing drivers to probe(). The thread doing new device processing checks +drivers' device ID entries from the ``MODULE_DEVICE_TABLE`` against interface +and device descriptors for the device. It will only call ``probe()`` if there +is a match, and the third argument to ``probe()`` will be the entry that +matched. + +If you don't provide an ``id_table`` for your driver, then your driver may get +probed for each new device; the third parameter to ``probe()`` will be +``NULL``. diff --git a/Documentation/driver-api/usb/index.rst b/Documentation/driver-api/usb/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..1bf64edc8c8a5c9bff3655d154eaa32584843530 --- /dev/null +++ b/Documentation/driver-api/usb/index.rst @@ -0,0 +1,26 @@ +============= +Linux USB API +============= + +.. toctree:: + + usb + gadget + anchors + bulk-streams + callbacks + dma + URB + power-management + hotplug + persist + error-codes + writing_usb_driver + writing_musb_glue_layer + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/driver-api/usb/persist.rst b/Documentation/driver-api/usb/persist.rst new file mode 100644 index 0000000000000000000000000000000000000000..08cafc6292c1d279da4f65b1f3061748d463602f --- /dev/null +++ b/Documentation/driver-api/usb/persist.rst @@ -0,0 +1,171 @@ +.. _usb-persist: + +USB device persistence during system suspend +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:Author: Alan Stern +:Date: September 2, 2006 (Updated February 25, 2008) + + +What is the problem? +==================== + +According to the USB specification, when a USB bus is suspended the +bus must continue to supply suspend current (around 1-5 mA). This +is so that devices can maintain their internal state and hubs can +detect connect-change events (devices being plugged in or unplugged). +The technical term is "power session". + +If a USB device's power session is interrupted then the system is +required to behave as though the device has been unplugged. It's a +conservative approach; in the absence of suspend current the computer +has no way to know what has actually happened. Perhaps the same +device is still attached or perhaps it was removed and a different +device plugged into the port. The system must assume the worst. + +By default, Linux behaves according to the spec. If a USB host +controller loses power during a system suspend, then when the system +wakes up all the devices attached to that controller are treated as +though they had disconnected. This is always safe and it is the +"officially correct" thing to do. + +For many sorts of devices this behavior doesn't matter in the least. +If the kernel wants to believe that your USB keyboard was unplugged +while the system was asleep and a new keyboard was plugged in when the +system woke up, who cares? It'll still work the same when you type on +it. + +Unfortunately problems _can_ arise, particularly with mass-storage +devices. The effect is exactly the same as if the device really had +been unplugged while the system was suspended. If you had a mounted +filesystem on the device, you're out of luck -- everything in that +filesystem is now inaccessible. This is especially annoying if your +root filesystem was located on the device, since your system will +instantly crash. + +Loss of power isn't the only mechanism to worry about. Anything that +interrupts a power session will have the same effect. For example, +even though suspend current may have been maintained while the system +was asleep, on many systems during the initial stages of wakeup the +firmware (i.e., the BIOS) resets the motherboard's USB host +controllers. Result: all the power sessions are destroyed and again +it's as though you had unplugged all the USB devices. Yes, it's +entirely the BIOS's fault, but that doesn't do _you_ any good unless +you can convince the BIOS supplier to fix the problem (lots of luck!). + +On many systems the USB host controllers will get reset after a +suspend-to-RAM. On almost all systems, no suspend current is +available during hibernation (also known as swsusp or suspend-to-disk). +You can check the kernel log after resuming to see if either of these +has happened; look for lines saying "root hub lost power or was reset". + +In practice, people are forced to unmount any filesystems on a USB +device before suspending. If the root filesystem is on a USB device, +the system can't be suspended at all. (All right, it _can_ be +suspended -- but it will crash as soon as it wakes up, which isn't +much better.) + + +What is the solution? +===================== + +The kernel includes a feature called USB-persist. It tries to work +around these issues by allowing the core USB device data structures to +persist across a power-session disruption. + +It works like this. If the kernel sees that a USB host controller is +not in the expected state during resume (i.e., if the controller was +reset or otherwise had lost power) then it applies a persistence check +to each of the USB devices below that controller for which the +"persist" attribute is set. It doesn't try to resume the device; that +can't work once the power session is gone. Instead it issues a USB +port reset and then re-enumerates the device. (This is exactly the +same thing that happens whenever a USB device is reset.) If the +re-enumeration shows that the device now attached to that port has the +same descriptors as before, including the Vendor and Product IDs, then +the kernel continues to use the same device structure. In effect, the +kernel treats the device as though it had merely been reset instead of +unplugged. + +The same thing happens if the host controller is in the expected state +but a USB device was unplugged and then replugged, or if a USB device +fails to carry out a normal resume. + +If no device is now attached to the port, or if the descriptors are +different from what the kernel remembers, then the treatment is what +you would expect. The kernel destroys the old device structure and +behaves as though the old device had been unplugged and a new device +plugged in. + +The end result is that the USB device remains available and usable. +Filesystem mounts and memory mappings are unaffected, and the world is +now a good and happy place. + +Note that the "USB-persist" feature will be applied only to those +devices for which it is enabled. You can enable the feature by doing +(as root):: + + echo 1 >/sys/bus/usb/devices/.../power/persist + +where the "..." should be filled in the with the device's ID. Disable +the feature by writing 0 instead of 1. For hubs the feature is +automatically and permanently enabled and the power/persist file +doesn't even exist, so you only have to worry about setting it for +devices where it really matters. + + +Is this the best solution? +========================== + +Perhaps not. Arguably, keeping track of mounted filesystems and +memory mappings across device disconnects should be handled by a +centralized Logical Volume Manager. Such a solution would allow you +to plug in a USB flash device, create a persistent volume associated +with it, unplug the flash device, plug it back in later, and still +have the same persistent volume associated with the device. As such +it would be more far-reaching than USB-persist. + +On the other hand, writing a persistent volume manager would be a big +job and using it would require significant input from the user. This +solution is much quicker and easier -- and it exists now, a giant +point in its favor! + +Furthermore, the USB-persist feature applies to _all_ USB devices, not +just mass-storage devices. It might turn out to be equally useful for +other device types, such as network interfaces. + + +WARNING: USB-persist can be dangerous!! +======================================= + +When recovering an interrupted power session the kernel does its best +to make sure the USB device hasn't been changed; that is, the same +device is still plugged into the port as before. But the checks +aren't guaranteed to be 100% accurate. + +If you replace one USB device with another of the same type (same +manufacturer, same IDs, and so on) there's an excellent chance the +kernel won't detect the change. The serial number string and other +descriptors are compared with the kernel's stored values, but this +might not help since manufacturers frequently omit serial numbers +entirely in their devices. + +Furthermore it's quite possible to leave a USB device exactly the same +while changing its media. If you replace the flash memory card in a +USB card reader while the system is asleep, the kernel will have no +way to know you did it. The kernel will assume that nothing has +happened and will continue to use the partition tables, inodes, and +memory mappings for the old card. + +If the kernel gets fooled in this way, it's almost certain to cause +data corruption and to crash your system. You'll have no one to blame +but yourself. + +For those devices with avoid_reset_quirk attribute being set, persist +maybe fail because they may morph after reset. + +YOU HAVE BEEN WARNED! USE AT YOUR OWN RISK! + +That having been said, most of the time there shouldn't be any trouble +at all. The USB-persist feature can be extremely useful. Make the +most of it. diff --git a/Documentation/driver-api/usb/power-management.rst b/Documentation/driver-api/usb/power-management.rst new file mode 100644 index 0000000000000000000000000000000000000000..79beb807996b7a3a17e08b5f1d6e31d4176d3fc2 --- /dev/null +++ b/Documentation/driver-api/usb/power-management.rst @@ -0,0 +1,794 @@ +.. _usb-power-management: + +Power Management for USB +~~~~~~~~~~~~~~~~~~~~~~~~ + +:Author: Alan Stern +:Date: Last-updated: February 2014 + +.. + Contents: + --------- + * What is Power Management? + * What is Remote Wakeup? + * When is a USB device idle? + * Forms of dynamic PM + * The user interface for dynamic PM + * Changing the default idle-delay time + * Warnings + * The driver interface for Power Management + * The driver interface for autosuspend and autoresume + * Other parts of the driver interface + * Mutual exclusion + * Interaction between dynamic PM and system PM + * xHCI hardware link PM + * USB Port Power Control + * User Interface for Port Power Control + * Suggested Userspace Port Power Policy + + +What is Power Management? +------------------------- + +Power Management (PM) is the practice of saving energy by suspending +parts of a computer system when they aren't being used. While a +component is ``suspended`` it is in a nonfunctional low-power state; it +might even be turned off completely. A suspended component can be +``resumed`` (returned to a functional full-power state) when the kernel +needs to use it. (There also are forms of PM in which components are +placed in a less functional but still usable state instead of being +suspended; an example would be reducing the CPU's clock rate. This +document will not discuss those other forms.) + +When the parts being suspended include the CPU and most of the rest of +the system, we speak of it as a "system suspend". When a particular +device is turned off while the system as a whole remains running, we +call it a "dynamic suspend" (also known as a "runtime suspend" or +"selective suspend"). This document concentrates mostly on how +dynamic PM is implemented in the USB subsystem, although system PM is +covered to some extent (see ``Documentation/power/*.txt`` for more +information about system PM). + +System PM support is present only if the kernel was built with +``CONFIG_SUSPEND`` or ``CONFIG_HIBERNATION`` enabled. Dynamic PM support + +for USB is present whenever +the kernel was built with ``CONFIG_PM`` enabled. + +[Historically, dynamic PM support for USB was present only if the +kernel had been built with ``CONFIG_USB_SUSPEND`` enabled (which depended on +``CONFIG_PM_RUNTIME``). Starting with the 3.10 kernel release, dynamic PM +support for USB was present whenever the kernel was built with +``CONFIG_PM_RUNTIME`` enabled. The ``CONFIG_USB_SUSPEND`` option had been +eliminated.] + + +What is Remote Wakeup? +---------------------- + +When a device has been suspended, it generally doesn't resume until +the computer tells it to. Likewise, if the entire computer has been +suspended, it generally doesn't resume until the user tells it to, say +by pressing a power button or opening the cover. + +However some devices have the capability of resuming by themselves, or +asking the kernel to resume them, or even telling the entire computer +to resume. This capability goes by several names such as "Wake On +LAN"; we will refer to it generically as "remote wakeup". When a +device is enabled for remote wakeup and it is suspended, it may resume +itself (or send a request to be resumed) in response to some external +event. Examples include a suspended keyboard resuming when a key is +pressed, or a suspended USB hub resuming when a device is plugged in. + + +When is a USB device idle? +-------------------------- + +A device is idle whenever the kernel thinks it's not busy doing +anything important and thus is a candidate for being suspended. The +exact definition depends on the device's driver; drivers are allowed +to declare that a device isn't idle even when there's no actual +communication taking place. (For example, a hub isn't considered idle +unless all the devices plugged into that hub are already suspended.) +In addition, a device isn't considered idle so long as a program keeps +its usbfs file open, whether or not any I/O is going on. + +If a USB device has no driver, its usbfs file isn't open, and it isn't +being accessed through sysfs, then it definitely is idle. + + +Forms of dynamic PM +------------------- + +Dynamic suspends occur when the kernel decides to suspend an idle +device. This is called ``autosuspend`` for short. In general, a device +won't be autosuspended unless it has been idle for some minimum period +of time, the so-called idle-delay time. + +Of course, nothing the kernel does on its own initiative should +prevent the computer or its devices from working properly. If a +device has been autosuspended and a program tries to use it, the +kernel will automatically resume the device (autoresume). For the +same reason, an autosuspended device will usually have remote wakeup +enabled, if the device supports remote wakeup. + +It is worth mentioning that many USB drivers don't support +autosuspend. In fact, at the time of this writing (Linux 2.6.23) the +only drivers which do support it are the hub driver, kaweth, asix, +usblp, usblcd, and usb-skeleton (which doesn't count). If a +non-supporting driver is bound to a device, the device won't be +autosuspended. In effect, the kernel pretends the device is never +idle. + +We can categorize power management events in two broad classes: +external and internal. External events are those triggered by some +agent outside the USB stack: system suspend/resume (triggered by +userspace), manual dynamic resume (also triggered by userspace), and +remote wakeup (triggered by the device). Internal events are those +triggered within the USB stack: autosuspend and autoresume. Note that +all dynamic suspend events are internal; external agents are not +allowed to issue dynamic suspends. + + +The user interface for dynamic PM +--------------------------------- + +The user interface for controlling dynamic PM is located in the ``power/`` +subdirectory of each USB device's sysfs directory, that is, in +``/sys/bus/usb/devices/.../power/`` where "..." is the device's ID. The +relevant attribute files are: wakeup, control, and +``autosuspend_delay_ms``. (There may also be a file named ``level``; this +file was deprecated as of the 2.6.35 kernel and replaced by the +``control`` file. In 2.6.38 the ``autosuspend`` file will be deprecated +and replaced by the ``autosuspend_delay_ms`` file. The only difference +is that the newer file expresses the delay in milliseconds whereas the +older file uses seconds. Confusingly, both files are present in 2.6.37 +but only ``autosuspend`` works.) + + ``power/wakeup`` + + This file is empty if the device does not support + remote wakeup. Otherwise the file contains either the + word ``enabled`` or the word ``disabled``, and you can + write those words to the file. The setting determines + whether or not remote wakeup will be enabled when the + device is next suspended. (If the setting is changed + while the device is suspended, the change won't take + effect until the following suspend.) + + ``power/control`` + + This file contains one of two words: ``on`` or ``auto``. + You can write those words to the file to change the + device's setting. + + - ``on`` means that the device should be resumed and + autosuspend is not allowed. (Of course, system + suspends are still allowed.) + + - ``auto`` is the normal state in which the kernel is + allowed to autosuspend and autoresume the device. + + (In kernels up to 2.6.32, you could also specify + ``suspend``, meaning that the device should remain + suspended and autoresume was not allowed. This + setting is no longer supported.) + + ``power/autosuspend_delay_ms`` + + This file contains an integer value, which is the + number of milliseconds the device should remain idle + before the kernel will autosuspend it (the idle-delay + time). The default is 2000. 0 means to autosuspend + as soon as the device becomes idle, and negative + values mean never to autosuspend. You can write a + number to the file to change the autosuspend + idle-delay time. + +Writing ``-1`` to ``power/autosuspend_delay_ms`` and writing ``on`` to +``power/control`` do essentially the same thing -- they both prevent the +device from being autosuspended. Yes, this is a redundancy in the +API. + +(In 2.6.21 writing ``0`` to ``power/autosuspend`` would prevent the device +from being autosuspended; the behavior was changed in 2.6.22. The +``power/autosuspend`` attribute did not exist prior to 2.6.21, and the +``power/level`` attribute did not exist prior to 2.6.22. ``power/control`` +was added in 2.6.34, and ``power/autosuspend_delay_ms`` was added in +2.6.37 but did not become functional until 2.6.38.) + + +Changing the default idle-delay time +------------------------------------ + +The default autosuspend idle-delay time (in seconds) is controlled by +a module parameter in usbcore. You can specify the value when usbcore +is loaded. For example, to set it to 5 seconds instead of 2 you would +do:: + + modprobe usbcore autosuspend=5 + +Equivalently, you could add to a configuration file in /etc/modprobe.d +a line saying:: + + options usbcore autosuspend=5 + +Some distributions load the usbcore module very early during the boot +process, by means of a program or script running from an initramfs +image. To alter the parameter value you would have to rebuild that +image. + +If usbcore is compiled into the kernel rather than built as a loadable +module, you can add:: + + usbcore.autosuspend=5 + +to the kernel's boot command line. + +Finally, the parameter value can be changed while the system is +running. If you do:: + + echo 5 >/sys/module/usbcore/parameters/autosuspend + +then each new USB device will have its autosuspend idle-delay +initialized to 5. (The idle-delay values for already existing devices +will not be affected.) + +Setting the initial default idle-delay to -1 will prevent any +autosuspend of any USB device. This has the benefit of allowing you +then to enable autosuspend for selected devices. + + +Warnings +-------- + +The USB specification states that all USB devices must support power +management. Nevertheless, the sad fact is that many devices do not +support it very well. You can suspend them all right, but when you +try to resume them they disconnect themselves from the USB bus or +they stop working entirely. This seems to be especially prevalent +among printers and scanners, but plenty of other types of device have +the same deficiency. + +For this reason, by default the kernel disables autosuspend (the +``power/control`` attribute is initialized to ``on``) for all devices other +than hubs. Hubs, at least, appear to be reasonably well-behaved in +this regard. + +(In 2.6.21 and 2.6.22 this wasn't the case. Autosuspend was enabled +by default for almost all USB devices. A number of people experienced +problems as a result.) + +This means that non-hub devices won't be autosuspended unless the user +or a program explicitly enables it. As of this writing there aren't +any widespread programs which will do this; we hope that in the near +future device managers such as HAL will take on this added +responsibility. In the meantime you can always carry out the +necessary operations by hand or add them to a udev script. You can +also change the idle-delay time; 2 seconds is not the best choice for +every device. + +If a driver knows that its device has proper suspend/resume support, +it can enable autosuspend all by itself. For example, the video +driver for a laptop's webcam might do this (in recent kernels they +do), since these devices are rarely used and so should normally be +autosuspended. + +Sometimes it turns out that even when a device does work okay with +autosuspend there are still problems. For example, the usbhid driver, +which manages keyboards and mice, has autosuspend support. Tests with +a number of keyboards show that typing on a suspended keyboard, while +causing the keyboard to do a remote wakeup all right, will nonetheless +frequently result in lost keystrokes. Tests with mice show that some +of them will issue a remote-wakeup request in response to button +presses but not to motion, and some in response to neither. + +The kernel will not prevent you from enabling autosuspend on devices +that can't handle it. It is even possible in theory to damage a +device by suspending it at the wrong time. (Highly unlikely, but +possible.) Take care. + + +The driver interface for Power Management +----------------------------------------- + +The requirements for a USB driver to support external power management +are pretty modest; the driver need only define:: + + .suspend + .resume + .reset_resume + +methods in its :c:type:`usb_driver` structure, and the ``reset_resume`` method +is optional. The methods' jobs are quite simple: + + - The ``suspend`` method is called to warn the driver that the + device is going to be suspended. If the driver returns a + negative error code, the suspend will be aborted. Normally + the driver will return 0, in which case it must cancel all + outstanding URBs (:c:func:`usb_kill_urb`) and not submit any more. + + - The ``resume`` method is called to tell the driver that the + device has been resumed and the driver can return to normal + operation. URBs may once more be submitted. + + - The ``reset_resume`` method is called to tell the driver that + the device has been resumed and it also has been reset. + The driver should redo any necessary device initialization, + since the device has probably lost most or all of its state + (although the interfaces will be in the same altsettings as + before the suspend). + +If the device is disconnected or powered down while it is suspended, +the ``disconnect`` method will be called instead of the ``resume`` or +``reset_resume`` method. This is also quite likely to happen when +waking up from hibernation, as many systems do not maintain suspend +current to the USB host controllers during hibernation. (It's +possible to work around the hibernation-forces-disconnect problem by +using the USB Persist facility.) + +The ``reset_resume`` method is used by the USB Persist facility (see +:ref:`usb-persist`) and it can also be used under certain +circumstances when ``CONFIG_USB_PERSIST`` is not enabled. Currently, if a +device is reset during a resume and the driver does not have a +``reset_resume`` method, the driver won't receive any notification about +the resume. Later kernels will call the driver's ``disconnect`` method; +2.6.23 doesn't do this. + +USB drivers are bound to interfaces, so their ``suspend`` and ``resume`` +methods get called when the interfaces are suspended or resumed. In +principle one might want to suspend some interfaces on a device (i.e., +force the drivers for those interface to stop all activity) without +suspending the other interfaces. The USB core doesn't allow this; all +interfaces are suspended when the device itself is suspended and all +interfaces are resumed when the device is resumed. It isn't possible +to suspend or resume some but not all of a device's interfaces. The +closest you can come is to unbind the interfaces' drivers. + + +The driver interface for autosuspend and autoresume +--------------------------------------------------- + +To support autosuspend and autoresume, a driver should implement all +three of the methods listed above. In addition, a driver indicates +that it supports autosuspend by setting the ``.supports_autosuspend`` flag +in its usb_driver structure. It is then responsible for informing the +USB core whenever one of its interfaces becomes busy or idle. The +driver does so by calling these six functions:: + + int usb_autopm_get_interface(struct usb_interface *intf); + void usb_autopm_put_interface(struct usb_interface *intf); + int usb_autopm_get_interface_async(struct usb_interface *intf); + void usb_autopm_put_interface_async(struct usb_interface *intf); + void usb_autopm_get_interface_no_resume(struct usb_interface *intf); + void usb_autopm_put_interface_no_suspend(struct usb_interface *intf); + +The functions work by maintaining a usage counter in the +usb_interface's embedded device structure. When the counter is > 0 +then the interface is deemed to be busy, and the kernel will not +autosuspend the interface's device. When the usage counter is = 0 +then the interface is considered to be idle, and the kernel may +autosuspend the device. + +Drivers need not be concerned about balancing changes to the usage +counter; the USB core will undo any remaining "get"s when a driver +is unbound from its interface. As a corollary, drivers must not call +any of the ``usb_autopm_*`` functions after their ``disconnect`` +routine has returned. + +Drivers using the async routines are responsible for their own +synchronization and mutual exclusion. + + :c:func:`usb_autopm_get_interface` increments the usage counter and + does an autoresume if the device is suspended. If the + autoresume fails, the counter is decremented back. + + :c:func:`usb_autopm_put_interface` decrements the usage counter and + attempts an autosuspend if the new value is = 0. + + :c:func:`usb_autopm_get_interface_async` and + :c:func:`usb_autopm_put_interface_async` do almost the same things as + their non-async counterparts. The big difference is that they + use a workqueue to do the resume or suspend part of their + jobs. As a result they can be called in an atomic context, + such as an URB's completion handler, but when they return the + device will generally not yet be in the desired state. + + :c:func:`usb_autopm_get_interface_no_resume` and + :c:func:`usb_autopm_put_interface_no_suspend` merely increment or + decrement the usage counter; they do not attempt to carry out + an autoresume or an autosuspend. Hence they can be called in + an atomic context. + +The simplest usage pattern is that a driver calls +:c:func:`usb_autopm_get_interface` in its open routine and +:c:func:`usb_autopm_put_interface` in its close or release routine. But other +patterns are possible. + +The autosuspend attempts mentioned above will often fail for one +reason or another. For example, the ``power/control`` attribute might be +set to ``on``, or another interface in the same device might not be +idle. This is perfectly normal. If the reason for failure was that +the device hasn't been idle for long enough, a timer is scheduled to +carry out the operation automatically when the autosuspend idle-delay +has expired. + +Autoresume attempts also can fail, although failure would mean that +the device is no longer present or operating properly. Unlike +autosuspend, there's no idle-delay for an autoresume. + + +Other parts of the driver interface +----------------------------------- + +Drivers can enable autosuspend for their devices by calling:: + + usb_enable_autosuspend(struct usb_device *udev); + +in their :c:func:`probe` routine, if they know that the device is capable of +suspending and resuming correctly. This is exactly equivalent to +writing ``auto`` to the device's ``power/control`` attribute. Likewise, +drivers can disable autosuspend by calling:: + + usb_disable_autosuspend(struct usb_device *udev); + +This is exactly the same as writing ``on`` to the ``power/control`` attribute. + +Sometimes a driver needs to make sure that remote wakeup is enabled +during autosuspend. For example, there's not much point +autosuspending a keyboard if the user can't cause the keyboard to do a +remote wakeup by typing on it. If the driver sets +``intf->needs_remote_wakeup`` to 1, the kernel won't autosuspend the +device if remote wakeup isn't available. (If the device is already +autosuspended, though, setting this flag won't cause the kernel to +autoresume it. Normally a driver would set this flag in its ``probe`` +method, at which time the device is guaranteed not to be +autosuspended.) + +If a driver does its I/O asynchronously in interrupt context, it +should call :c:func:`usb_autopm_get_interface_async` before starting output and +:c:func:`usb_autopm_put_interface_async` when the output queue drains. When +it receives an input event, it should call:: + + usb_mark_last_busy(struct usb_device *udev); + +in the event handler. This tells the PM core that the device was just +busy and therefore the next autosuspend idle-delay expiration should +be pushed back. Many of the usb_autopm_* routines also make this call, +so drivers need to worry only when interrupt-driven input arrives. + +Asynchronous operation is always subject to races. For example, a +driver may call the :c:func:`usb_autopm_get_interface_async` routine at a time +when the core has just finished deciding the device has been idle for +long enough but not yet gotten around to calling the driver's ``suspend`` +method. The ``suspend`` method must be responsible for synchronizing with +the I/O request routine and the URB completion handler; it should +cause autosuspends to fail with -EBUSY if the driver needs to use the +device. + +External suspend calls should never be allowed to fail in this way, +only autosuspend calls. The driver can tell them apart by applying +the :c:func:`PMSG_IS_AUTO` macro to the message argument to the ``suspend`` +method; it will return True for internal PM events (autosuspend) and +False for external PM events. + + +Mutual exclusion +---------------- + +For external events -- but not necessarily for autosuspend or +autoresume -- the device semaphore (udev->dev.sem) will be held when a +``suspend`` or ``resume`` method is called. This implies that external +suspend/resume events are mutually exclusive with calls to ``probe``, +``disconnect``, ``pre_reset``, and ``post_reset``; the USB core guarantees that +this is true of autosuspend/autoresume events as well. + +If a driver wants to block all suspend/resume calls during some +critical section, the best way is to lock the device and call +:c:func:`usb_autopm_get_interface` (and do the reverse at the end of the +critical section). Holding the device semaphore will block all +external PM calls, and the :c:func:`usb_autopm_get_interface` will prevent any +internal PM calls, even if it fails. (Exercise: Why?) + + +Interaction between dynamic PM and system PM +-------------------------------------------- + +Dynamic power management and system power management can interact in +a couple of ways. + +Firstly, a device may already be autosuspended when a system suspend +occurs. Since system suspends are supposed to be as transparent as +possible, the device should remain suspended following the system +resume. But this theory may not work out well in practice; over time +the kernel's behavior in this regard has changed. As of 2.6.37 the +policy is to resume all devices during a system resume and let them +handle their own runtime suspends afterward. + +Secondly, a dynamic power-management event may occur as a system +suspend is underway. The window for this is short, since system +suspends don't take long (a few seconds usually), but it can happen. +For example, a suspended device may send a remote-wakeup signal while +the system is suspending. The remote wakeup may succeed, which would +cause the system suspend to abort. If the remote wakeup doesn't +succeed, it may still remain active and thus cause the system to +resume as soon as the system suspend is complete. Or the remote +wakeup may fail and get lost. Which outcome occurs depends on timing +and on the hardware and firmware design. + + +xHCI hardware link PM +--------------------- + +xHCI host controller provides hardware link power management to usb2.0 +(xHCI 1.0 feature) and usb3.0 devices which support link PM. By +enabling hardware LPM, the host can automatically put the device into +lower power state(L1 for usb2.0 devices, or U1/U2 for usb3.0 devices), +which state device can enter and resume very quickly. + +The user interface for controlling hardware LPM is located in the +``power/`` subdirectory of each USB device's sysfs directory, that is, in +``/sys/bus/usb/devices/.../power/`` where "..." is the device's ID. The +relevant attribute files are ``usb2_hardware_lpm`` and ``usb3_hardware_lpm``. + + ``power/usb2_hardware_lpm`` + + When a USB2 device which support LPM is plugged to a + xHCI host root hub which support software LPM, the + host will run a software LPM test for it; if the device + enters L1 state and resume successfully and the host + supports USB2 hardware LPM, this file will show up and + driver will enable hardware LPM for the device. You + can write y/Y/1 or n/N/0 to the file to enable/disable + USB2 hardware LPM manually. This is for test purpose mainly. + + ``power/usb3_hardware_lpm_u1`` + ``power/usb3_hardware_lpm_u2`` + + When a USB 3.0 lpm-capable device is plugged in to a + xHCI host which supports link PM, it will check if U1 + and U2 exit latencies have been set in the BOS + descriptor; if the check is passed and the host + supports USB3 hardware LPM, USB3 hardware LPM will be + enabled for the device and these files will be created. + The files hold a string value (enable or disable) + indicating whether or not USB3 hardware LPM U1 or U2 + is enabled for the device. + +USB Port Power Control +---------------------- + +In addition to suspending endpoint devices and enabling hardware +controlled link power management, the USB subsystem also has the +capability to disable power to ports under some conditions. Power is +controlled through ``Set/ClearPortFeature(PORT_POWER)`` requests to a hub. +In the case of a root or platform-internal hub the host controller +driver translates ``PORT_POWER`` requests into platform firmware (ACPI) +method calls to set the port power state. For more background see the +Linux Plumbers Conference 2012 slides [#f1]_ and video [#f2]_: + +Upon receiving a ``ClearPortFeature(PORT_POWER)`` request a USB port is +logically off, and may trigger the actual loss of VBUS to the port [#f3]_. +VBUS may be maintained in the case where a hub gangs multiple ports into +a shared power well causing power to remain until all ports in the gang +are turned off. VBUS may also be maintained by hub ports configured for +a charging application. In any event a logically off port will lose +connection with its device, not respond to hotplug events, and not +respond to remote wakeup events. + +.. warning:: + + turning off a port may result in the inability to hot add a device. + Please see "User Interface for Port Power Control" for details. + +As far as the effect on the device itself it is similar to what a device +goes through during system suspend, i.e. the power session is lost. Any +USB device or driver that misbehaves with system suspend will be +similarly affected by a port power cycle event. For this reason the +implementation shares the same device recovery path (and honors the same +quirks) as the system resume path for the hub. + +.. [#f1] + + http://dl.dropbox.com/u/96820575/sarah-sharp-lpt-port-power-off2-mini.pdf + +.. [#f2] + + http://linuxplumbers.ubicast.tv/videos/usb-port-power-off-kerneluserspace-api/ + +.. [#f3] + + USB 3.1 Section 10.12 + + wakeup note: if a device is configured to send wakeup events the port + power control implementation will block poweroff attempts on that + port. + + +User Interface for Port Power Control +------------------------------------- + +The port power control mechanism uses the PM runtime system. Poweroff is +requested by clearing the ``power/pm_qos_no_power_off`` flag of the port device +(defaults to 1). If the port is disconnected it will immediately receive a +``ClearPortFeature(PORT_POWER)`` request. Otherwise, it will honor the pm +runtime rules and require the attached child device and all descendants to be +suspended. This mechanism is dependent on the hub advertising port power +switching in its hub descriptor (wHubCharacteristics logical power switching +mode field). + +Note, some interface devices/drivers do not support autosuspend. Userspace may +need to unbind the interface drivers before the :c:type:`usb_device` will +suspend. An unbound interface device is suspended by default. When unbinding, +be careful to unbind interface drivers, not the driver of the parent usb +device. Also, leave hub interface drivers bound. If the driver for the usb +device (not interface) is unbound the kernel is no longer able to resume the +device. If a hub interface driver is unbound, control of its child ports is +lost and all attached child-devices will disconnect. A good rule of thumb is +that if the 'driver/module' link for a device points to +``/sys/module/usbcore`` then unbinding it will interfere with port power +control. + +Example of the relevant files for port power control. Note, in this example +these files are relative to a usb hub device (prefix):: + + prefix=/sys/devices/pci0000:00/0000:00:14.0/usb3/3-1 + + attached child device + + hub port device + | + hub interface device + | | + v v v + $prefix/3-1:1.0/3-1-port1/device + + $prefix/3-1:1.0/3-1-port1/power/pm_qos_no_power_off + $prefix/3-1:1.0/3-1-port1/device/power/control + $prefix/3-1:1.0/3-1-port1/device/3-1.1:/driver/unbind + $prefix/3-1:1.0/3-1-port1/device/3-1.1:/driver/unbind + ... + $prefix/3-1:1.0/3-1-port1/device/3-1.1:/driver/unbind + +In addition to these files some ports may have a 'peer' link to a port on +another hub. The expectation is that all superspeed ports have a +hi-speed peer:: + + $prefix/3-1:1.0/3-1-port1/peer -> ../../../../usb2/2-1/2-1:1.0/2-1-port1 + ../../../../usb2/2-1/2-1:1.0/2-1-port1/peer -> ../../../../usb3/3-1/3-1:1.0/3-1-port1 + +Distinct from 'companion ports', or 'ehci/xhci shared switchover ports' +peer ports are simply the hi-speed and superspeed interface pins that +are combined into a single usb3 connector. Peer ports share the same +ancestor XHCI device. + +While a superspeed port is powered off a device may downgrade its +connection and attempt to connect to the hi-speed pins. The +implementation takes steps to prevent this: + +1. Port suspend is sequenced to guarantee that hi-speed ports are powered-off + before their superspeed peer is permitted to power-off. The implication is + that the setting ``pm_qos_no_power_off`` to zero on a superspeed port may + not cause the port to power-off until its highspeed peer has gone to its + runtime suspend state. Userspace must take care to order the suspensions + if it wants to guarantee that a superspeed port will power-off. + +2. Port resume is sequenced to force a superspeed port to power-on prior to its + highspeed peer. + +3. Port resume always triggers an attached child device to resume. After a + power session is lost the device may have been removed, or need reset. + Resuming the child device when the parent port regains power resolves those + states and clamps the maximum port power cycle frequency at the rate the + child device can suspend (autosuspend-delay) and resume (reset-resume + latency). + +Sysfs files relevant for port power control: + + ``/power/pm_qos_no_power_off``: + This writable flag controls the state of an idle port. + Once all children and descendants have suspended the + port may suspend/poweroff provided that + pm_qos_no_power_off is '0'. If pm_qos_no_power_off is + '1' the port will remain active/powered regardless of + the stats of descendants. Defaults to 1. + + ``/power/runtime_status``: + This file reflects whether the port is 'active' (power is on) + or 'suspended' (logically off). There is no indication to + userspace whether VBUS is still supplied. + + ``/connect_type``: + An advisory read-only flag to userspace indicating the + location and connection type of the port. It returns + one of four values 'hotplug', 'hardwired', 'not used', + and 'unknown'. All values, besides unknown, are set by + platform firmware. + + ``hotplug`` indicates an externally connectable/visible + port on the platform. Typically userspace would choose + to keep such a port powered to handle new device + connection events. + + ``hardwired`` refers to a port that is not visible but + connectable. Examples are internal ports for USB + bluetooth that can be disconnected via an external + switch or a port with a hardwired USB camera. It is + expected to be safe to allow these ports to suspend + provided pm_qos_no_power_off is coordinated with any + switch that gates connections. Userspace must arrange + for the device to be connected prior to the port + powering off, or to activate the port prior to enabling + connection via a switch. + + ``not used`` refers to an internal port that is expected + to never have a device connected to it. These may be + empty internal ports, or ports that are not physically + exposed on a platform. Considered safe to be + powered-off at all times. + + ``unknown`` means platform firmware does not provide + information for this port. Most commonly refers to + external hub ports which should be considered 'hotplug' + for policy decisions. + + .. note:: + + - since we are relying on the BIOS to get this ACPI + information correct, the USB port descriptions may + be missing or wrong. + + - Take care in clearing ``pm_qos_no_power_off``. Once + power is off this port will + not respond to new connect events. + + Once a child device is attached additional constraints are + applied before the port is allowed to poweroff. + + ``/power/control``: + Must be ``auto``, and the port will not + power down until ``/power/runtime_status`` + reflects the 'suspended' state. Default + value is controlled by child device driver. + + ``/power/persist``: + This defaults to ``1`` for most devices and indicates if + kernel can persist the device's configuration across a + power session loss (suspend / port-power event). When + this value is ``0`` (quirky devices), port poweroff is + disabled. + + ``/driver/unbind``: + Wakeup capable devices will block port poweroff. At + this time the only mechanism to clear the usb-internal + wakeup-capability for an interface device is to unbind + its driver. + +Summary of poweroff pre-requisite settings relative to a port device:: + + echo 0 > power/pm_qos_no_power_off + echo 0 > peer/power/pm_qos_no_power_off # if it exists + echo auto > power/control # this is the default value + echo auto > /power/control + echo 1 > /power/persist # this is the default value + +Suggested Userspace Port Power Policy +------------------------------------- + +As noted above userspace needs to be careful and deliberate about what +ports are enabled for poweroff. + +The default configuration is that all ports start with +``power/pm_qos_no_power_off`` set to ``1`` causing ports to always remain +active. + +Given confidence in the platform firmware's description of the ports +(ACPI _PLD record for a port populates 'connect_type') userspace can +clear pm_qos_no_power_off for all 'not used' ports. The same can be +done for 'hardwired' ports provided poweroff is coordinated with any +connection switch for the port. + +A more aggressive userspace policy is to enable USB port power off for +all ports (set ``/power/pm_qos_no_power_off`` to ``0``) when +some external factor indicates the user has stopped interacting with the +system. For example, a distro may want to enable power off all USB +ports when the screen blanks, and re-power them when the screen becomes +active. Smart phones and tablets may want to power off USB ports when +the user pushes the power button. diff --git a/Documentation/driver-api/usb/usb.rst b/Documentation/driver-api/usb/usb.rst new file mode 100644 index 0000000000000000000000000000000000000000..dba0f876b36f4dc1f52c412f82896972b36d208a --- /dev/null +++ b/Documentation/driver-api/usb/usb.rst @@ -0,0 +1,1047 @@ +.. _usb-hostside-api: + +=========================== +The Linux-USB Host Side API +=========================== + +Introduction to USB on Linux +============================ + +A Universal Serial Bus (USB) is used to connect a host, such as a PC or +workstation, to a number of peripheral devices. USB uses a tree +structure, with the host as the root (the system's master), hubs as +interior nodes, and peripherals as leaves (and slaves). Modern PCs +support several such trees of USB devices, usually +a few USB 3.0 (5 GBit/s) or USB 3.1 (10 GBit/s) and some legacy +USB 2.0 (480 MBit/s) busses just in case. + +That master/slave asymmetry was designed-in for a number of reasons, one +being ease of use. It is not physically possible to mistake upstream and +downstream or it does not matter with a type C plug (or they are built into the +peripheral). Also, the host software doesn't need to deal with +distributed auto-configuration since the pre-designated master node +manages all that. + +Kernel developers added USB support to Linux early in the 2.2 kernel +series and have been developing it further since then. Besides support +for each new generation of USB, various host controllers gained support, +new drivers for peripherals have been added and advanced features for latency +measurement and improved power management introduced. + +Linux can run inside USB devices as well as on the hosts that control +the devices. But USB device drivers running inside those peripherals +don't do the same things as the ones running inside hosts, so they've +been given a different name: *gadget drivers*. This document does not +cover gadget drivers. + +USB Host-Side API Model +======================= + +Host-side drivers for USB devices talk to the "usbcore" APIs. There are +two. One is intended for *general-purpose* drivers (exposed through +driver frameworks), and the other is for drivers that are *part of the +core*. Such core drivers include the *hub* driver (which manages trees +of USB devices) and several different kinds of *host controller +drivers*, which control individual busses. + +The device model seen by USB drivers is relatively complex. + +- USB supports four kinds of data transfers (control, bulk, interrupt, + and isochronous). Two of them (control and bulk) use bandwidth as + it's available, while the other two (interrupt and isochronous) are + scheduled to provide guaranteed bandwidth. + +- The device description model includes one or more "configurations" + per device, only one of which is active at a time. Devices are supposed + to be capable of operating at lower than their top + speeds and may provide a BOS descriptor showing the lowest speed they + remain fully operational at. + +- From USB 3.0 on configurations have one or more "functions", which + provide a common functionality and are grouped together for purposes + of power management. + +- Configurations or functions have one or more "interfaces", each of which may have + "alternate settings". Interfaces may be standardized by USB "Class" + specifications, or may be specific to a vendor or device. + + USB device drivers actually bind to interfaces, not devices. Think of + them as "interface drivers", though you may not see many devices + where the distinction is important. *Most USB devices are simple, + with only one function, one configuration, one interface, and one alternate + setting.* + +- Interfaces have one or more "endpoints", each of which supports one + type and direction of data transfer such as "bulk out" or "interrupt + in". The entire configuration may have up to sixteen endpoints in + each direction, allocated as needed among all the interfaces. + +- Data transfer on USB is packetized; each endpoint has a maximum + packet size. Drivers must often be aware of conventions such as + flagging the end of bulk transfers using "short" (including zero + length) packets. + +- The Linux USB API supports synchronous calls for control and bulk + messages. It also supports asynchronous calls for all kinds of data + transfer, using request structures called "URBs" (USB Request + Blocks). + +Accordingly, the USB Core API exposed to device drivers covers quite a +lot of territory. You'll probably need to consult the USB 3.0 +specification, available online from www.usb.org at no cost, as well as +class or device specifications. + +The only host-side drivers that actually touch hardware (reading/writing +registers, handling IRQs, and so on) are the HCDs. In theory, all HCDs +provide the same functionality through the same API. In practice, that's +becoming more true, but there are still differences +that crop up especially with fault handling on the less common controllers. +Different controllers don't +necessarily report the same aspects of failures, and recovery from +faults (including software-induced ones like unlinking an URB) isn't yet +fully consistent. Device driver authors should make a point of doing +disconnect testing (while the device is active) with each different host +controller driver, to make sure drivers don't have bugs of their own as +well as to make sure they aren't relying on some HCD-specific behavior. + +.. _usb_chapter9: + +USB-Standard Types +================== + +In ```` you will find the USB data types defined in +chapter 9 of the USB specification. These data types are used throughout +USB, and in APIs including this host side API, gadget APIs, usb character +devices and debugfs interfaces. + +.. kernel-doc:: include/linux/usb/ch9.h + :internal: + +.. _usb_header: + +Host-Side Data Types and Macros +=============================== + +The host side API exposes several layers to drivers, some of which are +more necessary than others. These support lifecycle models for host side +drivers and devices, and support passing buffers through usbcore to some +HCD that performs the I/O for the device driver. + +.. kernel-doc:: include/linux/usb.h + :internal: + +USB Core APIs +============= + +There are two basic I/O models in the USB API. The most elemental one is +asynchronous: drivers submit requests in the form of an URB, and the +URB's completion callback handles the next step. All USB transfer types +support that model, although there are special cases for control URBs +(which always have setup and status stages, but may not have a data +stage) and isochronous URBs (which allow large packets and include +per-packet fault reports). Built on top of that is synchronous API +support, where a driver calls a routine that allocates one or more URBs, +submits them, and waits until they complete. There are synchronous +wrappers for single-buffer control and bulk transfers (which are awkward +to use in some driver disconnect scenarios), and for scatterlist based +streaming i/o (bulk or interrupt). + +USB drivers need to provide buffers that can be used for DMA, although +they don't necessarily need to provide the DMA mapping themselves. There +are APIs to use used when allocating DMA buffers, which can prevent use +of bounce buffers on some systems. In some cases, drivers may be able to +rely on 64bit DMA to eliminate another kind of bounce buffer. + +.. kernel-doc:: drivers/usb/core/urb.c + :export: + +.. kernel-doc:: drivers/usb/core/message.c + :export: + +.. kernel-doc:: drivers/usb/core/file.c + :export: + +.. kernel-doc:: drivers/usb/core/driver.c + :export: + +.. kernel-doc:: drivers/usb/core/usb.c + :export: + +.. kernel-doc:: drivers/usb/core/hub.c + :export: + +Host Controller APIs +==================== + +These APIs are only for use by host controller drivers, most of which +implement standard register interfaces such as XHCI, EHCI, OHCI, or UHCI. UHCI +was one of the first interfaces, designed by Intel and also used by VIA; +it doesn't do much in hardware. OHCI was designed later, to have the +hardware do more work (bigger transfers, tracking protocol state, and so +on). EHCI was designed with USB 2.0; its design has features that +resemble OHCI (hardware does much more work) as well as UHCI (some parts +of ISO support, TD list processing). XHCI was designed with USB 3.0. It +continues to shift support for functionality into hardware. + +There are host controllers other than the "big three", although most PCI +based controllers (and a few non-PCI based ones) use one of those +interfaces. Not all host controllers use DMA; some use PIO, and there is +also a simulator and a virtual host controller to pipe USB over the network. + +The same basic APIs are available to drivers for all those controllers. +For historical reasons they are in two layers: :c:type:`struct +usb_bus ` is a rather thin layer that became available +in the 2.2 kernels, while :c:type:`struct usb_hcd ` +is a more featureful layer +that lets HCDs share common code, to shrink driver size and +significantly reduce hcd-specific behaviors. + +.. kernel-doc:: drivers/usb/core/hcd.c + :export: + +.. kernel-doc:: drivers/usb/core/hcd-pci.c + :export: + +.. kernel-doc:: drivers/usb/core/buffer.c + :internal: + +The USB character device nodes +============================== + +This chapter presents the Linux character device nodes. You may prefer +to avoid writing new kernel code for your USB driver. User mode device +drivers are usually packaged as applications or libraries, and may use +character devices through some programming library that wraps it. +Such libraries include: + + - `libusb `__ for C/C++, and + - `jUSB `__ for Java. + +Some old information about it can be seen at the "USB Device Filesystem" +section of the USB Guide. The latest copy of the USB Guide can be found +at http://www.linux-usb.org/ + +.. note:: + + - They were used to be implemented via *usbfs*, but this is not part of + the sysfs debug interface. + + - This particular documentation is incomplete, especially with respect + to the asynchronous mode. As of kernel 2.5.66 the code and this + (new) documentation need to be cross-reviewed. + +What files are in "devtmpfs"? +----------------------------- + +Conventionally mounted at ``/dev/bus/usb/``, usbfs features include: + +- ``/dev/bus/usb/BBB/DDD`` ... magic files exposing the each device's + configuration descriptors, and supporting a series of ioctls for + making device requests, including I/O to devices. (Purely for access + by programs.) + +Each bus is given a number (``BBB``) based on when it was enumerated; within +each bus, each device is given a similar number (``DDD``). Those ``BBB/DDD`` +paths are not "stable" identifiers; expect them to change even if you +always leave the devices plugged in to the same hub port. *Don't even +think of saving these in application configuration files.* Stable +identifiers are available, for user mode applications that want to use +them. HID and networking devices expose these stable IDs, so that for +example you can be sure that you told the right UPS to power down its +second server. Pleast note that it doesn't (yet) expose those IDs. + +/dev/bus/usb/BBB/DDD +-------------------- + +Use these files in one of these basic ways: + +- *They can be read,* producing first the device descriptor (18 bytes) and + then the descriptors for the current configuration. See the USB 2.0 spec + for details about those binary data formats. You'll need to convert most + multibyte values from little endian format to your native host byte + order, although a few of the fields in the device descriptor (both of + the BCD-encoded fields, and the vendor and product IDs) will be + byteswapped for you. Note that configuration descriptors include + descriptors for interfaces, altsettings, endpoints, and maybe additional + class descriptors. + +- *Perform USB operations* using *ioctl()* requests to make endpoint I/O + requests (synchronously or asynchronously) or manage the device. These + requests need the ``CAP_SYS_RAWIO`` capability, as well as filesystem + access permissions. Only one ioctl request can be made on one of these + device files at a time. This means that if you are synchronously reading + an endpoint from one thread, you won't be able to write to a different + endpoint from another thread until the read completes. This works for + *half duplex* protocols, but otherwise you'd use asynchronous i/o + requests. + +Each connected USB device has one file. The ``BBB`` indicates the bus +number. The ``DDD`` indicates the device address on that bus. Both +of these numbers are assigned sequentially, and can be reused, so +you can't rely on them for stable access to devices. For example, +it's relatively common for devices to re-enumerate while they are +still connected (perhaps someone jostled their power supply, hub, +or USB cable), so a device might be ``002/027`` when you first connect +it and ``002/048`` sometime later. + +These files can be read as binary data. The binary data consists +of first the device descriptor, then the descriptors for each +configuration of the device. Multi-byte fields in the device descriptor +are converted to host endianness by the kernel. The configuration +descriptors are in bus endian format! The configuration descriptor +are wTotalLength bytes apart. If a device returns less configuration +descriptor data than indicated by wTotalLength there will be a hole in +the file for the missing bytes. This information is also shown +in text form by the ``/sys/kernel/debug/usb/devices`` file, described later. + +These files may also be used to write user-level drivers for the USB +devices. You would open the ``/dev/bus/usb/BBB/DDD`` file read/write, +read its descriptors to make sure it's the device you expect, and then +bind to an interface (or perhaps several) using an ioctl call. You +would issue more ioctls to the device to communicate to it using +control, bulk, or other kinds of USB transfers. The IOCTLs are +listed in the ```` file, and at this writing the +source code (``linux/drivers/usb/core/devio.c``) is the primary reference +for how to access devices through those files. + +Note that since by default these ``BBB/DDD`` files are writable only by +root, only root can write such user mode drivers. You can selectively +grant read/write permissions to other users by using ``chmod``. Also, +usbfs mount options such as ``devmode=0666`` may be helpful. + + +Life Cycle of User Mode Drivers +------------------------------- + +Such a driver first needs to find a device file for a device it knows +how to handle. Maybe it was told about it because a ``/sbin/hotplug`` +event handling agent chose that driver to handle the new device. Or +maybe it's an application that scans all the ``/dev/bus/usb`` device files, +and ignores most devices. In either case, it should :c:func:`read()` +all the descriptors from the device file, and check them against what it +knows how to handle. It might just reject everything except a particular +vendor and product ID, or need a more complex policy. + +Never assume there will only be one such device on the system at a time! +If your code can't handle more than one device at a time, at least +detect when there's more than one, and have your users choose which +device to use. + +Once your user mode driver knows what device to use, it interacts with +it in either of two styles. The simple style is to make only control +requests; some devices don't need more complex interactions than those. +(An example might be software using vendor-specific control requests for +some initialization or configuration tasks, with a kernel driver for the +rest.) + +More likely, you need a more complex style driver: one using non-control +endpoints, reading or writing data and claiming exclusive use of an +interface. *Bulk* transfers are easiest to use, but only their sibling +*interrupt* transfers work with low speed devices. Both interrupt and +*isochronous* transfers offer service guarantees because their bandwidth +is reserved. Such "periodic" transfers are awkward to use through usbfs, +unless you're using the asynchronous calls. However, interrupt transfers +can also be used in a synchronous "one shot" style. + +Your user-mode driver should never need to worry about cleaning up +request state when the device is disconnected, although it should close +its open file descriptors as soon as it starts seeing the ENODEV errors. + +The ioctl() Requests +-------------------- + +To use these ioctls, you need to include the following headers in your +userspace program:: + + #include + #include + #include + +The standard USB device model requests, from "Chapter 9" of the USB 2.0 +specification, are automatically included from the ```` +header. + +Unless noted otherwise, the ioctl requests described here will update +the modification time on the usbfs file to which they are applied +(unless they fail). A return of zero indicates success; otherwise, a +standard USB error code is returned (These are documented in +:ref:`usb-error-codes`). + +Each of these files multiplexes access to several I/O streams, one per +endpoint. Each device has one control endpoint (endpoint zero) which +supports a limited RPC style RPC access. Devices are configured by +hub_wq (in the kernel) setting a device-wide *configuration* that +affects things like power consumption and basic functionality. The +endpoints are part of USB *interfaces*, which may have *altsettings* +affecting things like which endpoints are available. Many devices only +have a single configuration and interface, so drivers for them will +ignore configurations and altsettings. + +Management/Status Requests +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A number of usbfs requests don't deal very directly with device I/O. +They mostly relate to device management and status. These are all +synchronous requests. + +USBDEVFS_CLAIMINTERFACE + This is used to force usbfs to claim a specific interface, which has + not previously been claimed by usbfs or any other kernel driver. The + ioctl parameter is an integer holding the number of the interface + (bInterfaceNumber from descriptor). + + Note that if your driver doesn't claim an interface before trying to + use one of its endpoints, and no other driver has bound to it, then + the interface is automatically claimed by usbfs. + + This claim will be released by a RELEASEINTERFACE ioctl, or by + closing the file descriptor. File modification time is not updated + by this request. + +USBDEVFS_CONNECTINFO + Says whether the device is lowspeed. The ioctl parameter points to a + structure like this:: + + struct usbdevfs_connectinfo { + unsigned int devnum; + unsigned char slow; + }; + + File modification time is not updated by this request. + + *You can't tell whether a "not slow" device is connected at high + speed (480 MBit/sec) or just full speed (12 MBit/sec).* You should + know the devnum value already, it's the DDD value of the device file + name. + +USBDEVFS_GETDRIVER + Returns the name of the kernel driver bound to a given interface (a + string). Parameter is a pointer to this structure, which is + modified:: + + struct usbdevfs_getdriver { + unsigned int interface; + char driver[USBDEVFS_MAXDRIVERNAME + 1]; + }; + + File modification time is not updated by this request. + +USBDEVFS_IOCTL + Passes a request from userspace through to a kernel driver that has + an ioctl entry in the *struct usb_driver* it registered:: + + struct usbdevfs_ioctl { + int ifno; + int ioctl_code; + void *data; + }; + + /* user mode call looks like this. + * 'request' becomes the driver->ioctl() 'code' parameter. + * the size of 'param' is encoded in 'request', and that data + * is copied to or from the driver->ioctl() 'buf' parameter. + */ + static int + usbdev_ioctl (int fd, int ifno, unsigned request, void *param) + { + struct usbdevfs_ioctl wrapper; + + wrapper.ifno = ifno; + wrapper.ioctl_code = request; + wrapper.data = param; + + return ioctl (fd, USBDEVFS_IOCTL, &wrapper); + } + + File modification time is not updated by this request. + + This request lets kernel drivers talk to user mode code through + filesystem operations even when they don't create a character or + block special device. It's also been used to do things like ask + devices what device special file should be used. Two pre-defined + ioctls are used to disconnect and reconnect kernel drivers, so that + user mode code can completely manage binding and configuration of + devices. + +USBDEVFS_RELEASEINTERFACE + This is used to release the claim usbfs made on interface, either + implicitly or because of a USBDEVFS_CLAIMINTERFACE call, before the + file descriptor is closed. The ioctl parameter is an integer holding + the number of the interface (bInterfaceNumber from descriptor); File + modification time is not updated by this request. + + .. warning:: + + *No security check is made to ensure that the task which made + the claim is the one which is releasing it. This means that user + mode driver may interfere other ones.* + +USBDEVFS_RESETEP + Resets the data toggle value for an endpoint (bulk or interrupt) to + DATA0. The ioctl parameter is an integer endpoint number (1 to 15, + as identified in the endpoint descriptor), with USB_DIR_IN added + if the device's endpoint sends data to the host. + + .. Warning:: + + *Avoid using this request. It should probably be removed.* Using + it typically means the device and driver will lose toggle + synchronization. If you really lost synchronization, you likely + need to completely handshake with the device, using a request + like CLEAR_HALT or SET_INTERFACE. + +USBDEVFS_DROP_PRIVILEGES + This is used to relinquish the ability to do certain operations + which are considered to be privileged on a usbfs file descriptor. + This includes claiming arbitrary interfaces, resetting a device on + which there are currently claimed interfaces from other users, and + issuing USBDEVFS_IOCTL calls. The ioctl parameter is a 32 bit mask + of interfaces the user is allowed to claim on this file descriptor. + You may issue this ioctl more than one time to narrow said mask. + +Synchronous I/O Support +~~~~~~~~~~~~~~~~~~~~~~~ + +Synchronous requests involve the kernel blocking until the user mode +request completes, either by finishing successfully or by reporting an +error. In most cases this is the simplest way to use usbfs, although as +noted above it does prevent performing I/O to more than one endpoint at +a time. + +USBDEVFS_BULK + Issues a bulk read or write request to the device. The ioctl + parameter is a pointer to this structure:: + + struct usbdevfs_bulktransfer { + unsigned int ep; + unsigned int len; + unsigned int timeout; /* in milliseconds */ + void *data; + }; + + The ``ep`` value identifies a bulk endpoint number (1 to 15, as + identified in an endpoint descriptor), masked with USB_DIR_IN when + referring to an endpoint which sends data to the host from the + device. The length of the data buffer is identified by ``len``; Recent + kernels support requests up to about 128KBytes. *FIXME say how read + length is returned, and how short reads are handled.*. + +USBDEVFS_CLEAR_HALT + Clears endpoint halt (stall) and resets the endpoint toggle. This is + only meaningful for bulk or interrupt endpoints. The ioctl parameter + is an integer endpoint number (1 to 15, as identified in an endpoint + descriptor), masked with USB_DIR_IN when referring to an endpoint + which sends data to the host from the device. + + Use this on bulk or interrupt endpoints which have stalled, + returning ``-EPIPE`` status to a data transfer request. Do not issue + the control request directly, since that could invalidate the host's + record of the data toggle. + +USBDEVFS_CONTROL + Issues a control request to the device. The ioctl parameter points + to a structure like this:: + + struct usbdevfs_ctrltransfer { + __u8 bRequestType; + __u8 bRequest; + __u16 wValue; + __u16 wIndex; + __u16 wLength; + __u32 timeout; /* in milliseconds */ + void *data; + }; + + The first eight bytes of this structure are the contents of the + SETUP packet to be sent to the device; see the USB 2.0 specification + for details. The bRequestType value is composed by combining a + ``USB_TYPE_*`` value, a ``USB_DIR_*`` value, and a ``USB_RECIP_*`` + value (from ``linux/usb.h``). If wLength is nonzero, it describes + the length of the data buffer, which is either written to the device + (USB_DIR_OUT) or read from the device (USB_DIR_IN). + + At this writing, you can't transfer more than 4 KBytes of data to or + from a device; usbfs has a limit, and some host controller drivers + have a limit. (That's not usually a problem.) *Also* there's no way + to say it's not OK to get a short read back from the device. + +USBDEVFS_RESET + Does a USB level device reset. The ioctl parameter is ignored. After + the reset, this rebinds all device interfaces. File modification + time is not updated by this request. + +.. warning:: + + *Avoid using this call* until some usbcore bugs get fixed, since + it does not fully synchronize device, interface, and driver (not + just usbfs) state. + +USBDEVFS_SETINTERFACE + Sets the alternate setting for an interface. The ioctl parameter is + a pointer to a structure like this:: + + struct usbdevfs_setinterface { + unsigned int interface; + unsigned int altsetting; + }; + + File modification time is not updated by this request. + + Those struct members are from some interface descriptor applying to + the current configuration. The interface number is the + bInterfaceNumber value, and the altsetting number is the + bAlternateSetting value. (This resets each endpoint in the + interface.) + +USBDEVFS_SETCONFIGURATION + Issues the :c:func:`usb_set_configuration()` call for the + device. The parameter is an integer holding the number of a + configuration (bConfigurationValue from descriptor). File + modification time is not updated by this request. + +.. warning:: + + *Avoid using this call* until some usbcore bugs get fixed, since + it does not fully synchronize device, interface, and driver (not + just usbfs) state. + +Asynchronous I/O Support +~~~~~~~~~~~~~~~~~~~~~~~~ + +As mentioned above, there are situations where it may be important to +initiate concurrent operations from user mode code. This is particularly +important for periodic transfers (interrupt and isochronous), but it can +be used for other kinds of USB requests too. In such cases, the +asynchronous requests described here are essential. Rather than +submitting one request and having the kernel block until it completes, +the blocking is separate. + +These requests are packaged into a structure that resembles the URB used +by kernel device drivers. (No POSIX Async I/O support here, sorry.) It +identifies the endpoint type (``USBDEVFS_URB_TYPE_*``), endpoint +(number, masked with USB_DIR_IN as appropriate), buffer and length, +and a user "context" value serving to uniquely identify each request. +(It's usually a pointer to per-request data.) Flags can modify requests +(not as many as supported for kernel drivers). + +Each request can specify a realtime signal number (between SIGRTMIN and +SIGRTMAX, inclusive) to request a signal be sent when the request +completes. + +When usbfs returns these urbs, the status value is updated, and the +buffer may have been modified. Except for isochronous transfers, the +actual_length is updated to say how many bytes were transferred; if the +USBDEVFS_URB_DISABLE_SPD flag is set ("short packets are not OK"), if +fewer bytes were read than were requested then you get an error report:: + + struct usbdevfs_iso_packet_desc { + unsigned int length; + unsigned int actual_length; + unsigned int status; + }; + + struct usbdevfs_urb { + unsigned char type; + unsigned char endpoint; + int status; + unsigned int flags; + void *buffer; + int buffer_length; + int actual_length; + int start_frame; + int number_of_packets; + int error_count; + unsigned int signr; + void *usercontext; + struct usbdevfs_iso_packet_desc iso_frame_desc[]; + }; + +For these asynchronous requests, the file modification time reflects +when the request was initiated. This contrasts with their use with the +synchronous requests, where it reflects when requests complete. + +USBDEVFS_DISCARDURB + *TBS* File modification time is not updated by this request. + +USBDEVFS_DISCSIGNAL + *TBS* File modification time is not updated by this request. + +USBDEVFS_REAPURB + *TBS* File modification time is not updated by this request. + +USBDEVFS_REAPURBNDELAY + *TBS* File modification time is not updated by this request. + +USBDEVFS_SUBMITURB + *TBS* + +The USB devices +=============== + +The USB devices are now exported via debugfs: + +- ``/sys/kernel/debug/usb/devices`` ... a text file showing each of the USB + devices on known to the kernel, and their configuration descriptors. + You can also poll() this to learn about new devices. + +/sys/kernel/debug/usb/devices +----------------------------- + +This file is handy for status viewing tools in user mode, which can scan +the text format and ignore most of it. More detailed device status +(including class and vendor status) is available from device-specific +files. For information about the current format of this file, see the +``Documentation/usb/proc_usb_info.txt`` file in your Linux kernel +sources. + +This file, in combination with the poll() system call, can also be used +to detect when devices are added or removed:: + + int fd; + struct pollfd pfd; + + fd = open("/sys/kernel/debug/usb/devices", O_RDONLY); + pfd = { fd, POLLIN, 0 }; + for (;;) { + /* The first time through, this call will return immediately. */ + poll(&pfd, 1, -1); + + /* To see what's changed, compare the file's previous and current + contents or scan the filesystem. (Scanning is more precise.) */ + } + +Note that this behavior is intended to be used for informational and +debug purposes. It would be more appropriate to use programs such as +udev or HAL to initialize a device or start a user-mode helper program, +for instance. + +In this file, each device's output has multiple lines of ASCII output. + +I made it ASCII instead of binary on purpose, so that someone +can obtain some useful data from it without the use of an +auxiliary program. However, with an auxiliary program, the numbers +in the first 4 columns of each ``T:`` line (topology info: +Lev, Prnt, Port, Cnt) can be used to build a USB topology diagram. + +Each line is tagged with a one-character ID for that line:: + + T = Topology (etc.) + B = Bandwidth (applies only to USB host controllers, which are + virtualized as root hubs) + D = Device descriptor info. + P = Product ID info. (from Device descriptor, but they won't fit + together on one line) + S = String descriptors. + C = Configuration descriptor info. (* = active configuration) + I = Interface descriptor info. + E = Endpoint descriptor info. + +/sys/kernel/debug/usb/devices output format +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Legend:: + d = decimal number (may have leading spaces or 0's) + x = hexadecimal number (may have leading spaces or 0's) + s = string + + + +Topology info +^^^^^^^^^^^^^ + +:: + + T: Bus=dd Lev=dd Prnt=dd Port=dd Cnt=dd Dev#=ddd Spd=dddd MxCh=dd + | | | | | | | | |__MaxChildren + | | | | | | | |__Device Speed in Mbps + | | | | | | |__DeviceNumber + | | | | | |__Count of devices at this level + | | | | |__Connector/Port on Parent for this device + | | | |__Parent DeviceNumber + | | |__Level in topology for this bus + | |__Bus number + |__Topology info tag + +Speed may be: + + ======= ====================================================== + 1.5 Mbit/s for low speed USB + 12 Mbit/s for full speed USB + 480 Mbit/s for high speed USB (added for USB 2.0); + also used for Wireless USB, which has no fixed speed + 5000 Mbit/s for SuperSpeed USB (added for USB 3.0) + ======= ====================================================== + +For reasons lost in the mists of time, the Port number is always +too low by 1. For example, a device plugged into port 4 will +show up with ``Port=03``. + +Bandwidth info +^^^^^^^^^^^^^^ + +:: + + B: Alloc=ddd/ddd us (xx%), #Int=ddd, #Iso=ddd + | | | |__Number of isochronous requests + | | |__Number of interrupt requests + | |__Total Bandwidth allocated to this bus + |__Bandwidth info tag + +Bandwidth allocation is an approximation of how much of one frame +(millisecond) is in use. It reflects only periodic transfers, which +are the only transfers that reserve bandwidth. Control and bulk +transfers use all other bandwidth, including reserved bandwidth that +is not used for transfers (such as for short packets). + +The percentage is how much of the "reserved" bandwidth is scheduled by +those transfers. For a low or full speed bus (loosely, "USB 1.1"), +90% of the bus bandwidth is reserved. For a high speed bus (loosely, +"USB 2.0") 80% is reserved. + + +Device descriptor info & Product ID info +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + D: Ver=x.xx Cls=xx(s) Sub=xx Prot=xx MxPS=dd #Cfgs=dd + P: Vendor=xxxx ProdID=xxxx Rev=xx.xx + +where:: + + D: Ver=x.xx Cls=xx(sssss) Sub=xx Prot=xx MxPS=dd #Cfgs=dd + | | | | | | |__NumberConfigurations + | | | | | |__MaxPacketSize of Default Endpoint + | | | | |__DeviceProtocol + | | | |__DeviceSubClass + | | |__DeviceClass + | |__Device USB version + |__Device info tag #1 + +where:: + + P: Vendor=xxxx ProdID=xxxx Rev=xx.xx + | | | |__Product revision number + | | |__Product ID code + | |__Vendor ID code + |__Device info tag #2 + + +String descriptor info +^^^^^^^^^^^^^^^^^^^^^^ +:: + + S: Manufacturer=ssss + | |__Manufacturer of this device as read from the device. + | For USB host controller drivers (virtual root hubs) this may + | be omitted, or (for newer drivers) will identify the kernel + | version and the driver which provides this hub emulation. + |__String info tag + + S: Product=ssss + | |__Product description of this device as read from the device. + | For older USB host controller drivers (virtual root hubs) this + | indicates the driver; for newer ones, it's a product (and vendor) + | description that often comes from the kernel's PCI ID database. + |__String info tag + + S: SerialNumber=ssss + | |__Serial Number of this device as read from the device. + | For USB host controller drivers (virtual root hubs) this is + | some unique ID, normally a bus ID (address or slot name) that + | can't be shared with any other device. + |__String info tag + + + +Configuration descriptor info +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + C:* #Ifs=dd Cfg#=dd Atr=xx MPwr=dddmA + | | | | | |__MaxPower in mA + | | | | |__Attributes + | | | |__ConfiguratioNumber + | | |__NumberOfInterfaces + | |__ "*" indicates the active configuration (others are " ") + |__Config info tag + +USB devices may have multiple configurations, each of which act +rather differently. For example, a bus-powered configuration +might be much less capable than one that is self-powered. Only +one device configuration can be active at a time; most devices +have only one configuration. + +Each configuration consists of one or more interfaces. Each +interface serves a distinct "function", which is typically bound +to a different USB device driver. One common example is a USB +speaker with an audio interface for playback, and a HID interface +for use with software volume control. + +Interface descriptor info (can be multiple per Config) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:: + + I:* If#=dd Alt=dd #EPs=dd Cls=xx(sssss) Sub=xx Prot=xx Driver=ssss + | | | | | | | | |__Driver name + | | | | | | | | or "(none)" + | | | | | | | |__InterfaceProtocol + | | | | | | |__InterfaceSubClass + | | | | | |__InterfaceClass + | | | | |__NumberOfEndpoints + | | | |__AlternateSettingNumber + | | |__InterfaceNumber + | |__ "*" indicates the active altsetting (others are " ") + |__Interface info tag + +A given interface may have one or more "alternate" settings. +For example, default settings may not use more than a small +amount of periodic bandwidth. To use significant fractions +of bus bandwidth, drivers must select a non-default altsetting. + +Only one setting for an interface may be active at a time, and +only one driver may bind to an interface at a time. Most devices +have only one alternate setting per interface. + + +Endpoint descriptor info (can be multiple per Interface) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:: + + E: Ad=xx(s) Atr=xx(ssss) MxPS=dddd Ivl=dddss + | | | | |__Interval (max) between transfers + | | | |__EndpointMaxPacketSize + | | |__Attributes(EndpointType) + | |__EndpointAddress(I=In,O=Out) + |__Endpoint info tag + +The interval is nonzero for all periodic (interrupt or isochronous) +endpoints. For high speed endpoints the transfer interval may be +measured in microseconds rather than milliseconds. + +For high speed periodic endpoints, the ``EndpointMaxPacketSize`` reflects +the per-microframe data transfer size. For "high bandwidth" +endpoints, that can reflect two or three packets (for up to +3KBytes every 125 usec) per endpoint. + +With the Linux-USB stack, periodic bandwidth reservations use the +transfer intervals and sizes provided by URBs, which can be less +than those found in endpoint descriptor. + +Usage examples +~~~~~~~~~~~~~~ + +If a user or script is interested only in Topology info, for +example, use something like ``grep ^T: /sys/kernel/debug/usb/devices`` +for only the Topology lines. A command like +``grep -i ^[tdp]: /sys/kernel/debug/usb/devices`` can be used to list +only the lines that begin with the characters in square brackets, +where the valid characters are TDPCIE. With a slightly more able +script, it can display any selected lines (for example, only T, D, +and P lines) and change their output format. (The ``procusb`` +Perl script is the beginning of this idea. It will list only +selected lines [selected from TBDPSCIE] or "All" lines from +``/sys/kernel/debug/usb/devices``.) + +The Topology lines can be used to generate a graphic/pictorial +of the USB devices on a system's root hub. (See more below +on how to do this.) + +The Interface lines can be used to determine what driver is +being used for each device, and which altsetting it activated. + +The Configuration lines could be used to list maximum power +(in milliamps) that a system's USB devices are using. +For example, ``grep ^C: /sys/kernel/debug/usb/devices``. + + +Here's an example, from a system which has a UHCI root hub, +an external hub connected to the root hub, and a mouse and +a serial converter connected to the external hub. + +:: + + T: Bus=00 Lev=00 Prnt=00 Port=00 Cnt=00 Dev#= 1 Spd=12 MxCh= 2 + B: Alloc= 28/900 us ( 3%), #Int= 2, #Iso= 0 + D: Ver= 1.00 Cls=09(hub ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 + P: Vendor=0000 ProdID=0000 Rev= 0.00 + S: Product=USB UHCI Root Hub + S: SerialNumber=dce0 + C:* #Ifs= 1 Cfg#= 1 Atr=40 MxPwr= 0mA + I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub + E: Ad=81(I) Atr=03(Int.) MxPS= 8 Ivl=255ms + + T: Bus=00 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=12 MxCh= 4 + D: Ver= 1.00 Cls=09(hub ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 + P: Vendor=0451 ProdID=1446 Rev= 1.00 + C:* #Ifs= 1 Cfg#= 1 Atr=e0 MxPwr=100mA + I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub + E: Ad=81(I) Atr=03(Int.) MxPS= 1 Ivl=255ms + + T: Bus=00 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=1.5 MxCh= 0 + D: Ver= 1.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 + P: Vendor=04b4 ProdID=0001 Rev= 0.00 + C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA + I: If#= 0 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=01 Prot=02 Driver=mouse + E: Ad=81(I) Atr=03(Int.) MxPS= 3 Ivl= 10ms + + T: Bus=00 Lev=02 Prnt=02 Port=02 Cnt=02 Dev#= 4 Spd=12 MxCh= 0 + D: Ver= 1.00 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 + P: Vendor=0565 ProdID=0001 Rev= 1.08 + S: Manufacturer=Peracom Networks, Inc. + S: Product=Peracom USB to Serial Converter + C:* #Ifs= 1 Cfg#= 1 Atr=a0 MxPwr=100mA + I: If#= 0 Alt= 0 #EPs= 3 Cls=00(>ifc ) Sub=00 Prot=00 Driver=serial + E: Ad=81(I) Atr=02(Bulk) MxPS= 64 Ivl= 16ms + E: Ad=01(O) Atr=02(Bulk) MxPS= 16 Ivl= 16ms + E: Ad=82(I) Atr=03(Int.) MxPS= 8 Ivl= 8ms + + +Selecting only the ``T:`` and ``I:`` lines from this (for example, by using +``procusb ti``), we have + +:: + + T: Bus=00 Lev=00 Prnt=00 Port=00 Cnt=00 Dev#= 1 Spd=12 MxCh= 2 + T: Bus=00 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=12 MxCh= 4 + I: If#= 0 Alt= 0 #EPs= 1 Cls=09(hub ) Sub=00 Prot=00 Driver=hub + T: Bus=00 Lev=02 Prnt=02 Port=00 Cnt=01 Dev#= 3 Spd=1.5 MxCh= 0 + I: If#= 0 Alt= 0 #EPs= 1 Cls=03(HID ) Sub=01 Prot=02 Driver=mouse + T: Bus=00 Lev=02 Prnt=02 Port=02 Cnt=02 Dev#= 4 Spd=12 MxCh= 0 + I: If#= 0 Alt= 0 #EPs= 3 Cls=00(>ifc ) Sub=00 Prot=00 Driver=serial + + +Physically this looks like (or could be converted to):: + + +------------------+ + | PC/root_hub (12)| Dev# = 1 + +------------------+ (nn) is Mbps. + Level 0 | CN.0 | CN.1 | [CN = connector/port #] + +------------------+ + / + / + +-----------------------+ + Level 1 | Dev#2: 4-port hub (12)| + +-----------------------+ + |CN.0 |CN.1 |CN.2 |CN.3 | + +-----------------------+ + \ \____________________ + \_____ \ + \ \ + +--------------------+ +--------------------+ + Level 2 | Dev# 3: mouse (1.5)| | Dev# 4: serial (12)| + +--------------------+ +--------------------+ + + + +Or, in a more tree-like structure (ports [Connectors] without +connections could be omitted):: + + PC: Dev# 1, root hub, 2 ports, 12 Mbps + |_ CN.0: Dev# 2, hub, 4 ports, 12 Mbps + |_ CN.0: Dev #3, mouse, 1.5 Mbps + |_ CN.1: + |_ CN.2: Dev #4, serial, 12 Mbps + |_ CN.3: + |_ CN.1: diff --git a/Documentation/driver-api/usb/writing_musb_glue_layer.rst b/Documentation/driver-api/usb/writing_musb_glue_layer.rst new file mode 100644 index 0000000000000000000000000000000000000000..e90e8fa95600fe0eb2a2260fb0c6ac1723762652 --- /dev/null +++ b/Documentation/driver-api/usb/writing_musb_glue_layer.rst @@ -0,0 +1,723 @@ +========================= +Writing a MUSB Glue Layer +========================= + +:Author: Apelete Seketeli + +Introduction +============ + +The Linux MUSB subsystem is part of the larger Linux USB subsystem. It +provides support for embedded USB Device Controllers (UDC) that do not +use Universal Host Controller Interface (UHCI) or Open Host Controller +Interface (OHCI). + +Instead, these embedded UDC rely on the USB On-the-Go (OTG) +specification which they implement at least partially. The silicon +reference design used in most cases is the Multipoint USB Highspeed +Dual-Role Controller (MUSB HDRC) found in the Mentor Graphics Inventra™ +design. + +As a self-taught exercise I have written an MUSB glue layer for the +Ingenic JZ4740 SoC, modelled after the many MUSB glue layers in the +kernel source tree. This layer can be found at +``drivers/usb/musb/jz4740.c``. In this documentation I will walk through the +basics of the ``jz4740.c`` glue layer, explaining the different pieces and +what needs to be done in order to write your own device glue layer. + +.. _musb-basics: + +Linux MUSB Basics +================= + +To get started on the topic, please read USB On-the-Go Basics (see +Resources) which provides an introduction of USB OTG operation at the +hardware level. A couple of wiki pages by Texas Instruments and Analog +Devices also provide an overview of the Linux kernel MUSB configuration, +albeit focused on some specific devices provided by these companies. +Finally, getting acquainted with the USB specification at USB home page +may come in handy, with practical instance provided through the Writing +USB Device Drivers documentation (again, see Resources). + +Linux USB stack is a layered architecture in which the MUSB controller +hardware sits at the lowest. The MUSB controller driver abstract the +MUSB controller hardware to the Linux USB stack:: + + ------------------------ + | | <------- drivers/usb/gadget + | Linux USB Core Stack | <------- drivers/usb/host + | | <------- drivers/usb/core + ------------------------ + ⬍ + -------------------------- + | | <------ drivers/usb/musb/musb_gadget.c + | MUSB Controller driver | <------ drivers/usb/musb/musb_host.c + | | <------ drivers/usb/musb/musb_core.c + -------------------------- + ⬍ + --------------------------------- + | MUSB Platform Specific Driver | + | | <-- drivers/usb/musb/jz4740.c + | aka "Glue Layer" | + --------------------------------- + ⬍ + --------------------------------- + | MUSB Controller Hardware | + --------------------------------- + +As outlined above, the glue layer is actually the platform specific code +sitting in between the controller driver and the controller hardware. + +Just like a Linux USB driver needs to register itself with the Linux USB +subsystem, the MUSB glue layer needs first to register itself with the +MUSB controller driver. This will allow the controller driver to know +about which device the glue layer supports and which functions to call +when a supported device is detected or released; remember we are talking +about an embedded controller chip here, so no insertion or removal at +run-time. + +All of this information is passed to the MUSB controller driver through +a :c:type:`platform_driver` structure defined in the glue layer as:: + + static struct platform_driver jz4740_driver = { + .probe = jz4740_probe, + .remove = jz4740_remove, + .driver = { + .name = "musb-jz4740", + }, + }; + +The probe and remove function pointers are called when a matching device +is detected and, respectively, released. The name string describes the +device supported by this glue layer. In the current case it matches a +platform_device structure declared in ``arch/mips/jz4740/platform.c``. Note +that we are not using device tree bindings here. + +In order to register itself to the controller driver, the glue layer +goes through a few steps, basically allocating the controller hardware +resources and initialising a couple of circuits. To do so, it needs to +keep track of the information used throughout these steps. This is done +by defining a private ``jz4740_glue`` structure:: + + struct jz4740_glue { + struct device *dev; + struct platform_device *musb; + struct clk *clk; + }; + + +The dev and musb members are both device structure variables. The first +one holds generic information about the device, since it's the basic +device structure, and the latter holds information more closely related +to the subsystem the device is registered to. The clk variable keeps +information related to the device clock operation. + +Let's go through the steps of the probe function that leads the glue +layer to register itself to the controller driver. + +.. note:: + + For the sake of readability each function will be split in logical + parts, each part being shown as if it was independent from the others. + +.. code-block:: c + :emphasize-lines: 8,12,18 + + static int jz4740_probe(struct platform_device *pdev) + { + struct platform_device *musb; + struct jz4740_glue *glue; + struct clk *clk; + int ret; + + glue = devm_kzalloc(&pdev->dev, sizeof(*glue), GFP_KERNEL); + if (!glue) + return -ENOMEM; + + musb = platform_device_alloc("musb-hdrc", PLATFORM_DEVID_AUTO); + if (!musb) { + dev_err(&pdev->dev, "failed to allocate musb device\n"); + return -ENOMEM; + } + + clk = devm_clk_get(&pdev->dev, "udc"); + if (IS_ERR(clk)) { + dev_err(&pdev->dev, "failed to get clock\n"); + ret = PTR_ERR(clk); + goto err_platform_device_put; + } + + ret = clk_prepare_enable(clk); + if (ret) { + dev_err(&pdev->dev, "failed to enable clock\n"); + goto err_platform_device_put; + } + + musb->dev.parent = &pdev->dev; + + glue->dev = &pdev->dev; + glue->musb = musb; + glue->clk = clk; + + return 0; + + err_platform_device_put: + platform_device_put(musb); + return ret; + } + +The first few lines of the probe function allocate and assign the glue, +musb and clk variables. The ``GFP_KERNEL`` flag (line 8) allows the +allocation process to sleep and wait for memory, thus being usable in a +locking situation. The ``PLATFORM_DEVID_AUTO`` flag (line 12) allows +automatic allocation and management of device IDs in order to avoid +device namespace collisions with explicit IDs. With :c:func:`devm_clk_get` +(line 18) the glue layer allocates the clock -- the ``devm_`` prefix +indicates that :c:func:`clk_get` is managed: it automatically frees the +allocated clock resource data when the device is released -- and enable +it. + + + +Then comes the registration steps: + +.. code-block:: c + :emphasize-lines: 3,5,7,9,16 + + static int jz4740_probe(struct platform_device *pdev) + { + struct musb_hdrc_platform_data *pdata = &jz4740_musb_platform_data; + + pdata->platform_ops = &jz4740_musb_ops; + + platform_set_drvdata(pdev, glue); + + ret = platform_device_add_resources(musb, pdev->resource, + pdev->num_resources); + if (ret) { + dev_err(&pdev->dev, "failed to add resources\n"); + goto err_clk_disable; + } + + ret = platform_device_add_data(musb, pdata, sizeof(*pdata)); + if (ret) { + dev_err(&pdev->dev, "failed to add platform_data\n"); + goto err_clk_disable; + } + + return 0; + + err_clk_disable: + clk_disable_unprepare(clk); + err_platform_device_put: + platform_device_put(musb); + return ret; + } + +The first step is to pass the device data privately held by the glue +layer on to the controller driver through :c:func:`platform_set_drvdata` +(line 7). Next is passing on the device resources information, also privately +held at that point, through :c:func:`platform_device_add_resources` (line 9). + +Finally comes passing on the platform specific data to the controller +driver (line 16). Platform data will be discussed in +:ref:`musb-dev-platform-data`, but here we are looking at the +``platform_ops`` function pointer (line 5) in ``musb_hdrc_platform_data`` +structure (line 3). This function pointer allows the MUSB controller +driver to know which function to call for device operation:: + + static const struct musb_platform_ops jz4740_musb_ops = { + .init = jz4740_musb_init, + .exit = jz4740_musb_exit, + }; + +Here we have the minimal case where only init and exit functions are +called by the controller driver when needed. Fact is the JZ4740 MUSB +controller is a basic controller, lacking some features found in other +controllers, otherwise we may also have pointers to a few other +functions like a power management function or a function to switch +between OTG and non-OTG modes, for instance. + +At that point of the registration process, the controller driver +actually calls the init function: + + .. code-block:: c + :emphasize-lines: 12,14 + + static int jz4740_musb_init(struct musb *musb) + { + musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2); + if (!musb->xceiv) { + pr_err("HS UDC: no transceiver configured\n"); + return -ENODEV; + } + + /* Silicon does not implement ConfigData register. + * Set dyn_fifo to avoid reading EP config from hardware. + */ + musb->dyn_fifo = true; + + musb->isr = jz4740_musb_interrupt; + + return 0; + } + +The goal of ``jz4740_musb_init()`` is to get hold of the transceiver +driver data of the MUSB controller hardware and pass it on to the MUSB +controller driver, as usual. The transceiver is the circuitry inside the +controller hardware responsible for sending/receiving the USB data. +Since it is an implementation of the physical layer of the OSI model, +the transceiver is also referred to as PHY. + +Getting hold of the ``MUSB PHY`` driver data is done with ``usb_get_phy()`` +which returns a pointer to the structure containing the driver instance +data. The next couple of instructions (line 12 and 14) are used as a +quirk and to setup IRQ handling respectively. Quirks and IRQ handling +will be discussed later in :ref:`musb-dev-quirks` and +:ref:`musb-handling-irqs`\ :: + + static int jz4740_musb_exit(struct musb *musb) + { + usb_put_phy(musb->xceiv); + + return 0; + } + +Acting as the counterpart of init, the exit function releases the MUSB +PHY driver when the controller hardware itself is about to be released. + +Again, note that init and exit are fairly simple in this case due to the +basic set of features of the JZ4740 controller hardware. When writing an +musb glue layer for a more complex controller hardware, you might need +to take care of more processing in those two functions. + +Returning from the init function, the MUSB controller driver jumps back +into the probe function:: + + static int jz4740_probe(struct platform_device *pdev) + { + ret = platform_device_add(musb); + if (ret) { + dev_err(&pdev->dev, "failed to register musb device\n"); + goto err_clk_disable; + } + + return 0; + + err_clk_disable: + clk_disable_unprepare(clk); + err_platform_device_put: + platform_device_put(musb); + return ret; + } + +This is the last part of the device registration process where the glue +layer adds the controller hardware device to Linux kernel device +hierarchy: at this stage, all known information about the device is +passed on to the Linux USB core stack: + + .. code-block:: c + :emphasize-lines: 5,6 + + static int jz4740_remove(struct platform_device *pdev) + { + struct jz4740_glue *glue = platform_get_drvdata(pdev); + + platform_device_unregister(glue->musb); + clk_disable_unprepare(glue->clk); + + return 0; + } + +Acting as the counterpart of probe, the remove function unregister the +MUSB controller hardware (line 5) and disable the clock (line 6), +allowing it to be gated. + +.. _musb-handling-irqs: + +Handling IRQs +============= + +Additionally to the MUSB controller hardware basic setup and +registration, the glue layer is also responsible for handling the IRQs: + + .. code-block:: c + :emphasize-lines: 7,9-11,14,24 + + static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci) + { + unsigned long flags; + irqreturn_t retval = IRQ_NONE; + struct musb *musb = __hci; + + spin_lock_irqsave(&musb->lock, flags); + + musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB); + musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX); + musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX); + + /* + * The controller is gadget only, the state of the host mode IRQ bits is + * undefined. Mask them to make sure that the musb driver core will + * never see them set + */ + musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME | + MUSB_INTR_RESET | MUSB_INTR_SOF; + + if (musb->int_usb || musb->int_tx || musb->int_rx) + retval = musb_interrupt(musb); + + spin_unlock_irqrestore(&musb->lock, flags); + + return retval; + } + +Here the glue layer mostly has to read the relevant hardware registers +and pass their values on to the controller driver which will handle the +actual event that triggered the IRQ. + +The interrupt handler critical section is protected by the +:c:func:`spin_lock_irqsave` and counterpart :c:func:`spin_unlock_irqrestore` +functions (line 7 and 24 respectively), which prevent the interrupt +handler code to be run by two different threads at the same time. + +Then the relevant interrupt registers are read (line 9 to 11): + +- ``MUSB_INTRUSB``: indicates which USB interrupts are currently active, + +- ``MUSB_INTRTX``: indicates which of the interrupts for TX endpoints are + currently active, + +- ``MUSB_INTRRX``: indicates which of the interrupts for TX endpoints are + currently active. + +Note that :c:func:`musb_readb` is used to read 8-bit registers at most, while +:c:func:`musb_readw` allows us to read at most 16-bit registers. There are +other functions that can be used depending on the size of your device +registers. See ``musb_io.h`` for more information. + +Instruction on line 18 is another quirk specific to the JZ4740 USB +device controller, which will be discussed later in :ref:`musb-dev-quirks`. + +The glue layer still needs to register the IRQ handler though. Remember +the instruction on line 14 of the init function:: + + static int jz4740_musb_init(struct musb *musb) + { + musb->isr = jz4740_musb_interrupt; + + return 0; + } + +This instruction sets a pointer to the glue layer IRQ handler function, +in order for the controller hardware to call the handler back when an +IRQ comes from the controller hardware. The interrupt handler is now +implemented and registered. + +.. _musb-dev-platform-data: + +Device Platform Data +==================== + +In order to write an MUSB glue layer, you need to have some data +describing the hardware capabilities of your controller hardware, which +is called the platform data. + +Platform data is specific to your hardware, though it may cover a broad +range of devices, and is generally found somewhere in the ``arch/`` +directory, depending on your device architecture. + +For instance, platform data for the JZ4740 SoC is found in +``arch/mips/jz4740/platform.c``. In the ``platform.c`` file each device of the +JZ4740 SoC is described through a set of structures. + +Here is the part of ``arch/mips/jz4740/platform.c`` that covers the USB +Device Controller (UDC): + + .. code-block:: c + :emphasize-lines: 2,7,14-17,21,22,25,26,28,29 + + /* USB Device Controller */ + struct platform_device jz4740_udc_xceiv_device = { + .name = "usb_phy_gen_xceiv", + .id = 0, + }; + + static struct resource jz4740_udc_resources[] = { + [0] = { + .start = JZ4740_UDC_BASE_ADDR, + .end = JZ4740_UDC_BASE_ADDR + 0x10000 - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = JZ4740_IRQ_UDC, + .end = JZ4740_IRQ_UDC, + .flags = IORESOURCE_IRQ, + .name = "mc", + }, + }; + + struct platform_device jz4740_udc_device = { + .name = "musb-jz4740", + .id = -1, + .dev = { + .dma_mask = &jz4740_udc_device.dev.coherent_dma_mask, + .coherent_dma_mask = DMA_BIT_MASK(32), + }, + .num_resources = ARRAY_SIZE(jz4740_udc_resources), + .resource = jz4740_udc_resources, + }; + +The ``jz4740_udc_xceiv_device`` platform device structure (line 2) +describes the UDC transceiver with a name and id number. + +At the time of this writing, note that ``usb_phy_gen_xceiv`` is the +specific name to be used for all transceivers that are either built-in +with reference USB IP or autonomous and doesn't require any PHY +programming. You will need to set ``CONFIG_NOP_USB_XCEIV=y`` in the +kernel configuration to make use of the corresponding transceiver +driver. The id field could be set to -1 (equivalent to +``PLATFORM_DEVID_NONE``), -2 (equivalent to ``PLATFORM_DEVID_AUTO``) or +start with 0 for the first device of this kind if we want a specific id +number. + +The ``jz4740_udc_resources`` resource structure (line 7) defines the UDC +registers base addresses. + +The first array (line 9 to 11) defines the UDC registers base memory +addresses: start points to the first register memory address, end points +to the last register memory address and the flags member defines the +type of resource we are dealing with. So ``IORESOURCE_MEM`` is used to +define the registers memory addresses. The second array (line 14 to 17) +defines the UDC IRQ registers addresses. Since there is only one IRQ +register available for the JZ4740 UDC, start and end point at the same +address. The ``IORESOURCE_IRQ`` flag tells that we are dealing with IRQ +resources, and the name ``mc`` is in fact hard-coded in the MUSB core in +order for the controller driver to retrieve this IRQ resource by +querying it by its name. + +Finally, the ``jz4740_udc_device`` platform device structure (line 21) +describes the UDC itself. + +The ``musb-jz4740`` name (line 22) defines the MUSB driver that is used +for this device; remember this is in fact the name that we used in the +``jz4740_driver`` platform driver structure in :ref:`musb-basics`. +The id field (line 23) is set to -1 (equivalent to ``PLATFORM_DEVID_NONE``) +since we do not need an id for the device: the MUSB controller driver was +already set to allocate an automatic id in :ref:`musb-basics`. In the dev field +we care for DMA related information here. The ``dma_mask`` field (line 25) +defines the width of the DMA mask that is going to be used, and +``coherent_dma_mask`` (line 26) has the same purpose but for the +``alloc_coherent`` DMA mappings: in both cases we are using a 32 bits mask. +Then the resource field (line 29) is simply a pointer to the resource +structure defined before, while the ``num_resources`` field (line 28) keeps +track of the number of arrays defined in the resource structure (in this +case there were two resource arrays defined before). + +With this quick overview of the UDC platform data at the ``arch/`` level now +done, let's get back to the MUSB glue layer specific platform data in +``drivers/usb/musb/jz4740.c``: + + .. code-block:: c + :emphasize-lines: 3,5,7-9,11 + + static struct musb_hdrc_config jz4740_musb_config = { + /* Silicon does not implement USB OTG. */ + .multipoint = 0, + /* Max EPs scanned, driver will decide which EP can be used. */ + .num_eps = 4, + /* RAMbits needed to configure EPs from table */ + .ram_bits = 9, + .fifo_cfg = jz4740_musb_fifo_cfg, + .fifo_cfg_size = ARRAY_SIZE(jz4740_musb_fifo_cfg), + }; + + static struct musb_hdrc_platform_data jz4740_musb_platform_data = { + .mode = MUSB_PERIPHERAL, + .config = &jz4740_musb_config, + }; + +First the glue layer configures some aspects of the controller driver +operation related to the controller hardware specifics. This is done +through the ``jz4740_musb_config`` :c:type:`musb_hdrc_config` structure. + +Defining the OTG capability of the controller hardware, the multipoint +member (line 3) is set to 0 (equivalent to false) since the JZ4740 UDC +is not OTG compatible. Then ``num_eps`` (line 5) defines the number of USB +endpoints of the controller hardware, including endpoint 0: here we have +3 endpoints + endpoint 0. Next is ``ram_bits`` (line 7) which is the width +of the RAM address bus for the MUSB controller hardware. This +information is needed when the controller driver cannot automatically +configure endpoints by reading the relevant controller hardware +registers. This issue will be discussed when we get to device quirks in +:ref:`musb-dev-quirks`. Last two fields (line 8 and 9) are also +about device quirks: ``fifo_cfg`` points to the USB endpoints configuration +table and ``fifo_cfg_size`` keeps track of the size of the number of +entries in that configuration table. More on that later in +:ref:`musb-dev-quirks`. + +Then this configuration is embedded inside ``jz4740_musb_platform_data`` +:c:type:`musb_hdrc_platform_data` structure (line 11): config is a pointer to +the configuration structure itself, and mode tells the controller driver +if the controller hardware may be used as ``MUSB_HOST`` only, +``MUSB_PERIPHERAL`` only or ``MUSB_OTG`` which is a dual mode. + +Remember that ``jz4740_musb_platform_data`` is then used to convey +platform data information as we have seen in the probe function in +:ref:`musb-basics`. + +.. _musb-dev-quirks: + +Device Quirks +============= + +Completing the platform data specific to your device, you may also need +to write some code in the glue layer to work around some device specific +limitations. These quirks may be due to some hardware bugs, or simply be +the result of an incomplete implementation of the USB On-the-Go +specification. + +The JZ4740 UDC exhibits such quirks, some of which we will discuss here +for the sake of insight even though these might not be found in the +controller hardware you are working on. + +Let's get back to the init function first: + + .. code-block:: c + :emphasize-lines: 12 + + static int jz4740_musb_init(struct musb *musb) + { + musb->xceiv = usb_get_phy(USB_PHY_TYPE_USB2); + if (!musb->xceiv) { + pr_err("HS UDC: no transceiver configured\n"); + return -ENODEV; + } + + /* Silicon does not implement ConfigData register. + * Set dyn_fifo to avoid reading EP config from hardware. + */ + musb->dyn_fifo = true; + + musb->isr = jz4740_musb_interrupt; + + return 0; + } + +Instruction on line 12 helps the MUSB controller driver to work around +the fact that the controller hardware is missing registers that are used +for USB endpoints configuration. + +Without these registers, the controller driver is unable to read the +endpoints configuration from the hardware, so we use line 12 instruction +to bypass reading the configuration from silicon, and rely on a +hard-coded table that describes the endpoints configuration instead:: + + static struct musb_fifo_cfg jz4740_musb_fifo_cfg[] = { + { .hw_ep_num = 1, .style = FIFO_TX, .maxpacket = 512, }, + { .hw_ep_num = 1, .style = FIFO_RX, .maxpacket = 512, }, + { .hw_ep_num = 2, .style = FIFO_TX, .maxpacket = 64, }, + }; + +Looking at the configuration table above, we see that each endpoints is +described by three fields: ``hw_ep_num`` is the endpoint number, style is +its direction (either ``FIFO_TX`` for the controller driver to send packets +in the controller hardware, or ``FIFO_RX`` to receive packets from +hardware), and maxpacket defines the maximum size of each data packet +that can be transmitted over that endpoint. Reading from the table, the +controller driver knows that endpoint 1 can be used to send and receive +USB data packets of 512 bytes at once (this is in fact a bulk in/out +endpoint), and endpoint 2 can be used to send data packets of 64 bytes +at once (this is in fact an interrupt endpoint). + +Note that there is no information about endpoint 0 here: that one is +implemented by default in every silicon design, with a predefined +configuration according to the USB specification. For more examples of +endpoint configuration tables, see ``musb_core.c``. + +Let's now get back to the interrupt handler function: + + .. code-block:: c + :emphasize-lines: 18-19 + + static irqreturn_t jz4740_musb_interrupt(int irq, void *__hci) + { + unsigned long flags; + irqreturn_t retval = IRQ_NONE; + struct musb *musb = __hci; + + spin_lock_irqsave(&musb->lock, flags); + + musb->int_usb = musb_readb(musb->mregs, MUSB_INTRUSB); + musb->int_tx = musb_readw(musb->mregs, MUSB_INTRTX); + musb->int_rx = musb_readw(musb->mregs, MUSB_INTRRX); + + /* + * The controller is gadget only, the state of the host mode IRQ bits is + * undefined. Mask them to make sure that the musb driver core will + * never see them set + */ + musb->int_usb &= MUSB_INTR_SUSPEND | MUSB_INTR_RESUME | + MUSB_INTR_RESET | MUSB_INTR_SOF; + + if (musb->int_usb || musb->int_tx || musb->int_rx) + retval = musb_interrupt(musb); + + spin_unlock_irqrestore(&musb->lock, flags); + + return retval; + } + +Instruction on line 18 above is a way for the controller driver to work +around the fact that some interrupt bits used for USB host mode +operation are missing in the ``MUSB_INTRUSB`` register, thus left in an +undefined hardware state, since this MUSB controller hardware is used in +peripheral mode only. As a consequence, the glue layer masks these +missing bits out to avoid parasite interrupts by doing a logical AND +operation between the value read from ``MUSB_INTRUSB`` and the bits that +are actually implemented in the register. + +These are only a couple of the quirks found in the JZ4740 USB device +controller. Some others were directly addressed in the MUSB core since +the fixes were generic enough to provide a better handling of the issues +for others controller hardware eventually. + +Conclusion +========== + +Writing a Linux MUSB glue layer should be a more accessible task, as +this documentation tries to show the ins and outs of this exercise. + +The JZ4740 USB device controller being fairly simple, I hope its glue +layer serves as a good example for the curious mind. Used with the +current MUSB glue layers, this documentation should provide enough +guidance to get started; should anything gets out of hand, the linux-usb +mailing list archive is another helpful resource to browse through. + +Acknowledgements +================ + +Many thanks to Lars-Peter Clausen and Maarten ter Huurne for answering +my questions while I was writing the JZ4740 glue layer and for helping +me out getting the code in good shape. + +I would also like to thank the Qi-Hardware community at large for its +cheerful guidance and support. + +Resources +========= + +USB Home Page: http://www.usb.org + +linux-usb Mailing List Archives: http://marc.info/?l=linux-usb + +USB On-the-Go Basics: +http://www.maximintegrated.com/app-notes/index.mvp/id/1822 + +:ref:`Writing USB Device Drivers ` + +Texas Instruments USB Configuration Wiki Page: +http://processors.wiki.ti.com/index.php/Usbgeneralpage + +Analog Devices Blackfin MUSB Configuration: +http://docs.blackfin.uclinux.org/doku.php?id=linux-kernel:drivers:musb diff --git a/Documentation/driver-api/usb/writing_usb_driver.rst b/Documentation/driver-api/usb/writing_usb_driver.rst new file mode 100644 index 0000000000000000000000000000000000000000..69f077dcdb785c7265064010eafeb9887feca2b4 --- /dev/null +++ b/Documentation/driver-api/usb/writing_usb_driver.rst @@ -0,0 +1,326 @@ +.. _writing-usb-driver: + +========================== +Writing USB Device Drivers +========================== + +:Author: Greg Kroah-Hartman + +Introduction +============ + +The Linux USB subsystem has grown from supporting only two different +types of devices in the 2.2.7 kernel (mice and keyboards), to over 20 +different types of devices in the 2.4 kernel. Linux currently supports +almost all USB class devices (standard types of devices like keyboards, +mice, modems, printers and speakers) and an ever-growing number of +vendor-specific devices (such as USB to serial converters, digital +cameras, Ethernet devices and MP3 players). For a full list of the +different USB devices currently supported, see Resources. + +The remaining kinds of USB devices that do not have support on Linux are +almost all vendor-specific devices. Each vendor decides to implement a +custom protocol to talk to their device, so a custom driver usually +needs to be created. Some vendors are open with their USB protocols and +help with the creation of Linux drivers, while others do not publish +them, and developers are forced to reverse-engineer. See Resources for +some links to handy reverse-engineering tools. + +Because each different protocol causes a new driver to be created, I +have written a generic USB driver skeleton, modelled after the +pci-skeleton.c file in the kernel source tree upon which many PCI +network drivers have been based. This USB skeleton can be found at +drivers/usb/usb-skeleton.c in the kernel source tree. In this article I +will walk through the basics of the skeleton driver, explaining the +different pieces and what needs to be done to customize it to your +specific device. + +Linux USB Basics +================ + +If you are going to write a Linux USB driver, please become familiar +with the USB protocol specification. It can be found, along with many +other useful documents, at the USB home page (see Resources). An +excellent introduction to the Linux USB subsystem can be found at the +USB Working Devices List (see Resources). It explains how the Linux USB +subsystem is structured and introduces the reader to the concept of USB +urbs (USB Request Blocks), which are essential to USB drivers. + +The first thing a Linux USB driver needs to do is register itself with +the Linux USB subsystem, giving it some information about which devices +the driver supports and which functions to call when a device supported +by the driver is inserted or removed from the system. All of this +information is passed to the USB subsystem in the :c:type:`usb_driver` +structure. The skeleton driver declares a :c:type:`usb_driver` as:: + + static struct usb_driver skel_driver = { + .name = "skeleton", + .probe = skel_probe, + .disconnect = skel_disconnect, + .fops = &skel_fops, + .minor = USB_SKEL_MINOR_BASE, + .id_table = skel_table, + }; + + +The variable name is a string that describes the driver. It is used in +informational messages printed to the system log. The probe and +disconnect function pointers are called when a device that matches the +information provided in the ``id_table`` variable is either seen or +removed. + +The fops and minor variables are optional. Most USB drivers hook into +another kernel subsystem, such as the SCSI, network or TTY subsystem. +These types of drivers register themselves with the other kernel +subsystem, and any user-space interactions are provided through that +interface. But for drivers that do not have a matching kernel subsystem, +such as MP3 players or scanners, a method of interacting with user space +is needed. The USB subsystem provides a way to register a minor device +number and a set of :c:type:`file_operations` function pointers that enable +this user-space interaction. The skeleton driver needs this kind of +interface, so it provides a minor starting number and a pointer to its +:c:type:`file_operations` functions. + +The USB driver is then registered with a call to :c:func:`usb_register`, +usually in the driver's init function, as shown here:: + + static int __init usb_skel_init(void) + { + int result; + + /* register this driver with the USB subsystem */ + result = usb_register(&skel_driver); + if (result < 0) { + err("usb_register failed for the "__FILE__ "driver." + "Error number %d", result); + return -1; + } + + return 0; + } + module_init(usb_skel_init); + + +When the driver is unloaded from the system, it needs to deregister +itself with the USB subsystem. This is done with the :c:func:`usb_deregister` +function:: + + static void __exit usb_skel_exit(void) + { + /* deregister this driver with the USB subsystem */ + usb_deregister(&skel_driver); + } + module_exit(usb_skel_exit); + + +To enable the linux-hotplug system to load the driver automatically when +the device is plugged in, you need to create a ``MODULE_DEVICE_TABLE``. +The following code tells the hotplug scripts that this module supports a +single device with a specific vendor and product ID:: + + /* table of devices that work with this driver */ + static struct usb_device_id skel_table [] = { + { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) }, + { } /* Terminating entry */ + }; + MODULE_DEVICE_TABLE (usb, skel_table); + + +There are other macros that can be used in describing a struct +:c:type:`usb_device_id` for drivers that support a whole class of USB +drivers. See :ref:`usb.h ` for more information on this. + +Device operation +================ + +When a device is plugged into the USB bus that matches the device ID +pattern that your driver registered with the USB core, the probe +function is called. The :c:type:`usb_device` structure, interface number and +the interface ID are passed to the function:: + + static int skel_probe(struct usb_interface *interface, + const struct usb_device_id *id) + + +The driver now needs to verify that this device is actually one that it +can accept. If so, it returns 0. If not, or if any error occurs during +initialization, an errorcode (such as ``-ENOMEM`` or ``-ENODEV``) is +returned from the probe function. + +In the skeleton driver, we determine what end points are marked as +bulk-in and bulk-out. We create buffers to hold the data that will be +sent and received from the device, and a USB urb to write data to the +device is initialized. + +Conversely, when the device is removed from the USB bus, the disconnect +function is called with the device pointer. The driver needs to clean +any private data that has been allocated at this time and to shut down +any pending urbs that are in the USB system. + +Now that the device is plugged into the system and the driver is bound +to the device, any of the functions in the :c:type:`file_operations` structure +that were passed to the USB subsystem will be called from a user program +trying to talk to the device. The first function called will be open, as +the program tries to open the device for I/O. We increment our private +usage count and save a pointer to our internal structure in the file +structure. This is done so that future calls to file operations will +enable the driver to determine which device the user is addressing. All +of this is done with the following code:: + + /* increment our usage count for the module */ + ++skel->open_count; + + /* save our object in the file's private structure */ + file->private_data = dev; + + +After the open function is called, the read and write functions are +called to receive and send data to the device. In the ``skel_write`` +function, we receive a pointer to some data that the user wants to send +to the device and the size of the data. The function determines how much +data it can send to the device based on the size of the write urb it has +created (this size depends on the size of the bulk out end point that +the device has). Then it copies the data from user space to kernel +space, points the urb to the data and submits the urb to the USB +subsystem. This can be seen in the following code:: + + /* we can only write as much as 1 urb will hold */ + bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count; + + /* copy the data from user space into our urb */ + copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written); + + /* set up our urb */ + usb_fill_bulk_urb(skel->write_urb, + skel->dev, + usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr), + skel->write_urb->transfer_buffer, + bytes_written, + skel_write_bulk_callback, + skel); + + /* send the data out the bulk port */ + result = usb_submit_urb(skel->write_urb); + if (result) { + err("Failed submitting write urb, error %d", result); + } + + +When the write urb is filled up with the proper information using the +:c:func:`usb_fill_bulk_urb` function, we point the urb's completion callback +to call our own ``skel_write_bulk_callback`` function. This function is +called when the urb is finished by the USB subsystem. The callback +function is called in interrupt context, so caution must be taken not to +do very much processing at that time. Our implementation of +``skel_write_bulk_callback`` merely reports if the urb was completed +successfully or not and then returns. + +The read function works a bit differently from the write function in +that we do not use an urb to transfer data from the device to the +driver. Instead we call the :c:func:`usb_bulk_msg` function, which can be used +to send or receive data from a device without having to create urbs and +handle urb completion callback functions. We call the :c:func:`usb_bulk_msg` +function, giving it a buffer into which to place any data received from +the device and a timeout value. If the timeout period expires without +receiving any data from the device, the function will fail and return an +error message. This can be shown with the following code:: + + /* do an immediate bulk read to get data from the device */ + retval = usb_bulk_msg (skel->dev, + usb_rcvbulkpipe (skel->dev, + skel->bulk_in_endpointAddr), + skel->bulk_in_buffer, + skel->bulk_in_size, + &count, HZ*10); + /* if the read was successful, copy the data to user space */ + if (!retval) { + if (copy_to_user (buffer, skel->bulk_in_buffer, count)) + retval = -EFAULT; + else + retval = count; + } + + +The :c:func:`usb_bulk_msg` function can be very useful for doing single reads +or writes to a device; however, if you need to read or write constantly to +a device, it is recommended to set up your own urbs and submit them to +the USB subsystem. + +When the user program releases the file handle that it has been using to +talk to the device, the release function in the driver is called. In +this function we decrement our private usage count and wait for possible +pending writes:: + + /* decrement our usage count for the device */ + --skel->open_count; + + +One of the more difficult problems that USB drivers must be able to +handle smoothly is the fact that the USB device may be removed from the +system at any point in time, even if a program is currently talking to +it. It needs to be able to shut down any current reads and writes and +notify the user-space programs that the device is no longer there. The +following code (function ``skel_delete``) is an example of how to do +this:: + + static inline void skel_delete (struct usb_skel *dev) + { + kfree (dev->bulk_in_buffer); + if (dev->bulk_out_buffer != NULL) + usb_free_coherent (dev->udev, dev->bulk_out_size, + dev->bulk_out_buffer, + dev->write_urb->transfer_dma); + usb_free_urb (dev->write_urb); + kfree (dev); + } + + +If a program currently has an open handle to the device, we reset the +flag ``device_present``. For every read, write, release and other +functions that expect a device to be present, the driver first checks +this flag to see if the device is still present. If not, it releases +that the device has disappeared, and a ``-ENODEV`` error is returned to the +user-space program. When the release function is eventually called, it +determines if there is no device and if not, it does the cleanup that +the ``skel_disconnect`` function normally does if there are no open files +on the device (see Listing 5). + +Isochronous Data +================ + +This usb-skeleton driver does not have any examples of interrupt or +isochronous data being sent to or from the device. Interrupt data is +sent almost exactly as bulk data is, with a few minor exceptions. +Isochronous data works differently with continuous streams of data being +sent to or from the device. The audio and video camera drivers are very +good examples of drivers that handle isochronous data and will be useful +if you also need to do this. + +Conclusion +========== + +Writing Linux USB device drivers is not a difficult task as the +usb-skeleton driver shows. This driver, combined with the other current +USB drivers, should provide enough examples to help a beginning author +create a working driver in a minimal amount of time. The linux-usb-devel +mailing list archives also contain a lot of helpful information. + +Resources +========= + +The Linux USB Project: +http://www.linux-usb.org/ + +Linux Hotplug Project: +http://linux-hotplug.sourceforge.net/ + +Linux USB Working Devices List: +http://www.qbik.ch/usb/devices/ + +linux-usb-devel Mailing List Archives: +http://marc.theaimsgroup.com/?l=linux-usb-devel + +Programming Guide for Linux USB Device Drivers: +http://usb.cs.tum.edu/usbdoc + +USB Home Page: http://www.usb.org diff --git a/Documentation/driver-api/vme.rst b/Documentation/driver-api/vme.rst index 89776fb3c8bd04c8fec2e922b4ff29e5f137b068..def139c13410863ccb556af9b3425fadb6acc9c5 100644 --- a/Documentation/driver-api/vme.rst +++ b/Documentation/driver-api/vme.rst @@ -6,36 +6,15 @@ Driver registration As with other subsystems within the Linux kernel, VME device drivers register with the VME subsystem, typically called from the devices init routine. This is -achieved via a call to the following function: +achieved via a call to :c:func:`vme_register_driver`. -.. code-block:: c - - int vme_register_driver (struct vme_driver *driver, unsigned int ndevs); +A pointer to a structure of type :c:type:`struct vme_driver ` must +be provided to the registration function. Along with the maximum number of +devices your driver is able to support. -If driver registration is successful this function returns zero, if an error -occurred a negative error code will be returned. - -A pointer to a structure of type 'vme_driver' must be provided to the -registration function. Along with ndevs, which is the number of devices your -driver is able to support. The structure is as follows: - -.. code-block:: c - - struct vme_driver { - struct list_head node; - const char *name; - int (*match)(struct vme_dev *); - int (*probe)(struct vme_dev *); - int (*remove)(struct vme_dev *); - void (*shutdown)(void); - struct device_driver driver; - struct list_head devices; - unsigned int ndev; - }; - -At the minimum, the '.name', '.match' and '.probe' elements of this structure -should be correctly set. The '.name' element is a pointer to a string holding -the device driver's name. +At the minimum, the '.name', '.match' and '.probe' elements of +:c:type:`struct vme_driver ` should be correctly set. The '.name' +element is a pointer to a string holding the device driver's name. The '.match' function allows control over which VME devices should be registered with the driver. The match function should return 1 if a device should be @@ -54,29 +33,16 @@ the number of devices probed to one: } The '.probe' element should contain a pointer to the probe routine. The -probe routine is passed a 'struct vme_dev' pointer as an argument. The -'struct vme_dev' structure looks like the following: - -.. code-block:: c - - struct vme_dev { - int num; - struct vme_bridge *bridge; - struct device dev; - struct list_head drv_list; - struct list_head bridge_list; - }; +probe routine is passed a :c:type:`struct vme_dev ` pointer as an +argument. Here, the 'num' field refers to the sequential device ID for this specific driver. The bridge number (or bus number) can be accessed using dev->bridge->num. -A function is also provided to unregister the driver from the VME core and is -usually called from the device driver's exit routine: - -.. code-block:: c - - void vme_unregister_driver (struct vme_driver *driver); +A function is also provided to unregister the driver from the VME core called +:c:func:`vme_unregister_driver` and should usually be called from the device +driver's exit routine. Resource management @@ -90,47 +56,29 @@ driver is called. The probe routine is passed a pointer to the devices device structure. This pointer should be saved, it will be required for requesting VME resources. -The driver can request ownership of one or more master windows, slave windows -and/or dma channels. Rather than allowing the device driver to request a -specific window or DMA channel (which may be used by a different driver) this -driver allows a resource to be assigned based on the required attributes of the -driver in question: - -.. code-block:: c - - struct vme_resource * vme_master_request(struct vme_dev *dev, - u32 aspace, u32 cycle, u32 width); - - struct vme_resource * vme_slave_request(struct vme_dev *dev, u32 aspace, - u32 cycle); - - struct vme_resource *vme_dma_request(struct vme_dev *dev, u32 route); - -For slave windows these attributes are split into the VME address spaces that -need to be accessed in 'aspace' and VME bus cycle types required in 'cycle'. -Master windows add a further set of attributes in 'width' specifying the -required data transfer widths. These attributes are defined as bitmasks and as -such any combination of the attributes can be requested for a single window, -the core will assign a window that meets the requirements, returning a pointer -of type vme_resource that should be used to identify the allocated resource -when it is used. For DMA controllers, the request function requires the -potential direction of any transfers to be provided in the route attributes. -This is typically VME-to-MEM and/or MEM-to-VME, though some hardware can -support VME-to-VME and MEM-to-MEM transfers as well as test pattern generation. -If an unallocated window fitting the requirements can not be found a NULL -pointer will be returned. +The driver can request ownership of one or more master windows +(:c:func:`vme_master_request`), slave windows (:c:func:`vme_slave_request`) +and/or dma channels (:c:func:`vme_dma_request`). Rather than allowing the device +driver to request a specific window or DMA channel (which may be used by a +different driver) the API allows a resource to be assigned based on the required +attributes of the driver in question. For slave windows these attributes are +split into the VME address spaces that need to be accessed in 'aspace' and VME +bus cycle types required in 'cycle'. Master windows add a further set of +attributes in 'width' specifying the required data transfer widths. These +attributes are defined as bitmasks and as such any combination of the +attributes can be requested for a single window, the core will assign a window +that meets the requirements, returning a pointer of type vme_resource that +should be used to identify the allocated resource when it is used. For DMA +controllers, the request function requires the potential direction of any +transfers to be provided in the route attributes. This is typically VME-to-MEM +and/or MEM-to-VME, though some hardware can support VME-to-VME and MEM-to-MEM +transfers as well as test pattern generation. If an unallocated window fitting +the requirements can not be found a NULL pointer will be returned. Functions are also provided to free window allocations once they are no longer -required. These functions should be passed the pointer to the resource provided -during resource allocation: - -.. code-block:: c - - void vme_master_free(struct vme_resource *res); - - void vme_slave_free(struct vme_resource *res); - - void vme_dma_free(struct vme_resource *res); +required. These functions (:c:func:`vme_master_free`, :c:func:`vme_slave_free` +and :c:func:`vme_dma_free`) should be passed the pointer to the resource +provided during resource allocation. Master windows @@ -144,61 +92,22 @@ the underlying chipset. A window must be configured before it can be used. Master window configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once a master window has been assigned the following functions can be used to -configure it and retrieve the current settings: - -.. code-block:: c - - int vme_master_set (struct vme_resource *res, int enabled, - unsigned long long base, unsigned long long size, u32 aspace, - u32 cycle, u32 width); - - int vme_master_get (struct vme_resource *res, int *enabled, - unsigned long long *base, unsigned long long *size, u32 *aspace, - u32 *cycle, u32 *width); - -The address spaces, transfer widths and cycle types are the same as described +Once a master window has been assigned :c:func:`vme_master_set` can be used to +configure it and :c:func:`vme_master_get` to retrieve the current settings. The +address spaces, transfer widths and cycle types are the same as described under resource management, however some of the options are mutually exclusive. For example, only one address space may be specified. -These functions return 0 on success or an error code should the call fail. - Master window access ~~~~~~~~~~~~~~~~~~~~ -The following functions can be used to read from and write to configured master -windows. These functions return the number of bytes copied: - -.. code-block:: c - - ssize_t vme_master_read(struct vme_resource *res, void *buf, - size_t count, loff_t offset); - - ssize_t vme_master_write(struct vme_resource *res, void *buf, - size_t count, loff_t offset); - -In addition to simple reads and writes, a function is provided to do a -read-modify-write transaction. This function returns the original value of the -VME bus location : - -.. code-block:: c - - unsigned int vme_master_rmw (struct vme_resource *res, - unsigned int mask, unsigned int compare, unsigned int swap, - loff_t offset); - -This functions by reading the offset, applying the mask. If the bits selected in -the mask match with the values of the corresponding bits in the compare field, -the value of swap is written the specified offset. - -Parts of a VME window can be mapped into user space memory using the following -function: +The function :c:func:`vme_master_read` can be used to read from and +:c:func:`vme_master_write` used to write to configured master windows. -.. code-block:: c - - int vme_master_mmap(struct vme_resource *resource, - struct vm_area_struct *vma) +In addition to simple reads and writes, :c:func:`vme_master_rmw` is provided to +do a read-modify-write transaction. Parts of a VME window can also be mapped +into user space memory using :c:func:`vme_master_mmap`. Slave windows @@ -213,41 +122,23 @@ it can be used. Slave window configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once a slave window has been assigned the following functions can be used to -configure it and retrieve the current settings: - -.. code-block:: c - - int vme_slave_set (struct vme_resource *res, int enabled, - unsigned long long base, unsigned long long size, - dma_addr_t mem, u32 aspace, u32 cycle); - - int vme_slave_get (struct vme_resource *res, int *enabled, - unsigned long long *base, unsigned long long *size, - dma_addr_t *mem, u32 *aspace, u32 *cycle); +Once a slave window has been assigned :c:func:`vme_slave_set` can be used to +configure it and :c:func:`vme_slave_get` to retrieve the current settings. The address spaces, transfer widths and cycle types are the same as described under resource management, however some of the options are mutually exclusive. For example, only one address space may be specified. -These functions return 0 on success or an error code should the call fail. - Slave window buffer allocation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Functions are provided to allow the user to allocate and free a contiguous -buffers which will be accessible by the VME bridge. These functions do not have -to be used, other methods can be used to allocate a buffer, though care must be -taken to ensure that they are contiguous and accessible by the VME bridge: - -.. code-block:: c - - void * vme_alloc_consistent(struct vme_resource *res, size_t size, - dma_addr_t *mem); - - void vme_free_consistent(struct vme_resource *res, size_t size, - void *virt, dma_addr_t mem); +Functions are provided to allow the user to allocate +(:c:func:`vme_alloc_consistent`) and free (:c:func:`vme_free_consistent`) +contiguous buffers which will be accessible by the VME bridge. These functions +do not have to be used, other methods can be used to allocate a buffer, though +care must be taken to ensure that they are contiguous and accessible by the VME +bridge. Slave window access @@ -269,29 +160,18 @@ executed, reused and destroyed. List Management ~~~~~~~~~~~~~~~ -The following functions are provided to create and destroy DMA lists. Execution -of a list will not automatically destroy the list, thus enabling a list to be -reused for repetitive tasks: - -.. code-block:: c - - struct vme_dma_list *vme_new_dma_list(struct vme_resource *res); - - int vme_dma_list_free(struct vme_dma_list *list); +The function :c:func:`vme_new_dma_list` is provided to create and +:c:func:`vme_dma_list_free` to destroy DMA lists. Execution of a list will not +automatically destroy the list, thus enabling a list to be reused for repetitive +tasks. List Population ~~~~~~~~~~~~~~~ -An item can be added to a list using the following function ( the source and +An item can be added to a list using :c:func:`vme_dma_list_add` (the source and destination attributes need to be created before calling this function, this is -covered under "Transfer Attributes"): - -.. code-block:: c - - int vme_dma_list_add(struct vme_dma_list *list, - struct vme_dma_attr *src, struct vme_dma_attr *dest, - size_t count); +covered under "Transfer Attributes"). .. note:: @@ -310,41 +190,19 @@ an item to a list. This is due to the diverse attributes required for each type of source and destination. There are functions to create attributes for PCI, VME and pattern sources and destinations (where appropriate): -Pattern source: - -.. code-block:: c - - struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type); - -PCI source or destination: - -.. code-block:: c - - struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t mem); - -VME source or destination: + - PCI source or destination: :c:func:`vme_dma_pci_attribute` + - VME source or destination: :c:func:`vme_dma_vme_attribute` + - Pattern source: :c:func:`vme_dma_pattern_attribute` -.. code-block:: c - - struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long base, - u32 aspace, u32 cycle, u32 width); - -The following function should be used to free an attribute: - -.. code-block:: c - - void vme_dma_free_attribute(struct vme_dma_attr *attr); +The function :c:func:`vme_dma_free_attribute` should be used to free an +attribute. List Execution ~~~~~~~~~~~~~~ -The following function queues a list for execution. The function will return -once the list has been executed: - -.. code-block:: c - - int vme_dma_list_exec(struct vme_dma_list *list); +The function :c:func:`vme_dma_list_exec` queues a list for execution and will +return once the list has been executed. Interrupts @@ -358,20 +216,13 @@ specific VME level and status IDs. Attaching Interrupt Handlers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The following functions can be used to attach and free a specific VME level and -status ID combination. Any given combination can only be assigned a single -callback function. A void pointer parameter is provided, the value of which is -passed to the callback function, the use of this pointer is user undefined: - -.. code-block:: c - - int vme_irq_request(struct vme_dev *dev, int level, int statid, - void (*callback)(int, int, void *), void *priv); - - void vme_irq_free(struct vme_dev *dev, int level, int statid); - -The callback parameters are as follows. Care must be taken in writing a callback -function, callback functions run in interrupt context: +The function :c:func:`vme_irq_request` can be used to attach and +:c:func:`vme_irq_free` to free a specific VME level and status ID combination. +Any given combination can only be assigned a single callback function. A void +pointer parameter is provided, the value of which is passed to the callback +function, the use of this pointer is user undefined. The callback parameters are +as follows. Care must be taken in writing a callback function, callback +functions run in interrupt context: .. code-block:: c @@ -381,12 +232,8 @@ function, callback functions run in interrupt context: Interrupt Generation ~~~~~~~~~~~~~~~~~~~~ -The following function can be used to generate a VME interrupt at a given VME -level and VME status ID: - -.. code-block:: c - - int vme_irq_generate(struct vme_dev *dev, int level, int statid); +The function :c:func:`vme_irq_generate` can be used to generate a VME interrupt +at a given VME level and VME status ID. Location monitors @@ -399,54 +246,29 @@ monitor. Location Monitor Management ~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The following functions are provided to request the use of a block of location -monitors and to free them after they are no longer required: - -.. code-block:: c - - struct vme_resource * vme_lm_request(struct vme_dev *dev); - - void vme_lm_free(struct vme_resource * res); - -Each block may provide a number of location monitors, monitoring adjacent -locations. The following function can be used to determine how many locations -are provided: - -.. code-block:: c - - int vme_lm_count(struct vme_resource * res); +The function :c:func:`vme_lm_request` is provided to request the use of a block +of location monitors and :c:func:`vme_lm_free` to free them after they are no +longer required. Each block may provide a number of location monitors, +monitoring adjacent locations. The function :c:func:`vme_lm_count` can be used +to determine how many locations are provided. Location Monitor Configuration ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Once a bank of location monitors has been allocated, the following functions -are provided to configure the location and mode of the location monitor: - -.. code-block:: c - - int vme_lm_set(struct vme_resource *res, unsigned long long base, - u32 aspace, u32 cycle); - - int vme_lm_get(struct vme_resource *res, unsigned long long *base, - u32 *aspace, u32 *cycle); +Once a bank of location monitors has been allocated, the function +:c:func:`vme_lm_set` is provided to configure the location and mode of the +location monitor. The function :c:func:`vme_lm_get` can be used to retrieve +existing settings. Location Monitor Use ~~~~~~~~~~~~~~~~~~~~ -The following functions allow a callback to be attached and detached from each -location monitor location. Each location monitor can monitor a number of -adjacent locations: - -.. code-block:: c - - int vme_lm_attach(struct vme_resource *res, int num, - void (*callback)(void *)); - - int vme_lm_detach(struct vme_resource *res, int num); - -The callback function is declared as follows. +The function :c:func:`vme_lm_attach` enables a callback to be attached and +:c:func:`vme_lm_detach` allows on to be detached from each location monitor +location. Each location monitor can monitor a number of adjacent locations. The +callback function is declared as follows. .. code-block:: c @@ -456,19 +278,20 @@ The callback function is declared as follows. Slot Detection -------------- -This function returns the slot ID of the provided bridge. - -.. code-block:: c - - int vme_slot_num(struct vme_dev *dev); +The function :c:func:`vme_slot_num` returns the slot ID of the provided bridge. Bus Detection ------------- -This function returns the bus ID of the provided bridge. +The function :c:func:`vme_bus_num` returns the bus ID of the provided bridge. -.. code-block:: c - int vme_bus_num(struct vme_dev *dev); +VME API +------- + +.. kernel-doc:: include/linux/vme.h + :internal: +.. kernel-doc:: drivers/vme/vme.c + :export: diff --git a/Documentation/driver-model/devres.txt b/Documentation/driver-model/devres.txt index bf34d5b3a7330e5c26f9c6eebe977ed853804959..e72587fe477d5f1ad958e9810f1e5c46cbccc786 100644 --- a/Documentation/driver-model/devres.txt +++ b/Documentation/driver-model/devres.txt @@ -342,8 +342,10 @@ PER-CPU MEM devm_free_percpu() PCI - pcim_enable_device() : after success, all PCI ops become managed - pcim_pin_device() : keep PCI device enabled after release + devm_pci_remap_cfgspace() : ioremap PCI configuration space + devm_pci_remap_cfg_resource() : ioremap PCI configuration space resource + pcim_enable_device() : after success, all PCI ops become managed + pcim_pin_device() : keep PCI device enabled after release PHY devm_usb_get_phy() diff --git a/Documentation/early-userspace/README b/Documentation/early-userspace/README index 93e63a9af30b7b0ec63f1cdf3904d92e5731596c..2c00b072a4c8ca382f84af09ebea29150645380d 100644 --- a/Documentation/early-userspace/README +++ b/Documentation/early-userspace/README @@ -86,7 +86,7 @@ early userspace useful. The klibc distribution is currently maintained separately from the kernel. You can obtain somewhat infrequent snapshots of klibc from -ftp://ftp.kernel.org/pub/linux/libs/klibc/ +https://www.kernel.org/pub/linux/libs/klibc/ For active users, you are better off using the klibc git repository, at http://git.kernel.org/?p=libs/klibc/klibc.git diff --git a/Documentation/extcon/porting-android-switch-class b/Documentation/extcon/porting-android-switch-class deleted file mode 100644 index 49c81caef84dc421fd8194cda8d092d5a4de7302..0000000000000000000000000000000000000000 --- a/Documentation/extcon/porting-android-switch-class +++ /dev/null @@ -1,123 +0,0 @@ - - Staging/Android Switch Class Porting Guide - (linux/drivers/staging/android/switch) - (c) Copyright 2012 Samsung Electronics - -AUTHORS -MyungJoo Ham - -/***************************************************************** - * CHAPTER 1. * - * PORTING SWITCH CLASS DEVICE DRIVERS * - *****************************************************************/ - -****** STEP 1. Basic Functionality - No extcon extended feature, but switch features only. - -- struct switch_dev (fed to switch_dev_register/unregister) - @name: no change - @dev: no change - @index: drop (not used in switch device driver side anyway) - @state: no change - If you have used @state with magic numbers, keep it - at this step. - @print_name: no change but type change (switch_dev->extcon_dev) - @print_state: no change but type change (switch_dev->extcon_dev) - -- switch_dev_register(sdev, dev) - => extcon_dev_register(edev) - : type change (sdev->edev) - : remove second param('dev'). if edev has parent device, should store - 'dev' to 'edev.dev.parent' before registering extcon device -- switch_dev_unregister(sdev) - => extcon_dev_unregister(edev) - : no change but type change (sdev->edev) -- switch_get_state(sdev) - => extcon_get_state(edev) - : no change but type change (sdev->edev) and (return: int->u32) -- switch_set_state(sdev, state) - => extcon_set_state(edev, state) - : no change but type change (sdev->edev) and (state: int->u32) - -With this changes, the ex-switch extcon class device works as it once -worked as switch class device. However, it will now have additional -interfaces (both ABI and in-kernel API) and different ABI locations. -However, if CONFIG_ANDROID is enabled without CONFIG_ANDROID_SWITCH, -/sys/class/switch/* will be symbolically linked to /sys/class/extcon/ -so that they are still compatible with legacy userspace processes. - -****** STEP 2. Multistate (no more magic numbers in state value) - Extcon's extended features for switch device drivers with - complex features usually required magic numbers in state - value of switch_dev. With extcon, such magic numbers that - support multiple cables are no more required or supported. - - 1. Define cable names at edev->supported_cable. - 2. (Recommended) remove print_state callback. - 3. Use extcon_get_cable_state_(edev, index) or - extcon_get_cable_state(edev, cable_name) instead of - extcon_get_state(edev) if you intend to get a state of a specific - cable. Same for set_state. This way, you can remove the usage of - magic numbers in state value. - 4. Use extcon_update_state() if you are updating specific bits of - the state value. - -Example: a switch device driver w/ magic numbers for two cables. - "0x00": no cables connected. - "0x01": cable 1 connected - "0x02": cable 2 connected - "0x03": cable 1 and 2 connected - 1. edev->supported_cable = {"1", "2", NULL}; - 2. edev->print_state = NULL; - 3. extcon_get_cable_state_(edev, 0) shows cable 1's state. - extcon_get_cable_state(edev, "1") shows cable 1's state. - extcon_set_cable_state_(edev, 1) sets cable 2's state. - extcon_set_cable_state(edev, "2") sets cable 2's state - 4. extcon_update_state(edev, 0x01, 0) sets the least bit's 0. - -****** STEP 3. Notify other device drivers - - You can notify others of the cable attach/detach events with -notifier chains. - - At the side of other device drivers (the extcon device itself -does not need to get notified of its own events), there are two -methods to register notifier_block for cable events: -(a) for a specific cable or (b) for every cable. - - (a) extcon_register_interest(obj, extcon_name, cable_name, nb) - Example: want to get news of "MAX8997_MUIC"'s "USB" cable - - obj = kzalloc(sizeof(struct extcon_specific_cable_nb), - GFP_KERNEL); - nb->notifier_call = the_callback_to_handle_usb; - - extcon_register_intereset(obj, "MAX8997_MUIC", "USB", nb); - - (b) extcon_register_notifier(edev, nb) - Call nb for any changes in edev. - - Please note that in order to properly behave with method (a), -the extcon device driver should support multistate feature (STEP 2). - -****** STEP 4. Inter-cable relation (mutually exclusive) - - You can provide inter-cable mutually exclusiveness information -for an extcon device. When cables A and B are declared to be mutually -exclusive, the two cables cannot be in ATTACHED state simulteneously. - - -/***************************************************************** - * CHAPTER 2. * - * PORTING USERSPACE w/ SWITCH CLASS DEVICE SUPPORT * - *****************************************************************/ - -****** ABI Location - - If "CONFIG_ANDROID" is enabled, /sys/class/switch/* are created -as symbolic links to /sys/class/extcon/*. - - The two files of switch class, name and state, are provided with -extcon, too. When the multistate support (STEP 2 of CHAPTER 1.) is -not enabled or print_state callback is supplied, the output of -state ABI is same with switch class. diff --git a/Documentation/features/core/BPF-JIT/arch-support.txt b/Documentation/features/core/BPF-JIT/arch-support.txt index c1b4f917238f7124aaf78778f4df8c1c3ebdf2e5..5575d2d09625ac7ce97029099ee5ee938f179bd4 100644 --- a/Documentation/features/core/BPF-JIT/arch-support.txt +++ b/Documentation/features/core/BPF-JIT/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt index 6d930fcbe519105831326012a6a383a2cd1d51a0..abb5f271a792e96566a36496055afffd5df078c8 100644 --- a/Documentation/features/core/generic-idle-thread/arch-support.txt +++ b/Documentation/features/core/generic-idle-thread/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | ok | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt index 136868b636e678b6ec6d8e4ed308f729a85a0d4a..dbdaffcc51107c0c0cd513774fcfb41b626a66a4 100644 --- a/Documentation/features/core/jump-labels/arch-support.txt +++ b/Documentation/features/core/jump-labels/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt index 728061d763b1150f09222848cec80ff1012d96c1..5e97a89420ef09ceaa6c8fb7d89c7ca771d955ef 100644 --- a/Documentation/features/core/tracehook/arch-support.txt +++ b/Documentation/features/core/tracehook/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | ok | | c6x: | ok | | cris: | TODO | diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index 703f5784bc90d9647d015c086330b59a48b4631d..76bbd7fe27b35bec5839e6bcac0337c550fc76b2 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 38dea8eeba0a3db202c2def713eacc3164f2cee7..830dbe801aafbfce012fb433b631fcd60a819b78 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt index 862e15d6f79ec75b4c99c8b7c0d70f1f9a7136eb..0217bf6e942d2b83ddb24f14d2ce6aa079ada9ac 100644 --- a/Documentation/features/debug/kgdb/arch-support.txt +++ b/Documentation/features/debug/kgdb/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | ok | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 40f44d041fb4764386fb6f90f12397bf805c7a7d..1e84be3c142e01ea0c23a092924c950793a278a7 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | @@ -27,7 +26,7 @@ | nios2: | TODO | | openrisc: | TODO | | parisc: | TODO | - | powerpc: | TODO | + | powerpc: | ok | | s390: | TODO | | score: | TODO | | sh: | TODO | diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt index a44bfff6940b9a8a2b3d6a6abef7ec262562c658..529f66eda679aaa4f16bd3dc1d090c2a1d1338e5 100644 --- a/Documentation/features/debug/kprobes/arch-support.txt +++ b/Documentation/features/debug/kprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | TODO | - | avr32: | ok | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt index d87c1ce24204fafbdf9cd42dfdd7eb04bd4d784e..43353242e43968ac3622cd2de1eb82a72c8fb558 100644 --- a/Documentation/features/debug/kretprobes/arch-support.txt +++ b/Documentation/features/debug/kretprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt index b8999d8544cac4bbaa6bd4c7480a3073017f65e5..f559f1ba5416da682fed3c947bdb08bd8090f0d8 100644 --- a/Documentation/features/debug/optprobes/arch-support.txt +++ b/Documentation/features/debug/optprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 0fa423313409c9bbbc505283f9f114ef41df47bd..d7acd7bd36197025afc332efb7e53065dd138f13 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt index d605c3fc38fd5cd920230b8057e16a2f180eec1b..53ed42b0e7e59ba14ca7b52ee7023357e006a9be 100644 --- a/Documentation/features/debug/uprobes/arch-support.txt +++ b/Documentation/features/debug/uprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt index 44cc1ff3f603a957a6b552bb911e08803c882caa..149443936de9fda2607aca200796c64719a2d81e 100644 --- a/Documentation/features/debug/user-ret-profiler/arch-support.txt +++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/io/dma-api-debug/arch-support.txt b/Documentation/features/io/dma-api-debug/arch-support.txt index ffa522a9bdfdcc34b017bc9a3e70be8fe2d87082..6be920643be667f312b62e5335ec8f849ddf01d9 100644 --- a/Documentation/features/io/dma-api-debug/arch-support.txt +++ b/Documentation/features/io/dma-api-debug/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | ok | | cris: | TODO | diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt index 83d2cf989ea3494dc825fc9e4da3b1ed09491646..0eb08e1e32b808c0f81a27df0470b2fdd9e1a14c 100644 --- a/Documentation/features/io/dma-contiguous/arch-support.txt +++ b/Documentation/features/io/dma-contiguous/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/io/sg-chain/arch-support.txt b/Documentation/features/io/sg-chain/arch-support.txt index 6ca98f9911bbb31693ff342039d5c84772e37d91..514ad3468aa597b36a030b46941f68dc7d161a68 100644 --- a/Documentation/features/io/sg-chain/arch-support.txt +++ b/Documentation/features/io/sg-chain/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/lib/strncasecmp/arch-support.txt b/Documentation/features/lib/strncasecmp/arch-support.txt index 12b1c9358e57150827bf1d8f7e7a4b86ef373ed0..532c6f0fc15c0c2291765ae9bf6da0c921c82140 100644 --- a/Documentation/features/lib/strncasecmp/arch-support.txt +++ b/Documentation/features/lib/strncasecmp/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt index d9c310889bc1a529f66313921568ce42e57ff4b1..f3eec26c8cf80c589e768fdc078fa365f1839cba 100644 --- a/Documentation/features/locking/cmpxchg-local/arch-support.txt +++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt index cf90635bdcbb23124b8c199103d71c11bcadec65..9756abc680a7e5de130945a5bf4ccac5c271ef7a 100644 --- a/Documentation/features/locking/lockdep/arch-support.txt +++ b/Documentation/features/locking/lockdep/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | ok | | blackfin: | ok | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt index 68c3a5ddd9b9cf9ad4f17e3e88c530186edf9333..62f4ee5c156c41dffd9306c4e6dd5ccece93029e 100644 --- a/Documentation/features/locking/queued-rwlocks/arch-support.txt +++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt index e973b1a9572f86da2cce2e0c586bd376ebf61668..321b32f6e63c5070c69c6b4d9e5e372b2cd544a3 100644 --- a/Documentation/features/locking/queued-spinlocks/arch-support.txt +++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/locking/rwsem-optimized/arch-support.txt b/Documentation/features/locking/rwsem-optimized/arch-support.txt index ac93d7ab66c4f3134c13da5b04224dfe99141162..79bfa4d6e41ffa18f361f9d14455a5a8dab93e18 100644 --- a/Documentation/features/locking/rwsem-optimized/arch-support.txt +++ b/Documentation/features/locking/rwsem-optimized/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt index 4660bf222db17099e221fc37f2d002ab7b45fd8b..00f1606bbf45709355356fbc326ddebb6324f811 100644 --- a/Documentation/features/perf/kprobes-event/arch-support.txt +++ b/Documentation/features/perf/kprobes-event/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index f179b1fb26ef49734ff84bf1deee48bfe8915fdd..7d516eacf7b929cbb375c69293ba08e93c686c8a 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 85777c5c6353712c51b6ae58ad322af7ebcd9d74..f974b8df5d825621997e604bac797fbef5254333 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt index ac7cd6b1502b5a029f7aa3f31d5ad17217b69fe6..1d3c0f66915252f939fef5c2ba23f89a123c175a 100644 --- a/Documentation/features/sched/numa-balancing/arch-support.txt +++ b/Documentation/features/sched/numa-balancing/arch-support.txt @@ -10,7 +10,6 @@ | arc: | .. | | arm: | .. | | arm64: | .. | - | avr32: | .. | | blackfin: | .. | | c6x: | .. | | cris: | .. | diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index 4f66ec13395112ae9be41a7250f0152963995a1c..a32d5b20767916f4df047feee74a01af0ab26e17 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt index 8acb439a4a1767e0eab512a4fec5e786f5200255..caee8f64d1bca0e4bf26d8971d955deca45d03ee 100644 --- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt +++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt index ff670b2207f15051202d9808ff5464dc31e0db2f..1cd87f6cd07da9a64b908ff62a4d4ad82a49159c 100644 --- a/Documentation/features/time/clockevents/arch-support.txt +++ b/Documentation/features/time/clockevents/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | ok | | blackfin: | ok | | c6x: | ok | | cris: | ok | diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt index a1e3eea7003f785e561626615c2a11d468297f6d..e6d7c7b2253c8b4980b5a2bbb9f7363ca13f96d1 100644 --- a/Documentation/features/time/context-tracking/arch-support.txt +++ b/Documentation/features/time/context-tracking/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index 4199ffecc0ff06bde4b26fa5f09d1fd0a9a08601..15c6071788aec5ba0a7e5db6b5d682db9199f891 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/time/modern-timekeeping/arch-support.txt b/Documentation/features/time/modern-timekeeping/arch-support.txt index 17f68a02e84db5f1d441359f7b17efd97ae0f2d1..baee7611ba3d6a33cba69f73d107ed7d67008f94 100644 --- a/Documentation/features/time/modern-timekeeping/arch-support.txt +++ b/Documentation/features/time/modern-timekeeping/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | TODO | | arm64: | ok | - | avr32: | ok | | blackfin: | TODO | | c6x: | ok | | cris: | TODO | diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt index cf3c3e383d156d90e85dfcf2b7a23da3465b879f..9129530cb73c76b6ac1a2aa94c2987c1eb007205 100644 --- a/Documentation/features/time/virt-cpuacct/arch-support.txt +++ b/Documentation/features/time/virt-cpuacct/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt index ec4dd28e12979fedb5cfe26c9f2780936650505a..f6829af3255f3545e0f111b280bfc7b5797e99b3 100644 --- a/Documentation/features/vm/ELF-ASLR/arch-support.txt +++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt index 991974275a3ef789975502dd248d026133db138c..1a09ea99d486c7cb0f1ebcdbb232f6c46afbb279 100644 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ b/Documentation/features/vm/PG_uncached/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt index 523f8307b9cd574ee22961d4fd27e618f5fe5b05..d170e6236503afe80577e5ebc73fe5e975d72ad0 100644 --- a/Documentation/features/vm/THP/arch-support.txt +++ b/Documentation/features/vm/THP/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | .. | | blackfin: | .. | | c6x: | .. | | cris: | .. | diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 261b92e2fb1ae1e1f41c7ee481f87a6b9cb48198..abfab4080a9136d24a477e022b887cd8500a39e8 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | avr32: | .. | | blackfin: | TODO | | c6x: | .. | | cris: | .. | diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index df1d1f3c9af290aa6ffaa1584f71ba6025e54c4e..f81f09b22b08cf8f8c69477752b4a5f93d062af2 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt index 90c53749fde78107d7d2f66bc6ec7434a40341a9..0cc3e11c42e2e5f5ab31e3256c36a1ef740ed81f 100644 --- a/Documentation/features/vm/ioremap_prot/arch-support.txt +++ b/Documentation/features/vm/ioremap_prot/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | TODO | | arm64: | TODO | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/features/vm/numa-memblock/arch-support.txt b/Documentation/features/vm/numa-memblock/arch-support.txt index e7c252a0c53123bb22fbf7d8bd217bf7e3b762b0..9a3fdac42ce1258ff2c604634cfecea9069a210c 100644 --- a/Documentation/features/vm/numa-memblock/arch-support.txt +++ b/Documentation/features/vm/numa-memblock/arch-support.txt @@ -10,7 +10,6 @@ | arc: | .. | | arm: | .. | | arm64: | .. | - | avr32: | .. | | blackfin: | .. | | c6x: | .. | | cris: | .. | diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 3de5434c857c8909deac9f8c1770c56723c31ff8..dfaa39e664fffe944f2c348c361e45bf4e62e22f 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | avr32: | TODO | | blackfin: | TODO | | c6x: | TODO | | cris: | TODO | diff --git a/Documentation/filesystems/bfs.txt b/Documentation/filesystems/bfs.txt index 78043d5a8fc35f01c343ced6f956aadee03e1026..843ce91a2e40504beaedf9c159c36d271700071b 100644 --- a/Documentation/filesystems/bfs.txt +++ b/Documentation/filesystems/bfs.txt @@ -54,4 +54,4 @@ The first 4 bytes should be 0x1badface. If you have any patches, questions or suggestions regarding this BFS implementation please contact the author: -Tigran Aivazian +Tigran Aivazian diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 3698ed3146e3011b11ed4211a06a7badc2f254cb..5a8f7f4d2bca227516b0e21fae0945b4554d4485 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt @@ -25,7 +25,7 @@ Note: More extensive information for getting started with ext4 can be or - ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ + https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ or grab the latest git repository from: diff --git a/Documentation/filesystems/nfs/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt index 1e6564545edf067a1d9e2be42d78c149bead0730..22dc0dd6889cbfbaff1ad2a5af43f6f4f6303fdb 100644 --- a/Documentation/filesystems/nfs/nfs-rdma.txt +++ b/Documentation/filesystems/nfs/nfs-rdma.txt @@ -110,10 +110,10 @@ Installation - Install a Linux kernel with NFS/RDMA The NFS/RDMA client and server are both included in the mainline Linux - kernel version 2.6.25 and later. This and other versions of the 2.6 Linux + kernel version 2.6.25 and later. This and other versions of the Linux kernel can be found at: - ftp://ftp.kernel.org/pub/linux/kernel/v2.6/ + https://www.kernel.org/pub/linux/kernel/ Download the sources and place them in an appropriate location. diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt index 8de578a98222784ebab5e930586a019dd6cd67aa..80dc0bdc302a6ae86db68657efddc7aba8fec498 100644 --- a/Documentation/filesystems/nfs/pnfs.txt +++ b/Documentation/filesystems/nfs/pnfs.txt @@ -64,46 +64,9 @@ table which are called by the nfs-client pnfs-core to implement the different layout types. Files-layout-driver code is in: fs/nfs/filelayout/.. directory -Objects-layout-driver code is in: fs/nfs/objlayout/.. directory Blocks-layout-driver code is in: fs/nfs/blocklayout/.. directory Flexfiles-layout-driver code is in: fs/nfs/flexfilelayout/.. directory -objects-layout setup --------------------- - -As part of the full STD implementation the objlayoutdriver.ko needs, at times, -to automatically login to yet undiscovered iscsi/osd devices. For this the -driver makes up-calles to a user-mode script called *osd_login* - -The path_name of the script to use is by default: - /sbin/osd_login. -This name can be overridden by the Kernel module parameter: - objlayoutdriver.osd_login_prog - -If Kernel does not find the osd_login_prog path it will zero it out -and will not attempt farther logins. An admin can then write new value -to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it. - -The /sbin/osd_login is part of the nfs-utils package, and should usually -be installed on distributions that support this Kernel version. - -The API to the login script is as follows: - Usage: $0 -u -o -s - Options: - -u target uri e.g. iscsi://: - (always exists) - (More protocols can be defined in the future. - The client does not interpret this string it is - passed unchanged as received from the Server) - -o osdname of the requested target OSD - (Might be empty) - (A string which denotes the OSD name, there is a - limit of 64 chars on this string) - -s systemid of the requested target OSD - (Might be empty) - (This string, if not empty is always an hex - representation of the 20 bytes osd_system_id) - blocks-layout setup ------------------- diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt index 634d03e20c2d9154c926b7db51061c69a6d32cd9..c9e884b52698020f88fdab12588687c10f16004e 100644 --- a/Documentation/filesystems/overlayfs.txt +++ b/Documentation/filesystems/overlayfs.txt @@ -21,12 +21,19 @@ from accessing the corresponding object from the original filesystem. This is most obvious from the 'st_dev' field returned by stat(2). While directories will report an st_dev from the overlay-filesystem, -all non-directory objects will report an st_dev from the lower or +non-directory objects may report an st_dev from the lower filesystem or upper filesystem that is providing the object. Similarly st_ino will only be unique when combined with st_dev, and both of these can change over the lifetime of a non-directory object. Many applications and tools ignore these values and will not be affected. +In the special case of all overlay layers on the same underlying +filesystem, all objects will report an st_dev from the overlay +filesystem and st_ino from the underlying filesystem. This will +make the overlay mount more compliant with filesystem scanners and +overlay objects will be distinguishable from the corresponding +objects in the original filesystem. + Upper and Lower --------------- diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index c94b4675d021ffd374de22d7d83df61dbb6c34dd..4cddbce85ac9ad175743f5f4384a32937bf87888 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -44,6 +44,7 @@ Table of Contents 3.8 /proc//fdinfo/ - Information about opened file 3.9 /proc//map_files - Information about memory mapped files 3.10 /proc//timerslack_ns - Task timerslack value + 3.11 /proc//patch_state - Livepatch patch operation state 4 Configuring procfs 4.1 Mount options @@ -412,6 +413,7 @@ Private_Clean: 0 kB Private_Dirty: 0 kB Referenced: 892 kB Anonymous: 0 kB +LazyFree: 0 kB AnonHugePages: 0 kB ShmemPmdMapped: 0 kB Shared_Hugetlb: 0 kB @@ -441,6 +443,11 @@ accessed. "Anonymous" shows the amount of memory that does not belong to any file. Even a mapping associated with a file may contain anonymous pages: when MAP_PRIVATE and a page is modified, the file page is replaced by a private anonymous copy. +"LazyFree" shows the amount of memory which is marked by madvise(MADV_FREE). +The memory isn't freed immediately with madvise(). It's freed in memory +pressure if the memory is clean. Please note that the printed value might +be lower than the real value due to optimizations used in the current +implementation. If this is not desirable please file a bug report. "AnonHugePages" shows the ammount of memory backed by transparent hugepage. "ShmemPmdMapped" shows the ammount of shared (shmem/tmpfs) memory backed by huge pages. @@ -1887,6 +1894,23 @@ Valid values are from 0 - ULLONG_MAX An application setting the value must have PTRACE_MODE_ATTACH_FSCREDS level permissions on the task specified to change its timerslack_ns value. +3.11 /proc//patch_state - Livepatch patch operation state +----------------------------------------------------------------- +When CONFIG_LIVEPATCH is enabled, this file displays the value of the +patch state for the task. + +A value of '-1' indicates that no patch is in transition. + +A value of '0' indicates that a patch is in transition and the task is +unpatched. If the patch is being enabled, then the task hasn't been +patched yet. If the patch is being disabled, then the task has already +been unpatched. + +A value of '1' indicates that a patch is in transition and the task is +patched. If the patch is being enabled, then the task has already been +patched. If the patch is being disabled, then the task hasn't been +unpatched yet. + ------------------------------------------------------------------------------ Configuring procfs diff --git a/Documentation/filesystems/sysfs-pci.txt b/Documentation/filesystems/sysfs-pci.txt index 6ea1ceda6f5235ea18458b7aa70f1fe51866fcf5..06f1d64c6f702fa2507b748b6feb5a6bddf20975 100644 --- a/Documentation/filesystems/sysfs-pci.txt +++ b/Documentation/filesystems/sysfs-pci.txt @@ -113,9 +113,18 @@ Supporting PCI access on new platforms -------------------------------------- In order to support PCI resource mapping as described above, Linux platform -code must define HAVE_PCI_MMAP and provide a pci_mmap_page_range function. -Platforms are free to only support subsets of the mmap functionality, but -useful return codes should be provided. +code should ideally define ARCH_GENERIC_PCI_MMAP_RESOURCE and use the generic +implementation of that functionality. To support the historical interface of +mmap() through files in /proc/bus/pci, platforms may also set HAVE_PCI_MMAP. + +Alternatively, platforms which set HAVE_PCI_MMAP may provide their own +implementation of pci_mmap_page_range() instead of defining +ARCH_GENERIC_PCI_MMAP_RESOURCE. + +Platforms which support write-combining maps of PCI resources must define +arch_can_pci_mmap_wc() which shall evaluate to non-zero at runtime when +write-combining is permitted. Platforms which support maps of I/O resources +define arch_can_pci_mmap_io() similarly. Legacy resources are protected by the HAVE_PCI_LEGACY define. Platforms wishing to support legacy functionality should define it and provide diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 94dd27ef4a766ca5414146fab1b76ea5aacb67de..f42b90687d406b73b4a24b2d92c9116540cc8ca1 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -694,8 +694,7 @@ struct address_space_operations { write_end: After a successful write_begin, and data copy, write_end must be called. len is the original len passed to write_begin, and copied - is the amount that was able to be copied (copied == len is always true - if write_begin was called with the AOP_FLAG_UNINTERRUPTIBLE flag). + is the amount that was able to be copied. The filesystem must take care of unlocking the page and releasing it refcount, and updating i_size. diff --git a/Documentation/gpio/consumer.txt b/Documentation/gpio/consumer.txt index 05676fdacfe3c56f1a9afdaf54795e9167a9c287..912568baabb9629083151fc4ed30578c066684cd 100644 --- a/Documentation/gpio/consumer.txt +++ b/Documentation/gpio/consumer.txt @@ -70,6 +70,12 @@ instead of -ENOENT if no GPIO has been assigned to the requested function: unsigned int index, enum gpiod_flags flags) +Note that gpio_get*_optional() functions (and their managed variants), unlike +the rest of gpiolib API, also return NULL when gpiolib support is disabled. +This is helpful to driver authors, since they do not need to special case +-ENOSYS return codes. System integrators should however be careful to enable +gpiolib on systems that need it. + For a function using multiple GPIOs all of those can be obtained with one call: struct gpio_descs *gpiod_get_array(struct device *dev, diff --git a/Documentation/gpu/introduction.rst b/Documentation/gpu/introduction.rst index 05a82bdfbca4d195ca6b5cd675554d13804ce8fc..fccbe375244d8c10767f83b4d6da4ac09366022f 100644 --- a/Documentation/gpu/introduction.rst +++ b/Documentation/gpu/introduction.rst @@ -85,3 +85,14 @@ This means that there's a blackout-period of about one month where feature work can't be merged. The recommended way to deal with that is having a -next tree that's always open, but making sure to not feed it into linux-next during the blackout period. As an example, drm-misc works like that. + +Code of Conduct +--------------- + +As a freedesktop.org project, dri-devel, and the DRM community, follows the +Contributor Covenant, found at: https://www.freedesktop.org/wiki/CodeOfConduct + +Please conduct yourself in a respectful and civilised manner when +interacting with community members on mailing lists, IRC, or bug +trackers. The community represents the project as a whole, and abusive +or bullying behaviour is not tolerated by the project. diff --git a/Documentation/hid/hidraw.txt b/Documentation/hid/hidraw.txt index 029e6cb9a7e8ea78326c36b3a5fe42f8302c6e31..c8436e354f44b3ac7220f6766e1eef85fb8304b3 100644 --- a/Documentation/hid/hidraw.txt +++ b/Documentation/hid/hidraw.txt @@ -81,7 +81,7 @@ of: BUS_HIL BUS_BLUETOOTH BUS_VIRTUAL -which are defined in linux/input.h. +which are defined in uapi/linux/input.h. HIDIOCGRAWNAME(len): Get Raw Name This ioctl returns a string containing the vendor and product strings of diff --git a/Documentation/hwmon/aspeed-pwm-tacho b/Documentation/hwmon/aspeed-pwm-tacho new file mode 100644 index 0000000000000000000000000000000000000000..7cfb3497746069c4a90cba474e9aca4d27c5ebba --- /dev/null +++ b/Documentation/hwmon/aspeed-pwm-tacho @@ -0,0 +1,22 @@ +Kernel driver aspeed-pwm-tacho +============================== + +Supported chips: + ASPEED AST2400/2500 + +Authors: + + +Description: +------------ +This driver implements support for ASPEED AST2400/2500 PWM and Fan Tacho +controller. The PWM controller supports upto 8 PWM outputs. The Fan tacho +controller supports up to 16 tachometer inputs. + +The driver provides the following sensor accesses in sysfs: + +fanX_input ro provide current fan rotation value in RPM as reported + by the fan to the device. + +pwmX rw get or set PWM fan control value. This is an integer + value between 0(off) and 255(full speed). diff --git a/Documentation/hwmon/tc654 b/Documentation/hwmon/tc654 index 91a2843f5f98811347e434519e4771502dd95c16..47636a8077b4cbf5bb80dda17a3bbfc6bfae722b 100644 --- a/Documentation/hwmon/tc654 +++ b/Documentation/hwmon/tc654 @@ -2,7 +2,7 @@ Kernel driver tc654 =================== Supported chips: - * Microship TC654 and TC655 + * Microchip TC654 and TC655 Prefix: 'tc654' Datasheet: http://ww1.microchip.com/downloads/en/DeviceDoc/20001734C.pdf diff --git a/Documentation/index.rst b/Documentation/index.rst index f6e641a54bbcab5c6dd8e4d8f724c580deaff19e..bc67dbf76eb041c5dbd66813d873ce74420a0da5 100644 --- a/Documentation/index.rst +++ b/Documentation/index.rst @@ -24,6 +24,18 @@ trying to get it to work optimally on a given system. admin-guide/index +Application-developer documentation +----------------------------------- + +The user-space API manual gathers together documents describing aspects of +the kernel interface as seen by application developers. + +.. toctree:: + :maxdepth: 2 + + userspace-api/index + + Introduction to kernel development ---------------------------------- @@ -55,6 +67,7 @@ needed). driver-api/index core-api/index media/index + input/index gpu/index security/index sound/index @@ -76,6 +89,14 @@ Chinese translations translations/zh_CN/index +Japanese translations +--------------------- + +.. toctree:: + :maxdepth: 1 + + translations/ja_JP/index + Indices and tables ================== diff --git a/Documentation/infiniband/opa_vnic.txt b/Documentation/infiniband/opa_vnic.txt new file mode 100644 index 0000000000000000000000000000000000000000..282e17be798a9cb4985b37f01e65252af0dc0362 --- /dev/null +++ b/Documentation/infiniband/opa_vnic.txt @@ -0,0 +1,153 @@ +Intel Omni-Path (OPA) Virtual Network Interface Controller (VNIC) feature +supports Ethernet functionality over Omni-Path fabric by encapsulating +the Ethernet packets between HFI nodes. + +Architecture +============= +The patterns of exchanges of Omni-Path encapsulated Ethernet packets +involves one or more virtual Ethernet switches overlaid on the Omni-Path +fabric topology. A subset of HFI nodes on the Omni-Path fabric are +permitted to exchange encapsulated Ethernet packets across a particular +virtual Ethernet switch. The virtual Ethernet switches are logical +abstractions achieved by configuring the HFI nodes on the fabric for +header generation and processing. In the simplest configuration all HFI +nodes across the fabric exchange encapsulated Ethernet packets over a +single virtual Ethernet switch. A virtual Ethernet switch, is effectively +an independent Ethernet network. The configuration is performed by an +Ethernet Manager (EM) which is part of the trusted Fabric Manager (FM) +application. HFI nodes can have multiple VNICs each connected to a +different virtual Ethernet switch. The below diagram presents a case +of two virtual Ethernet switches with two HFI nodes. + + +-------------------+ + | Subnet/ | + | Ethernet | + | Manager | + +-------------------+ + / / + / / + / / + / / ++-----------------------------+ +------------------------------+ +| Virtual Ethernet Switch | | Virtual Ethernet Switch | +| +---------+ +---------+ | | +---------+ +---------+ | +| | VPORT | | VPORT | | | | VPORT | | VPORT | | ++--+---------+----+---------+-+ +-+---------+----+---------+---+ + | \ / | + | \ / | + | \/ | + | / \ | + | / \ | + +-----------+------------+ +-----------+------------+ + | VNIC | VNIC | | VNIC | VNIC | + +-----------+------------+ +-----------+------------+ + | HFI | | HFI | + +------------------------+ +------------------------+ + + +The Omni-Path encapsulated Ethernet packet format is as described below. + +Bits Field +------------------------------------ +Quad Word 0: +0-19 SLID (lower 20 bits) +20-30 Length (in Quad Words) +31 BECN bit +32-51 DLID (lower 20 bits) +52-56 SC (Service Class) +57-59 RC (Routing Control) +60 FECN bit +61-62 L2 (=10, 16B format) +63 LT (=1, Link Transfer Head Flit) + +Quad Word 1: +0-7 L4 type (=0x78 ETHERNET) +8-11 SLID[23:20] +12-15 DLID[23:20] +16-31 PKEY +32-47 Entropy +48-63 Reserved + +Quad Word 2: +0-15 Reserved +16-31 L4 header +32-63 Ethernet Packet + +Quad Words 3 to N-1: +0-63 Ethernet packet (pad extended) + +Quad Word N (last): +0-23 Ethernet packet (pad extended) +24-55 ICRC +56-61 Tail +62-63 LT (=01, Link Transfer Tail Flit) + +Ethernet packet is padded on the transmit side to ensure that the VNIC OPA +packet is quad word aligned. The 'Tail' field contains the number of bytes +padded. On the receive side the 'Tail' field is read and the padding is +removed (along with ICRC, Tail and OPA header) before passing packet up +the network stack. + +The L4 header field contains the virtual Ethernet switch id the VNIC port +belongs to. On the receive side, this field is used to de-multiplex the +received VNIC packets to different VNIC ports. + +Driver Design +============== +Intel OPA VNIC software design is presented in the below diagram. +OPA VNIC functionality has a HW dependent component and a HW +independent component. + +The support has been added for IB device to allocate and free the RDMA +netdev devices. The RDMA netdev supports interfacing with the network +stack thus creating standard network interfaces. OPA_VNIC is an RDMA +netdev device type. + +The HW dependent VNIC functionality is part of the HFI1 driver. It +implements the verbs to allocate and free the OPA_VNIC RDMA netdev. +It involves HW resource allocation/management for VNIC functionality. +It interfaces with the network stack and implements the required +net_device_ops functions. It expects Omni-Path encapsulated Ethernet +packets in the transmit path and provides HW access to them. It strips +the Omni-Path header from the received packets before passing them up +the network stack. It also implements the RDMA netdev control operations. + +The OPA VNIC module implements the HW independent VNIC functionality. +It consists of two parts. The VNIC Ethernet Management Agent (VEMA) +registers itself with IB core as an IB client and interfaces with the +IB MAD stack. It exchanges the management information with the Ethernet +Manager (EM) and the VNIC netdev. The VNIC netdev part allocates and frees +the OPA_VNIC RDMA netdev devices. It overrides the net_device_ops functions +set by HW dependent VNIC driver where required to accommodate any control +operation. It also handles the encapsulation of Ethernet packets with an +Omni-Path header in the transmit path. For each VNIC interface, the +information required for encapsulation is configured by the EM via VEMA MAD +interface. It also passes any control information to the HW dependent driver +by invoking the RDMA netdev control operations. + + +-------------------+ +----------------------+ + | | | Linux | + | IB MAD | | Network | + | | | Stack | + +-------------------+ +----------------------+ + | | | + | | | + +----------------------------+ | + | | | + | OPA VNIC Module | | + | (OPA VNIC RDMA Netdev | | + | & EMA functions) | | + | | | + +----------------------------+ | + | | + | | + +------------------+ | + | IB core | | + +------------------+ | + | | + | | + +--------------------------------------------+ + | | + | HFI1 Driver with VNIC support | + | | + +--------------------------------------------+ diff --git a/Documentation/input/alps.txt b/Documentation/input/alps.txt deleted file mode 100644 index 8d1341ccde6498bebc534bcd7c1ef4560560a96f..0000000000000000000000000000000000000000 --- a/Documentation/input/alps.txt +++ /dev/null @@ -1,378 +0,0 @@ -ALPS Touchpad Protocol ----------------------- - -Introduction ------------- -Currently the ALPS touchpad driver supports seven protocol versions in use by -ALPS touchpads, called versions 1, 2, 3, 4, 5, 6 and 7. - -Since roughly mid-2010 several new ALPS touchpads have been released and -integrated into a variety of laptops and netbooks. These new touchpads -have enough behavior differences that the alps_model_data definition -table, describing the properties of the different versions, is no longer -adequate. The design choices were to re-define the alps_model_data -table, with the risk of regression testing existing devices, or isolate -the new devices outside of the alps_model_data table. The latter design -choice was made. The new touchpad signatures are named: "Rushmore", -"Pinnacle", and "Dolphin", which you will see in the alps.c code. -For the purposes of this document, this group of ALPS touchpads will -generically be called "new ALPS touchpads". - -We experimented with probing the ACPI interface _HID (Hardware ID)/_CID -(Compatibility ID) definition as a way to uniquely identify the -different ALPS variants but there did not appear to be a 1:1 mapping. -In fact, it appeared to be an m:n mapping between the _HID and actual -hardware type. - -Detection ---------- - -All ALPS touchpads should respond to the "E6 report" command sequence: -E8-E6-E6-E6-E9. An ALPS touchpad should respond with either 00-00-0A or -00-00-64 if no buttons are pressed. The bits 0-2 of the first byte will be 1s -if some buttons are pressed. - -If the E6 report is successful, the touchpad model is identified using the "E7 -report" sequence: E8-E7-E7-E7-E9. The response is the model signature and is -matched against known models in the alps_model_data_array. - -For older touchpads supporting protocol versions 3 and 4, the E7 report -model signature is always 73-02-64. To differentiate between these -versions, the response from the "Enter Command Mode" sequence must be -inspected as described below. - -The new ALPS touchpads have an E7 signature of 73-03-50 or 73-03-0A but -seem to be better differentiated by the EC Command Mode response. - -Command Mode ------------- - -Protocol versions 3 and 4 have a command mode that is used to read and write -one-byte device registers in a 16-bit address space. The command sequence -EC-EC-EC-E9 places the device in command mode, and the device will respond -with 88-07 followed by a third byte. This third byte can be used to determine -whether the devices uses the version 3 or 4 protocol. - -To exit command mode, PSMOUSE_CMD_SETSTREAM (EA) is sent to the touchpad. - -While in command mode, register addresses can be set by first sending a -specific command, either EC for v3 devices or F5 for v4 devices. Then the -address is sent one nibble at a time, where each nibble is encoded as a -command with optional data. This encoding differs slightly between the v3 and -v4 protocols. - -Once an address has been set, the addressed register can be read by sending -PSMOUSE_CMD_GETINFO (E9). The first two bytes of the response contains the -address of the register being read, and the third contains the value of the -register. Registers are written by writing the value one nibble at a time -using the same encoding used for addresses. - -For the new ALPS touchpads, the EC command is used to enter command -mode. The response in the new ALPS touchpads is significantly different, -and more important in determining the behavior. This code has been -separated from the original alps_model_data table and put in the -alps_identify function. For example, there seem to be two hardware init -sequences for the "Dolphin" touchpads as determined by the second byte -of the EC response. - -Packet Format -------------- - -In the following tables, the following notation is used. - - CAPITALS = stick, miniscules = touchpad - -?'s can have different meanings on different models, such as wheel rotation, -extra buttons, stick buttons on a dualpoint, etc. - -PS/2 packet format ------------------- - - byte 0: 0 0 YSGN XSGN 1 M R L - byte 1: X7 X6 X5 X4 X3 X2 X1 X0 - byte 2: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 - -Note that the device never signals overflow condition. - -For protocol version 2 devices when the trackpoint is used, and no fingers -are on the touchpad, the M R L bits signal the combined status of both the -pointingstick and touchpad buttons. - -ALPS Absolute Mode - Protocol Version 1 --------------------------------------- - - byte 0: 1 0 0 0 1 x9 x8 x7 - byte 1: 0 x6 x5 x4 x3 x2 x1 x0 - byte 2: 0 ? ? l r ? fin ges - byte 3: 0 ? ? ? ? y9 y8 y7 - byte 4: 0 y6 y5 y4 y3 y2 y1 y0 - byte 5: 0 z6 z5 z4 z3 z2 z1 z0 - -ALPS Absolute Mode - Protocol Version 2 ---------------------------------------- - - byte 0: 1 ? ? ? 1 PSM PSR PSL - byte 1: 0 x6 x5 x4 x3 x2 x1 x0 - byte 2: 0 x10 x9 x8 x7 ? fin ges - byte 3: 0 y9 y8 y7 1 M R L - byte 4: 0 y6 y5 y4 y3 y2 y1 y0 - byte 5: 0 z6 z5 z4 z3 z2 z1 z0 - -Protocol Version 2 DualPoint devices send standard PS/2 mouse packets for -the DualPoint Stick. The M, R and L bits signal the combined status of both -the pointingstick and touchpad buttons, except for Dell dualpoint devices -where the pointingstick buttons get reported separately in the PSM, PSR -and PSL bits. - -Dualpoint device -- interleaved packet format ---------------------------------------------- - - byte 0: 1 1 0 0 1 1 1 1 - byte 1: 0 x6 x5 x4 x3 x2 x1 x0 - byte 2: 0 x10 x9 x8 x7 0 fin ges - byte 3: 0 0 YSGN XSGN 1 1 1 1 - byte 4: X7 X6 X5 X4 X3 X2 X1 X0 - byte 5: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 - byte 6: 0 y9 y8 y7 1 m r l - byte 7: 0 y6 y5 y4 y3 y2 y1 y0 - byte 8: 0 z6 z5 z4 z3 z2 z1 z0 - -Devices which use the interleaving format normally send standard PS/2 mouse -packets for the DualPoint Stick + ALPS Absolute Mode packets for the -touchpad, switching to the interleaved packet format when both the stick and -the touchpad are used at the same time. - -ALPS Absolute Mode - Protocol Version 3 ---------------------------------------- - -ALPS protocol version 3 has three different packet formats. The first two are -associated with touchpad events, and the third is associated with trackstick -events. - -The first type is the touchpad position packet. - - byte 0: 1 ? x1 x0 1 1 1 1 - byte 1: 0 x10 x9 x8 x7 x6 x5 x4 - byte 2: 0 y10 y9 y8 y7 y6 y5 y4 - byte 3: 0 M R L 1 m r l - byte 4: 0 mt x3 x2 y3 y2 y1 y0 - byte 5: 0 z6 z5 z4 z3 z2 z1 z0 - -Note that for some devices the trackstick buttons are reported in this packet, -and on others it is reported in the trackstick packets. - -The second packet type contains bitmaps representing the x and y axes. In the -bitmaps a given bit is set if there is a finger covering that position on the -given axis. Thus the bitmap packet can be used for low-resolution multi-touch -data, although finger tracking is not possible. This packet also encodes the -number of contacts (f1 and f0 in the table below). - - byte 0: 1 1 x1 x0 1 1 1 1 - byte 1: 0 x8 x7 x6 x5 x4 x3 x2 - byte 2: 0 y7 y6 y5 y4 y3 y2 y1 - byte 3: 0 y10 y9 y8 1 1 1 1 - byte 4: 0 x14 x13 x12 x11 x10 x9 y0 - byte 5: 0 1 ? ? ? ? f1 f0 - -This packet only appears after a position packet with the mt bit set, and -usually only appears when there are two or more contacts (although -occasionally it's seen with only a single contact). - -The final v3 packet type is the trackstick packet. - - byte 0: 1 1 x7 y7 1 1 1 1 - byte 1: 0 x6 x5 x4 x3 x2 x1 x0 - byte 2: 0 y6 y5 y4 y3 y2 y1 y0 - byte 3: 0 1 0 0 1 0 0 0 - byte 4: 0 z4 z3 z2 z1 z0 ? ? - byte 5: 0 0 1 1 1 1 1 1 - -ALPS Absolute Mode - Protocol Version 4 ---------------------------------------- - -Protocol version 4 has an 8-byte packet format. - - byte 0: 1 ? x1 x0 1 1 1 1 - byte 1: 0 x10 x9 x8 x7 x6 x5 x4 - byte 2: 0 y10 y9 y8 y7 y6 y5 y4 - byte 3: 0 1 x3 x2 y3 y2 y1 y0 - byte 4: 0 ? ? ? 1 ? r l - byte 5: 0 z6 z5 z4 z3 z2 z1 z0 - byte 6: bitmap data (described below) - byte 7: bitmap data (described below) - -The last two bytes represent a partial bitmap packet, with 3 full packets -required to construct a complete bitmap packet. Once assembled, the 6-byte -bitmap packet has the following format: - - byte 0: 0 1 x7 x6 x5 x4 x3 x2 - byte 1: 0 x1 x0 y4 y3 y2 y1 y0 - byte 2: 0 0 ? x14 x13 x12 x11 x10 - byte 3: 0 x9 x8 y9 y8 y7 y6 y5 - byte 4: 0 0 0 0 0 0 0 0 - byte 5: 0 0 0 0 0 0 0 y10 - -There are several things worth noting here. - - 1) In the bitmap data, bit 6 of byte 0 serves as a sync byte to - identify the first fragment of a bitmap packet. - - 2) The bitmaps represent the same data as in the v3 bitmap packets, although - the packet layout is different. - - 3) There doesn't seem to be a count of the contact points anywhere in the v4 - protocol packets. Deriving a count of contact points must be done by - analyzing the bitmaps. - - 4) There is a 3 to 1 ratio of position packets to bitmap packets. Therefore - MT position can only be updated for every third ST position update, and - the count of contact points can only be updated every third packet as - well. - -So far no v4 devices with tracksticks have been encountered. - -ALPS Absolute Mode - Protocol Version 5 ---------------------------------------- -This is basically Protocol Version 3 but with different logic for packet -decode. It uses the same alps_process_touchpad_packet_v3 call with a -specialized decode_fields function pointer to correctly interpret the -packets. This appears to only be used by the Dolphin devices. - -For single-touch, the 6-byte packet format is: - - byte 0: 1 1 0 0 1 0 0 0 - byte 1: 0 x6 x5 x4 x3 x2 x1 x0 - byte 2: 0 y6 y5 y4 y3 y2 y1 y0 - byte 3: 0 M R L 1 m r l - byte 4: y10 y9 y8 y7 x10 x9 x8 x7 - byte 5: 0 z6 z5 z4 z3 z2 z1 z0 - -For mt, the format is: - - byte 0: 1 1 1 n3 1 n2 n1 x24 - byte 1: 1 y7 y6 y5 y4 y3 y2 y1 - byte 2: ? x2 x1 y12 y11 y10 y9 y8 - byte 3: 0 x23 x22 x21 x20 x19 x18 x17 - byte 4: 0 x9 x8 x7 x6 x5 x4 x3 - byte 5: 0 x16 x15 x14 x13 x12 x11 x10 - -ALPS Absolute Mode - Protocol Version 6 ---------------------------------------- - -For trackstick packet, the format is: - - byte 0: 1 1 1 1 1 1 1 1 - byte 1: 0 X6 X5 X4 X3 X2 X1 X0 - byte 2: 0 Y6 Y5 Y4 Y3 Y2 Y1 Y0 - byte 3: ? Y7 X7 ? ? M R L - byte 4: Z7 Z6 Z5 Z4 Z3 Z2 Z1 Z0 - byte 5: 0 1 1 1 1 1 1 1 - -For touchpad packet, the format is: - - byte 0: 1 1 1 1 1 1 1 1 - byte 1: 0 0 0 0 x3 x2 x1 x0 - byte 2: 0 0 0 0 y3 y2 y1 y0 - byte 3: ? x7 x6 x5 x4 ? r l - byte 4: ? y7 y6 y5 y4 ? ? ? - byte 5: z7 z6 z5 z4 z3 z2 z1 z0 - -(v6 touchpad does not have middle button) - -ALPS Absolute Mode - Protocol Version 7 ---------------------------------------- - -For trackstick packet, the format is: - - byte 0: 0 1 0 0 1 0 0 0 - byte 1: 1 1 * * 1 M R L - byte 2: X7 1 X5 X4 X3 X2 X1 X0 - byte 3: Z6 1 Y6 X6 1 Y2 Y1 Y0 - byte 4: Y7 0 Y5 Y4 Y3 1 1 0 - byte 5: T&P 0 Z5 Z4 Z3 Z2 Z1 Z0 - -For touchpad packet, the format is: - - packet-fmt b7 b6 b5 b4 b3 b2 b1 b0 - byte 0: TWO & MULTI L 1 R M 1 Y0-2 Y0-1 Y0-0 - byte 0: NEW L 1 X1-5 1 1 Y0-2 Y0-1 Y0-0 - byte 1: Y0-10 Y0-9 Y0-8 Y0-7 Y0-6 Y0-5 Y0-4 Y0-3 - byte 2: X0-11 1 X0-10 X0-9 X0-8 X0-7 X0-6 X0-5 - byte 3: X1-11 1 X0-4 X0-3 1 X0-2 X0-1 X0-0 - byte 4: TWO X1-10 TWO X1-9 X1-8 X1-7 X1-6 X1-5 X1-4 - byte 4: MULTI X1-10 TWO X1-9 X1-8 X1-7 X1-6 Y1-5 1 - byte 4: NEW X1-10 TWO X1-9 X1-8 X1-7 X1-6 0 0 - byte 5: TWO & NEW Y1-10 0 Y1-9 Y1-8 Y1-7 Y1-6 Y1-5 Y1-4 - byte 5: MULTI Y1-10 0 Y1-9 Y1-8 Y1-7 Y1-6 F-1 F-0 - - L: Left button - R / M: Non-clickpads: Right / Middle button - Clickpads: When > 2 fingers are down, and some fingers - are in the button area, then the 2 coordinates reported - are for fingers outside the button area and these report - extra fingers being present in the right / left button - area. Note these fingers are not added to the F field! - so if a TWO packet is received and R = 1 then there are - 3 fingers down, etc. - TWO: 1: Two touches present, byte 0/4/5 are in TWO fmt - 0: If byte 4 bit 0 is 1, then byte 0/4/5 are in MULTI fmt - otherwise byte 0 bit 4 must be set and byte 0/4/5 are - in NEW fmt - F: Number of fingers - 3, 0 means 3 fingers, 1 means 4 ... - - -ALPS Absolute Mode - Protocol Version 8 ---------------------------------------- - -Spoken by SS4 (73 03 14) and SS5 (73 03 28) hardware. - -The packet type is given by the APD field, bits 4-5 of byte 3. - -Touchpad packet (APD = 0x2): - - b7 b6 b5 b4 b3 b2 b1 b0 - byte 0: SWM SWR SWL 1 1 0 0 X7 - byte 1: 0 X6 X5 X4 X3 X2 X1 X0 - byte 2: 0 Y6 Y5 Y4 Y3 Y2 Y1 Y0 - byte 3: 0 T&P 1 0 1 0 0 Y7 - byte 4: 0 Z6 Z5 Z4 Z3 Z2 Z1 Z0 - byte 5: 0 0 0 0 0 0 0 0 - -SWM, SWR, SWL: Middle, Right, and Left button states - -Touchpad 1 Finger packet (APD = 0x0): - - b7 b6 b5 b4 b3 b2 b1 b0 - byte 0: SWM SWR SWL 1 1 X2 X1 X0 - byte 1: X9 X8 X7 1 X6 X5 X4 X3 - byte 2: 0 X11 X10 LFB Y3 Y2 Y1 Y0 - byte 3: Y5 Y4 0 0 1 TAPF2 TAPF1 TAPF0 - byte 4: Zv7 Y11 Y10 1 Y9 Y8 Y7 Y6 - byte 5: Zv6 Zv5 Zv4 0 Zv3 Zv2 Zv1 Zv0 - -TAPF: ??? -LFB: ??? - -Touchpad 2 Finger packet (APD = 0x1): - - b7 b6 b5 b4 b3 b2 b1 b0 - byte 0: SWM SWR SWL 1 1 AX6 AX5 AX4 - byte 1: AX11 AX10 AX9 AX8 AX7 AZ1 AY4 AZ0 - byte 2: AY11 AY10 AY9 CONT AY8 AY7 AY6 AY5 - byte 3: 0 0 0 1 1 BX6 BX5 BX4 - byte 4: BX11 BX10 BX9 BX8 BX7 BZ1 BY4 BZ0 - byte 5: BY11 BY10 BY9 0 BY8 BY7 BY5 BY5 - -CONT: A 3-or-4 Finger packet is to follow - -Touchpad 3-or-4 Finger packet (APD = 0x3): - - b7 b6 b5 b4 b3 b2 b1 b0 - byte 0: SWM SWR SWL 1 1 AX6 AX5 AX4 - byte 1: AX11 AX10 AX9 AX8 AX7 AZ1 AY4 AZ0 - byte 2: AY11 AY10 AY9 OVF AY8 AY7 AY6 AY5 - byte 3: 0 0 1 1 1 BX6 BX5 BX4 - byte 4: BX11 BX10 BX9 BX8 BX7 BZ1 BY4 BZ0 - byte 5: BY11 BY10 BY9 0 BY8 BY7 BY5 BY5 - -OVF: 5th finger detected diff --git a/Documentation/input/amijoy.txt b/Documentation/input/amijoy.txt deleted file mode 100644 index 7dc4f175943cff2854dd1d54ff93c03be98b2b56..0000000000000000000000000000000000000000 --- a/Documentation/input/amijoy.txt +++ /dev/null @@ -1,184 +0,0 @@ -Amiga 4-joystick parport extension -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Parallel port pins: - - (2) - Up1 (6) - Up2 - (3) - Down1 (7) - Down2 - (4) - Left1 (8) - Left2 - (5) - Right1 (9) - Right2 -(13) - Fire1 (11) - Fire2 -(18) - Gnd1 (18) - Gnd2 - -Amiga digital joystick pinout -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -(1) - Up -(2) - Down -(3) - Left -(4) - Right -(5) - n/c -(6) - Fire button -(7) - +5V (50mA) -(8) - Gnd -(9) - Thumb button - -Amiga mouse pinout -~~~~~~~~~~~~~~~~~~ -(1) - V-pulse -(2) - H-pulse -(3) - VQ-pulse -(4) - HQ-pulse -(5) - Middle button -(6) - Left button -(7) - +5V (50mA) -(8) - Gnd -(9) - Right button - -Amiga analog joystick pinout -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -(1) - Top button -(2) - Top2 button -(3) - Trigger button -(4) - Thumb button -(5) - Analog X -(6) - n/c -(7) - +5V (50mA) -(8) - Gnd -(9) - Analog Y - -Amiga lightpen pinout -~~~~~~~~~~~~~~~~~~~~~ -(1) - n/c -(2) - n/c -(3) - n/c -(4) - n/c -(5) - Touch button -(6) - /Beamtrigger -(7) - +5V (50mA) -(8) - Gnd -(9) - Stylus button - -------------------------------------------------------------------------------- - -NAME rev ADDR type chip Description -JOY0DAT 00A R Denise Joystick-mouse 0 data (left vert, horiz) -JOY1DAT 00C R Denise Joystick-mouse 1 data (right vert,horiz) - - These addresses each read a 16 bit register. These in turn - are loaded from the MDAT serial stream and are clocked in on - the rising edge of SCLK. MLD output is used to parallel load - the external parallel-to-serial converter.This in turn is - loaded with the 4 quadrature inputs from each of two game - controller ports (8 total) plus 8 miscellaneous control bits - which are new for LISA and can be read in upper 8 bits of - LISAID. - Register bits are as follows: - Mouse counter usage (pins 1,3 =Yclock, pins 2,4 =Xclock) - - BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 -JOY0DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 -JOY1DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 - - 0=LEFT CONTROLLER PAIR, 1=RIGHT CONTROLLER PAIR. - (4 counters total). The bit usage for both left and right - addresses is shown below. Each 6 bit counter (Y7-Y2,X7-X2) is - clocked by 2 of the signals input from the mouse serial - stream. Starting with first bit received: - - +-------------------+-----------------------------------------+ - | Serial | Bit Name | Description | - +--------+----------+-----------------------------------------+ - | 0 | M0H | JOY0DAT Horizontal Clock | - | 1 | M0HQ | JOY0DAT Horizontal Clock (quadrature) | - | 2 | M0V | JOY0DAT Vertical Clock | - | 3 | M0VQ | JOY0DAT Vertical Clock (quadrature) | - | 4 | M1V | JOY1DAT Horizontal Clock | - | 5 | M1VQ | JOY1DAT Horizontal Clock (quadrature) | - | 6 | M1V | JOY1DAT Vertical Clock | - | 7 | M1VQ | JOY1DAT Vertical Clock (quadrature) | - +--------+----------+-----------------------------------------+ - - Bits 1 and 0 of each counter (Y1-Y0,X1-X0) may be - read to determine the state of the related input signal pair. - This allows these pins to double as joystick switch inputs. - Joystick switch closures can be deciphered as follows: - - +------------+------+---------------------------------+ - | Directions | Pin# | Counter bits | - +------------+------+---------------------------------+ - | Forward | 1 | Y1 xor Y0 (BIT#09 xor BIT#08) | - | Left | 3 | Y1 | - | Back | 2 | X1 xor X0 (BIT#01 xor BIT#00) | - | Right | 4 | X1 | - +------------+------+---------------------------------+ - -------------------------------------------------------------------------------- - -NAME rev ADDR type chip Description -JOYTEST 036 W Denise Write to all 4 joystick-mouse counters at once. - - Mouse counter write test data: - BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 - JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx - JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx - -------------------------------------------------------------------------------- - -NAME rev ADDR type chip Description -POT0DAT h 012 R Paula Pot counter data left pair (vert, horiz) -POT1DAT h 014 R Paula Pot counter data right pair (vert,horiz) - - These addresses each read a pair of 8 bit pot counters. - (4 counters total). The bit assignment for both - addresses is shown below. The counters are stopped by signals - from 2 controller connectors (left-right) with 2 pins each. - - BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 - RIGHT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 - LEFT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 - - +--------------------------+-------+ - | CONNECTORS | PAULA | - +-------+------+-----+-----+-------+ - | Loc. | Dir. | Sym | pin | pin | - +-------+------+-----+-----+-------+ - | RIGHT | Y | RX | 9 | 33 | - | RIGHT | X | RX | 5 | 32 | - | LEFT | Y | LY | 9 | 36 | - | LEFT | X | LX | 5 | 35 | - +-------+------+-----+-----+-------+ - - With normal (NTSC or PAL) horiz. line rate, the pots will - give a full scale (FF) reading with about 500kohms in one - frame time. With proportionally faster horiz line times, - the counters will count proportionally faster. - This should be noted when doing variable beam displays. - -------------------------------------------------------------------------------- - -NAME rev ADDR type chip Description -POTGO 034 W Paula Pot port (4 bit) bi-direction and data, and pot counter start. - -------------------------------------------------------------------------------- - -NAME rev ADDR type chip Description -POTINP 016 R Paula Pot pin data read - - This register controls a 4 bit bi-direction I/O port - that shares the same 4 pins as the 4 pot counters above. - - +-------+----------+---------------------------------------------+ - | BIT# | FUNCTION | DESCRIPTION | - +-------+----------+---------------------------------------------+ - | 15 | OUTRY | Output enable for Paula pin 33 | - | 14 | DATRY | I/O data Paula pin 33 | - | 13 | OUTRX | Output enable for Paula pin 32 | - | 12 | DATRX | I/O data Paula pin 32 | - | 11 | OUTLY | Out put enable for Paula pin 36 | - | 10 | DATLY | I/O data Paula pin 36 | - | 09 | OUTLX | Output enable for Paula pin 35 | - | 08 | DATLX | I/O data Paula pin 35 | - | 07-01 | X | Not used | - | 00 | START | Start pots (dump capacitors,start counters) | - +-------+----------+---------------------------------------------+ - -------------------------------------------------------------------------------- diff --git a/Documentation/input/appletouch.txt b/Documentation/input/appletouch.txt deleted file mode 100644 index b13de3f89108df1e7b9cd585c97aa81b458261d9..0000000000000000000000000000000000000000 --- a/Documentation/input/appletouch.txt +++ /dev/null @@ -1,85 +0,0 @@ -Apple Touchpad Driver (appletouch) ----------------------------------- - Copyright (C) 2005 Stelian Pop - -appletouch is a Linux kernel driver for the USB touchpad found on post -February 2005 and October 2005 Apple Aluminium Powerbooks. - -This driver is derived from Johannes Berg's appletrackpad driver[1], but it has -been improved in some areas: - * appletouch is a full kernel driver, no userspace program is necessary - * appletouch can be interfaced with the synaptics X11 driver, in order - to have touchpad acceleration, scrolling, etc. - -Credits go to Johannes Berg for reverse-engineering the touchpad protocol, -Frank Arnold for further improvements, and Alex Harper for some additional -information about the inner workings of the touchpad sensors. Michael -Hanselmann added support for the October 2005 models. - -Usage: ------- - -In order to use the touchpad in the basic mode, compile the driver and load -the module. A new input device will be detected and you will be able to read -the mouse data from /dev/input/mice (using gpm, or X11). - -In X11, you can configure the touchpad to use the synaptics X11 driver, which -will give additional functionalities, like acceleration, scrolling, 2 finger -tap for middle button mouse emulation, 3 finger tap for right button mouse -emulation, etc. In order to do this, make sure you're using a recent version of -the synaptics driver (tested with 0.14.2, available from [2]), and configure a -new input device in your X11 configuration file (take a look below for an -example). For additional configuration, see the synaptics driver documentation. - - Section "InputDevice" - Identifier "Synaptics Touchpad" - Driver "synaptics" - Option "SendCoreEvents" "true" - Option "Device" "/dev/input/mice" - Option "Protocol" "auto-dev" - Option "LeftEdge" "0" - Option "RightEdge" "850" - Option "TopEdge" "0" - Option "BottomEdge" "645" - Option "MinSpeed" "0.4" - Option "MaxSpeed" "1" - Option "AccelFactor" "0.02" - Option "FingerLow" "0" - Option "FingerHigh" "30" - Option "MaxTapMove" "20" - Option "MaxTapTime" "100" - Option "HorizScrollDelta" "0" - Option "VertScrollDelta" "30" - Option "SHMConfig" "on" - EndSection - - Section "ServerLayout" - ... - InputDevice "Mouse" - InputDevice "Synaptics Touchpad" - ... - EndSection - -Fuzz problems: --------------- - -The touchpad sensors are very sensitive to heat, and will generate a lot of -noise when the temperature changes. This is especially true when you power-on -the laptop for the first time. - -The appletouch driver tries to handle this noise and auto adapt itself, but it -is not perfect. If finger movements are not recognized anymore, try reloading -the driver. - -You can activate debugging using the 'debug' module parameter. A value of 0 -deactivates any debugging, 1 activates tracing of invalid samples, 2 activates -full tracing (each sample is being traced): - modprobe appletouch debug=1 - or - echo "1" > /sys/module/appletouch/parameters/debug - -Links: ------- - -[1]: http://johannes.sipsolutions.net/PowerBook/touchpad/ -[2]: http://web.archive.org/web/*/http://web.telia.com/~u89404340/touchpad/index.html diff --git a/Documentation/input/atarikbd.txt b/Documentation/input/atarikbd.txt deleted file mode 100644 index f3a3ba8847ba7749c0d881220f50a56c217d8c31..0000000000000000000000000000000000000000 --- a/Documentation/input/atarikbd.txt +++ /dev/null @@ -1,709 +0,0 @@ -Intelligent Keyboard (ikbd) Protocol - - -1. Introduction - -The Atari Corp. Intelligent Keyboard (ikbd) is a general purpose keyboard -controller that is flexible enough that it can be used in a variety of -products without modification. The keyboard, with its microcontroller, -provides a convenient connection point for a mouse and switch-type joysticks. -The ikbd processor also maintains a time-of-day clock with one second -resolution. -The ikbd has been designed to be general enough that it can be used with a -variety of new computer products. Product variations in a number of -keyswitches, mouse resolution, etc. can be accommodated. -The ikbd communicates with the main processor over a high speed bi-directional -serial interface. It can function in a variety of modes to facilitate -different applications of the keyboard, joysticks, or mouse. Limited use of -the controller is possible in applications in which only a unidirectional -communications medium is available by carefully designing the default modes. - -3. Keyboard - -The keyboard always returns key make/break scan codes. The ikbd generates -keyboard scan codes for each key press and release. The key scan make (key -closure) codes start at 1, and are defined in Appendix A. For example, the -ISO key position in the scan code table should exist even if no keyswitch -exists in that position on a particular keyboard. The break code for each key -is obtained by ORing 0x80 with the make code. - -The special codes 0xF6 through 0xFF are reserved for use as follows: - 0xF6 status report - 0xF7 absolute mouse position record - 0xF8-0xFB relative mouse position records (lsbs determined by - mouse button states) - 0xFC time-of-day - 0xFD joystick report (both sticks) - 0xFE joystick 0 event - 0xFF joystick 1 event - -The two shift keys return different scan codes in this mode. The ENTER key -and the RETurn key are also distinct. - -4. Mouse - -The mouse port should be capable of supporting a mouse with resolution of -approximately 200 counts (phase changes or 'clicks') per inch of travel. The -mouse should be scanned at a rate that will permit accurate tracking at -velocities up to 10 inches per second. -The ikbd can report mouse motion in three distinctly different ways. It can -report relative motion, absolute motion in a coordinate system maintained -within the ikbd, or by converting mouse motion into keyboard cursor control -key equivalents. -The mouse buttons can be treated as part of the mouse or as additional -keyboard keys. - -4.1 Relative Position Reporting - -In relative position mode, the ikbd will return relative mouse position -records whenever a mouse event occurs. A mouse event consists of a mouse -button being pressed or released, or motion in either axis exceeding a -settable threshold of motion. Regardless of the threshold, all bits of -resolution are returned to the host computer. -Note that the ikbd may return mouse relative position reports with -significantly more than the threshold delta x or y. This may happen since no -relative mouse motion events will be generated: (a) while the keyboard has -been 'paused' ( the event will be stored until keyboard communications is -resumed) (b) while any event is being transmitted. - -The relative mouse position record is a three byte record of the form -(regardless of keyboard mode): - %111110xy ; mouse position record flag - ; where y is the right button state - ; and x is the left button state - X ; delta x as twos complement integer - Y ; delta y as twos complement integer - -Note that the value of the button state bits should be valid even if the -MOUSE BUTTON ACTION has set the buttons to act like part of the keyboard. -If the accumulated motion before the report packet is generated exceeds the -+127...-128 range, the motion is broken into multiple packets. -Note that the sign of the delta y reported is a function of the Y origin -selected. - -4.2 Absolute Position reporting - -The ikbd can also maintain absolute mouse position. Commands exist for -resetting the mouse position, setting X/Y scaling, and interrogating the -current mouse position. - -4.3 Mouse Cursor Key Mode - -The ikbd can translate mouse motion into the equivalent cursor keystrokes. -The number of mouse clicks per keystroke is independently programmable in -each axis. The ikbd internally maintains mouse motion information to the -highest resolution available, and merely generates a pair of cursor key events -for each multiple of the scale factor. -Mouse motion produces the cursor key make code immediately followed by the -break code for the appropriate cursor key. The mouse buttons produce scan -codes above those normally assigned for the largest envisioned keyboard (i.e. -LEFT=0x74 & RIGHT=0x75). - -5. Joystick - -5.1 Joystick Event Reporting - -In this mode, the ikbd generates a record whenever the joystick position is -changed (i.e. for each opening or closing of a joystick switch or trigger). - -The joystick event record is two bytes of the form: - %1111111x ; Joystick event marker - ; where x is Joystick 0 or 1 - %x000yyyy ; where yyyy is the stick position - ; and x is the trigger - -5.2 Joystick Interrogation - -The current state of the joystick ports may be interrogated at any time in -this mode by sending an 'Interrogate Joystick' command to the ikbd. - -The ikbd response to joystick interrogation is a three byte report of the form - 0xFD ; joystick report header - %x000yyyy ; Joystick 0 - %x000yyyy ; Joystick 1 - ; where x is the trigger - ; and yyy is the stick position - -5.3 Joystick Monitoring - -A mode is available that devotes nearly all of the keyboard communications -time to reporting the state of the joystick ports at a user specifiable rate. -It remains in this mode until reset or commanded into another mode. The PAUSE -command in this mode not only stop the output but also temporarily stops -scanning the joysticks (samples are not queued). - -5.4 Fire Button Monitoring - -A mode is provided to permit monitoring a single input bit at a high rate. In -this mode the ikbd monitors the state of the Joystick 1 fire button at the -maximum rate permitted by the serial communication channel. The data is packed -8 bits per byte for transmission to the host. The ikbd remains in this mode -until reset or commanded into another mode. The PAUSE command in this mode not -only stops the output but also temporarily stops scanning the button (samples -are not queued). - -5.5 Joystick Key Code Mode - -The ikbd may be commanded to translate the use of either joystick into the -equivalent cursor control keystroke(s). The ikbd provides a single breakpoint -velocity joystick cursor. -Joystick events produce the make code, immediately followed by the break code -for the appropriate cursor motion keys. The trigger or fire buttons of the -joysticks produce pseudo key scan codes above those used by the largest key -matrix envisioned (i.e. JOYSTICK0=0x74, JOYSTICK1=0x75). - -6. Time-of-Day Clock - -The ikbd also maintains a time-of-day clock for the system. Commands are -available to set and interrogate the timer-of-day clock. Time-keeping is -maintained down to a resolution of one second. - -7. Status Inquiries - -The current state of ikbd modes and parameters may be found by sending status -inquiry commands that correspond to the ikbd set commands. - -8. Power-Up Mode - -The keyboard controller will perform a simple self-test on power-up to detect -major controller faults (ROM checksum and RAM test) and such things as stuck -keys. Any keys down at power-up are presumed to be stuck, and their BREAK -(sic) code is returned (which without the preceding MAKE code is a flag for a -keyboard error). If the controller self-test completes without error, the code -0xF0 is returned. (This code will be used to indicate the version/release of -the ikbd controller. The first release of the ikbd is version 0xF0, should -there be a second release it will be 0xF1, and so on.) -The ikbd defaults to a mouse position reporting with threshold of 1 unit in -either axis and the Y=0 origin at the top of the screen, and joystick event -reporting mode for joystick 1, with both buttons being logically assigned to -the mouse. After any joystick command, the ikbd assumes that joysticks are -connected to both Joystick0 and Joystick1. Any mouse command (except MOUSE -DISABLE) then causes port 0 to again be scanned as if it were a mouse, and -both buttons are logically connected to it. If a mouse disable command is -received while port 0 is presumed to be a mouse, the button is logically -assigned to Joystick1 (until the mouse is reenabled by another mouse command). - -9. ikbd Command Set - -This section contains a list of commands that can be sent to the ikbd. Command -codes (such as 0x00) which are not specified should perform no operation -(NOPs). - -9.1 RESET - - 0x80 - 0x01 - -N.B. The RESET command is the only two byte command understood by the ikbd. -Any byte following an 0x80 command byte other than 0x01 is ignored (and causes -the 0x80 to be ignored). -A reset may also be caused by sending a break lasting at least 200mS to the -ikbd. -Executing the RESET command returns the keyboard to its default (power-up) -mode and parameter settings. It does not affect the time-of-day clock. -The RESET command or function causes the ikbd to perform a simple self-test. -If the test is successful, the ikbd will send the code of 0xF0 within 300mS -of receipt of the RESET command (or the end of the break, or power-up). The -ikbd will then scan the key matrix for any stuck (closed) keys. Any keys found -closed will cause the break scan code to be generated (the break code arriving -without being preceded by the make code is a flag for a key matrix error). - -9.2. SET MOUSE BUTTON ACTION - - 0x07 - %00000mss ; mouse button action - ; (m is presumed = 1 when in MOUSE KEYCODE mode) - ; mss=0xy, mouse button press or release causes mouse - ; position report - ; where y=1, mouse key press causes absolute report - ; and x=1, mouse key release causes absolute report - ; mss=100, mouse buttons act like keys - -This command sets how the ikbd should treat the buttons on the mouse. The -default mouse button action mode is %00000000, the buttons are treated as part -of the mouse logically. -When buttons act like keys, LEFT=0x74 & RIGHT=0x75. - -9.3 SET RELATIVE MOUSE POSITION REPORTING - - 0x08 - -Set relative mouse position reporting. (DEFAULT) Mouse position packets are -generated asynchronously by the ikbd whenever motion exceeds the setable -threshold in either axis (see SET MOUSE THRESHOLD). Depending upon the mouse -key mode, mouse position reports may also be generated when either mouse -button is pressed or released. Otherwise the mouse buttons behave as if they -were keyboard keys. - -9.4 SET ABSOLUTE MOUSE POSITIONING - - 0x09 - XMSB ; X maximum (in scaled mouse clicks) - XLSB - YMSB ; Y maximum (in scaled mouse clicks) - YLSB - -Set absolute mouse position maintenance. Resets the ikbd maintained X and Y -coordinates. -In this mode, the value of the internally maintained coordinates does NOT wrap -between 0 and large positive numbers. Excess motion below 0 is ignored. The -command sets the maximum positive value that can be attained in the scaled -coordinate system. Motion beyond that value is also ignored. - -9.5 SET MOUSE KEYCODE MOSE - - 0x0A - deltax ; distance in X clicks to return (LEFT) or (RIGHT) - deltay ; distance in Y clicks to return (UP) or (DOWN) - -Set mouse monitoring routines to return cursor motion keycodes instead of -either RELATIVE or ABSOLUTE motion records. The ikbd returns the appropriate -cursor keycode after mouse travel exceeding the user specified deltas in -either axis. When the keyboard is in key scan code mode, mouse motion will -cause the make code immediately followed by the break code. Note that this -command is not affected by the mouse motion origin. - -9..6 SET MOUSE THRESHOLD - - 0x0B - X ; x threshold in mouse ticks (positive integers) - Y ; y threshold in mouse ticks (positive integers) - -This command sets the threshold before a mouse event is generated. Note that -it does NOT affect the resolution of the data returned to the host. This -command is valid only in RELATIVE MOUSE POSITIONING mode. The thresholds -default to 1 at RESET (or power-up). - -9.7 SET MOUSE SCALE - - 0x0C - X ; horizontal mouse ticks per internal X - Y ; vertical mouse ticks per internal Y - -This command sets the scale factor for the ABSOLUTE MOUSE POSITIONING mode. -In this mode, the specified number of mouse phase changes ('clicks') must -occur before the internally maintained coordinate is changed by one -(independently scaled for each axis). Remember that the mouse position -information is available only by interrogating the ikbd in the ABSOLUTE MOUSE -POSITIONING mode unless the ikbd has been commanded to report on button press -or release (see SET MOSE BUTTON ACTION). - -9.8 INTERROGATE MOUSE POSITION - - 0x0D - Returns: - 0xF7 ; absolute mouse position header - BUTTONS - 0000dcba ; where a is right button down since last interrogation - ; b is right button up since last - ; c is left button down since last - ; d is left button up since last - XMSB ; X coordinate - XLSB - YMSB ; Y coordinate - YLSB - -The INTERROGATE MOUSE POSITION command is valid when in the ABSOLUTE MOUSE -POSITIONING mode, regardless of the setting of the MOUSE BUTTON ACTION. - -9.9 LOAD MOUSE POSITION - - 0x0E - 0x00 ; filler - XMSB ; X coordinate - XLSB ; (in scaled coordinate system) - YMSB ; Y coordinate - YLSB - -This command allows the user to preset the internally maintained absolute -mouse position. - -9.10 SET Y=0 AT BOTTOM - - 0x0F - -This command makes the origin of the Y axis to be at the bottom of the -logical coordinate system internal to the ikbd for all relative or absolute -mouse motion. This causes mouse motion toward the user to be negative in sign -and away from the user to be positive. - -9.11 SET Y=0 AT TOP - - 0x10 - -Makes the origin of the Y axis to be at the top of the logical coordinate -system within the ikbd for all relative or absolute mouse motion. (DEFAULT) -This causes mouse motion toward the user to be positive in sign and away from -the user to be negative. - -9.12 RESUME - - 0x11 - -Resume sending data to the host. Since any command received by the ikbd after -its output has been paused also causes an implicit RESUME this command can be -thought of as a NO OPERATION command. If this command is received by the ikbd -and it is not PAUSED, it is simply ignored. - -9.13 DISABLE MOUSE - - 0x12 - -All mouse event reporting is disabled (and scanning may be internally -disabled). Any valid mouse mode command resumes mouse motion monitoring. (The -valid mouse mode commands are SET RELATIVE MOUSE POSITION REPORTING, SET -ABSOLUTE MOUSE POSITIONING, and SET MOUSE KEYCODE MODE. ) -N.B. If the mouse buttons have been commanded to act like keyboard keys, this -command DOES affect their actions. - -9.14 PAUSE OUTPUT - - 0x13 - -Stop sending data to the host until another valid command is received. Key -matrix activity is still monitored and scan codes or ASCII characters enqueued -(up to the maximum supported by the microcontroller) to be sent when the host -allows the output to be resumed. If in the JOYSTICK EVENT REPORTING mode, -joystick events are also queued. -Mouse motion should be accumulated while the output is paused. If the ikbd is -in RELATIVE MOUSE POSITIONING REPORTING mode, motion is accumulated beyond the -normal threshold limits to produce the minimum number of packets necessary for -transmission when output is resumed. Pressing or releasing either mouse button -causes any accumulated motion to be immediately queued as packets, if the -mouse is in RELATIVE MOUSE POSITION REPORTING mode. -Because of the limitations of the microcontroller memory this command should -be used sparingly, and the output should not be shut of for more than -milliseconds at a time. -The output is stopped only at the end of the current 'even'. If the PAUSE -OUTPUT command is received in the middle of a multiple byte report, the packet -will still be transmitted to conclusion and then the PAUSE will take effect. -When the ikbd is in either the JOYSTICK MONITORING mode or the FIRE BUTTON -MONITORING mode, the PAUSE OUTPUT command also temporarily stops the -monitoring process (i.e. the samples are not enqueued for transmission). - -0.15 SET JOYSTICK EVENT REPORTING - - 0x14 - -Enter JOYSTICK EVENT REPORTING mode (DEFAULT). Each opening or closure of a -joystick switch or trigger causes a joystick event record to be generated. - -9.16 SET JOYSTICK INTERROGATION MODE - - 0x15 - -Disables JOYSTICK EVENT REPORTING. Host must send individual JOYSTICK -INTERROGATE commands to sense joystick state. - -9.17 JOYSTICK INTERROGATE - - 0x16 - -Return a record indicating the current state of the joysticks. This command -is valid in either the JOYSTICK EVENT REPORTING mode or the JOYSTICK -INTERROGATION MODE. - -9.18 SET JOYSTICK MONITORING - - 0x17 - rate ; time between samples in hundredths of a second - Returns: (in packets of two as long as in mode) - %000000xy ; where y is JOYSTICK1 Fire button - ; and x is JOYSTICK0 Fire button - %nnnnmmmm ; where m is JOYSTICK1 state - ; and n is JOYSTICK0 state - -Sets the ikbd to do nothing but monitor the serial command line, maintain the -time-of-day clock, and monitor the joystick. The rate sets the interval -between joystick samples. -N.B. The user should not set the rate higher than the serial communications -channel will allow the 2 bytes packets to be transmitted. - -9.19 SET FIRE BUTTON MONITORING - - 0x18 - Returns: (as long as in mode) - %bbbbbbbb ; state of the JOYSTICK1 fire button packed - ; 8 bits per byte, the first sample if the MSB - -Set the ikbd to do nothing but monitor the serial command line, maintain the -time-of-day clock, and monitor the fire button on Joystick 1. The fire button -is scanned at a rate that causes 8 samples to be made in the time it takes for -the previous byte to be sent to the host (i.e. scan rate = 8/10 * baud rate). -The sample interval should be as constant as possible. - -9.20 SET JOYSTICK KEYCODE MODE - - 0x19 - RX ; length of time (in tenths of seconds) until - ; horizontal velocity breakpoint is reached - RY ; length of time (in tenths of seconds) until - ; vertical velocity breakpoint is reached - TX ; length (in tenths of seconds) of joystick closure - ; until horizontal cursor key is generated before RX - ; has elapsed - TY ; length (in tenths of seconds) of joystick closure - ; until vertical cursor key is generated before RY - ; has elapsed - VX ; length (in tenths of seconds) of joystick closure - ; until horizontal cursor keystrokes are generated - ; after RX has elapsed - VY ; length (in tenths of seconds) of joystick closure - ; until vertical cursor keystrokes are generated - ; after RY has elapsed - -In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. -On initial closure, a keystroke pair (make/break) is generated. Then up to Rn -tenths of seconds later, keystroke pairs are generated every Tn tenths of -seconds. After the Rn breakpoint is reached, keystroke pairs are generated -every Vn tenths of seconds. This provides a velocity (auto-repeat) breakpoint -feature. -Note that by setting RX and/or Ry to zero, the velocity feature can be -disabled. The values of TX and TY then become meaningless, and the generation -of cursor 'keystrokes' is set by VX and VY. - -9.21 DISABLE JOYSTICKS - - 0x1A - -Disable the generation of any joystick events (and scanning may be internally -disabled). Any valid joystick mode command resumes joystick monitoring. (The -joystick mode commands are SET JOYSTICK EVENT REPORTING, SET JOYSTICK -INTERROGATION MODE, SET JOYSTICK MONITORING, SET FIRE BUTTON MONITORING, and -SET JOYSTICK KEYCODE MODE.) - -9.22 TIME-OF-DAY CLOCK SET - - 0x1B - YY ; year (2 least significant digits) - MM ; month - DD ; day - hh ; hour - mm ; minute - ss ; second - -All time-of-day data should be sent to the ikbd in packed BCD format. -Any digit that is not a valid BCD digit should be treated as a 'don't care' -and not alter that particular field of the date or time. This permits setting -only some subfields of the time-of-day clock. - -9.23 INTERROGATE TIME-OF-DAT CLOCK - - 0x1C - Returns: - 0xFC ; time-of-day event header - YY ; year (2 least significant digits) - MM ; month - DD ; day - hh ; hour - mm ; minute - ss ; second - - All time-of-day is sent in packed BCD format. - -9.24 MEMORY LOAD - - 0x20 - ADRMSB ; address in controller - ADRLSB ; memory to be loaded - NUM ; number of bytes (0-128) - { data } - -This command permits the host to load arbitrary values into the ikbd -controller memory. The time between data bytes must be less than 20ms. - -9.25 MEMORY READ - - 0x21 - ADRMSB ; address in controller - ADRLSB ; memory to be read - Returns: - 0xF6 ; status header - 0x20 ; memory access - { data } ; 6 data bytes starting at ADR - -This command permits the host to read from the ikbd controller memory. - -9.26 CONTROLLER EXECUTE - - 0x22 - ADRMSB ; address of subroutine in - ADRLSB ; controller memory to be called - -This command allows the host to command the execution of a subroutine in the -ikbd controller memory. - -9.27 STATUS INQUIRIES - - Status commands are formed by inclusively ORing 0x80 with the - relevant SET command. - - Example: - 0x88 (or 0x89 or 0x8A) ; request mouse mode - Returns: - 0xF6 ; status response header - mode ; 0x08 is RELATIVE - ; 0x09 is ABSOLUTE - ; 0x0A is KEYCODE - param1 ; 0 is RELATIVE - ; XMSB maximum if ABSOLUTE - ; DELTA X is KEYCODE - param2 ; 0 is RELATIVE - ; YMSB maximum if ABSOLUTE - ; DELTA Y is KEYCODE - param3 ; 0 if RELATIVE - ; or KEYCODE - ; YMSB is ABSOLUTE - param4 ; 0 if RELATIVE - ; or KEYCODE - ; YLSB is ABSOLUTE - 0 ; pad - 0 - -The STATUS INQUIRY commands request the ikbd to return either the current mode -or the parameters associated with a given command. All status reports are -padded to form 8 byte long return packets. The responses to the status -requests are designed so that the host may store them away (after stripping -off the status report header byte) and later send them back as commands to -ikbd to restore its state. The 0 pad bytes will be treated as NOPs by the -ikbd. - - Valid STATUS INQUIRY commands are: - - 0x87 mouse button action - 0x88 mouse mode - 0x89 - 0x8A - 0x8B mnouse threshold - 0x8C mouse scale - 0x8F mouse vertical coordinates - 0x90 ( returns 0x0F Y=0 at bottom - 0x10 Y=0 at top ) - 0x92 mouse enable/disable - ( returns 0x00 enabled) - 0x12 disabled ) - 0x94 joystick mode - 0x95 - 0x96 - 0x9A joystick enable/disable - ( returns 0x00 enabled - 0x1A disabled ) - -It is the (host) programmer's responsibility to have only one unanswered -inquiry in process at a time. -STATUS INQUIRY commands are not valid if the ikbd is in JOYSTICK MONITORING -mode or FIRE BUTTON MONITORING mode. - - -10. SCAN CODES - -The key scan codes returned by the ikbd are chosen to simplify the -implementation of GSX. - -GSX Standard Keyboard Mapping. - -Hex Keytop -01 Esc -02 1 -03 2 -04 3 -05 4 -06 5 -07 6 -08 7 -09 8 -0A 9 -0B 0 -0C - -0D == -0E BS -0F TAB -10 Q -11 W -12 E -13 R -14 T -15 Y -16 U -17 I -18 O -19 P -1A [ -1B ] -1C RET -1D CTRL -1E A -1F S -20 D -21 F -22 G -23 H -24 J -25 K -26 L -27 ; -28 ' -29 ` -2A (LEFT) SHIFT -2B \ -2C Z -2D X -2E C -2F V -30 B -31 N -32 M -33 , -34 . -35 / -36 (RIGHT) SHIFT -37 { NOT USED } -38 ALT -39 SPACE BAR -3A CAPS LOCK -3B F1 -3C F2 -3D F3 -3E F4 -3F F5 -40 F6 -41 F7 -42 F8 -43 F9 -44 F10 -45 { NOT USED } -46 { NOT USED } -47 HOME -48 UP ARROW -49 { NOT USED } -4A KEYPAD - -4B LEFT ARROW -4C { NOT USED } -4D RIGHT ARROW -4E KEYPAD + -4F { NOT USED } -50 DOWN ARROW -51 { NOT USED } -52 INSERT -53 DEL -54 { NOT USED } -5F { NOT USED } -60 ISO KEY -61 UNDO -62 HELP -63 KEYPAD ( -64 KEYPAD / -65 KEYPAD * -66 KEYPAD * -67 KEYPAD 7 -68 KEYPAD 8 -69 KEYPAD 9 -6A KEYPAD 4 -6B KEYPAD 5 -6C KEYPAD 6 -6D KEYPAD 1 -6E KEYPAD 2 -6F KEYPAD 3 -70 KEYPAD 0 -71 KEYPAD . -72 KEYPAD ENTER diff --git a/Documentation/input/bcm5974.txt b/Documentation/input/bcm5974.txt deleted file mode 100644 index 74d3876d6f348d5f05eb39d8d78026ce9ffb5894..0000000000000000000000000000000000000000 --- a/Documentation/input/bcm5974.txt +++ /dev/null @@ -1,65 +0,0 @@ -BCM5974 Driver (bcm5974) ------------------------- - Copyright (C) 2008-2009 Henrik Rydberg - -The USB initialization and package decoding was made by Scott Shawcroft as -part of the touchd user-space driver project: - Copyright (C) 2008 Scott Shawcroft (scott.shawcroft@gmail.com) - -The BCM5974 driver is based on the appletouch driver: - Copyright (C) 2001-2004 Greg Kroah-Hartman (greg@kroah.com) - Copyright (C) 2005 Johannes Berg (johannes@sipsolutions.net) - Copyright (C) 2005 Stelian Pop (stelian@popies.net) - Copyright (C) 2005 Frank Arnold (frank@scirocco-5v-turbo.de) - Copyright (C) 2005 Peter Osterlund (petero2@telia.com) - Copyright (C) 2005 Michael Hanselmann (linux-kernel@hansmi.ch) - Copyright (C) 2006 Nicolas Boichat (nicolas@boichat.ch) - -This driver adds support for the multi-touch trackpad on the new Apple -Macbook Air and Macbook Pro laptops. It replaces the appletouch driver on -those computers, and integrates well with the synaptics driver of the Xorg -system. - -Known to work on Macbook Air, Macbook Pro Penryn and the new unibody -Macbook 5 and Macbook Pro 5. - -Usage ------ - -The driver loads automatically for the supported usb device ids, and -becomes available both as an event device (/dev/input/event*) and as a -mouse via the mousedev driver (/dev/input/mice). - -USB Race --------- - -The Apple multi-touch trackpads report both mouse and keyboard events via -different interfaces of the same usb device. This creates a race condition -with the HID driver, which, if not told otherwise, will find the standard -HID mouse and keyboard, and claim the whole device. To remedy, the usb -product id must be listed in the mouse_ignore list of the hid driver. - -Debug output ------------- - -To ease the development for new hardware version, verbose packet output can -be switched on with the debug kernel module parameter. The range [1-9] -yields different levels of verbosity. Example (as root): - -echo -n 9 > /sys/module/bcm5974/parameters/debug - -tail -f /var/log/debug - -echo -n 0 > /sys/module/bcm5974/parameters/debug - -Trivia ------- - -The driver was developed at the ubuntu forums in June 2008 [1], and now has -a more permanent home at bitmath.org [2]. - -Links ------ - -[1] http://ubuntuforums.org/showthread.php?t=840040 -[2] http://bitmath.org/code/ diff --git a/Documentation/input/cd32.txt b/Documentation/input/cd32.txt deleted file mode 100644 index a003d9b41eca058eeaa040e423267e3d441899ee..0000000000000000000000000000000000000000 --- a/Documentation/input/cd32.txt +++ /dev/null @@ -1,19 +0,0 @@ -I have written a small patch that let's me use my Amiga CD32 -joypad connected to the parallel port. Thought I'd share it with you so -you can add it to the list of supported joysticks (hopefully someone will -find it useful). - -It needs the following wiring: - -CD32 pad | Parallel port ----------------------------- -1 (Up) | 2 (D0) -2 (Down) | 3 (D1) -3 (Left) | 4 (D2) -4 (Right) | 5 (D3) -5 (Fire3) | 14 (AUTOFD) -6 (Fire1) | 17 (SELIN) -7 (+5V) | 1 (STROBE) -8 (Gnd) | 18 (Gnd) -9 (Fire2) | 7 (D5) - diff --git a/Documentation/input/cma3000_d0x.txt b/Documentation/input/cma3000_d0x.txt deleted file mode 100644 index 29d088db4afd6f804d2061b4d16e54c1ea769ed7..0000000000000000000000000000000000000000 --- a/Documentation/input/cma3000_d0x.txt +++ /dev/null @@ -1,115 +0,0 @@ -Kernel driver for CMA3000-D0x -============================ - -Supported chips: -* VTI CMA3000-D0x -Datasheet: - CMA3000-D0X Product Family Specification 8281000A.02.pdf - - -Author: Hemanth V - - -Description ------------ -CMA3000 Tri-axis accelerometer supports Motion detect, Measurement and -Free fall modes. - -Motion Detect Mode: Its the low power mode where interrupts are generated only -when motion exceeds the defined thresholds. - -Measurement Mode: This mode is used to read the acceleration data on X,Y,Z -axis and supports 400, 100, 40 Hz sample frequency. - -Free fall Mode: This mode is intended to save system resources. - -Threshold values: Chip supports defining threshold values for above modes -which includes time and g value. Refer product specifications for more details. - -CMA3000 chip supports mutually exclusive I2C and SPI interfaces for -communication, currently the driver supports I2C based communication only. -Initial configuration for bus mode is set in non volatile memory and can later -be modified through bus interface command. - -Driver reports acceleration data through input subsystem. It generates ABS_MISC -event with value 1 when free fall is detected. - -Platform data need to be configured for initial default values. - -Platform Data -------------- -fuzz_x: Noise on X Axis - -fuzz_y: Noise on Y Axis - -fuzz_z: Noise on Z Axis - -g_range: G range in milli g i.e 2000 or 8000 - -mode: Default Operating mode - -mdthr: Motion detect g range threshold value - -mdfftmr: Motion detect and free fall time threshold value - -ffthr: Free fall g range threshold value - -Input Interface --------------- -Input driver version is 1.0.0 -Input device ID: bus 0x18 vendor 0x0 product 0x0 version 0x0 -Input device name: "cma3000-accelerometer" -Supported events: - Event type 0 (Sync) - Event type 3 (Absolute) - Event code 0 (X) - Value 47 - Min -8000 - Max 8000 - Fuzz 200 - Event code 1 (Y) - Value -28 - Min -8000 - Max 8000 - Fuzz 200 - Event code 2 (Z) - Value 905 - Min -8000 - Max 8000 - Fuzz 200 - Event code 40 (Misc) - Value 0 - Min 0 - Max 1 - Event type 4 (Misc) - - -Register/Platform parameters Description ----------------------------------------- - -mode: - 0: power down mode - 1: 100 Hz Measurement mode - 2: 400 Hz Measurement mode - 3: 40 Hz Measurement mode - 4: Motion Detect mode (default) - 5: 100 Hz Free fall mode - 6: 40 Hz Free fall mode - 7: Power off mode - -grange: - 2000: 2000 mg or 2G Range - 8000: 8000 mg or 8G Range - -mdthr: - X: X * 71mg (8G Range) - X: X * 18mg (2G Range) - -mdfftmr: - X: (X & 0x70) * 100 ms (MDTMR) - (X & 0x0F) * 2.5 ms (FFTMR 400 Hz) - (X & 0x0F) * 10 ms (FFTMR 100 Hz) - -ffthr: - X: (X >> 2) * 18mg (2G Range) - X: (X & 0x0F) * 71 mg (8G Range) diff --git a/Documentation/input/conf.py b/Documentation/input/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..d2352fdc92ede64ce47afccd0bac6ee152d836b9 --- /dev/null +++ b/Documentation/input/conf.py @@ -0,0 +1,10 @@ +# -*- coding: utf-8; mode: python -*- + +project = "The Linux input driver subsystem" + +tags.add("subproject") + +latex_documents = [ + ('index', 'linux-input.tex', project, + 'The kernel development community', 'manual'), +] diff --git a/Documentation/input/cs461x.txt b/Documentation/input/cs461x.txt deleted file mode 100644 index 202e9dbacec39a96990c5722c4b1ab832b939b61..0000000000000000000000000000000000000000 --- a/Documentation/input/cs461x.txt +++ /dev/null @@ -1,45 +0,0 @@ -Preface. - -This is a new low-level driver to support analog joystick attached to -Crystal SoundFusion CS4610/CS4612/CS4615. This code is based upon -Vortex/Solo drivers as an example of decoration style, and ALSA -0.5.8a kernel drivers as an chipset documentation and samples. - -This version does not have cooked mode support; the basic code -is present here, but have not tested completely. The button analysis -is completed in this mode, but the axis movement is not. - -Raw mode works fine with analog joystick front-end driver and cs461x -driver as a backend. I've tested this driver with CS4610, 4-axis and -4-button joystick; I mean the jstest utility. Also I've tried to -play in xracer game using joystick, and the result is better than -keyboard only mode. - -The sensitivity and calibrate quality have not been tested; the two -reasons are performed: the same hardware cannot work under Win95 (blue -screen in VJOYD); I have no documentation on my chip; and the existing -behavior in my case was not raised the requirement of joystick calibration. -So the driver have no code to perform hardware related calibration. - -The patch contains minor changes of Config.in and Makefile files. All -needed code have been moved to one separate file cs461x.c like ns558.c -This driver have the basic support for PCI devices only; there is no -ISA or PnP ISA cards supported. AFAIK the ns558 have support for Crystal -ISA and PnP ISA series. - -The driver works with ALSA drivers simultaneously. For example, the xracer -uses joystick as input device and PCM device as sound output in one time. -There are no sound or input collisions detected. The source code have -comments about them; but I've found the joystick can be initialized -separately of ALSA modules. So, you can use only one joystick driver -without ALSA drivers. The ALSA drivers are not needed to compile or -run this driver. - -There are no debug information print have been placed in source, and no -specific options required to work this driver. The found chipset parameters -are printed via printk(KERN_INFO "..."), see the /var/log/messages to -inspect cs461x: prefixed messages to determine possible card detection -errors. - -Regards, -Viktor diff --git a/Documentation/input/devices/alps.rst b/Documentation/input/devices/alps.rst new file mode 100644 index 0000000000000000000000000000000000000000..6779148e428cb830ed1a40ddd5c8d5ca1689e0d3 --- /dev/null +++ b/Documentation/input/devices/alps.rst @@ -0,0 +1,387 @@ +---------------------- +ALPS Touchpad Protocol +---------------------- + +Introduction +------------ +Currently the ALPS touchpad driver supports seven protocol versions in use by +ALPS touchpads, called versions 1, 2, 3, 4, 5, 6, 7 and 8. + +Since roughly mid-2010 several new ALPS touchpads have been released and +integrated into a variety of laptops and netbooks. These new touchpads +have enough behavior differences that the alps_model_data definition +table, describing the properties of the different versions, is no longer +adequate. The design choices were to re-define the alps_model_data +table, with the risk of regression testing existing devices, or isolate +the new devices outside of the alps_model_data table. The latter design +choice was made. The new touchpad signatures are named: "Rushmore", +"Pinnacle", and "Dolphin", which you will see in the alps.c code. +For the purposes of this document, this group of ALPS touchpads will +generically be called "new ALPS touchpads". + +We experimented with probing the ACPI interface _HID (Hardware ID)/_CID +(Compatibility ID) definition as a way to uniquely identify the +different ALPS variants but there did not appear to be a 1:1 mapping. +In fact, it appeared to be an m:n mapping between the _HID and actual +hardware type. + +Detection +--------- + +All ALPS touchpads should respond to the "E6 report" command sequence: +E8-E6-E6-E6-E9. An ALPS touchpad should respond with either 00-00-0A or +00-00-64 if no buttons are pressed. The bits 0-2 of the first byte will be 1s +if some buttons are pressed. + +If the E6 report is successful, the touchpad model is identified using the "E7 +report" sequence: E8-E7-E7-E7-E9. The response is the model signature and is +matched against known models in the alps_model_data_array. + +For older touchpads supporting protocol versions 3 and 4, the E7 report +model signature is always 73-02-64. To differentiate between these +versions, the response from the "Enter Command Mode" sequence must be +inspected as described below. + +The new ALPS touchpads have an E7 signature of 73-03-50 or 73-03-0A but +seem to be better differentiated by the EC Command Mode response. + +Command Mode +------------ + +Protocol versions 3 and 4 have a command mode that is used to read and write +one-byte device registers in a 16-bit address space. The command sequence +EC-EC-EC-E9 places the device in command mode, and the device will respond +with 88-07 followed by a third byte. This third byte can be used to determine +whether the devices uses the version 3 or 4 protocol. + +To exit command mode, PSMOUSE_CMD_SETSTREAM (EA) is sent to the touchpad. + +While in command mode, register addresses can be set by first sending a +specific command, either EC for v3 devices or F5 for v4 devices. Then the +address is sent one nibble at a time, where each nibble is encoded as a +command with optional data. This encoding differs slightly between the v3 and +v4 protocols. + +Once an address has been set, the addressed register can be read by sending +PSMOUSE_CMD_GETINFO (E9). The first two bytes of the response contains the +address of the register being read, and the third contains the value of the +register. Registers are written by writing the value one nibble at a time +using the same encoding used for addresses. + +For the new ALPS touchpads, the EC command is used to enter command +mode. The response in the new ALPS touchpads is significantly different, +and more important in determining the behavior. This code has been +separated from the original alps_model_data table and put in the +alps_identify function. For example, there seem to be two hardware init +sequences for the "Dolphin" touchpads as determined by the second byte +of the EC response. + +Packet Format +------------- + +In the following tables, the following notation is used:: + + CAPITALS = stick, miniscules = touchpad + +?'s can have different meanings on different models, such as wheel rotation, +extra buttons, stick buttons on a dualpoint, etc. + +PS/2 packet format +------------------ + +:: + + byte 0: 0 0 YSGN XSGN 1 M R L + byte 1: X7 X6 X5 X4 X3 X2 X1 X0 + byte 2: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 + +Note that the device never signals overflow condition. + +For protocol version 2 devices when the trackpoint is used, and no fingers +are on the touchpad, the M R L bits signal the combined status of both the +pointingstick and touchpad buttons. + +ALPS Absolute Mode - Protocol Version 1 +--------------------------------------- + +:: + + byte 0: 1 0 0 0 1 x9 x8 x7 + byte 1: 0 x6 x5 x4 x3 x2 x1 x0 + byte 2: 0 ? ? l r ? fin ges + byte 3: 0 ? ? ? ? y9 y8 y7 + byte 4: 0 y6 y5 y4 y3 y2 y1 y0 + byte 5: 0 z6 z5 z4 z3 z2 z1 z0 + +ALPS Absolute Mode - Protocol Version 2 +--------------------------------------- + +:: + + byte 0: 1 ? ? ? 1 PSM PSR PSL + byte 1: 0 x6 x5 x4 x3 x2 x1 x0 + byte 2: 0 x10 x9 x8 x7 ? fin ges + byte 3: 0 y9 y8 y7 1 M R L + byte 4: 0 y6 y5 y4 y3 y2 y1 y0 + byte 5: 0 z6 z5 z4 z3 z2 z1 z0 + +Protocol Version 2 DualPoint devices send standard PS/2 mouse packets for +the DualPoint Stick. The M, R and L bits signal the combined status of both +the pointingstick and touchpad buttons, except for Dell dualpoint devices +where the pointingstick buttons get reported separately in the PSM, PSR +and PSL bits. + +Dualpoint device -- interleaved packet format +--------------------------------------------- + +:: + + byte 0: 1 1 0 0 1 1 1 1 + byte 1: 0 x6 x5 x4 x3 x2 x1 x0 + byte 2: 0 x10 x9 x8 x7 0 fin ges + byte 3: 0 0 YSGN XSGN 1 1 1 1 + byte 4: X7 X6 X5 X4 X3 X2 X1 X0 + byte 5: Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 + byte 6: 0 y9 y8 y7 1 m r l + byte 7: 0 y6 y5 y4 y3 y2 y1 y0 + byte 8: 0 z6 z5 z4 z3 z2 z1 z0 + +Devices which use the interleaving format normally send standard PS/2 mouse +packets for the DualPoint Stick + ALPS Absolute Mode packets for the +touchpad, switching to the interleaved packet format when both the stick and +the touchpad are used at the same time. + +ALPS Absolute Mode - Protocol Version 3 +--------------------------------------- + +ALPS protocol version 3 has three different packet formats. The first two are +associated with touchpad events, and the third is associated with trackstick +events. + +The first type is the touchpad position packet:: + + byte 0: 1 ? x1 x0 1 1 1 1 + byte 1: 0 x10 x9 x8 x7 x6 x5 x4 + byte 2: 0 y10 y9 y8 y7 y6 y5 y4 + byte 3: 0 M R L 1 m r l + byte 4: 0 mt x3 x2 y3 y2 y1 y0 + byte 5: 0 z6 z5 z4 z3 z2 z1 z0 + +Note that for some devices the trackstick buttons are reported in this packet, +and on others it is reported in the trackstick packets. + +The second packet type contains bitmaps representing the x and y axes. In the +bitmaps a given bit is set if there is a finger covering that position on the +given axis. Thus the bitmap packet can be used for low-resolution multi-touch +data, although finger tracking is not possible. This packet also encodes the +number of contacts (f1 and f0 in the table below):: + + byte 0: 1 1 x1 x0 1 1 1 1 + byte 1: 0 x8 x7 x6 x5 x4 x3 x2 + byte 2: 0 y7 y6 y5 y4 y3 y2 y1 + byte 3: 0 y10 y9 y8 1 1 1 1 + byte 4: 0 x14 x13 x12 x11 x10 x9 y0 + byte 5: 0 1 ? ? ? ? f1 f0 + +This packet only appears after a position packet with the mt bit set, and +usually only appears when there are two or more contacts (although +occasionally it's seen with only a single contact). + +The final v3 packet type is the trackstick packet:: + + byte 0: 1 1 x7 y7 1 1 1 1 + byte 1: 0 x6 x5 x4 x3 x2 x1 x0 + byte 2: 0 y6 y5 y4 y3 y2 y1 y0 + byte 3: 0 1 0 0 1 0 0 0 + byte 4: 0 z4 z3 z2 z1 z0 ? ? + byte 5: 0 0 1 1 1 1 1 1 + +ALPS Absolute Mode - Protocol Version 4 +--------------------------------------- + +Protocol version 4 has an 8-byte packet format:: + + byte 0: 1 ? x1 x0 1 1 1 1 + byte 1: 0 x10 x9 x8 x7 x6 x5 x4 + byte 2: 0 y10 y9 y8 y7 y6 y5 y4 + byte 3: 0 1 x3 x2 y3 y2 y1 y0 + byte 4: 0 ? ? ? 1 ? r l + byte 5: 0 z6 z5 z4 z3 z2 z1 z0 + byte 6: bitmap data (described below) + byte 7: bitmap data (described below) + +The last two bytes represent a partial bitmap packet, with 3 full packets +required to construct a complete bitmap packet. Once assembled, the 6-byte +bitmap packet has the following format:: + + byte 0: 0 1 x7 x6 x5 x4 x3 x2 + byte 1: 0 x1 x0 y4 y3 y2 y1 y0 + byte 2: 0 0 ? x14 x13 x12 x11 x10 + byte 3: 0 x9 x8 y9 y8 y7 y6 y5 + byte 4: 0 0 0 0 0 0 0 0 + byte 5: 0 0 0 0 0 0 0 y10 + +There are several things worth noting here. + + 1) In the bitmap data, bit 6 of byte 0 serves as a sync byte to + identify the first fragment of a bitmap packet. + + 2) The bitmaps represent the same data as in the v3 bitmap packets, although + the packet layout is different. + + 3) There doesn't seem to be a count of the contact points anywhere in the v4 + protocol packets. Deriving a count of contact points must be done by + analyzing the bitmaps. + + 4) There is a 3 to 1 ratio of position packets to bitmap packets. Therefore + MT position can only be updated for every third ST position update, and + the count of contact points can only be updated every third packet as + well. + +So far no v4 devices with tracksticks have been encountered. + +ALPS Absolute Mode - Protocol Version 5 +--------------------------------------- +This is basically Protocol Version 3 but with different logic for packet +decode. It uses the same alps_process_touchpad_packet_v3 call with a +specialized decode_fields function pointer to correctly interpret the +packets. This appears to only be used by the Dolphin devices. + +For single-touch, the 6-byte packet format is:: + + byte 0: 1 1 0 0 1 0 0 0 + byte 1: 0 x6 x5 x4 x3 x2 x1 x0 + byte 2: 0 y6 y5 y4 y3 y2 y1 y0 + byte 3: 0 M R L 1 m r l + byte 4: y10 y9 y8 y7 x10 x9 x8 x7 + byte 5: 0 z6 z5 z4 z3 z2 z1 z0 + +For mt, the format is:: + + byte 0: 1 1 1 n3 1 n2 n1 x24 + byte 1: 1 y7 y6 y5 y4 y3 y2 y1 + byte 2: ? x2 x1 y12 y11 y10 y9 y8 + byte 3: 0 x23 x22 x21 x20 x19 x18 x17 + byte 4: 0 x9 x8 x7 x6 x5 x4 x3 + byte 5: 0 x16 x15 x14 x13 x12 x11 x10 + +ALPS Absolute Mode - Protocol Version 6 +--------------------------------------- + +For trackstick packet, the format is:: + + byte 0: 1 1 1 1 1 1 1 1 + byte 1: 0 X6 X5 X4 X3 X2 X1 X0 + byte 2: 0 Y6 Y5 Y4 Y3 Y2 Y1 Y0 + byte 3: ? Y7 X7 ? ? M R L + byte 4: Z7 Z6 Z5 Z4 Z3 Z2 Z1 Z0 + byte 5: 0 1 1 1 1 1 1 1 + +For touchpad packet, the format is:: + + byte 0: 1 1 1 1 1 1 1 1 + byte 1: 0 0 0 0 x3 x2 x1 x0 + byte 2: 0 0 0 0 y3 y2 y1 y0 + byte 3: ? x7 x6 x5 x4 ? r l + byte 4: ? y7 y6 y5 y4 ? ? ? + byte 5: z7 z6 z5 z4 z3 z2 z1 z0 + +(v6 touchpad does not have middle button) + +ALPS Absolute Mode - Protocol Version 7 +--------------------------------------- + +For trackstick packet, the format is:: + + byte 0: 0 1 0 0 1 0 0 0 + byte 1: 1 1 * * 1 M R L + byte 2: X7 1 X5 X4 X3 X2 X1 X0 + byte 3: Z6 1 Y6 X6 1 Y2 Y1 Y0 + byte 4: Y7 0 Y5 Y4 Y3 1 1 0 + byte 5: T&P 0 Z5 Z4 Z3 Z2 Z1 Z0 + +For touchpad packet, the format is:: + + packet-fmt b7 b6 b5 b4 b3 b2 b1 b0 + byte 0: TWO & MULTI L 1 R M 1 Y0-2 Y0-1 Y0-0 + byte 0: NEW L 1 X1-5 1 1 Y0-2 Y0-1 Y0-0 + byte 1: Y0-10 Y0-9 Y0-8 Y0-7 Y0-6 Y0-5 Y0-4 Y0-3 + byte 2: X0-11 1 X0-10 X0-9 X0-8 X0-7 X0-6 X0-5 + byte 3: X1-11 1 X0-4 X0-3 1 X0-2 X0-1 X0-0 + byte 4: TWO X1-10 TWO X1-9 X1-8 X1-7 X1-6 X1-5 X1-4 + byte 4: MULTI X1-10 TWO X1-9 X1-8 X1-7 X1-6 Y1-5 1 + byte 4: NEW X1-10 TWO X1-9 X1-8 X1-7 X1-6 0 0 + byte 5: TWO & NEW Y1-10 0 Y1-9 Y1-8 Y1-7 Y1-6 Y1-5 Y1-4 + byte 5: MULTI Y1-10 0 Y1-9 Y1-8 Y1-7 Y1-6 F-1 F-0 + + L: Left button + R / M: Non-clickpads: Right / Middle button + Clickpads: When > 2 fingers are down, and some fingers + are in the button area, then the 2 coordinates reported + are for fingers outside the button area and these report + extra fingers being present in the right / left button + area. Note these fingers are not added to the F field! + so if a TWO packet is received and R = 1 then there are + 3 fingers down, etc. + TWO: 1: Two touches present, byte 0/4/5 are in TWO fmt + 0: If byte 4 bit 0 is 1, then byte 0/4/5 are in MULTI fmt + otherwise byte 0 bit 4 must be set and byte 0/4/5 are + in NEW fmt + F: Number of fingers - 3, 0 means 3 fingers, 1 means 4 ... + + +ALPS Absolute Mode - Protocol Version 8 +--------------------------------------- + +Spoken by SS4 (73 03 14) and SS5 (73 03 28) hardware. + +The packet type is given by the APD field, bits 4-5 of byte 3. + +Touchpad packet (APD = 0x2):: + + b7 b6 b5 b4 b3 b2 b1 b0 + byte 0: SWM SWR SWL 1 1 0 0 X7 + byte 1: 0 X6 X5 X4 X3 X2 X1 X0 + byte 2: 0 Y6 Y5 Y4 Y3 Y2 Y1 Y0 + byte 3: 0 T&P 1 0 1 0 0 Y7 + byte 4: 0 Z6 Z5 Z4 Z3 Z2 Z1 Z0 + byte 5: 0 0 0 0 0 0 0 0 + +SWM, SWR, SWL: Middle, Right, and Left button states + +Touchpad 1 Finger packet (APD = 0x0):: + + b7 b6 b5 b4 b3 b2 b1 b0 + byte 0: SWM SWR SWL 1 1 X2 X1 X0 + byte 1: X9 X8 X7 1 X6 X5 X4 X3 + byte 2: 0 X11 X10 LFB Y3 Y2 Y1 Y0 + byte 3: Y5 Y4 0 0 1 TAPF2 TAPF1 TAPF0 + byte 4: Zv7 Y11 Y10 1 Y9 Y8 Y7 Y6 + byte 5: Zv6 Zv5 Zv4 0 Zv3 Zv2 Zv1 Zv0 + +TAPF: ??? +LFB: ??? + +Touchpad 2 Finger packet (APD = 0x1):: + + b7 b6 b5 b4 b3 b2 b1 b0 + byte 0: SWM SWR SWL 1 1 AX6 AX5 AX4 + byte 1: AX11 AX10 AX9 AX8 AX7 AZ1 AY4 AZ0 + byte 2: AY11 AY10 AY9 CONT AY8 AY7 AY6 AY5 + byte 3: 0 0 0 1 1 BX6 BX5 BX4 + byte 4: BX11 BX10 BX9 BX8 BX7 BZ1 BY4 BZ0 + byte 5: BY11 BY10 BY9 0 BY8 BY7 BY5 BY5 + +CONT: A 3-or-4 Finger packet is to follow + +Touchpad 3-or-4 Finger packet (APD = 0x3):: + + b7 b6 b5 b4 b3 b2 b1 b0 + byte 0: SWM SWR SWL 1 1 AX6 AX5 AX4 + byte 1: AX11 AX10 AX9 AX8 AX7 AZ1 AY4 AZ0 + byte 2: AY11 AY10 AY9 OVF AY8 AY7 AY6 AY5 + byte 3: 0 0 1 1 1 BX6 BX5 BX4 + byte 4: BX11 BX10 BX9 BX8 BX7 BZ1 BY4 BZ0 + byte 5: BY11 BY10 BY9 0 BY8 BY7 BY5 BY5 + +OVF: 5th finger detected diff --git a/Documentation/input/devices/amijoy.rst b/Documentation/input/devices/amijoy.rst new file mode 100644 index 0000000000000000000000000000000000000000..8df7b11cd98d76e3b3d5828fc4275ddc7aa9ffbe --- /dev/null +++ b/Documentation/input/devices/amijoy.rst @@ -0,0 +1,263 @@ +~~~~~~~~~~~~~~~~~~~~~~~~~ +Amiga joystick extensions +~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Amiga 4-joystick parport extension +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Parallel port pins: + + +===== ======== ==== ========== +Pin Meaning Pin Meaning +===== ======== ==== ========== + 2 Up1 6 Up2 + 3 Down1 7 Down2 + 4 Left1 8 Left2 + 5 Right1 9 Right2 +13 Fire1 11 Fire2 +18 Gnd1 18 Gnd2 +===== ======== ==== ========== + +Amiga digital joystick pinout +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +=== ============ +Pin Meaning +=== ============ +1 Up +2 Down +3 Left +4 Right +5 n/c +6 Fire button +7 +5V (50mA) +8 Gnd +9 Thumb button +=== ============ + +Amiga mouse pinout +~~~~~~~~~~~~~~~~~~ + +=== ============ +Pin Meaning +=== ============ +1 V-pulse +2 H-pulse +3 VQ-pulse +4 HQ-pulse +5 Middle button +6 Left button +7 +5V (50mA) +8 Gnd +9 Right button +=== ============ + +Amiga analog joystick pinout +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +=== ============== +Pin Meaning +=== ============== +1 Top button +2 Top2 button +3 Trigger button +4 Thumb button +5 Analog X +6 n/c +7 +5V (50mA) +8 Gnd +9 Analog Y +=== ============== + +Amiga lightpen pinout +~~~~~~~~~~~~~~~~~~~~~ + +=== ============= +Pin Meaning +=== ============= +1 n/c +2 n/c +3 n/c +4 n/c +5 Touch button +6 /Beamtrigger +7 +5V (50mA) +8 Gnd +9 Stylus button +=== ============= + +------------------------------------------------------------------------------- + +======== === ==== ==== ====== ======================================== +NAME rev ADDR type chip Description +======== === ==== ==== ====== ======================================== +JOY0DAT 00A R Denise Joystick-mouse 0 data (left vert, horiz) +JOY1DAT 00C R Denise Joystick-mouse 1 data (right vert,horiz) +======== === ==== ==== ====== ======================================== + + These addresses each read a 16 bit register. These in turn + are loaded from the MDAT serial stream and are clocked in on + the rising edge of SCLK. MLD output is used to parallel load + the external parallel-to-serial converter.This in turn is + loaded with the 4 quadrature inputs from each of two game + controller ports (8 total) plus 8 miscellaneous control bits + which are new for LISA and can be read in upper 8 bits of + LISAID. + + Register bits are as follows: + + - Mouse counter usage (pins 1,3 =Yclock, pins 2,4 =Xclock) + +======== === === === === === === === === ====== === === === === === === === + BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 +======== === === === === === === === === ====== === === === === === === === +JOY0DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 +JOY1DAT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 +======== === === === === === === === === ====== === === === === === === === + + 0=LEFT CONTROLLER PAIR, 1=RIGHT CONTROLLER PAIR. + (4 counters total). The bit usage for both left and right + addresses is shown below. Each 6 bit counter (Y7-Y2,X7-X2) is + clocked by 2 of the signals input from the mouse serial + stream. Starting with first bit received: + + +-------------------+-----------------------------------------+ + | Serial | Bit Name | Description | + +========+==========+=========================================+ + | 0 | M0H | JOY0DAT Horizontal Clock | + +--------+----------+-----------------------------------------+ + | 1 | M0HQ | JOY0DAT Horizontal Clock (quadrature) | + +--------+----------+-----------------------------------------+ + | 2 | M0V | JOY0DAT Vertical Clock | + +--------+----------+-----------------------------------------+ + | 3 | M0VQ | JOY0DAT Vertical Clock (quadrature) | + +--------+----------+-----------------------------------------+ + | 4 | M1V | JOY1DAT Horizontal Clock | + +--------+----------+-----------------------------------------+ + | 5 | M1VQ | JOY1DAT Horizontal Clock (quadrature) | + +--------+----------+-----------------------------------------+ + | 6 | M1V | JOY1DAT Vertical Clock | + +--------+----------+-----------------------------------------+ + | 7 | M1VQ | JOY1DAT Vertical Clock (quadrature) | + +--------+----------+-----------------------------------------+ + + Bits 1 and 0 of each counter (Y1-Y0,X1-X0) may be + read to determine the state of the related input signal pair. + This allows these pins to double as joystick switch inputs. + Joystick switch closures can be deciphered as follows: + + +------------+------+---------------------------------+ + | Directions | Pin# | Counter bits | + +============+======+=================================+ + | Forward | 1 | Y1 xor Y0 (BIT#09 xor BIT#08) | + +------------+------+---------------------------------+ + | Left | 3 | Y1 | + +------------+------+---------------------------------+ + | Back | 2 | X1 xor X0 (BIT#01 xor BIT#00) | + +------------+------+---------------------------------+ + | Right | 4 | X1 | + +------------+------+---------------------------------+ + +------------------------------------------------------------------------------- + +======== === ==== ==== ====== ================================================= +NAME rev ADDR type chip Description +======== === ==== ==== ====== ================================================= +JOYTEST 036 W Denise Write to all 4 joystick-mouse counters at once. +======== === ==== ==== ====== ================================================= + + Mouse counter write test data: + +========= === === === === === === === === ====== === === === === === === === + BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 +========= === === === === === === === === ====== === === === === === === === + JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx + JOYxDAT Y7 Y6 Y5 Y4 Y3 Y2 xx xx X7 X6 X5 X4 X3 X2 xx xx +========= === === === === === === === === ====== === === === === === === === + +------------------------------------------------------------------------------- + +======= === ==== ==== ====== ======================================== +NAME rev ADDR type chip Description +======= === ==== ==== ====== ======================================== +POT0DAT h 012 R Paula Pot counter data left pair (vert, horiz) +POT1DAT h 014 R Paula Pot counter data right pair (vert,horiz) +======= === ==== ==== ====== ======================================== + + These addresses each read a pair of 8 bit pot counters. + (4 counters total). The bit assignment for both + addresses is shown below. The counters are stopped by signals + from 2 controller connectors (left-right) with 2 pins each. + +====== === === === === === === === === ====== === === === === === === === + BIT# 15 14 13 12 11 10 09 08 07 06 05 04 03 02 01 00 +====== === === === === === === === === ====== === === === === === === === + RIGHT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 + LEFT Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y0 X7 X6 X5 X4 X3 X2 X1 X0 +====== === === === === === === === === ====== === === === === === === === + + +--------------------------+-------+ + | CONNECTORS | PAULA | + +-------+------+-----+-----+-------+ + | Loc. | Dir. | Sym | pin | pin | + +=======+======+=====+=====+=======+ + | RIGHT | Y | RX | 9 | 33 | + +-------+------+-----+-----+-------+ + | RIGHT | X | RX | 5 | 32 | + +-------+------+-----+-----+-------+ + | LEFT | Y | LY | 9 | 36 | + +-------+------+-----+-----+-------+ + | LEFT | X | LX | 5 | 35 | + +-------+------+-----+-----+-------+ + + With normal (NTSC or PAL) horiz. line rate, the pots will + give a full scale (FF) reading with about 500kohms in one + frame time. With proportionally faster horiz line times, + the counters will count proportionally faster. + This should be noted when doing variable beam displays. + +------------------------------------------------------------------------------- + +====== === ==== ==== ====== ================================================ +NAME rev ADDR type chip Description +====== === ==== ==== ====== ================================================ +POTGO 034 W Paula Pot port (4 bit) bi-direction and data, and pot + counter start. +====== === ==== ==== ====== ================================================ + +------------------------------------------------------------------------------- + +====== === ==== ==== ====== ================================================ +NAME rev ADDR type chip Description +====== === ==== ==== ====== ================================================ +POTINP 016 R Paula Pot pin data read +====== === ==== ==== ====== ================================================ + + This register controls a 4 bit bi-direction I/O port + that shares the same 4 pins as the 4 pot counters above. + + +-------+----------+---------------------------------------------+ + | BIT# | FUNCTION | DESCRIPTION | + +=======+==========+=============================================+ + | 15 | OUTRY | Output enable for Paula pin 33 | + +-------+----------+---------------------------------------------+ + | 14 | DATRY | I/O data Paula pin 33 | + +-------+----------+---------------------------------------------+ + | 13 | OUTRX | Output enable for Paula pin 32 | + +-------+----------+---------------------------------------------+ + | 12 | DATRX | I/O data Paula pin 32 | + +-------+----------+---------------------------------------------+ + | 11 | OUTLY | Out put enable for Paula pin 36 | + +-------+----------+---------------------------------------------+ + | 10 | DATLY | I/O data Paula pin 36 | + +-------+----------+---------------------------------------------+ + | 09 | OUTLX | Output enable for Paula pin 35 | + +-------+----------+---------------------------------------------+ + | 08 | DATLX | I/O data Paula pin 35 | + +-------+----------+---------------------------------------------+ + | 07-01 | X | Not used | + +-------+----------+---------------------------------------------+ + | 00 | START | Start pots (dump capacitors,start counters) | + +-------+----------+---------------------------------------------+ diff --git a/Documentation/input/devices/appletouch.rst b/Documentation/input/devices/appletouch.rst new file mode 100644 index 0000000000000000000000000000000000000000..c94470e66533e91ce8de0b52c2dfe28fe04aac58 --- /dev/null +++ b/Documentation/input/devices/appletouch.rst @@ -0,0 +1,94 @@ +.. include:: + +---------------------------------- +Apple Touchpad Driver (appletouch) +---------------------------------- + +:Copyright: |copy| 2005 Stelian Pop + +appletouch is a Linux kernel driver for the USB touchpad found on post +February 2005 and October 2005 Apple Aluminium Powerbooks. + +This driver is derived from Johannes Berg's appletrackpad driver [#f1]_, +but it has been improved in some areas: + + * appletouch is a full kernel driver, no userspace program is necessary + * appletouch can be interfaced with the synaptics X11 driver, in order + to have touchpad acceleration, scrolling, etc. + +Credits go to Johannes Berg for reverse-engineering the touchpad protocol, +Frank Arnold for further improvements, and Alex Harper for some additional +information about the inner workings of the touchpad sensors. Michael +Hanselmann added support for the October 2005 models. + +Usage +----- + +In order to use the touchpad in the basic mode, compile the driver and load +the module. A new input device will be detected and you will be able to read +the mouse data from /dev/input/mice (using gpm, or X11). + +In X11, you can configure the touchpad to use the synaptics X11 driver, which +will give additional functionalities, like acceleration, scrolling, 2 finger +tap for middle button mouse emulation, 3 finger tap for right button mouse +emulation, etc. In order to do this, make sure you're using a recent version of +the synaptics driver (tested with 0.14.2, available from [#f2]_), and configure +a new input device in your X11 configuration file (take a look below for an +example). For additional configuration, see the synaptics driver documentation:: + + Section "InputDevice" + Identifier "Synaptics Touchpad" + Driver "synaptics" + Option "SendCoreEvents" "true" + Option "Device" "/dev/input/mice" + Option "Protocol" "auto-dev" + Option "LeftEdge" "0" + Option "RightEdge" "850" + Option "TopEdge" "0" + Option "BottomEdge" "645" + Option "MinSpeed" "0.4" + Option "MaxSpeed" "1" + Option "AccelFactor" "0.02" + Option "FingerLow" "0" + Option "FingerHigh" "30" + Option "MaxTapMove" "20" + Option "MaxTapTime" "100" + Option "HorizScrollDelta" "0" + Option "VertScrollDelta" "30" + Option "SHMConfig" "on" + EndSection + + Section "ServerLayout" + ... + InputDevice "Mouse" + InputDevice "Synaptics Touchpad" + ... + EndSection + +Fuzz problems +------------- + +The touchpad sensors are very sensitive to heat, and will generate a lot of +noise when the temperature changes. This is especially true when you power-on +the laptop for the first time. + +The appletouch driver tries to handle this noise and auto adapt itself, but it +is not perfect. If finger movements are not recognized anymore, try reloading +the driver. + +You can activate debugging using the 'debug' module parameter. A value of 0 +deactivates any debugging, 1 activates tracing of invalid samples, 2 activates +full tracing (each sample is being traced):: + + modprobe appletouch debug=1 + +or:: + + echo "1" > /sys/module/appletouch/parameters/debug + + +.. Links: + +.. [#f1] http://johannes.sipsolutions.net/PowerBook/touchpad/ + +.. [#f2] ``_ diff --git a/Documentation/input/devices/atarikbd.rst b/Documentation/input/devices/atarikbd.rst new file mode 100644 index 0000000000000000000000000000000000000000..745e7a1ff122b3d8dec0e39b3fc1eb235fba20dd --- /dev/null +++ b/Documentation/input/devices/atarikbd.rst @@ -0,0 +1,820 @@ +==================================== +Intelligent Keyboard (ikbd) Protocol +==================================== + + +Introduction +============ + +The Atari Corp. Intelligent Keyboard (ikbd) is a general purpose keyboard +controller that is flexible enough that it can be used in a variety of +products without modification. The keyboard, with its microcontroller, +provides a convenient connection point for a mouse and switch-type joysticks. +The ikbd processor also maintains a time-of-day clock with one second +resolution. +The ikbd has been designed to be general enough that it can be used with a +variety of new computer products. Product variations in a number of +keyswitches, mouse resolution, etc. can be accommodated. +The ikbd communicates with the main processor over a high speed bi-directional +serial interface. It can function in a variety of modes to facilitate +different applications of the keyboard, joysticks, or mouse. Limited use of +the controller is possible in applications in which only a unidirectional +communications medium is available by carefully designing the default modes. + +Keyboard +======== + +The keyboard always returns key make/break scan codes. The ikbd generates +keyboard scan codes for each key press and release. The key scan make (key +closure) codes start at 1, and are defined in Appendix A. For example, the +ISO key position in the scan code table should exist even if no keyswitch +exists in that position on a particular keyboard. The break code for each key +is obtained by ORing 0x80 with the make code. + +The special codes 0xF6 through 0xFF are reserved for use as follows: + +=================== ==================================================== + Code Command +=================== ==================================================== + 0xF6 status report + 0xF7 absolute mouse position record + 0xF8-0xFB relative mouse position records (lsbs determined by + mouse button states) + 0xFC time-of-day + 0xFD joystick report (both sticks) + 0xFE joystick 0 event + 0xFF joystick 1 event +=================== ==================================================== + +The two shift keys return different scan codes in this mode. The ENTER key +and the RETurn key are also distinct. + +Mouse +===== + +The mouse port should be capable of supporting a mouse with resolution of +approximately 200 counts (phase changes or 'clicks') per inch of travel. The +mouse should be scanned at a rate that will permit accurate tracking at +velocities up to 10 inches per second. +The ikbd can report mouse motion in three distinctly different ways. It can +report relative motion, absolute motion in a coordinate system maintained +within the ikbd, or by converting mouse motion into keyboard cursor control +key equivalents. +The mouse buttons can be treated as part of the mouse or as additional +keyboard keys. + +Relative Position Reporting +--------------------------- + +In relative position mode, the ikbd will return relative mouse position +records whenever a mouse event occurs. A mouse event consists of a mouse +button being pressed or released, or motion in either axis exceeding a +settable threshold of motion. Regardless of the threshold, all bits of +resolution are returned to the host computer. +Note that the ikbd may return mouse relative position reports with +significantly more than the threshold delta x or y. This may happen since no +relative mouse motion events will be generated: (a) while the keyboard has +been 'paused' ( the event will be stored until keyboard communications is +resumed) (b) while any event is being transmitted. + +The relative mouse position record is a three byte record of the form +(regardless of keyboard mode):: + + %111110xy ; mouse position record flag + ; where y is the right button state + ; and x is the left button state + X ; delta x as twos complement integer + Y ; delta y as twos complement integer + +Note that the value of the button state bits should be valid even if the +MOUSE BUTTON ACTION has set the buttons to act like part of the keyboard. +If the accumulated motion before the report packet is generated exceeds the ++127...-128 range, the motion is broken into multiple packets. +Note that the sign of the delta y reported is a function of the Y origin +selected. + +Absolute Position reporting +--------------------------- + +The ikbd can also maintain absolute mouse position. Commands exist for +resetting the mouse position, setting X/Y scaling, and interrogating the +current mouse position. + +Mouse Cursor Key Mode +--------------------- + +The ikbd can translate mouse motion into the equivalent cursor keystrokes. +The number of mouse clicks per keystroke is independently programmable in +each axis. The ikbd internally maintains mouse motion information to the +highest resolution available, and merely generates a pair of cursor key events +for each multiple of the scale factor. +Mouse motion produces the cursor key make code immediately followed by the +break code for the appropriate cursor key. The mouse buttons produce scan +codes above those normally assigned for the largest envisioned keyboard (i.e. +LEFT=0x74 & RIGHT=0x75). + +Joystick +======== + +Joystick Event Reporting +------------------------ + +In this mode, the ikbd generates a record whenever the joystick position is +changed (i.e. for each opening or closing of a joystick switch or trigger). + +The joystick event record is two bytes of the form:: + + %1111111x ; Joystick event marker + ; where x is Joystick 0 or 1 + %x000yyyy ; where yyyy is the stick position + ; and x is the trigger + +Joystick Interrogation +---------------------- + +The current state of the joystick ports may be interrogated at any time in +this mode by sending an 'Interrogate Joystick' command to the ikbd. + +The ikbd response to joystick interrogation is a three byte report of the form:: + + 0xFD ; joystick report header + %x000yyyy ; Joystick 0 + %x000yyyy ; Joystick 1 + ; where x is the trigger + ; and yyy is the stick position + +Joystick Monitoring +------------------- + +A mode is available that devotes nearly all of the keyboard communications +time to reporting the state of the joystick ports at a user specifiable rate. +It remains in this mode until reset or commanded into another mode. The PAUSE +command in this mode not only stop the output but also temporarily stops +scanning the joysticks (samples are not queued). + +Fire Button Monitoring +---------------------- + +A mode is provided to permit monitoring a single input bit at a high rate. In +this mode the ikbd monitors the state of the Joystick 1 fire button at the +maximum rate permitted by the serial communication channel. The data is packed +8 bits per byte for transmission to the host. The ikbd remains in this mode +until reset or commanded into another mode. The PAUSE command in this mode not +only stops the output but also temporarily stops scanning the button (samples +are not queued). + +Joystick Key Code Mode +---------------------- + +The ikbd may be commanded to translate the use of either joystick into the +equivalent cursor control keystroke(s). The ikbd provides a single breakpoint +velocity joystick cursor. +Joystick events produce the make code, immediately followed by the break code +for the appropriate cursor motion keys. The trigger or fire buttons of the +joysticks produce pseudo key scan codes above those used by the largest key +matrix envisioned (i.e. JOYSTICK0=0x74, JOYSTICK1=0x75). + +Time-of-Day Clock +================= + +The ikbd also maintains a time-of-day clock for the system. Commands are +available to set and interrogate the timer-of-day clock. Time-keeping is +maintained down to a resolution of one second. + +Status Inquiries +================ + +The current state of ikbd modes and parameters may be found by sending status +inquiry commands that correspond to the ikbd set commands. + +Power-Up Mode +============= + +The keyboard controller will perform a simple self-test on power-up to detect +major controller faults (ROM checksum and RAM test) and such things as stuck +keys. Any keys down at power-up are presumed to be stuck, and their BREAK +(sic) code is returned (which without the preceding MAKE code is a flag for a +keyboard error). If the controller self-test completes without error, the code +0xF0 is returned. (This code will be used to indicate the version/release of +the ikbd controller. The first release of the ikbd is version 0xF0, should +there be a second release it will be 0xF1, and so on.) +The ikbd defaults to a mouse position reporting with threshold of 1 unit in +either axis and the Y=0 origin at the top of the screen, and joystick event +reporting mode for joystick 1, with both buttons being logically assigned to +the mouse. After any joystick command, the ikbd assumes that joysticks are +connected to both Joystick0 and Joystick1. Any mouse command (except MOUSE +DISABLE) then causes port 0 to again be scanned as if it were a mouse, and +both buttons are logically connected to it. If a mouse disable command is +received while port 0 is presumed to be a mouse, the button is logically +assigned to Joystick1 (until the mouse is reenabled by another mouse command). + +ikbd Command Set +================ + +This section contains a list of commands that can be sent to the ikbd. Command +codes (such as 0x00) which are not specified should perform no operation +(NOPs). + +RESET +----- + +:: + + 0x80 + 0x01 + +N.B. The RESET command is the only two byte command understood by the ikbd. +Any byte following an 0x80 command byte other than 0x01 is ignored (and causes +the 0x80 to be ignored). +A reset may also be caused by sending a break lasting at least 200mS to the +ikbd. +Executing the RESET command returns the keyboard to its default (power-up) +mode and parameter settings. It does not affect the time-of-day clock. +The RESET command or function causes the ikbd to perform a simple self-test. +If the test is successful, the ikbd will send the code of 0xF0 within 300mS +of receipt of the RESET command (or the end of the break, or power-up). The +ikbd will then scan the key matrix for any stuck (closed) keys. Any keys found +closed will cause the break scan code to be generated (the break code arriving +without being preceded by the make code is a flag for a key matrix error). + +SET MOUSE BUTTON ACTION +----------------------- + +:: + + 0x07 + %00000mss ; mouse button action + ; (m is presumed = 1 when in MOUSE KEYCODE mode) + ; mss=0xy, mouse button press or release causes mouse + ; position report + ; where y=1, mouse key press causes absolute report + ; and x=1, mouse key release causes absolute report + ; mss=100, mouse buttons act like keys + +This command sets how the ikbd should treat the buttons on the mouse. The +default mouse button action mode is %00000000, the buttons are treated as part +of the mouse logically. +When buttons act like keys, LEFT=0x74 & RIGHT=0x75. + +SET RELATIVE MOUSE POSITION REPORTING +------------------------------------- + +:: + + 0x08 + +Set relative mouse position reporting. (DEFAULT) Mouse position packets are +generated asynchronously by the ikbd whenever motion exceeds the setable +threshold in either axis (see SET MOUSE THRESHOLD). Depending upon the mouse +key mode, mouse position reports may also be generated when either mouse +button is pressed or released. Otherwise the mouse buttons behave as if they +were keyboard keys. + +SET ABSOLUTE MOUSE POSITIONING +------------------------------ + +:: + + 0x09 + XMSB ; X maximum (in scaled mouse clicks) + XLSB + YMSB ; Y maximum (in scaled mouse clicks) + YLSB + +Set absolute mouse position maintenance. Resets the ikbd maintained X and Y +coordinates. +In this mode, the value of the internally maintained coordinates does NOT wrap +between 0 and large positive numbers. Excess motion below 0 is ignored. The +command sets the maximum positive value that can be attained in the scaled +coordinate system. Motion beyond that value is also ignored. + +SET MOUSE KEYCODE MOSE +---------------------- + +:: + + 0x0A + deltax ; distance in X clicks to return (LEFT) or (RIGHT) + deltay ; distance in Y clicks to return (UP) or (DOWN) + +Set mouse monitoring routines to return cursor motion keycodes instead of +either RELATIVE or ABSOLUTE motion records. The ikbd returns the appropriate +cursor keycode after mouse travel exceeding the user specified deltas in +either axis. When the keyboard is in key scan code mode, mouse motion will +cause the make code immediately followed by the break code. Note that this +command is not affected by the mouse motion origin. + +SET MOUSE THRESHOLD +------------------- + +:: + + 0x0B + X ; x threshold in mouse ticks (positive integers) + Y ; y threshold in mouse ticks (positive integers) + +This command sets the threshold before a mouse event is generated. Note that +it does NOT affect the resolution of the data returned to the host. This +command is valid only in RELATIVE MOUSE POSITIONING mode. The thresholds +default to 1 at RESET (or power-up). + +SET MOUSE SCALE +--------------- + +:: + + 0x0C + X ; horizontal mouse ticks per internal X + Y ; vertical mouse ticks per internal Y + +This command sets the scale factor for the ABSOLUTE MOUSE POSITIONING mode. +In this mode, the specified number of mouse phase changes ('clicks') must +occur before the internally maintained coordinate is changed by one +(independently scaled for each axis). Remember that the mouse position +information is available only by interrogating the ikbd in the ABSOLUTE MOUSE +POSITIONING mode unless the ikbd has been commanded to report on button press +or release (see SET MOSE BUTTON ACTION). + +INTERROGATE MOUSE POSITION +-------------------------- + +:: + + 0x0D + Returns: + 0xF7 ; absolute mouse position header + BUTTONS + 0000dcba ; where a is right button down since last interrogation + ; b is right button up since last + ; c is left button down since last + ; d is left button up since last + XMSB ; X coordinate + XLSB + YMSB ; Y coordinate + YLSB + +The INTERROGATE MOUSE POSITION command is valid when in the ABSOLUTE MOUSE +POSITIONING mode, regardless of the setting of the MOUSE BUTTON ACTION. + +LOAD MOUSE POSITION +------------------- + +:: + + 0x0E + 0x00 ; filler + XMSB ; X coordinate + XLSB ; (in scaled coordinate system) + YMSB ; Y coordinate + YLSB + +This command allows the user to preset the internally maintained absolute +mouse position. + +SET Y=0 AT BOTTOM +----------------- + +:: + + 0x0F + +This command makes the origin of the Y axis to be at the bottom of the +logical coordinate system internal to the ikbd for all relative or absolute +mouse motion. This causes mouse motion toward the user to be negative in sign +and away from the user to be positive. + +SET Y=0 AT TOP +-------------- + +:: + + 0x10 + +Makes the origin of the Y axis to be at the top of the logical coordinate +system within the ikbd for all relative or absolute mouse motion. (DEFAULT) +This causes mouse motion toward the user to be positive in sign and away from +the user to be negative. + +RESUME +------ + +:: + + 0x11 + +Resume sending data to the host. Since any command received by the ikbd after +its output has been paused also causes an implicit RESUME this command can be +thought of as a NO OPERATION command. If this command is received by the ikbd +and it is not PAUSED, it is simply ignored. + +DISABLE MOUSE +------------- + +:: + + 0x12 + +All mouse event reporting is disabled (and scanning may be internally +disabled). Any valid mouse mode command resumes mouse motion monitoring. (The +valid mouse mode commands are SET RELATIVE MOUSE POSITION REPORTING, SET +ABSOLUTE MOUSE POSITIONING, and SET MOUSE KEYCODE MODE. ) +N.B. If the mouse buttons have been commanded to act like keyboard keys, this +command DOES affect their actions. + +PAUSE OUTPUT +------------ + +:: + + 0x13 + +Stop sending data to the host until another valid command is received. Key +matrix activity is still monitored and scan codes or ASCII characters enqueued +(up to the maximum supported by the microcontroller) to be sent when the host +allows the output to be resumed. If in the JOYSTICK EVENT REPORTING mode, +joystick events are also queued. +Mouse motion should be accumulated while the output is paused. If the ikbd is +in RELATIVE MOUSE POSITIONING REPORTING mode, motion is accumulated beyond the +normal threshold limits to produce the minimum number of packets necessary for +transmission when output is resumed. Pressing or releasing either mouse button +causes any accumulated motion to be immediately queued as packets, if the +mouse is in RELATIVE MOUSE POSITION REPORTING mode. +Because of the limitations of the microcontroller memory this command should +be used sparingly, and the output should not be shut of for more than +milliseconds at a time. +The output is stopped only at the end of the current 'even'. If the PAUSE +OUTPUT command is received in the middle of a multiple byte report, the packet +will still be transmitted to conclusion and then the PAUSE will take effect. +When the ikbd is in either the JOYSTICK MONITORING mode or the FIRE BUTTON +MONITORING mode, the PAUSE OUTPUT command also temporarily stops the +monitoring process (i.e. the samples are not enqueued for transmission). + +SET JOYSTICK EVENT REPORTING +---------------------------- + +:: + + 0x14 + +Enter JOYSTICK EVENT REPORTING mode (DEFAULT). Each opening or closure of a +joystick switch or trigger causes a joystick event record to be generated. + +SET JOYSTICK INTERROGATION MODE +------------------------------- + +:: + + 0x15 + +Disables JOYSTICK EVENT REPORTING. Host must send individual JOYSTICK +INTERROGATE commands to sense joystick state. + +JOYSTICK INTERROGATE +-------------------- + +:: + + 0x16 + +Return a record indicating the current state of the joysticks. This command +is valid in either the JOYSTICK EVENT REPORTING mode or the JOYSTICK +INTERROGATION MODE. + +SET JOYSTICK MONITORING +----------------------- + +:: + + 0x17 + rate ; time between samples in hundredths of a second + Returns: (in packets of two as long as in mode) + %000000xy ; where y is JOYSTICK1 Fire button + ; and x is JOYSTICK0 Fire button + %nnnnmmmm ; where m is JOYSTICK1 state + ; and n is JOYSTICK0 state + +Sets the ikbd to do nothing but monitor the serial command line, maintain the +time-of-day clock, and monitor the joystick. The rate sets the interval +between joystick samples. +N.B. The user should not set the rate higher than the serial communications +channel will allow the 2 bytes packets to be transmitted. + +SET FIRE BUTTON MONITORING +-------------------------- + +:: + + 0x18 + Returns: (as long as in mode) + %bbbbbbbb ; state of the JOYSTICK1 fire button packed + ; 8 bits per byte, the first sample if the MSB + +Set the ikbd to do nothing but monitor the serial command line, maintain the +time-of-day clock, and monitor the fire button on Joystick 1. The fire button +is scanned at a rate that causes 8 samples to be made in the time it takes for +the previous byte to be sent to the host (i.e. scan rate = 8/10 * baud rate). +The sample interval should be as constant as possible. + +SET JOYSTICK KEYCODE MODE +------------------------- + +:: + + 0x19 + RX ; length of time (in tenths of seconds) until + ; horizontal velocity breakpoint is reached + RY ; length of time (in tenths of seconds) until + ; vertical velocity breakpoint is reached + TX ; length (in tenths of seconds) of joystick closure + ; until horizontal cursor key is generated before RX + ; has elapsed + TY ; length (in tenths of seconds) of joystick closure + ; until vertical cursor key is generated before RY + ; has elapsed + VX ; length (in tenths of seconds) of joystick closure + ; until horizontal cursor keystrokes are generated + ; after RX has elapsed + VY ; length (in tenths of seconds) of joystick closure + ; until vertical cursor keystrokes are generated + ; after RY has elapsed + +In this mode, joystick 0 is scanned in a way that simulates cursor keystrokes. +On initial closure, a keystroke pair (make/break) is generated. Then up to Rn +tenths of seconds later, keystroke pairs are generated every Tn tenths of +seconds. After the Rn breakpoint is reached, keystroke pairs are generated +every Vn tenths of seconds. This provides a velocity (auto-repeat) breakpoint +feature. +Note that by setting RX and/or Ry to zero, the velocity feature can be +disabled. The values of TX and TY then become meaningless, and the generation +of cursor 'keystrokes' is set by VX and VY. + +DISABLE JOYSTICKS +----------------- + +:: + + 0x1A + +Disable the generation of any joystick events (and scanning may be internally +disabled). Any valid joystick mode command resumes joystick monitoring. (The +joystick mode commands are SET JOYSTICK EVENT REPORTING, SET JOYSTICK +INTERROGATION MODE, SET JOYSTICK MONITORING, SET FIRE BUTTON MONITORING, and +SET JOYSTICK KEYCODE MODE.) + +TIME-OF-DAY CLOCK SET +--------------------- + +:: + + 0x1B + YY ; year (2 least significant digits) + MM ; month + DD ; day + hh ; hour + mm ; minute + ss ; second + +All time-of-day data should be sent to the ikbd in packed BCD format. +Any digit that is not a valid BCD digit should be treated as a 'don't care' +and not alter that particular field of the date or time. This permits setting +only some subfields of the time-of-day clock. + +INTERROGATE TIME-OF-DAT CLOCK +----------------------------- + +:: + + 0x1C + Returns: + 0xFC ; time-of-day event header + YY ; year (2 least significant digits) + MM ; month + DD ; day + hh ; hour + mm ; minute + ss ; second + + All time-of-day is sent in packed BCD format. + +MEMORY LOAD +----------- + +:: + + 0x20 + ADRMSB ; address in controller + ADRLSB ; memory to be loaded + NUM ; number of bytes (0-128) + { data } + +This command permits the host to load arbitrary values into the ikbd +controller memory. The time between data bytes must be less than 20ms. + +MEMORY READ +----------- + +:: + + 0x21 + ADRMSB ; address in controller + ADRLSB ; memory to be read + Returns: + 0xF6 ; status header + 0x20 ; memory access + { data } ; 6 data bytes starting at ADR + +This command permits the host to read from the ikbd controller memory. + +CONTROLLER EXECUTE +------------------ + +:: + + 0x22 + ADRMSB ; address of subroutine in + ADRLSB ; controller memory to be called + +This command allows the host to command the execution of a subroutine in the +ikbd controller memory. + +STATUS INQUIRIES +---------------- + +:: + + Status commands are formed by inclusively ORing 0x80 with the + relevant SET command. + + Example: + 0x88 (or 0x89 or 0x8A) ; request mouse mode + Returns: + 0xF6 ; status response header + mode ; 0x08 is RELATIVE + ; 0x09 is ABSOLUTE + ; 0x0A is KEYCODE + param1 ; 0 is RELATIVE + ; XMSB maximum if ABSOLUTE + ; DELTA X is KEYCODE + param2 ; 0 is RELATIVE + ; YMSB maximum if ABSOLUTE + ; DELTA Y is KEYCODE + param3 ; 0 if RELATIVE + ; or KEYCODE + ; YMSB is ABSOLUTE + param4 ; 0 if RELATIVE + ; or KEYCODE + ; YLSB is ABSOLUTE + 0 ; pad + 0 + +The STATUS INQUIRY commands request the ikbd to return either the current mode +or the parameters associated with a given command. All status reports are +padded to form 8 byte long return packets. The responses to the status +requests are designed so that the host may store them away (after stripping +off the status report header byte) and later send them back as commands to +ikbd to restore its state. The 0 pad bytes will be treated as NOPs by the +ikbd. + + Valid STATUS INQUIRY commands are:: + + 0x87 mouse button action + 0x88 mouse mode + 0x89 + 0x8A + 0x8B mnouse threshold + 0x8C mouse scale + 0x8F mouse vertical coordinates + 0x90 ( returns 0x0F Y=0 at bottom + 0x10 Y=0 at top ) + 0x92 mouse enable/disable + ( returns 0x00 enabled) + 0x12 disabled ) + 0x94 joystick mode + 0x95 + 0x96 + 0x9A joystick enable/disable + ( returns 0x00 enabled + 0x1A disabled ) + +It is the (host) programmer's responsibility to have only one unanswered +inquiry in process at a time. +STATUS INQUIRY commands are not valid if the ikbd is in JOYSTICK MONITORING +mode or FIRE BUTTON MONITORING mode. + + +SCAN CODES +========== + +The key scan codes returned by the ikbd are chosen to simplify the +implementation of GSX. + +GSX Standard Keyboard Mapping + +======= ============ +Hex Keytop +======= ============ +01 Esc +02 1 +03 2 +04 3 +05 4 +06 5 +07 6 +08 7 +09 8 +0A 9 +0B 0 +0C \- +0D \= +0E BS +0F TAB +10 Q +11 W +12 E +13 R +14 T +15 Y +16 U +17 I +18 O +19 P +1A [ +1B ] +1C RET +1D CTRL +1E A +1F S +20 D +21 F +22 G +23 H +24 J +25 K +26 L +27 ; +28 ' +29 \` +2A (LEFT) SHIFT +2B \\ +2C Z +2D X +2E C +2F V +30 B +31 N +32 M +33 , +34 . +35 / +36 (RIGHT) SHIFT +37 { NOT USED } +38 ALT +39 SPACE BAR +3A CAPS LOCK +3B F1 +3C F2 +3D F3 +3E F4 +3F F5 +40 F6 +41 F7 +42 F8 +43 F9 +44 F10 +45 { NOT USED } +46 { NOT USED } +47 HOME +48 UP ARROW +49 { NOT USED } +4A KEYPAD - +4B LEFT ARROW +4C { NOT USED } +4D RIGHT ARROW +4E KEYPAD + +4F { NOT USED } +50 DOWN ARROW +51 { NOT USED } +52 INSERT +53 DEL +54 { NOT USED } +5F { NOT USED } +60 ISO KEY +61 UNDO +62 HELP +63 KEYPAD ( +64 KEYPAD / +65 KEYPAD * +66 KEYPAD * +67 KEYPAD 7 +68 KEYPAD 8 +69 KEYPAD 9 +6A KEYPAD 4 +6B KEYPAD 5 +6C KEYPAD 6 +6D KEYPAD 1 +6E KEYPAD 2 +6F KEYPAD 3 +70 KEYPAD 0 +71 KEYPAD . +72 KEYPAD ENTER +======= ============ diff --git a/Documentation/input/devices/bcm5974.rst b/Documentation/input/devices/bcm5974.rst new file mode 100644 index 0000000000000000000000000000000000000000..4aca199b0aa66a302e3bcb7f49cdbfc3d57106d3 --- /dev/null +++ b/Documentation/input/devices/bcm5974.rst @@ -0,0 +1,70 @@ +.. include:: + +------------------------ +BCM5974 Driver (bcm5974) +------------------------ + +:Copyright: |copy| 2008-2009 Henrik Rydberg + +The USB initialization and package decoding was made by Scott Shawcroft as +part of the touchd user-space driver project: + +:Copyright: |copy| 2008 Scott Shawcroft (scott.shawcroft@gmail.com) + +The BCM5974 driver is based on the appletouch driver: + +:Copyright: |copy| 2001-2004 Greg Kroah-Hartman (greg@kroah.com) +:Copyright: |copy| 2005 Johannes Berg (johannes@sipsolutions.net) +:Copyright: |copy| 2005 Stelian Pop (stelian@popies.net) +:Copyright: |copy| 2005 Frank Arnold (frank@scirocco-5v-turbo.de) +:Copyright: |copy| 2005 Peter Osterlund (petero2@telia.com) +:Copyright: |copy| 2005 Michael Hanselmann (linux-kernel@hansmi.ch) +:Copyright: |copy| 2006 Nicolas Boichat (nicolas@boichat.ch) + +This driver adds support for the multi-touch trackpad on the new Apple +Macbook Air and Macbook Pro laptops. It replaces the appletouch driver on +those computers, and integrates well with the synaptics driver of the Xorg +system. + +Known to work on Macbook Air, Macbook Pro Penryn and the new unibody +Macbook 5 and Macbook Pro 5. + +Usage +----- + +The driver loads automatically for the supported usb device ids, and +becomes available both as an event device (/dev/input/event*) and as a +mouse via the mousedev driver (/dev/input/mice). + +USB Race +-------- + +The Apple multi-touch trackpads report both mouse and keyboard events via +different interfaces of the same usb device. This creates a race condition +with the HID driver, which, if not told otherwise, will find the standard +HID mouse and keyboard, and claim the whole device. To remedy, the usb +product id must be listed in the mouse_ignore list of the hid driver. + +Debug output +------------ + +To ease the development for new hardware version, verbose packet output can +be switched on with the debug kernel module parameter. The range [1-9] +yields different levels of verbosity. Example (as root):: + + echo -n 9 > /sys/module/bcm5974/parameters/debug + + tail -f /var/log/debug + + echo -n 0 > /sys/module/bcm5974/parameters/debug + +Trivia +------ + +The driver was developed at the ubuntu forums in June 2008 [#f1]_, and now has +a more permanent home at bitmath.org [#f2]_. + +.. Links + +.. [#f1] http://ubuntuforums.org/showthread.php?t=840040 +.. [#f2] http://bitmath.org/code/ diff --git a/Documentation/input/devices/cma3000_d0x.rst b/Documentation/input/devices/cma3000_d0x.rst new file mode 100644 index 0000000000000000000000000000000000000000..8bc8e61487b04aa98c502b3153a0c5c1cce81e30 --- /dev/null +++ b/Documentation/input/devices/cma3000_d0x.rst @@ -0,0 +1,139 @@ +CMA3000-D0x Accelerometer +========================= + +Supported chips: +* VTI CMA3000-D0x + +Datasheet: + CMA3000-D0X Product Family Specification 8281000A.02.pdf + + +:Author: Hemanth V + + +Description +----------- + +CMA3000 Tri-axis accelerometer supports Motion detect, Measurement and +Free fall modes. + +Motion Detect Mode: + Its the low power mode where interrupts are generated only + when motion exceeds the defined thresholds. + +Measurement Mode: + This mode is used to read the acceleration data on X,Y,Z + axis and supports 400, 100, 40 Hz sample frequency. + +Free fall Mode: + This mode is intended to save system resources. + +Threshold values: + Chip supports defining threshold values for above modes + which includes time and g value. Refer product specifications for + more details. + +CMA3000 chip supports mutually exclusive I2C and SPI interfaces for +communication, currently the driver supports I2C based communication only. +Initial configuration for bus mode is set in non volatile memory and can later +be modified through bus interface command. + +Driver reports acceleration data through input subsystem. It generates ABS_MISC +event with value 1 when free fall is detected. + +Platform data need to be configured for initial default values. + +Platform Data +------------- + +fuzz_x: + Noise on X Axis + +fuzz_y: + Noise on Y Axis + +fuzz_z: + Noise on Z Axis + +g_range: + G range in milli g i.e 2000 or 8000 + +mode: + Default Operating mode + +mdthr: + Motion detect g range threshold value + +mdfftmr: + Motion detect and free fall time threshold value + +ffthr: + Free fall g range threshold value + +Input Interface +--------------- + +Input driver version is 1.0.0 +Input device ID: bus 0x18 vendor 0x0 product 0x0 version 0x0 +Input device name: "cma3000-accelerometer" + +Supported events:: + + Event type 0 (Sync) + Event type 3 (Absolute) + Event code 0 (X) + Value 47 + Min -8000 + Max 8000 + Fuzz 200 + Event code 1 (Y) + Value -28 + Min -8000 + Max 8000 + Fuzz 200 + Event code 2 (Z) + Value 905 + Min -8000 + Max 8000 + Fuzz 200 + Event code 40 (Misc) + Value 0 + Min 0 + Max 1 + Event type 4 (Misc) + + +Register/Platform parameters Description +---------------------------------------- + +mode:: + + 0: power down mode + 1: 100 Hz Measurement mode + 2: 400 Hz Measurement mode + 3: 40 Hz Measurement mode + 4: Motion Detect mode (default) + 5: 100 Hz Free fall mode + 6: 40 Hz Free fall mode + 7: Power off mode + +grange:: + + 2000: 2000 mg or 2G Range + 8000: 8000 mg or 8G Range + +mdthr:: + + X: X * 71mg (8G Range) + X: X * 18mg (2G Range) + +mdfftmr:: + + X: (X & 0x70) * 100 ms (MDTMR) + (X & 0x0F) * 2.5 ms (FFTMR 400 Hz) + (X & 0x0F) * 10 ms (FFTMR 100 Hz) + +ffthr:: + + X: (X >> 2) * 18mg (2G Range) + X: (X & 0x0F) * 71 mg (8G Range) diff --git a/Documentation/input/devices/cs461x.rst b/Documentation/input/devices/cs461x.rst new file mode 100644 index 0000000000000000000000000000000000000000..b1e6d508ad26ec54dd519032650fb3b8bd7cd3f6 --- /dev/null +++ b/Documentation/input/devices/cs461x.rst @@ -0,0 +1,43 @@ +Crystal SoundFusion CS4610/CS4612/CS461 joystick +================================================ + +This is a new low-level driver to support analog joystick attached to +Crystal SoundFusion CS4610/CS4612/CS4615. This code is based upon +Vortex/Solo drivers as an example of decoration style, and ALSA +0.5.8a kernel drivers as an chipset documentation and samples. + +This version does not have cooked mode support; the basic code +is present here, but have not tested completely. The button analysis +is completed in this mode, but the axis movement is not. + +Raw mode works fine with analog joystick front-end driver and cs461x +driver as a backend. I've tested this driver with CS4610, 4-axis and +4-button joystick; I mean the jstest utility. Also I've tried to +play in xracer game using joystick, and the result is better than +keyboard only mode. + +The sensitivity and calibrate quality have not been tested; the two +reasons are performed: the same hardware cannot work under Win95 (blue +screen in VJOYD); I have no documentation on my chip; and the existing +behavior in my case was not raised the requirement of joystick calibration. +So the driver have no code to perform hardware related calibration. + +This driver have the basic support for PCI devices only; there is no +ISA or PnP ISA cards supported. + +The driver works with ALSA drivers simultaneously. For example, the xracer +uses joystick as input device and PCM device as sound output in one time. +There are no sound or input collisions detected. The source code have +comments about them; but I've found the joystick can be initialized +separately of ALSA modules. So, you can use only one joystick driver +without ALSA drivers. The ALSA drivers are not needed to compile or +run this driver. + +There are no debug information print have been placed in source, and no +specific options required to work this driver. The found chipset parameters +are printed via printk(KERN_INFO "..."), see the /var/log/messages to +inspect cs461x: prefixed messages to determine possible card detection +errors. + +Regards, +Viktor diff --git a/Documentation/input/edt-ft5x06.txt b/Documentation/input/devices/edt-ft5x06.rst similarity index 100% rename from Documentation/input/edt-ft5x06.txt rename to Documentation/input/devices/edt-ft5x06.rst diff --git a/Documentation/input/devices/elantech.rst b/Documentation/input/devices/elantech.rst new file mode 100644 index 0000000000000000000000000000000000000000..c3374a7ce7af84b3b9adbd1c9f5ce20dbbd665b1 --- /dev/null +++ b/Documentation/input/devices/elantech.rst @@ -0,0 +1,841 @@ +Elantech Touchpad Driver +======================== + + Copyright (C) 2007-2008 Arjan Opmeer + + Extra information for hardware version 1 found and + provided by Steve Havelka + + Version 2 (EeePC) hardware support based on patches + received from Woody at Xandros and forwarded to me + by user StewieGriffin at the eeeuser.com forum + +.. Contents + + 1. Introduction + 2. Extra knobs + 3. Differentiating hardware versions + 4. Hardware version 1 + 4.1 Registers + 4.2 Native relative mode 4 byte packet format + 4.3 Native absolute mode 4 byte packet format + 5. Hardware version 2 + 5.1 Registers + 5.2 Native absolute mode 6 byte packet format + 5.2.1 Parity checking and packet re-synchronization + 5.2.2 One/Three finger touch + 5.2.3 Two finger touch + 6. Hardware version 3 + 6.1 Registers + 6.2 Native absolute mode 6 byte packet format + 6.2.1 One/Three finger touch + 6.2.2 Two finger touch + 7. Hardware version 4 + 7.1 Registers + 7.2 Native absolute mode 6 byte packet format + 7.2.1 Status packet + 7.2.2 Head packet + 7.2.3 Motion packet + 8. Trackpoint (for Hardware version 3 and 4) + 8.1 Registers + 8.2 Native relative mode 6 byte packet format + 8.2.1 Status Packet + + + +Introduction +~~~~~~~~~~~~ + +Currently the Linux Elantech touchpad driver is aware of four different +hardware versions unimaginatively called version 1,version 2, version 3 +and version 4. Version 1 is found in "older" laptops and uses 4 bytes per +packet. Version 2 seems to be introduced with the EeePC and uses 6 bytes +per packet, and provides additional features such as position of two fingers, +and width of the touch. Hardware version 3 uses 6 bytes per packet (and +for 2 fingers the concatenation of two 6 bytes packets) and allows tracking +of up to 3 fingers. Hardware version 4 uses 6 bytes per packet, and can +combine a status packet with multiple head or motion packets. Hardware version +4 allows tracking up to 5 fingers. + +Some Hardware version 3 and version 4 also have a trackpoint which uses a +separate packet format. It is also 6 bytes per packet. + +The driver tries to support both hardware versions and should be compatible +with the Xorg Synaptics touchpad driver and its graphical configuration +utilities. + +Note that a mouse button is also associated with either the touchpad or the +trackpoint when a trackpoint is available. Disabling the Touchpad in xorg +(TouchPadOff=0) will also disable the buttons associated with the touchpad. + +Additionally the operation of the touchpad can be altered by adjusting the +contents of some of its internal registers. These registers are represented +by the driver as sysfs entries under /sys/bus/serio/drivers/psmouse/serio? +that can be read from and written to. + +Currently only the registers for hardware version 1 are somewhat understood. +Hardware version 2 seems to use some of the same registers but it is not +known whether the bits in the registers represent the same thing or might +have changed their meaning. + +On top of that, some register settings have effect only when the touchpad is +in relative mode and not in absolute mode. As the Linux Elantech touchpad +driver always puts the hardware into absolute mode not all information +mentioned below can be used immediately. But because there is no freely +available Elantech documentation the information is provided here anyway for +completeness sake. + + +Extra knobs +~~~~~~~~~~~ + +Currently the Linux Elantech touchpad driver provides three extra knobs under +/sys/bus/serio/drivers/psmouse/serio? for the user. + +* debug + + Turn different levels of debugging ON or OFF. + + By echoing "0" to this file all debugging will be turned OFF. + + Currently a value of "1" will turn on some basic debugging and a value of + "2" will turn on packet debugging. For hardware version 1 the default is + OFF. For version 2 the default is "1". + + Turning packet debugging on will make the driver dump every packet + received to the syslog before processing it. Be warned that this can + generate quite a lot of data! + +* paritycheck + + Turns parity checking ON or OFF. + + By echoing "0" to this file parity checking will be turned OFF. Any + non-zero value will turn it ON. For hardware version 1 the default is ON. + For version 2 the default it is OFF. + + Hardware version 1 provides basic data integrity verification by + calculating a parity bit for the last 3 bytes of each packet. The driver + can check these bits and reject any packet that appears corrupted. Using + this knob you can bypass that check. + + Hardware version 2 does not provide the same parity bits. Only some basic + data consistency checking can be done. For now checking is disabled by + default. Currently even turning it on will do nothing. + +* crc_enabled + + Sets crc_enabled to 0/1. The name "crc_enabled" is the official name of + this integrity check, even though it is not an actual cyclic redundancy + check. + + Depending on the state of crc_enabled, certain basic data integrity + verification is done by the driver on hardware version 3 and 4. The + driver will reject any packet that appears corrupted. Using this knob, + The state of crc_enabled can be altered with this knob. + + Reading the crc_enabled value will show the active value. Echoing + "0" or "1" to this file will set the state to "0" or "1". + +Differentiating hardware versions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To detect the hardware version, read the version number as param[0].param[1].param[2]:: + + 4 bytes version: (after the arrow is the name given in the Dell-provided driver) + 02.00.22 => EF013 + 02.06.00 => EF019 + +In the wild, there appear to be more versions, such as 00.01.64, 01.00.21, +02.00.00, 02.00.04, 02.00.06:: + + 6 bytes: + 02.00.30 => EF113 + 02.08.00 => EF023 + 02.08.XX => EF123 + 02.0B.00 => EF215 + 04.01.XX => Scroll_EF051 + 04.02.XX => EF051 + +In the wild, there appear to be more versions, such as 04.03.01, 04.04.11. There +appears to be almost no difference, except for EF113, which does not report +pressure/width and has different data consistency checks. + +Probably all the versions with param[0] <= 01 can be considered as +4 bytes/firmware 1. The versions < 02.08.00, with the exception of 02.00.30, as +4 bytes/firmware 2. Everything >= 02.08.00 can be considered as 6 bytes. + + +Hardware version 1 +~~~~~~~~~~~~~~~~~~ + +Registers +--------- + +By echoing a hexadecimal value to a register it contents can be altered. + +For example:: + + echo -n 0x16 > reg_10 + +* reg_10:: + + bit 7 6 5 4 3 2 1 0 + B C T D L A S E + + E: 1 = enable smart edges unconditionally + S: 1 = enable smart edges only when dragging + A: 1 = absolute mode (needs 4 byte packets, see reg_11) + L: 1 = enable drag lock (see reg_22) + D: 1 = disable dynamic resolution + T: 1 = disable tapping + C: 1 = enable corner tap + B: 1 = swap left and right button + +* reg_11:: + + bit 7 6 5 4 3 2 1 0 + 1 0 0 H V 1 F P + + P: 1 = enable parity checking for relative mode + F: 1 = enable native 4 byte packet mode + V: 1 = enable vertical scroll area + H: 1 = enable horizontal scroll area + +* reg_20:: + + single finger width? + +* reg_21:: + + scroll area width (small: 0x40 ... wide: 0xff) + +* reg_22:: + + drag lock time out (short: 0x14 ... long: 0xfe; + 0xff = tap again to release) + +* reg_23:: + + tap make timeout? + +* reg_24:: + + tap release timeout? + +* reg_25:: + + smart edge cursor speed (0x02 = slow, 0x03 = medium, 0x04 = fast) + +* reg_26:: + + smart edge activation area width? + + +Native relative mode 4 byte packet format +----------------------------------------- + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + c c p2 p1 1 M R L + + L, R, M = 1 when Left, Right, Middle mouse button pressed + some models have M as byte 3 odd parity bit + when parity checking is enabled (reg_11, P = 1): + p1..p2 = byte 1 and 2 odd parity bit + c = 1 when corner tap detected + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + dx7 dx6 dx5 dx4 dx3 dx2 dx1 dx0 + + dx7..dx0 = x movement; positive = right, negative = left + byte 1 = 0xf0 when corner tap detected + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + dy7 dy6 dy5 dy4 dy3 dy2 dy1 dy0 + + dy7..dy0 = y movement; positive = up, negative = down + +byte 3:: + + parity checking enabled (reg_11, P = 1): + + bit 7 6 5 4 3 2 1 0 + w h n1 n0 ds3 ds2 ds1 ds0 + + normally: + ds3..ds0 = scroll wheel amount and direction + positive = down or left + negative = up or right + when corner tap detected: + ds0 = 1 when top right corner tapped + ds1 = 1 when bottom right corner tapped + ds2 = 1 when bottom left corner tapped + ds3 = 1 when top left corner tapped + n1..n0 = number of fingers on touchpad + only models with firmware 2.x report this, models with + firmware 1.x seem to map one, two and three finger taps + directly to L, M and R mouse buttons + h = 1 when horizontal scroll action + w = 1 when wide finger touch? + + otherwise (reg_11, P = 0): + + bit 7 6 5 4 3 2 1 0 + ds7 ds6 ds5 ds4 ds3 ds2 ds1 ds0 + + ds7..ds0 = vertical scroll amount and direction + negative = up + positive = down + + +Native absolute mode 4 byte packet format +----------------------------------------- + +EF013 and EF019 have a special behaviour (due to a bug in the firmware?), and +when 1 finger is touching, the first 2 position reports must be discarded. +This counting is reset whenever a different number of fingers is reported. + +byte 0:: + + firmware version 1.x: + + bit 7 6 5 4 3 2 1 0 + D U p1 p2 1 p3 R L + + L, R = 1 when Left, Right mouse button pressed + p1..p3 = byte 1..3 odd parity bit + D, U = 1 when rocker switch pressed Up, Down + + firmware version 2.x: + + bit 7 6 5 4 3 2 1 0 + n1 n0 p2 p1 1 p3 R L + + L, R = 1 when Left, Right mouse button pressed + p1..p3 = byte 1..3 odd parity bit + n1..n0 = number of fingers on touchpad + +byte 1:: + + firmware version 1.x: + + bit 7 6 5 4 3 2 1 0 + f 0 th tw x9 x8 y9 y8 + + tw = 1 when two finger touch + th = 1 when three finger touch + f = 1 when finger touch + + firmware version 2.x: + + bit 7 6 5 4 3 2 1 0 + . . . . x9 x8 y9 y8 + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x9..x0 = absolute x value (horizontal) + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y9..y0 = absolute y value (vertical) + + +Hardware version 2 +~~~~~~~~~~~~~~~~~~ + + +Registers +--------- + +By echoing a hexadecimal value to a register it contents can be altered. + +For example:: + + echo -n 0x56 > reg_10 + +* reg_10:: + + bit 7 6 5 4 3 2 1 0 + 0 1 0 1 0 1 D 0 + + D: 1 = enable drag and drop + +* reg_11:: + + bit 7 6 5 4 3 2 1 0 + 1 0 0 0 S 0 1 0 + + S: 1 = enable vertical scroll + +* reg_21:: + + unknown (0x00) + +* reg_22:: + + drag and drop release time out (short: 0x70 ... long 0x7e; + 0x7f = never i.e. tap again to release) + + +Native absolute mode 6 byte packet format +----------------------------------------- + +Parity checking and packet re-synchronization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +There is no parity checking, however some consistency checks can be performed. + +For instance for EF113:: + + SA1= packet[0]; + A1 = packet[1]; + B1 = packet[2]; + SB1= packet[3]; + C1 = packet[4]; + D1 = packet[5]; + if( (((SA1 & 0x3C) != 0x3C) && ((SA1 & 0xC0) != 0x80)) || // check Byte 1 + (((SA1 & 0x0C) != 0x0C) && ((SA1 & 0xC0) == 0x80)) || // check Byte 1 (one finger pressed) + (((SA1 & 0xC0) != 0x80) && (( A1 & 0xF0) != 0x00)) || // check Byte 2 + (((SB1 & 0x3E) != 0x38) && ((SA1 & 0xC0) != 0x80)) || // check Byte 4 + (((SB1 & 0x0E) != 0x08) && ((SA1 & 0xC0) == 0x80)) || // check Byte 4 (one finger pressed) + (((SA1 & 0xC0) != 0x80) && (( C1 & 0xF0) != 0x00)) ) // check Byte 5 + // error detected + +For all the other ones, there are just a few constant bits:: + + if( ((packet[0] & 0x0C) != 0x04) || + ((packet[3] & 0x0f) != 0x02) ) + // error detected + + +In case an error is detected, all the packets are shifted by one (and packet[0] is discarded). + +One/Three finger touch +^^^^^^^^^^^^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + n1 n0 w3 w2 . . R L + + L, R = 1 when Left, Right mouse button pressed + n1..n0 = number of fingers on touchpad + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + p7 p6 p5 p4 x11 x10 x9 x8 + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x11..x0 = absolute x value (horizontal) + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + n4 vf w1 w0 . . . b2 + + n4 = set if more than 3 fingers (only in 3 fingers mode) + vf = a kind of flag ? (only on EF123, 0 when finger is over one + of the buttons, 1 otherwise) + w3..w0 = width of the finger touch (not EF113) + b2 (on EF113 only, 0 otherwise), b2.R.L indicates one button pressed: + 0 = none + 1 = Left + 2 = Right + 3 = Middle (Left and Right) + 4 = Forward + 5 = Back + 6 = Another one + 7 = Another one + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + p3 p1 p2 p0 y11 y10 y9 y8 + + p7..p0 = pressure (not EF113) + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y11..y0 = absolute y value (vertical) + + +Two finger touch +^^^^^^^^^^^^^^^^ + +Note that the two pairs of coordinates are not exactly the coordinates of the +two fingers, but only the pair of the lower-left and upper-right coordinates. +So the actual fingers might be situated on the other diagonal of the square +defined by these two points. + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + n1 n0 ay8 ax8 . . R L + + L, R = 1 when Left, Right mouse button pressed + n1..n0 = number of fingers on touchpad + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + ax7 ax6 ax5 ax4 ax3 ax2 ax1 ax0 + + ax8..ax0 = lower-left finger absolute x value + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + ay7 ay6 ay5 ay4 ay3 ay2 ay1 ay0 + + ay8..ay0 = lower-left finger absolute y value + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + . . by8 bx8 . . . . + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + bx7 bx6 bx5 bx4 bx3 bx2 bx1 bx0 + + bx8..bx0 = upper-right finger absolute x value + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + by7 by8 by5 by4 by3 by2 by1 by0 + + by8..by0 = upper-right finger absolute y value + +Hardware version 3 +~~~~~~~~~~~~~~~~~~ + +Registers +--------- + +* reg_10:: + + bit 7 6 5 4 3 2 1 0 + 0 0 0 0 R F T A + + A: 1 = enable absolute tracking + T: 1 = enable two finger mode auto correct + F: 1 = disable ABS Position Filter + R: 1 = enable real hardware resolution + +Native absolute mode 6 byte packet format +----------------------------------------- + +1 and 3 finger touch shares the same 6-byte packet format, except that +3 finger touch only reports the position of the center of all three fingers. + +Firmware would send 12 bytes of data for 2 finger touch. + +Note on debounce: +In case the box has unstable power supply or other electricity issues, or +when number of finger changes, F/W would send "debounce packet" to inform +driver that the hardware is in debounce status. +The debouce packet has the following signature:: + + byte 0: 0xc4 + byte 1: 0xff + byte 2: 0xff + byte 3: 0x02 + byte 4: 0xff + byte 5: 0xff + +When we encounter this kind of packet, we just ignore it. + +One/Three finger touch +^^^^^^^^^^^^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + n1 n0 w3 w2 0 1 R L + + L, R = 1 when Left, Right mouse button pressed + n1..n0 = number of fingers on touchpad + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + p7 p6 p5 p4 x11 x10 x9 x8 + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x11..x0 = absolute x value (horizontal) + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + 0 0 w1 w0 0 0 1 0 + + w3..w0 = width of the finger touch + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + p3 p1 p2 p0 y11 y10 y9 y8 + + p7..p0 = pressure + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y11..y0 = absolute y value (vertical) + +Two finger touch +^^^^^^^^^^^^^^^^ + +The packet format is exactly the same for two finger touch, except the hardware +sends two 6 byte packets. The first packet contains data for the first finger, +the second packet has data for the second finger. So for two finger touch a +total of 12 bytes are sent. + +Hardware version 4 +~~~~~~~~~~~~~~~~~~ + +Registers +--------- + +* reg_07:: + + bit 7 6 5 4 3 2 1 0 + 0 0 0 0 0 0 0 A + + A: 1 = enable absolute tracking + +Native absolute mode 6 byte packet format +----------------------------------------- + +v4 hardware is a true multitouch touchpad, capable of tracking up to 5 fingers. +Unfortunately, due to PS/2's limited bandwidth, its packet format is rather +complex. + +Whenever the numbers or identities of the fingers changes, the hardware sends a +status packet to indicate how many and which fingers is on touchpad, followed by +head packets or motion packets. A head packet contains data of finger id, finger +position (absolute x, y values), width, and pressure. A motion packet contains +two fingers' position delta. + +For example, when status packet tells there are 2 fingers on touchpad, then we +can expect two following head packets. If the finger status doesn't change, +the following packets would be motion packets, only sending delta of finger +position, until we receive a status packet. + +One exception is one finger touch. when a status packet tells us there is only +one finger, the hardware would just send head packets afterwards. + +Status packet +^^^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + . . . . 0 1 R L + + L, R = 1 when Left, Right mouse button pressed + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + . . . ft4 ft3 ft2 ft1 ft0 + + ft4 ft3 ft2 ft1 ft0 ftn = 1 when finger n is on touchpad + +byte 2:: + + not used + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + . . . 1 0 0 0 0 + + constant bits + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + p . . . . . . . + + p = 1 for palm + +byte 5:: + + not used + +Head packet +^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + w3 w2 w1 w0 0 1 R L + + L, R = 1 when Left, Right mouse button pressed + w3..w0 = finger width (spans how many trace lines) + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + p7 p6 p5 p4 x11 x10 x9 x8 + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x11..x0 = absolute x value (horizontal) + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + id2 id1 id0 1 0 0 0 1 + + id2..id0 = finger id + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + p3 p1 p2 p0 y11 y10 y9 y8 + + p7..p0 = pressure + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y11..y0 = absolute y value (vertical) + +Motion packet +^^^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + id2 id1 id0 w 0 1 R L + + L, R = 1 when Left, Right mouse button pressed + id2..id0 = finger id + w = 1 when delta overflows (> 127 or < -128), in this case + firmware sends us (delta x / 5) and (delta y / 5) + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x7..x0 = delta x (two's complement) + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y7..y0 = delta y (two's complement) + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + id2 id1 id0 1 0 0 1 0 + + id2..id0 = finger id + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + + x7..x0 = delta x (two's complement) + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + y7..y0 = delta y (two's complement) + + byte 0 ~ 2 for one finger + byte 3 ~ 5 for another + + +Trackpoint (for Hardware version 3 and 4) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Registers +--------- + +No special registers have been identified. + +Native relative mode 6 byte packet format +----------------------------------------- + +Status Packet +^^^^^^^^^^^^^ + +byte 0:: + + bit 7 6 5 4 3 2 1 0 + 0 0 sx sy 0 M R L + +byte 1:: + + bit 7 6 5 4 3 2 1 0 + ~sx 0 0 0 0 0 0 0 + +byte 2:: + + bit 7 6 5 4 3 2 1 0 + ~sy 0 0 0 0 0 0 0 + +byte 3:: + + bit 7 6 5 4 3 2 1 0 + 0 0 ~sy ~sx 0 1 1 0 + +byte 4:: + + bit 7 6 5 4 3 2 1 0 + x7 x6 x5 x4 x3 x2 x1 x0 + +byte 5:: + + bit 7 6 5 4 3 2 1 0 + y7 y6 y5 y4 y3 y2 y1 y0 + + + x and y are written in two's complement spread + over 9 bits with sx/sy the relative top bit and + x7..x0 and y7..y0 the lower bits. + ~sx is the inverse of sx, ~sy is the inverse of sy. + The sign of y is opposite to what the input driver + expects for a relative movement diff --git a/Documentation/input/devices/gpio-tilt.rst b/Documentation/input/devices/gpio-tilt.rst new file mode 100644 index 0000000000000000000000000000000000000000..fa6e64570aa72f622b5484ab4b8fa90e6ac8aaa5 --- /dev/null +++ b/Documentation/input/devices/gpio-tilt.rst @@ -0,0 +1,103 @@ +Driver for tilt-switches connected via GPIOs +============================================ + +Generic driver to read data from tilt switches connected via gpios. +Orientation can be provided by one or more than one tilt switches, +i.e. each tilt switch providing one axis, and the number of axes +is also not limited. + + +Data structures +--------------- + +The array of struct gpio in the gpios field is used to list the gpios +that represent the current tilt state. + +The array of struct gpio_tilt_axis describes the axes that are reported +to the input system. The values set therein are used for the +input_set_abs_params calls needed to init the axes. + +The array of struct gpio_tilt_state maps gpio states to the corresponding +values to report. The gpio state is represented as a bitfield where the +bit-index corresponds to the index of the gpio in the struct gpio array. +In the same manner the values stored in the axes array correspond to +the elements of the gpio_tilt_axis-array. + + +Example +------- + +Example configuration for a single TS1003 tilt switch that rotates around +one axis in 4 steps and emits the current tilt via two GPIOs:: + + static int sg060_tilt_enable(struct device *dev) { + /* code to enable the sensors */ + }; + + static void sg060_tilt_disable(struct device *dev) { + /* code to disable the sensors */ + }; + + static struct gpio sg060_tilt_gpios[] = { + { SG060_TILT_GPIO_SENSOR1, GPIOF_IN, "tilt_sensor1" }, + { SG060_TILT_GPIO_SENSOR2, GPIOF_IN, "tilt_sensor2" }, + }; + + static struct gpio_tilt_state sg060_tilt_states[] = { + { + .gpios = (0 << 1) | (0 << 0), + .axes = (int[]) { + 0, + }, + }, { + .gpios = (0 << 1) | (1 << 0), + .axes = (int[]) { + 1, /* 90 degrees */ + }, + }, { + .gpios = (1 << 1) | (1 << 0), + .axes = (int[]) { + 2, /* 180 degrees */ + }, + }, { + .gpios = (1 << 1) | (0 << 0), + .axes = (int[]) { + 3, /* 270 degrees */ + }, + }, + }; + + static struct gpio_tilt_axis sg060_tilt_axes[] = { + { + .axis = ABS_RY, + .min = 0, + .max = 3, + .fuzz = 0, + .flat = 0, + }, + }; + + static struct gpio_tilt_platform_data sg060_tilt_pdata= { + .gpios = sg060_tilt_gpios, + .nr_gpios = ARRAY_SIZE(sg060_tilt_gpios), + + .axes = sg060_tilt_axes, + .nr_axes = ARRAY_SIZE(sg060_tilt_axes), + + .states = sg060_tilt_states, + .nr_states = ARRAY_SIZE(sg060_tilt_states), + + .debounce_interval = 100, + + .poll_interval = 1000, + .enable = sg060_tilt_enable, + .disable = sg060_tilt_disable, + }; + + static struct platform_device sg060_device_tilt = { + .name = "gpio-tilt-polled", + .id = -1, + .dev = { + .platform_data = &sg060_tilt_pdata, + }, + }; diff --git a/Documentation/input/devices/iforce-protocol.rst b/Documentation/input/devices/iforce-protocol.rst new file mode 100644 index 0000000000000000000000000000000000000000..8634beac3fdb67b72d137012664f824eaeb2008c --- /dev/null +++ b/Documentation/input/devices/iforce-protocol.rst @@ -0,0 +1,381 @@ +=============== +Iforce Protocol +=============== + +:Author: Johann Deneux + +Home page at ``_ + +:Additions: by Vojtech Pavlik. + + +Introduction +============ + +This document describes what I managed to discover about the protocol used to +specify force effects to I-Force 2.0 devices. None of this information comes +from Immerse. That's why you should not trust what is written in this +document. This document is intended to help understanding the protocol. +This is not a reference. Comments and corrections are welcome. To contact me, +send an email to: johann.deneux@gmail.com + +.. warning:: + + I shall not be held responsible for any damage or harm caused if you try to + send data to your I-Force device based on what you read in this document. + +Preliminary Notes +================= + +All values are hexadecimal with big-endian encoding (msb on the left). Beware, +values inside packets are encoded using little-endian. Bytes whose roles are +unknown are marked ??? Information that needs deeper inspection is marked (?) + +General form of a packet +------------------------ + +This is how packets look when the device uses the rs232 to communicate. + +== == === ==== == +2B OP LEN DATA CS +== == === ==== == + +CS is the checksum. It is equal to the exclusive or of all bytes. + +When using USB: + +== ==== +OP DATA +== ==== + +The 2B, LEN and CS fields have disappeared, probably because USB handles +frames and data corruption is handled or unsignificant. + +First, I describe effects that are sent by the device to the computer + +Device input state +================== + +This packet is used to indicate the state of each button and the value of each +axis:: + + OP= 01 for a joystick, 03 for a wheel + LEN= Varies from device to device + 00 X-Axis lsb + 01 X-Axis msb + 02 Y-Axis lsb, or gas pedal for a wheel + 03 Y-Axis msb, or brake pedal for a wheel + 04 Throttle + 05 Buttons + 06 Lower 4 bits: Buttons + Upper 4 bits: Hat + 07 Rudder + +Device effects states +===================== + +:: + + OP= 02 + LEN= Varies + 00 ? Bit 1 (Value 2) is the value of the deadman switch + 01 Bit 8 is set if the effect is playing. Bits 0 to 7 are the effect id. + 02 ?? + 03 Address of parameter block changed (lsb) + 04 Address of parameter block changed (msb) + 05 Address of second parameter block changed (lsb) + ... depending on the number of parameter blocks updated + +Force effect +------------ + +:: + + OP= 01 + LEN= 0e + 00 Channel (when playing several effects at the same time, each must + be assigned a channel) + 01 Wave form + Val 00 Constant + Val 20 Square + Val 21 Triangle + Val 22 Sine + Val 23 Sawtooth up + Val 24 Sawtooth down + Val 40 Spring (Force = f(pos)) + Val 41 Friction (Force = f(velocity)) and Inertia + (Force = f(acceleration)) + + + 02 Axes affected and trigger + Bits 4-7: Val 2 = effect along one axis. Byte 05 indicates direction + Val 4 = X axis only. Byte 05 must contain 5a + Val 8 = Y axis only. Byte 05 must contain b4 + Val c = X and Y axes. Bytes 05 must contain 60 + Bits 0-3: Val 0 = No trigger + Val x+1 = Button x triggers the effect + When the whole byte is 0, cancel the previously set trigger + + 03-04 Duration of effect (little endian encoding, in ms) + + 05 Direction of effect, if applicable. Else, see 02 for value to assign. + + 06-07 Minimum time between triggering. + + 08-09 Address of periodicity or magnitude parameters + 0a-0b Address of attack and fade parameters, or ffff if none. + *or* + 08-09 Address of interactive parameters for X-axis, + or ffff if not applicable + 0a-0b Address of interactive parameters for Y-axis, + or ffff if not applicable + + 0c-0d Delay before execution of effect (little endian encoding, in ms) + + +Time based parameters +--------------------- + +Attack and fade +^^^^^^^^^^^^^^^ + +:: + + OP= 02 + LEN= 08 + 00-01 Address where to store the parameters + 02-03 Duration of attack (little endian encoding, in ms) + 04 Level at end of attack. Signed byte. + 05-06 Duration of fade. + 07 Level at end of fade. + +Magnitude +^^^^^^^^^ + +:: + + OP= 03 + LEN= 03 + 00-01 Address + 02 Level. Signed byte. + +Periodicity +^^^^^^^^^^^ + +:: + + OP= 04 + LEN= 07 + 00-01 Address + 02 Magnitude. Signed byte. + 03 Offset. Signed byte. + 04 Phase. Val 00 = 0 deg, Val 40 = 90 degs. + 05-06 Period (little endian encoding, in ms) + +Interactive parameters +---------------------- + +:: + + OP= 05 + LEN= 0a + 00-01 Address + 02 Positive Coeff + 03 Negative Coeff + 04+05 Offset (center) + 06+07 Dead band (Val 01F4 = 5000 (decimal)) + 08 Positive saturation (Val 0a = 1000 (decimal) Val 64 = 10000 (decimal)) + 09 Negative saturation + +The encoding is a bit funny here: For coeffs, these are signed values. The +maximum value is 64 (100 decimal), the min is 9c. +For the offset, the minimum value is FE0C, the maximum value is 01F4. +For the deadband, the minimum value is 0, the max is 03E8. + +Controls +-------- + +:: + + OP= 41 + LEN= 03 + 00 Channel + 01 Start/Stop + Val 00: Stop + Val 01: Start and play once. + Val 41: Start and play n times (See byte 02 below) + 02 Number of iterations n. + +Init +---- + + +Querying features +^^^^^^^^^^^^^^^^^ +:: + + OP= ff + Query command. Length varies according to the query type. + The general format of this packet is: + ff 01 QUERY [INDEX] CHECKSUM + responses are of the same form: + FF LEN QUERY VALUE_QUERIED CHECKSUM2 + where LEN = 1 + length(VALUE_QUERIED) + +Query ram size +~~~~~~~~~~~~~~ + +:: + + QUERY = 42 ('B'uffer size) + +The device should reply with the same packet plus two additional bytes +containing the size of the memory: +ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. + +Query number of effects +~~~~~~~~~~~~~~~~~~~~~~~ + +:: + + QUERY = 4e ('N'umber of effects) + +The device should respond by sending the number of effects that can be played +at the same time (one byte) +ff 02 4e 14 CS would stand for 20 effects. + +Vendor's id +~~~~~~~~~~~ + +:: + + QUERY = 4d ('M'anufacturer) + +Query the vendors'id (2 bytes) + +Product id +~~~~~~~~~~ + +:: + + QUERY = 50 ('P'roduct) + +Query the product id (2 bytes) + +Open device +~~~~~~~~~~~ + +:: + + QUERY = 4f ('O'pen) + +No data returned. + +Close device +~~~~~~~~~~~~ + +:: + + QUERY = 43 ('C')lose + +No data returned. + +Query effect +~~~~~~~~~~~~ + +:: + + QUERY = 45 ('E') + +Send effect type. +Returns nonzero if supported (2 bytes) + +Firmware Version +~~~~~~~~~~~~~~~~ + +:: + + QUERY = 56 ('V'ersion) + +Sends back 3 bytes - major, minor, subminor + +Initialisation of the device +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Set Control +~~~~~~~~~~~ + +.. note:: + Device dependent, can be different on different models! + +:: + + OP= 40 [] + LEN= 2 or 3 + 00 Idx + Idx 00 Set dead zone (0..2048) + Idx 01 Ignore Deadman sensor (0..1) + Idx 02 Enable comm watchdog (0..1) + Idx 03 Set the strength of the spring (0..100) + Idx 04 Enable or disable the spring (0/1) + Idx 05 Set axis saturation threshold (0..2048) + +Set Effect State +~~~~~~~~~~~~~~~~ + +:: + + OP= 42 + LEN= 1 + 00 State + Bit 3 Pause force feedback + Bit 2 Enable force feedback + Bit 0 Stop all effects + +Set overall +~~~~~~~~~~~ + +:: + + OP= 43 + LEN= 1 + 00 Gain + Val 00 = 0% + Val 40 = 50% + Val 80 = 100% + +Parameter memory +---------------- + +Each device has a certain amount of memory to store parameters of effects. +The amount of RAM may vary, I encountered values from 200 to 1000 bytes. Below +is the amount of memory apparently needed for every set of parameters: + + - period : 0c + - magnitude : 02 + - attack and fade : 0e + - interactive : 08 + +Appendix: How to study the protocol? +==================================== + +1. Generate effects using the force editor provided with the DirectX SDK, or +use Immersion Studio (freely available at their web site in the developer section: +www.immersion.com) +2. Start a soft spying RS232 or USB (depending on where you connected your +joystick/wheel). I used ComPortSpy from fCoder (alpha version!) +3. Play the effect, and watch what happens on the spy screen. + +A few words about ComPortSpy: +At first glance, this software seems, hum, well... buggy. In fact, data appear with a +few seconds latency. Personally, I restart it every time I play an effect. +Remember it's free (as in free beer) and alpha! + +URLS +==== + +Check http://www.immerse.com for Immersion Studio, +and http://www.fcoder.com for ComPortSpy. + + +I-Force is trademark of Immersion Corp. diff --git a/Documentation/input/devices/index.rst b/Documentation/input/devices/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..95a453782bad377d234e101700d031f69d9f26bd --- /dev/null +++ b/Documentation/input/devices/index.rst @@ -0,0 +1,19 @@ +Driver-specific documentation +============================= + +This section provides information about various devices supported by the +Linux kernel, their protocols, and driver details. + +.. toctree:: + :maxdepth: 2 + :numbered: + :glob: + + * + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/input/devices/joystick-parport.rst b/Documentation/input/devices/joystick-parport.rst new file mode 100644 index 0000000000000000000000000000000000000000..e8ce16ee799a2a089312e842594e1248b1350ef6 --- /dev/null +++ b/Documentation/input/devices/joystick-parport.rst @@ -0,0 +1,611 @@ +.. include:: + +.. _joystick-parport: + +============================== +Parallel Port Joystick Drivers +============================== + +:Copyright: |copy| 1998-2000 Vojtech Pavlik +:Copyright: |copy| 1998 Andree Borrmann + + +Sponsored by SuSE + +Disclaimer +========== + +Any information in this file is provided as-is, without any guarantee that +it will be true. So, use it at your own risk. The possible damages that can +happen include burning your parallel port, and/or the sticks and joystick +and maybe even more. Like when a lightning kills you it is not our problem. + +Introduction +============ + +The joystick parport drivers are used for joysticks and gamepads not +originally designed for PCs and other computers Linux runs on. Because of +that, PCs usually lack the right ports to connect these devices to. Parallel +port, because of its ability to change single bits at will, and providing +both output and input bits is the most suitable port on the PC for +connecting such devices. + +Devices supported +================= + +Many console and 8-bit computer gamepads and joysticks are supported. The +following subsections discuss usage of each. + +NES and SNES +------------ + +The Nintendo Entertainment System and Super Nintendo Entertainment System +gamepads are widely available, and easy to get. Also, they are quite easy to +connect to a PC, and don't need much processing speed (108 us for NES and +165 us for SNES, compared to about 1000 us for PC gamepads) to communicate +with them. + +All NES and SNES use the same synchronous serial protocol, clocked from +the computer's side (and thus timing insensitive). To allow up to 5 NES +and/or SNES gamepads and/or SNES mice connected to the parallel port at once, +the output lines of the parallel port are shared, while one of 5 available +input lines is assigned to each gamepad. + +This protocol is handled by the gamecon.c driver, so that's the one +you'll use for NES, SNES gamepads and SNES mice. + +The main problem with PC parallel ports is that they don't have +5V power +source on any of their pins. So, if you want a reliable source of power +for your pads, use either keyboard or joystick port, and make a pass-through +cable. You can also pull the power directly from the power supply (the red +wire is +5V). + +If you want to use the parallel port only, you can take the power is from +some data pin. For most gamepad and parport implementations only one pin is +needed, and I'd recommend pin 9 for that, the highest data bit. On the other +hand, if you are not planning to use anything else than NES / SNES on the +port, anything between and including pin 4 and pin 9 will work:: + + (pin 9) -----> Power + +Unfortunately, there are pads that need a lot more of power, and parallel +ports that can't give much current through the data pins. If this is your +case, you'll need to use diodes (as a prevention of destroying your parallel +port), and combine the currents of two or more data bits together:: + + Diodes + (pin 9) ----|>|-------+------> Power + | + (pin 8) ----|>|-------+ + | + (pin 7) ----|>|-------+ + | + : + | + (pin 4) ----|>|-------+ + +Ground is quite easy. On PC's parallel port the ground is on any of the +pins from pin 18 to pin 25. So use any pin of these you like for the ground:: + + (pin 18) -----> Ground + +NES and SNES pads have two input bits, Clock and Latch, which drive the +serial transfer. These are connected to pins 2 and 3 of the parallel port, +respectively:: + + (pin 2) -----> Clock + (pin 3) -----> Latch + +And the last thing is the NES / SNES data wire. Only that isn't shared and +each pad needs its own data pin. The parallel port pins are:: + + (pin 10) -----> Pad 1 data + (pin 11) -----> Pad 2 data + (pin 12) -----> Pad 3 data + (pin 13) -----> Pad 4 data + (pin 15) -----> Pad 5 data + +Note that pin 14 is not used, since it is not an input pin on the parallel +port. + +This is everything you need on the PC's side of the connection, now on to +the gamepads side. The NES and SNES have different connectors. Also, there +are quite a lot of NES clones, and because Nintendo used proprietary +connectors for their machines, the cloners couldn't and used standard D-Cannon +connectors. Anyway, if you've got a gamepad, and it has buttons A, B, Turbo +A, Turbo B, Select and Start, and is connected through 5 wires, then it is +either a NES or NES clone and will work with this connection. SNES gamepads +also use 5 wires, but have more buttons. They will work as well, of course:: + + Pinout for NES gamepads Pinout for SNES gamepads and mice + + +----> Power +-----------------------\ + | 7 | o o o o | x x o | 1 + 5 +---------+ 7 +-----------------------/ + | x x o \ | | | | | + | o o o o | | | | | +-> Ground + 4 +------------+ 1 | | | +------------> Data + | | | | | | +---------------> Latch + | | | +-> Ground | +------------------> Clock + | | +----> Clock +---------------------> Power + | +-------> Latch + +----------> Data + + Pinout for NES clone (db9) gamepads Pinout for NES clone (db15) gamepads + + +---------> Clock +-----------------> Data + | +-------> Latch | +---> Ground + | | +-----> Data | | + | | | ___________________ + _____________ 8 \ o x x x x x x o / 1 + 5 \ x o o o x / 1 \ o x x o x x o / + \ x o x o / 15 `~~~~~~~~~~~~~' 9 + 9 `~~~~~~~' 6 | | | + | | | | +----> Clock + | +----> Power | +----------> Latch + +--------> Ground +----------------> Power + +Multisystem joysticks +--------------------- + +In the era of 8-bit machines, there was something like de-facto standard +for joystick ports. They were all digital, and all used D-Cannon 9 pin +connectors (db9). Because of that, a single joystick could be used without +hassle on Atari (130, 800XE, 800XL, 2600, 7200), Amiga, Commodore C64, +Amstrad CPC, Sinclair ZX Spectrum and many other machines. That's why these +joysticks are called "Multisystem". + +Now their pinout:: + + +---------> Right + | +-------> Left + | | +-----> Down + | | | +---> Up + | | | | + _____________ + 5 \ x o o o o / 1 + \ x o x o / + 9 `~~~~~~~' 6 + | | + | +----> Button + +--------> Ground + +However, as time passed, extensions to this standard developed, and these +were not compatible with each other:: + + + Atari 130, 800/XL/XE MSX + + +-----------> Power + +---------> Right | +---------> Right + | +-------> Left | | +-------> Left + | | +-----> Down | | | +-----> Down + | | | +---> Up | | | | +---> Up + | | | | | | | | | + _____________ _____________ + 5 \ x o o o o / 1 5 \ o o o o o / 1 + \ x o o o / \ o o o o / + 9 `~~~~~~~' 6 9 `~~~~~~~' 6 + | | | | | | | + | | +----> Button | | | +----> Button 1 + | +------> Power | | +------> Button 2 + +--------> Ground | +--------> Output 3 + +----------> Ground + + Amstrad CPC Commodore C64 + + +-----------> Analog Y + +---------> Right | +---------> Right + | +-------> Left | | +-------> Left + | | +-----> Down | | | +-----> Down + | | | +---> Up | | | | +---> Up + | | | | | | | | | + _____________ _____________ + 5 \ x o o o o / 1 5 \ o o o o o / 1 + \ x o o o / \ o o o o / + 9 `~~~~~~~' 6 9 `~~~~~~~' 6 + | | | | | | | + | | +----> Button 1 | | | +----> Button + | +------> Button 2 | | +------> Power + +--------> Ground | +--------> Ground + +----------> Analog X + + Sinclair Spectrum +2A/+3 Amiga 1200 + + +-----------> Up +-----------> Button 3 + | +---------> Fire | +---------> Right + | | | | +-------> Left + | | +-----> Ground | | | +-----> Down + | | | | | | | +---> Up + | | | | | | | | + _____________ _____________ + 5 \ o o x o x / 1 5 \ o o o o o / 1 + \ o o o o / \ o o o o / + 9 `~~~~~~~' 6 9 `~~~~~~~' 6 + | | | | | | | | + | | | +----> Right | | | +----> Button 1 + | | +------> Left | | +------> Power + | +--------> Ground | +--------> Ground + +----------> Down +----------> Button 2 + + And there were many others. + +Multisystem joysticks using db9.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For the Multisystem joysticks, and their derivatives, the db9.c driver +was written. It allows only one joystick / gamepad per parallel port, but +the interface is easy to build and works with almost anything. + +For the basic 1-button Multisystem joystick you connect its wires to the +parallel port like this:: + + (pin 1) -----> Power + (pin 18) -----> Ground + + (pin 2) -----> Up + (pin 3) -----> Down + (pin 4) -----> Left + (pin 5) -----> Right + (pin 6) -----> Button 1 + +However, if the joystick is switch based (eg. clicks when you move it), +you might or might not, depending on your parallel port, need 10 kOhm pullup +resistors on each of the direction and button signals, like this:: + + (pin 2) ------------+------> Up + Resistor | + (pin 1) --[10kOhm]--+ + +Try without, and if it doesn't work, add them. For TTL based joysticks / +gamepads the pullups are not needed. + +For joysticks with two buttons you connect the second button to pin 7 on +the parallel port:: + + (pin 7) -----> Button 2 + +And that's it. + +On a side note, if you have already built a different adapter for use with +the digital joystick driver 0.8.0.2, this is also supported by the db9.c +driver, as device type 8. (See section 3.2) + +Multisystem joysticks using gamecon.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For some people just one joystick per parallel port is not enough, and/or +want to use them on one parallel port together with NES/SNES/PSX pads. This is +possible using the gamecon.c. It supports up to 5 devices of the above types, +including 1 and 2 buttons Multisystem joysticks. + +However, there is nothing for free. To allow more sticks to be used at +once, you need the sticks to be purely switch based (that is non-TTL), and +not to need power. Just a plain simple six switches inside. If your +joystick can do more (eg. turbofire) you'll need to disable it totally first +if you want to use gamecon.c. + +Also, the connection is a bit more complex. You'll need a bunch of diodes, +and one pullup resistor. First, you connect the Directions and the button +the same as for db9, however with the diodes between:: + + Diodes + (pin 2) -----|<|----> Up + (pin 3) -----|<|----> Down + (pin 4) -----|<|----> Left + (pin 5) -----|<|----> Right + (pin 6) -----|<|----> Button 1 + +For two button sticks you also connect the other button:: + + (pin 7) -----|<|----> Button 2 + +And finally, you connect the Ground wire of the joystick, like done in +this little schematic to Power and Data on the parallel port, as described +for the NES / SNES pads in section 2.1 of this file - that is, one data pin +for each joystick. The power source is shared:: + + Data ------------+-----> Ground + Resistor | + Power --[10kOhm]--+ + +And that's all, here we go! + +Multisystem joysticks using turbografx.c +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The TurboGraFX interface, designed by + + Steffen Schwenke + +allows up to 7 Multisystem joysticks connected to the parallel port. In +Steffen's version, there is support for up to 5 buttons per joystick. However, +since this doesn't work reliably on all parallel ports, the turbografx.c driver +supports only one button per joystick. For more information on how to build the +interface, see: + + http://www2.burg-halle.de/~schwenke/parport.html + +Sony Playstation +---------------- + +The PSX controller is supported by the gamecon.c. Pinout of the PSX +controller (compatible with DirectPadPro):: + + +---------+---------+---------+ + 9 | o o o | o o o | o o o | 1 parallel + \________|_________|________/ port pins + | | | | | | + | | | | | +--------> Clock --- (4) + | | | | +------------> Select --- (3) + | | | +---------------> Power --- (5-9) + | | +------------------> Ground --- (18-25) + | +-------------------------> Command --- (2) + +----------------------------> Data --- (one of 10,11,12,13,15) + +The driver supports these controllers: + + * Standard PSX Pad + * NegCon PSX Pad + * Analog PSX Pad (red mode) + * Analog PSX Pad (green mode) + * PSX Rumble Pad + * PSX DDR Pad + +Sega +---- + +All the Sega controllers are more or less based on the standard 2-button +Multisystem joystick. However, since they don't use switches and use TTL +logic, the only driver usable with them is the db9.c driver. + +Sega Master System +~~~~~~~~~~~~~~~~~~ + +The SMS gamepads are almost exactly the same as normal 2-button +Multisystem joysticks. Set the driver to Multi2 mode, use the corresponding +parallel port pins, and the following schematic:: + + +-----------> Power + | +---------> Right + | | +-------> Left + | | | +-----> Down + | | | | +---> Up + | | | | | + _____________ + 5 \ o o o o o / 1 + \ o o x o / + 9 `~~~~~~~' 6 + | | | + | | +----> Button 1 + | +--------> Ground + +----------> Button 2 + +Sega Genesis aka MegaDrive +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Sega Genesis (in Europe sold as Sega MegaDrive) pads are an extension +to the Sega Master System pads. They use more buttons (3+1, 5+1, 6+1). Use +the following schematic:: + + +-----------> Power + | +---------> Right + | | +-------> Left + | | | +-----> Down + | | | | +---> Up + | | | | | + _____________ + 5 \ o o o o o / 1 + \ o o o o / + 9 `~~~~~~~' 6 + | | | | + | | | +----> Button 1 + | | +------> Select + | +--------> Ground + +----------> Button 2 + +The Select pin goes to pin 14 on the parallel port:: + + (pin 14) -----> Select + +The rest is the same as for Multi2 joysticks using db9.c + +Sega Saturn +~~~~~~~~~~~ + +Sega Saturn has eight buttons, and to transfer that, without hacks like +Genesis 6 pads use, it needs one more select pin. Anyway, it is still +handled by the db9.c driver. Its pinout is very different from anything +else. Use this schematic:: + + +-----------> Select 1 + | +---------> Power + | | +-------> Up + | | | +-----> Down + | | | | +---> Ground + | | | | | + _____________ + 5 \ o o o o o / 1 + \ o o o o / + 9 `~~~~~~~' 6 + | | | | + | | | +----> Select 2 + | | +------> Right + | +--------> Left + +----------> Power + +Select 1 is pin 14 on the parallel port, Select 2 is pin 16 on the +parallel port:: + + (pin 14) -----> Select 1 + (pin 16) -----> Select 2 + +The other pins (Up, Down, Right, Left, Power, Ground) are the same as for +Multi joysticks using db9.c + +Amiga CD32 +---------- + +Amiga CD32 joypad uses the following pinout:: + + +-----------> Button 3 + | +---------> Right + | | +-------> Left + | | | +-----> Down + | | | | +---> Up + | | | | | + _____________ + 5 \ o o o o o / 1 + \ o o o o / + 9 `~~~~~~~' 6 + | | | | + | | | +----> Button 1 + | | +------> Power + | +--------> Ground + +----------> Button 2 + +It can be connected to the parallel port and driven by db9.c driver. It needs the following wiring: + + ============ ============= + CD32 pad Parallel port + ============ ============= + 1 (Up) 2 (D0) + 2 (Down) 3 (D1) + 3 (Left) 4 (D2) + 4 (Right) 5 (D3) + 5 (Button 3) 14 (AUTOFD) + 6 (Button 1) 17 (SELIN) + 7 (+5V) 1 (STROBE) + 8 (Gnd) 18 (Gnd) + 9 (Button 2) 7 (D5) + ============ ============= + +The drivers +=========== + +There are three drivers for the parallel port interfaces. Each, as +described above, allows to connect a different group of joysticks and pads. +Here are described their command lines: + +gamecon.c +--------- + +Using gamecon.c you can connect up to five devices to one parallel port. It +uses the following kernel/module command line:: + + gamecon.map=port,pad1,pad2,pad3,pad4,pad5 + +Where ``port`` the number of the parport interface (eg. 0 for parport0). + +And ``pad1`` to ``pad5`` are pad types connected to different data input pins +(10,11,12,13,15), as described in section 2.1 of this file. + +The types are: + + ===== ============================= + Type Joystick/Pad + ===== ============================= + 0 None + 1 SNES pad + 2 NES pad + 4 Multisystem 1-button joystick + 5 Multisystem 2-button joystick + 6 N64 pad + 7 Sony PSX controller + 8 Sony PSX DDR controller + 9 SNES mouse + ===== ============================= + +The exact type of the PSX controller type is autoprobed when used, so +hot swapping should work (but is not recommended). + +Should you want to use more than one of parallel ports at once, you can use +gamecon.map2 and gamecon.map3 as additional command line parameters for two +more parallel ports. + +There are two options specific to PSX driver portion. gamecon.psx_delay sets +the command delay when talking to the controllers. The default of 25 should +work but you can try lowering it for better performance. If your pads don't +respond try raising it until they work. Setting the type to 8 allows the +driver to be used with Dance Dance Revolution or similar games. Arrow keys are +registered as key presses instead of X and Y axes. + +db9.c +----- + +Apart from making an interface, there is nothing difficult on using the +db9.c driver. It uses the following kernel/module command line:: + + db9.dev=port,type + +Where ``port`` is the number of the parport interface (eg. 0 for parport0). + +Caveat here: This driver only works on bidirectional parallel ports. If +your parallel port is recent enough, you should have no trouble with this. +Old parallel ports may not have this feature. + +``Type`` is the type of joystick or pad attached: + + ===== ====================================================== + Type Joystick/Pad + ===== ====================================================== + 0 None + 1 Multisystem 1-button joystick + 2 Multisystem 2-button joystick + 3 Genesis pad (3+1 buttons) + 5 Genesis pad (5+1 buttons) + 6 Genesis pad (6+2 buttons) + 7 Saturn pad (8 buttons) + 8 Multisystem 1-button joystick (v0.8.0.2 pin-out) + 9 Two Multisystem 1-button joysticks (v0.8.0.2 pin-out) + 10 Amiga CD32 pad + ===== ====================================================== + +Should you want to use more than one of these joysticks/pads at once, you +can use db9.dev2 and db9.dev3 as additional command line parameters for two +more joysticks/pads. + +turbografx.c +------------ + +The turbografx.c driver uses a very simple kernel/module command line:: + + turbografx.map=port,js1,js2,js3,js4,js5,js6,js7 + +Where ``port`` is the number of the parport interface (eg. 0 for parport0). + +``jsX`` is the number of buttons the Multisystem joysticks connected to the +interface ports 1-7 have. For a standard multisystem joystick, this is 1. + +Should you want to use more than one of these interfaces at once, you can +use turbografx.map2 and turbografx.map3 as additional command line parameters +for two more interfaces. + +PC parallel port pinout +======================= + +:: + + .----------------------------------------. + At the PC: \ 13 12 11 10 9 8 7 6 5 4 3 2 1 / + \ 25 24 23 22 21 20 19 18 17 16 15 14 / + ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +====== ======= ============= + Pin Name Description +====== ======= ============= + 1 /STROBE Strobe + 2-9 D0-D7 Data Bit 0-7 + 10 /ACK Acknowledge + 11 BUSY Busy + 12 PE Paper End + 13 SELIN Select In + 14 /AUTOFD Autofeed + 15 /ERROR Error + 16 /INIT Initialize + 17 /SEL Select + 18-25 GND Signal Ground +====== ======= ============= + + +That's all, folks! Have fun! diff --git a/Documentation/input/devices/ntrig.rst b/Documentation/input/devices/ntrig.rst new file mode 100644 index 0000000000000000000000000000000000000000..a6b22ce6c61cdda2d8c48a53da866374dd52d07d --- /dev/null +++ b/Documentation/input/devices/ntrig.rst @@ -0,0 +1,137 @@ +.. include:: + +========================= +N-Trig touchscreen Driver +========================= + +:Copyright: |copy| 2008-2010 Rafi Rubin +:Copyright: |copy| 2009-2010 Stephane Chatty + +This driver provides support for N-Trig pen and multi-touch sensors. Single +and multi-touch events are translated to the appropriate protocols for +the hid and input systems. Pen events are sufficiently hid compliant and +are left to the hid core. The driver also provides additional filtering +and utility functions accessible with sysfs and module parameters. + +This driver has been reported to work properly with multiple N-Trig devices +attached. + + +Parameters +---------- + +Note: values set at load time are global and will apply to all applicable +devices. Adjusting parameters with sysfs will override the load time values, +but only for that one device. + +The following parameters are used to configure filters to reduce noise: + ++-----------------------+-----------------------------------------------------+ +|activate_slack |number of fingers to ignore before processing events | ++-----------------------+-----------------------------------------------------+ +|activation_height, |size threshold to activate immediately | +|activation_width | | ++-----------------------+-----------------------------------------------------+ +|min_height, |size threshold bellow which fingers are ignored | +|min_width |both to decide activation and during activity | ++-----------------------+-----------------------------------------------------+ +|deactivate_slack |the number of "no contact" frames to ignore before | +| |propagating the end of activity events | ++-----------------------+-----------------------------------------------------+ + +When the last finger is removed from the device, it sends a number of empty +frames. By holding off on deactivation for a few frames we can tolerate false +erroneous disconnects, where the sensor may mistakenly not detect a finger that +is still present. Thus deactivate_slack addresses problems where a users might +see breaks in lines during drawing, or drop an object during a long drag. + + +Additional sysfs items +---------------------- + +These nodes just provide easy access to the ranges reported by the device. + ++-----------------------+-----------------------------------------------------+ +|sensor_logical_height, | the range for positions reported during activity | +|sensor_logical_width | | ++-----------------------+-----------------------------------------------------+ +|sensor_physical_height,| internal ranges not used for normal events but | +|sensor_physical_width | useful for tuning | ++-----------------------+-----------------------------------------------------+ + +All N-Trig devices with product id of 1 report events in the ranges of + +* X: 0-9600 +* Y: 0-7200 + +However not all of these devices have the same physical dimensions. Most +seem to be 12" sensors (Dell Latitude XT and XT2 and the HP TX2), and +at least one model (Dell Studio 17) has a 17" sensor. The ratio of physical +to logical sizes is used to adjust the size based filter parameters. + + +Filtering +--------- + +With the release of the early multi-touch firmwares it became increasingly +obvious that these sensors were prone to erroneous events. Users reported +seeing both inappropriately dropped contact and ghosts, contacts reported +where no finger was actually touching the screen. + +Deactivation slack helps prevent dropped contact for single touch use, but does +not address the problem of dropping one of more contacts while other contacts +are still active. Drops in the multi-touch context require additional +processing and should be handled in tandem with tacking. + +As observed ghost contacts are similar to actual use of the sensor, but they +seem to have different profiles. Ghost activity typically shows up as small +short lived touches. As such, I assume that the longer the continuous stream +of events the more likely those events are from a real contact, and that the +larger the size of each contact the more likely it is real. Balancing the +goals of preventing ghosts and accepting real events quickly (to minimize +user observable latency), the filter accumulates confidence for incoming +events until it hits thresholds and begins propagating. In the interest in +minimizing stored state as well as the cost of operations to make a decision, +I've kept that decision simple. + +Time is measured in terms of the number of fingers reported, not frames since +the probability of multiple simultaneous ghosts is expected to drop off +dramatically with increasing numbers. Rather than accumulate weight as a +function of size, I just use it as a binary threshold. A sufficiently large +contact immediately overrides the waiting period and leads to activation. + +Setting the activation size thresholds to large values will result in deciding +primarily on activation slack. If you see longer lived ghosts, turning up the +activation slack while reducing the size thresholds may suffice to eliminate +the ghosts while keeping the screen quite responsive to firm taps. + +Contacts continue to be filtered with min_height and min_width even after +the initial activation filter is satisfied. The intent is to provide +a mechanism for filtering out ghosts in the form of an extra finger while +you actually are using the screen. In practice this sort of ghost has +been far less problematic or relatively rare and I've left the defaults +set to 0 for both parameters, effectively turning off that filter. + +I don't know what the optimal values are for these filters. If the defaults +don't work for you, please play with the parameters. If you do find other +values more comfortable, I would appreciate feedback. + +The calibration of these devices does drift over time. If ghosts or contact +dropping worsen and interfere with the normal usage of your device, try +recalibrating it. + + +Calibration +----------- + +The N-Trig windows tools provide calibration and testing routines. Also an +unofficial unsupported set of user space tools including a calibrator is +available at: +http://code.launchpad.net/~rafi-seas/+junk/ntrig_calib + + +Tracking +-------- + +As of yet, all tested N-Trig firmwares do not track fingers. When multiple +contacts are active they seem to be sorted primarily by Y position. diff --git a/Documentation/input/devices/rotary-encoder.rst b/Documentation/input/devices/rotary-encoder.rst new file mode 100644 index 0000000000000000000000000000000000000000..b07b20a295ac5c220a55f0ba6def574d6c9ea5af --- /dev/null +++ b/Documentation/input/devices/rotary-encoder.rst @@ -0,0 +1,131 @@ +============================================================ +rotary-encoder - a generic driver for GPIO connected devices +============================================================ + +:Author: Daniel Mack , Feb 2009 + +Function +-------- + +Rotary encoders are devices which are connected to the CPU or other +peripherals with two wires. The outputs are phase-shifted by 90 degrees +and by triggering on falling and rising edges, the turn direction can +be determined. + +Some encoders have both outputs low in stable states, others also have +a stable state with both outputs high (half-period mode) and some have +a stable state in all steps (quarter-period mode). + +The phase diagram of these two outputs look like this:: + + _____ _____ _____ + | | | | | | + Channel A ____| |_____| |_____| |____ + + : : : : : : : : : : : : + __ _____ _____ _____ + | | | | | | | + Channel B |_____| |_____| |_____| |__ + + : : : : : : : : : : : : + Event a b c d a b c d a b c d + + |<-------->| + one step + + |<-->| + one step (half-period mode) + + |<>| + one step (quarter-period mode) + +For more information, please see + https://en.wikipedia.org/wiki/Rotary_encoder + + +Events / state machine +---------------------- + +In half-period mode, state a) and c) above are used to determine the +rotational direction based on the last stable state. Events are reported in +states b) and d) given that the new stable state is different from the last +(i.e. the rotation was not reversed half-way). + +Otherwise, the following apply: + +a) Rising edge on channel A, channel B in low state + This state is used to recognize a clockwise turn + +b) Rising edge on channel B, channel A in high state + When entering this state, the encoder is put into 'armed' state, + meaning that there it has seen half the way of a one-step transition. + +c) Falling edge on channel A, channel B in high state + This state is used to recognize a counter-clockwise turn + +d) Falling edge on channel B, channel A in low state + Parking position. If the encoder enters this state, a full transition + should have happened, unless it flipped back on half the way. The + 'armed' state tells us about that. + +Platform requirements +--------------------- + +As there is no hardware dependent call in this driver, the platform it is +used with must support gpiolib. Another requirement is that IRQs must be +able to fire on both edges. + + +Board integration +----------------- + +To use this driver in your system, register a platform_device with the +name 'rotary-encoder' and associate the IRQs and some specific platform +data with it. Because the driver uses generic device properties, this can +be done either via device tree, ACPI, or using static board files, like in +example below: + +:: + + /* board support file example */ + + #include + #include + #include + + #define GPIO_ROTARY_A 1 + #define GPIO_ROTARY_B 2 + + static struct gpiod_lookup_table rotary_encoder_gpios = { + .dev_id = "rotary-encoder.0", + .table = { + GPIO_LOOKUP_IDX("gpio-0", + GPIO_ROTARY_A, NULL, 0, GPIO_ACTIVE_LOW), + GPIO_LOOKUP_IDX("gpio-0", + GPIO_ROTARY_B, NULL, 1, GPIO_ACTIVE_HIGH), + { }, + }, + }; + + static const struct property_entry rotary_encoder_properties[] __initconst = { + PROPERTY_ENTRY_INTEGER("rotary-encoder,steps-per-period", u32, 24), + PROPERTY_ENTRY_INTEGER("linux,axis", u32, ABS_X), + PROPERTY_ENTRY_INTEGER("rotary-encoder,relative_axis", u32, 0), + { }, + }; + + static struct platform_device rotary_encoder_device = { + .name = "rotary-encoder", + .id = 0, + }; + + ... + + gpiod_add_lookup_table(&rotary_encoder_gpios); + device_add_properties(&rotary_encoder_device, rotary_encoder_properties); + platform_device_register(&rotary_encoder_device); + + ... + +Please consult device tree binding documentation to see all properties +supported by the driver. diff --git a/Documentation/input/devices/sentelic.rst b/Documentation/input/devices/sentelic.rst new file mode 100644 index 0000000000000000000000000000000000000000..d7ad603dd77e26ad78ee0bff5d2261c97e3bcc42 --- /dev/null +++ b/Documentation/input/devices/sentelic.rst @@ -0,0 +1,901 @@ +.. include:: + +================= +Sentelic Touchpad +================= + + +:Copyright: |copy| 2002-2011 Sentelic Corporation. + +:Last update: Dec-07-2011 + +Finger Sensing Pad Intellimouse Mode (scrolling wheel, 4th and 5th buttons) +============================================================================ + +A) MSID 4: Scrolling wheel mode plus Forward page(4th button) and Backward + page (5th button) + +1. Set sample rate to 200; +2. Set sample rate to 200; +3. Set sample rate to 80; +4. Issuing the "Get device ID" command (0xF2) and waits for the response; +5. FSP will respond 0x04. + +:: + + Packet 1 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|W|W|W|W| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7 => Y overflow + Bit6 => X overflow + Bit5 => Y sign bit + Bit4 => X sign bit + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X Movement(9-bit 2's complement integers) + Byte 3: Y Movement(9-bit 2's complement integers) + Byte 4: Bit3~Bit0 => the scrolling wheel's movement since the last data report. + valid values, -8 ~ +7 + Bit4 => 1 = 4th mouse button is pressed, Forward one page. + 0 = 4th mouse button is not pressed. + Bit5 => 1 = 5th mouse button is pressed, Backward one page. + 0 = 5th mouse button is not pressed. + +B) MSID 6: Horizontal and Vertical scrolling + +- Set bit 1 in register 0x40 to 1 + +FSP replaces scrolling wheel's movement as 4 bits to show horizontal and +vertical scrolling. + +:: + + Packet 1 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|r|l|u|d| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7 => Y overflow + Bit6 => X overflow + Bit5 => Y sign bit + Bit4 => X sign bit + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X Movement(9-bit 2's complement integers) + Byte 3: Y Movement(9-bit 2's complement integers) + Byte 4: Bit0 => the Vertical scrolling movement downward. + Bit1 => the Vertical scrolling movement upward. + Bit2 => the Horizontal scrolling movement leftward. + Bit3 => the Horizontal scrolling movement rightward. + Bit4 => 1 = 4th mouse button is pressed, Forward one page. + 0 = 4th mouse button is not pressed. + Bit5 => 1 = 5th mouse button is pressed, Backward one page. + 0 = 5th mouse button is not pressed. + +C) MSID 7 + +FSP uses 2 packets (8 Bytes) to represent Absolute Position. +so we have PACKET NUMBER to identify packets. + + If PACKET NUMBER is 0, the packet is Packet 1. + If PACKET NUMBER is 1, the packet is Packet 2. + Please count this number in program. + +MSID6 special packet will be enable at the same time when enable MSID 7. + +Absolute position for STL3886-G0 +================================ + +1. Set bit 2 or 3 in register 0x40 to 1 +2. Set bit 6 in register 0x40 to 1 + +:: + + Packet 1 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|1|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|d|u|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => valid bit + Bit4 => 1 + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => scroll up + Bit5 => scroll down + Bit6 => scroll left + Bit7 => scroll right + + Notify Packet for G0 + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|0|1|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |M|M|M|M|M|M|M|M| 4 |0|0|0|0|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => 0 + Bit4 => 1 + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: Message Type => 0x5A (Enable/Disable status packet) + Mode Type => 0xA5 (Normal/Icon mode status) + Byte 3: Message Type => 0x00 (Disabled) + => 0x01 (Enabled) + Mode Type => 0x00 (Normal) + => 0x01 (Icon) + Byte 4: Bit7~Bit0 => Don't Care + +Absolute position for STL3888-Ax +================================ + +:: + + Packet 1 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|A|1|L|0|1| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => arc + Bit3 => 1 + Bit2 => Left Button, 1 is pressed, 0 is released. + Bit1 => 0 + Bit0 => 1 + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit5~Bit4 => y1_g + Bit7~Bit6 => x1_g + + Packet 2 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|A|1|R|1|0| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => arc + Bit3 => 1 + Bit2 => Right Button, 1 is pressed, 0 is released. + Bit1 => 1 + Bit0 => 0 + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit5~Bit4 => y2_g + Bit7~Bit6 => x2_g + + Notify Packet for STL3888-Ax + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => 1 + Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): + 0: left button is generated by the on-pad command + 1: left button is generated by the external button + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) + Byte 3: Bit7~Bit6 => Don't care + Bit5~Bit4 => Number of fingers + Bit3~Bit1 => Reserved + Bit0 => 1: enter gesture mode; 0: leaving gesture mode + Byte 4: Bit7 => scroll right button + Bit6 => scroll left button + Bit5 => scroll down button + Bit4 => scroll up button + * Note that if gesture and additional button (Bit4~Bit7) + happen at the same time, the button information will not + be sent. + Bit3~Bit0 => Reserved + +Sample sequence of Multi-finger, Multi-coordinate mode: + + notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, + abs pkt 2, ..., notify packet (valid bit == 0) + +Absolute position for STL3888-B0 +================================ + +:: + + Packet 1(ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|F|1|0|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => finger up/down information. 1: finger down, 0: finger up. + Bit3 => 1 + Bit2 => finger index, 0 is the first finger, 1 is the second finger. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => scroll down button + Bit5 => scroll up button + Bit6 => scroll left button + Bit7 => scroll right button + + Packet 2 (ABSOLUTE POSITION) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|V|F|1|1|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. + When both fingers are up, the last two reports have zero valid + bit. + Bit4 => finger up/down information. 1: finger down, 0: finger up. + Bit3 => 1 + Bit2 => finger index, 0 is the first finger, 1 is the second finger. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => scroll down button + Bit5 => scroll up button + Bit6 => scroll left button + Bit7 => scroll right button + +Notify Packet for STL3888-B0:: + + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + => 11, Normal data packet with on-pad click + Bit5 => 1 + Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): + 0: left button is generated by the on-pad command + 1: left button is generated by the external button + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) + Byte 3: Bit7~Bit6 => Don't care + Bit5~Bit4 => Number of fingers + Bit3~Bit1 => Reserved + Bit0 => 1: enter gesture mode; 0: leaving gesture mode + Byte 4: Bit7 => scroll right button + Bit6 => scroll left button + Bit5 => scroll up button + Bit4 => scroll down button + * Note that if gesture and additional button(Bit4~Bit7) + happen at the same time, the button information will not + be sent. + Bit3~Bit0 => Reserved + +Sample sequence of Multi-finger, Multi-coordinate mode: + + notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, + abs pkt 2, ..., notify packet (valid bit == 0) + +Absolute position for STL3888-Cx and STL3888-Dx +=============================================== + +:: + + Single Finger, Absolute Coordinate Mode (SFAC) + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|0|P|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|B|F|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + Bit5 => Coordinate mode(always 0 in SFAC mode): + 0: single-finger absolute coordinates (SFAC) mode + 1: multi-finger, multiple coordinates (MFMC) mode + Bit4 => 0: The LEFT button is generated by on-pad command (OPC) + 1: The LEFT button is generated by external button + Default is 1 even if the LEFT button is not pressed. + Bit3 => Always 1, as specified by PS/2 protocol. + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => 4th mouse button(forward one page) + Bit5 => 5th mouse button(backward one page) + Bit6 => scroll left button + Bit7 => scroll right button + + Multi Finger, Multiple Coordinates Mode (MFMC): + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |0|1|1|P|1|F|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|B|F|X|X|Y|Y| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordination packet + => 10, Notify packet + Bit5 => Coordinate mode (always 1 in MFMC mode): + 0: single-finger absolute coordinates (SFAC) mode + 1: multi-finger, multiple coordinates (MFMC) mode + Bit4 => 0: The LEFT button is generated by on-pad command (OPC) + 1: The LEFT button is generated by external button + Default is 1 even if the LEFT button is not pressed. + Bit3 => Always 1, as specified by PS/2 protocol. + Bit2 => Finger index, 0 is the first finger, 1 is the second finger. + If bit 1 and 0 are all 1 and bit 4 is 0, the middle external + button is pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: X coordinate (xpos[9:2]) + Byte 3: Y coordinate (ypos[9:2]) + Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) + Bit3~Bit2 => X coordinate (ypos[1:0]) + Bit4 => 4th mouse button(forward one page) + Bit5 => 5th mouse button(backward one page) + Bit6 => scroll left button + Bit7 => scroll right button + +When one of the two fingers is up, the device will output four consecutive +MFMC#0 report packets with zero X and Y to represent 1st finger is up or +four consecutive MFMC#1 report packets with zero X and Y to represent that +the 2nd finger is up. On the other hand, if both fingers are up, the device +will output four consecutive single-finger, absolute coordinate(SFAC) packets +with zero X and Y. + +Notify Packet for STL3888-Cx/Dx:: + + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |1|0|0|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0| + |---------------| |---------------| |---------------| |---------------| + + Byte 1: Bit7~Bit6 => 00, Normal data packet + => 01, Absolute coordinates packet + => 10, Notify packet + Bit5 => Always 0 + Bit4 => 0: The LEFT button is generated by on-pad command(OPC) + 1: The LEFT button is generated by external button + Default is 1 even if the LEFT button is not pressed. + Bit3 => 1 + Bit2 => Middle Button, 1 is pressed, 0 is not pressed. + Bit1 => Right Button, 1 is pressed, 0 is not pressed. + Bit0 => Left Button, 1 is pressed, 0 is not pressed. + Byte 2: Message type: + 0xba => gesture information + 0xc0 => one finger hold-rotating gesture + Byte 3: The first parameter for the received message: + 0xba => gesture ID (refer to the 'Gesture ID' section) + 0xc0 => region ID + Byte 4: The second parameter for the received message: + 0xba => N/A + 0xc0 => finger up/down information + +Sample sequence of Multi-finger, Multi-coordinates mode: + + notify packet (valid bit == 1), MFMC packet 1 (byte 1, bit 2 == 0), + MFMC packet 2 (byte 1, bit 2 == 1), MFMC packet 1, MFMC packet 2, + ..., notify packet (valid bit == 0) + + That is, when the device is in MFMC mode, the host will receive + interleaved absolute coordinate packets for each finger. + +FSP Enable/Disable packet +========================= + +:: + + Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 + BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| + 1 |Y|X|0|0|1|M|R|L| 2 |0|1|0|1|1|0|1|E| 3 | | | | | | | | | 4 | | | | | | | | | + |---------------| |---------------| |---------------| |---------------| + + FSP will send out enable/disable packet when FSP receive PS/2 enable/disable + command. Host will receive the packet which Middle, Right, Left button will + be set. The packet only use byte 0 and byte 1 as a pattern of original packet. + Ignore the other bytes of the packet. + + Byte 1: Bit7 => 0, Y overflow + Bit6 => 0, X overflow + Bit5 => 0, Y sign bit + Bit4 => 0, X sign bit + Bit3 => 1 + Bit2 => 1, Middle Button + Bit1 => 1, Right Button + Bit0 => 1, Left Button + Byte 2: Bit7~1 => (0101101b) + Bit0 => 1 = Enable + 0 = Disable + Byte 3: Don't care + Byte 4: Don't care (MOUSE ID 3, 4) + Byte 5~8: Don't care (Absolute packet) + +PS/2 Command Set +================ + +FSP supports basic PS/2 commanding set and modes, refer to following URL for +details about PS/2 commands: + +http://www.computer-engineering.org/ps2mouse/ + +Programming Sequence for Determining Packet Parsing Flow +======================================================== + +1. Identify FSP by reading device ID(0x00) and version(0x01) register + +2. For FSP version < STL3888 Cx, determine number of buttons by reading + the 'test mode status' (0x20) register:: + + buttons = reg[0x20] & 0x30 + + if buttons == 0x30 or buttons == 0x20: + # two/four buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section A for packet parsing detail(ignore byte 4, bit ~ 7) + elif buttons == 0x10: + # 6 buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section B for packet parsing detail + elif buttons == 0x00: + # 6 buttons + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section A for packet parsing detail + +3. For FSP version >= STL3888 Cx: + Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' + section A for packet parsing detail (ignore byte 4, bit ~ 7) + +Programming Sequence for Register Reading/Writing +================================================= + +Register inversion requirement: + +Following values needed to be inverted(the '~' operator in C) before being +sent to FSP:: + + 0xe8, 0xe9, 0xee, 0xf2, 0xf3 and 0xff. + +Register swapping requirement: + +Following values needed to have their higher 4 bits and lower 4 bits being +swapped before being sent to FSP:: + + 10, 20, 40, 60, 80, 100 and 200. + +Register reading sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. send 0x66 PS/2 command to FSP; + + 3. send 0x88 PS/2 command to FSP; + + 4. send 0xf3 PS/2 command to FSP; + + 5. if the register address being to read is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 6 + + a. send 0x68 PS/2 command to FSP; + + b. send the inverted register address to FSP and goto step 8; + + 6. if the register address being to read is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 7 + + a. send 0xcc PS/2 command to FSP; + + b. send the swapped register address to FSP and goto step 8; + + 7. send 0x66 PS/2 command to FSP; + + a. send the original register address to FSP and goto step 8; + + 8. send 0xe9(status request) PS/2 command to FSP; + + 9. the 4th byte of the response read from FSP should be the + requested register value(?? indicates don't care byte):: + + host: 0xe9 + 3888: 0xfa (??) (??) (val) + + * Note that since the Cx release, the hardware will return 1's + complement of the register value at the 3rd byte of status request + result:: + + host: 0xe9 + 3888: 0xfa (??) (~val) (val) + +Register writing sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. if the register address being to write is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 3 + + a. send 0x74 PS/2 command to FSP; + + b. send the inverted register address to FSP and goto step 5; + + 3. if the register address being to write is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 4 + + a. send 0x77 PS/2 command to FSP; + + b. send the swapped register address to FSP and goto step 5; + + 4. send 0x55 PS/2 command to FSP; + + a. send the register address to FSP and goto step 5; + + 5. send 0xf3 PS/2 command to FSP; + + 6. if the register value being to write is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 7 + + a. send 0x47 PS/2 command to FSP; + + b. send the inverted register value to FSP and goto step 9; + + 7. if the register value being to write is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 8 + + a. send 0x44 PS/2 command to FSP; + + b. send the swapped register value to FSP and goto step 9; + + 8. send 0x33 PS/2 command to FSP; + + a. send the register value to FSP; + + 9. the register writing sequence is completed. + + * Since the Cx release, the hardware will return 1's + complement of the register value at the 3rd byte of status request + result. Host can optionally send another 0xe9 (status request) PS/2 + command to FSP at the end of register writing to verify that the + register writing operation is successful (?? indicates don't care + byte):: + + host: 0xe9 + 3888: 0xfa (??) (~val) (val) + +Programming Sequence for Page Register Reading/Writing +====================================================== + +In order to overcome the limitation of maximum number of registers +supported, the hardware separates register into different groups called +'pages.' Each page is able to include up to 255 registers. + +The default page after power up is 0x82; therefore, if one has to get +access to register 0x8301, one has to use following sequence to switch +to page 0x83, then start reading/writing from/to offset 0x01 by using +the register read/write sequence described in previous section. + +Page register reading sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. send 0x66 PS/2 command to FSP; + + 3. send 0x88 PS/2 command to FSP; + + 4. send 0xf3 PS/2 command to FSP; + + 5. send 0x83 PS/2 command to FSP; + + 6. send 0x88 PS/2 command to FSP; + + 7. send 0xe9(status request) PS/2 command to FSP; + + 8. the response read from FSP should be the requested page value. + + +Page register writing sequence: + + 1. send 0xf3 PS/2 command to FSP; + + 2. send 0x38 PS/2 command to FSP; + + 3. send 0x88 PS/2 command to FSP; + + 4. send 0xf3 PS/2 command to FSP; + + 5. if the page address being written is not required to be + inverted(refer to the 'Register inversion requirement' section), + goto step 6 + + a. send 0x47 PS/2 command to FSP; + + b. send the inverted page address to FSP and goto step 9; + + 6. if the page address being written is not required to be + swapped(refer to the 'Register swapping requirement' section), + goto step 7 + + a. send 0x44 PS/2 command to FSP; + + b. send the swapped page address to FSP and goto step 9; + + 7. send 0x33 PS/2 command to FSP; + + 8. send the page address to FSP; + + 9. the page register writing sequence is completed. + +Gesture ID +========== + +Unlike other devices which sends multiple fingers' coordinates to host, +FSP processes multiple fingers' coordinates internally and convert them +into a 8 bits integer, namely 'Gesture ID.' Following is a list of +supported gesture IDs: + + ======= ================================== + ID Description + ======= ================================== + 0x86 2 finger straight up + 0x82 2 finger straight down + 0x80 2 finger straight right + 0x84 2 finger straight left + 0x8f 2 finger zoom in + 0x8b 2 finger zoom out + 0xc0 2 finger curve, counter clockwise + 0xc4 2 finger curve, clockwise + 0x2e 3 finger straight up + 0x2a 3 finger straight down + 0x28 3 finger straight right + 0x2c 3 finger straight left + 0x38 palm + ======= ================================== + +Register Listing +================ + +Registers are represented in 16 bits values. The higher 8 bits represent +the page address and the lower 8 bits represent the relative offset within +that particular page. Refer to the 'Programming Sequence for Page Register +Reading/Writing' section for instructions on how to change current page +address:: + + offset width default r/w name + 0x8200 bit7~bit0 0x01 RO device ID + + 0x8201 bit7~bit0 RW version ID + 0xc1: STL3888 Ax + 0xd0 ~ 0xd2: STL3888 Bx + 0xe0 ~ 0xe1: STL3888 Cx + 0xe2 ~ 0xe3: STL3888 Dx + + 0x8202 bit7~bit0 0x01 RO vendor ID + + 0x8203 bit7~bit0 0x01 RO product ID + + 0x8204 bit3~bit0 0x01 RW revision ID + + 0x820b test mode status 1 + bit3 1 RO 0: rotate 180 degree + 1: no rotation + *only supported by H/W prior to Cx + + 0x820f register file page control + bit2 0 RW 1: rotate 180 degree + 0: no rotation + *supported since Cx + + bit0 0 RW 1 to enable page 1 register files + *only supported by H/W prior to Cx + + 0x8210 RW system control 1 + bit0 1 RW Reserved, must be 1 + bit1 0 RW Reserved, must be 0 + bit4 0 RW Reserved, must be 0 + bit5 1 RW register clock gating enable + 0: read only, 1: read/write enable + (Note that following registers does not require clock gating being + enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e + 40 41 42 43. In addition to that, this bit must be 1 when gesture + mode is enabled) + + 0x8220 test mode status + bit5~bit4 RO number of buttons + 11 => 2, lbtn/rbtn + 10 => 4, lbtn/rbtn/scru/scrd + 01 => 6, lbtn/rbtn/scru/scrd/scrl/scrr + 00 => 6, lbtn/rbtn/scru/scrd/fbtn/bbtn + *only supported by H/W prior to Cx + + 0x8231 RW on-pad command detection + bit7 0 RW on-pad command left button down tag + enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + 0x8234 RW on-pad command control 5 + bit4~bit0 0x05 RW XLO in 0s/4/1, so 03h = 0010.1b = 2.5 + (Note that position unit is in 0.5 scanline) + *only supported by H/W prior to Cx + + bit7 0 RW on-pad tap zone enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + 0x8235 RW on-pad command control 6 + bit4~bit0 0x1d RW XHI in 0s/4/1, so 19h = 1100.1b = 12.5 + (Note that position unit is in 0.5 scanline) + *only supported by H/W prior to Cx + + 0x8236 RW on-pad command control 7 + bit4~bit0 0x04 RW YLO in 0s/4/1, so 03h = 0010.1b = 2.5 + (Note that position unit is in 0.5 scanline) + *only supported by H/W prior to Cx + + 0x8237 RW on-pad command control 8 + bit4~bit0 0x13 RW YHI in 0s/4/1, so 11h = 1000.1b = 8.5 + (Note that position unit is in 0.5 scanline) + *only supported by H/W prior to Cx + + 0x8240 RW system control 5 + bit1 0 RW FSP Intellimouse mode enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + bit2 0 RW movement + abs. coordinate mode enable + 0: disable, 1: enable + (Note that this function has the functionality of bit 1 even when + bit 1 is not set. However, the format is different from that of bit 1. + In addition, when bit 1 and bit 2 are set at the same time, bit 2 will + override bit 1.) + *only supported by H/W prior to Cx + + bit3 0 RW abs. coordinate only mode enable + 0: disable, 1: enable + (Note that this function has the functionality of bit 1 even when + bit 1 is not set. However, the format is different from that of bit 1. + In addition, when bit 1, bit 2 and bit 3 are set at the same time, + bit 3 will override bit 1 and 2.) + *only supported by H/W prior to Cx + + bit5 0 RW auto switch enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + bit6 0 RW G0 abs. + notify packet format enable + 0: disable, 1: enable + (Note that the absolute/relative coordinate output still depends on + bit 2 and 3. That is, if any of those bit is 1, host will receive + absolute coordinates; otherwise, host only receives packets with + relative coordinate.) + *only supported by H/W prior to Cx + + bit7 0 RW EN_PS2_F2: PS/2 gesture mode 2nd + finger packet enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + 0x8243 RW on-pad control + bit0 0 RW on-pad control enable + 0: disable, 1: enable + (Note that if this bit is cleared, bit 3/5 will be ineffective) + *only supported by H/W prior to Cx + + bit3 0 RW on-pad fix vertical scrolling enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + bit5 0 RW on-pad fix horizontal scrolling enable + 0: disable, 1: enable + *only supported by H/W prior to Cx + + 0x8290 RW software control register 1 + bit0 0 RW absolute coordination mode + 0: disable, 1: enable + *supported since Cx + + bit1 0 RW gesture ID output + 0: disable, 1: enable + *supported since Cx + + bit2 0 RW two fingers' coordinates output + 0: disable, 1: enable + *supported since Cx + + bit3 0 RW finger up one packet output + 0: disable, 1: enable + *supported since Cx + + bit4 0 RW absolute coordination continuous mode + 0: disable, 1: enable + *supported since Cx + + bit6~bit5 00 RW gesture group selection + 00: basic + 01: suite + 10: suite pro + 11: advanced + *supported since Cx + + bit7 0 RW Bx packet output compatible mode + 0: disable, 1: enable + *supported since Cx + *supported since Cx + + + 0x833d RW on-pad command control 1 + bit7 1 RW on-pad command detection enable + 0: disable, 1: enable + *supported since Cx + + 0x833e RW on-pad command detection + bit7 0 RW on-pad command left button down tag + enable. Works only in H/W based PS/2 + data packet mode. + 0: disable, 1: enable + *supported since Cx diff --git a/Documentation/input/devices/walkera0701.rst b/Documentation/input/devices/walkera0701.rst new file mode 100644 index 0000000000000000000000000000000000000000..2adda99ca7171f0c3e2633f4ca164ba0cc86b0ca --- /dev/null +++ b/Documentation/input/devices/walkera0701.rst @@ -0,0 +1,128 @@ +=========================== +Walkera WK-0701 transmitter +=========================== + +Walkera WK-0701 transmitter is supplied with a ready to fly Walkera +helicopters such as HM36, HM37, HM60. The walkera0701 module enables to use +this transmitter as joystick + +Devel homepage and download: +http://zub.fei.tuke.sk/walkera-wk0701/ + +or use cogito: +cg-clone http://zub.fei.tuke.sk/GIT/walkera0701-joystick + + +Connecting to PC +================ + +At back side of transmitter S-video connector can be found. Modulation +pulses from processor to HF part can be found at pin 2 of this connector, +pin 3 is GND. Between pin 3 and CPU 5k6 resistor can be found. To get +modulation pulses to PC, signal pulses must be amplified. + +Cable: (walkera TX to parport) + +Walkera WK-0701 TX S-VIDEO connector:: + + (back side of TX) + __ __ S-video: canon25 + / |_| \ pin 2 (signal) NPN parport + / O 4 3 O \ pin 3 (GND) LED ________________ 10 ACK + ( O 2 1 O ) | C + \ ___ / 2 ________________________|\|_____|/ + | [___] | |/| B |\ + ------- 3 __________________________________|________________ 25 GND + E + +I use green LED and BC109 NPN transistor. + +Software +======== + +Build kernel with walkera0701 module. Module walkera0701 need exclusive +access to parport, modules like lp must be unloaded before loading +walkera0701 module, check dmesg for error messages. Connect TX to PC by +cable and run jstest /dev/input/js0 to see values from TX. If no value can +be changed by TX "joystick", check output from /proc/interrupts. Value for +(usually irq7) parport must increase if TX is on. + + + +Technical details +================= + +Driver use interrupt from parport ACK input bit to measure pulse length +using hrtimers. + +Frame format: +Based on walkera WK-0701 PCM Format description by Shaul Eizikovich. +(downloaded from http://www.smartpropoplus.com/Docs/Walkera_Wk-0701_PCM.pdf) + +Signal pulses +------------- + +:: + + (ANALOG) + SYNC BIN OCT + +---------+ +------+ + | | | | + --+ +------+ +--- + +Frame +----- + +:: + + SYNC , BIN1, OCT1, BIN2, OCT2 ... BIN24, OCT24, BIN25, next frame SYNC .. + +pulse length +------------ + +:: + + Binary values: Analog octal values: + + 288 uS Binary 0 318 uS 000 + 438 uS Binary 1 398 uS 001 + 478 uS 010 + 558 uS 011 + 638 uS 100 + 1306 uS SYNC 718 uS 101 + 798 uS 110 + 878 uS 111 + +24 bin+oct values + 1 bin value = 24*4+1 bits = 97 bits + +(Warning, pulses on ACK are inverted by transistor, irq is raised up on sync +to bin change or octal value to bin change). + +Binary data representations +--------------------------- + +One binary and octal value can be grouped to nibble. 24 nibbles + one binary +values can be sampled between sync pulses. + +Values for first four channels (analog joystick values) can be found in +first 10 nibbles. Analog value is represented by one sign bit and 9 bit +absolute binary value. (10 bits per channel). Next nibble is checksum for +first ten nibbles. + +Next nibbles 12 .. 21 represents four channels (not all channels can be +directly controlled from TX). Binary representations are the same as in first +four channels. In nibbles 22 and 23 is a special magic number. Nibble 24 is +checksum for nibbles 12..23. + +After last octal value for nibble 24 and next sync pulse one additional +binary value can be sampled. This bit and magic number is not used in +software driver. Some details about this magic numbers can be found in +Walkera_Wk-0701_PCM.pdf. + +Checksum calculation +-------------------- + +Summary of octal values in nibbles must be same as octal value in checksum +nibble (only first 3 bits are used). Binary value for checksum nibble is +calculated by sum of binary values in checked nibbles + sum of octal values +in checked nibbles divided by 8. Only bit 0 of this sum is used. diff --git a/Documentation/input/devices/xpad.rst b/Documentation/input/devices/xpad.rst new file mode 100644 index 0000000000000000000000000000000000000000..5a709ab77c8dbe42f8d7da8669cb6090a3f8323f --- /dev/null +++ b/Documentation/input/devices/xpad.rst @@ -0,0 +1,233 @@ +======================================================= +xpad - Linux USB driver for Xbox compatible controllers +======================================================= + +This driver exposes all first-party and third-party Xbox compatible +controllers. It has a long history and has enjoyed considerable usage +as Window's xinput library caused most PC games to focus on Xbox +controller compatibility. + +Due to backwards compatibility all buttons are reported as digital. +This only effects Original Xbox controllers. All later controller models +have only digital face buttons. + +Rumble is supported on some models of Xbox 360 controllers but not of +Original Xbox controllers nor on Xbox One controllers. As of writing +the Xbox One's rumble protocol has not been reverse engineered but in +the future could be supported. + + +Notes +===== + +The number of buttons/axes reported varies based on 3 things: + +- if you are using a known controller +- if you are using a known dance pad +- if using an unknown device (one not listed below), what you set in the + module configuration for "Map D-PAD to buttons rather than axes for unknown + pads" (module option dpad_to_buttons) + +If you set dpad_to_buttons to N and you are using an unknown device +the driver will map the directional pad to axes (X/Y). +If you said Y it will map the d-pad to buttons, which is needed for dance +style games to function correctly. The default is Y. + +dpad_to_buttons has no effect for known pads. A erroneous commit message +claimed dpad_to_buttons could be used to force behavior on known devices. +This is not true. Both dpad_to_buttons and triggers_to_buttons only affect +unknown controllers. + + +Normal Controllers +------------------ + +With a normal controller, the directional pad is mapped to its own X/Y axes. +The jstest-program from joystick-1.2.15 (jstest-version 2.1.0) will report 8 +axes and 10 buttons. + +All 8 axes work, though they all have the same range (-32768..32767) +and the zero-setting is not correct for the triggers (I don't know if that +is some limitation of jstest, since the input device setup should be fine. I +didn't have a look at jstest itself yet). + +All of the 10 buttons work (in digital mode). The six buttons on the +right side (A, B, X, Y, black, white) are said to be "analog" and +report their values as 8 bit unsigned, not sure what this is good for. + +I tested the controller with quake3, and configuration and +in game functionality were OK. However, I find it rather difficult to +play first person shooters with a pad. Your mileage may vary. + + +Xbox Dance Pads +--------------- + +When using a known dance pad, jstest will report 6 axes and 14 buttons. + +For dance style pads (like the redoctane pad) several changes +have been made. The old driver would map the d-pad to axes, resulting +in the driver being unable to report when the user was pressing both +left+right or up+down, making DDR style games unplayable. + +Known dance pads automatically map the d-pad to buttons and will work +correctly out of the box. + +If your dance pad is recognized by the driver but is using axes instead +of buttons, see section 0.3 - Unknown Controllers + +I've tested this with Stepmania, and it works quite well. + + +Unknown Controllers +------------------- + +If you have an unknown xbox controller, it should work just fine with +the default settings. + +HOWEVER if you have an unknown dance pad not listed below, it will not +work UNLESS you set "dpad_to_buttons" to 1 in the module configuration. + + +USB adapters +============ + +All generations of Xbox controllers speak USB over the wire. + +- Original Xbox controllers use a proprietary connector and require adapters. +- Wireless Xbox 360 controllers require a 'Xbox 360 Wireless Gaming Receiver + for Windows' +- Wired Xbox 360 controllers use standard USB connectors. +- Xbox One controllers can be wireless but speak Wi-Fi Direct and are not + yet supported. +- Xbox One controllers can be wired and use standard Micro-USB connectors. + + + +Original Xbox USB adapters +-------------------------- + +Using this driver with an Original Xbox controller requires an +adapter cable to break out the proprietary connector's pins to USB. +You can buy these online fairly cheap, or build your own. + +Such a cable is pretty easy to build. The Controller itself is a USB +compound device (a hub with three ports for two expansion slots and +the controller device) with the only difference in a nonstandard connector +(5 pins vs. 4 on standard USB 1.0 connectors). + +You just need to solder a USB connector onto the cable and keep the +yellow wire unconnected. The other pins have the same order on both +connectors so there is no magic to it. Detailed info on these matters +can be found on the net ([1]_, [2]_, [3]_). + +Thanks to the trip splitter found on the cable you don't even need to cut the +original one. You can buy an extension cable and cut that instead. That way, +you can still use the controller with your X-Box, if you have one ;) + + + +Driver Installation +=================== + +Once you have the adapter cable, if needed, and the controller connected +the xpad module should be auto loaded. To confirm you can cat +/sys/kernel/debug/usb/devices. There should be an entry like those: + +.. code-block:: none + :caption: dump from InterAct PowerPad Pro (Germany) + + T: Bus=01 Lev=03 Prnt=04 Port=00 Cnt=01 Dev#= 5 Spd=12 MxCh= 0 + D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=32 #Cfgs= 1 + P: Vendor=05fd ProdID=107a Rev= 1.00 + C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA + I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=(none) + E: Ad=81(I) Atr=03(Int.) MxPS= 32 Ivl= 10ms + E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl= 10ms + +.. code-block:: none + :caption: dump from Redoctane Xbox Dance Pad (US) + + T: Bus=01 Lev=02 Prnt=09 Port=00 Cnt=01 Dev#= 10 Spd=12 MxCh= 0 + D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 + P: Vendor=0c12 ProdID=8809 Rev= 0.01 + S: Product=XBOX DDR + C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA + I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=xpad + E: Ad=82(I) Atr=03(Int.) MxPS= 32 Ivl=4ms + E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl=4ms + + +Supported Controllers +===================== + +For a full list of supported controllers and associated vendor and product +IDs see the xpad_device[] array\ [4]_. + +As of the historic version 0.0.6 (2006-10-10) the following devices +were supported:: + + original Microsoft XBOX controller (US), vendor=0x045e, product=0x0202 + smaller Microsoft XBOX controller (US), vendor=0x045e, product=0x0289 + original Microsoft XBOX controller (Japan), vendor=0x045e, product=0x0285 + InterAct PowerPad Pro (Germany), vendor=0x05fd, product=0x107a + RedOctane Xbox Dance Pad (US), vendor=0x0c12, product=0x8809 + +Unrecognized models of Xbox controllers should function as Generic +Xbox controllers. Unrecognized Dance Pad controllers require setting +the module option 'dpad_to_buttons'. + +If you have an unrecognized controller please see 0.3 - Unknown Controllers + + +Manual Testing +============== + +To test this driver's functionality you may use 'jstest'. + +For example:: + + > modprobe xpad + > modprobe joydev + > jstest /dev/js0 + +If you're using a normal controller, there should be a single line showing +18 inputs (8 axes, 10 buttons), and its values should change if you move +the sticks and push the buttons. If you're using a dance pad, it should +show 20 inputs (6 axes, 14 buttons). + +It works? Voila, you're done ;) + + + +Thanks +====== + +I have to thank ITO Takayuki for the detailed info on his site + http://euc.jp/periphs/xbox-controller.ja.html. + +His useful info and both the usb-skeleton as well as the iforce input driver +(Greg Kroah-Hartmann; Vojtech Pavlik) helped a lot in rapid prototyping +the basic functionality. + + + +References +========== + +.. [1] http://euc.jp/periphs/xbox-controller.ja.html (ITO Takayuki) +.. [2] http://xpad.xbox-scene.com/ +.. [3] http://www.markosweb.com/www/xboxhackz.com/ +.. [4] http://lxr.free-electrons.com/ident?i=xpad_device + + +Historic Edits +============== + +2002-07-16 - Marko Friedemann + - original doc + +2005-03-19 - Dominic Cerquetti + - added stuff for dance pads, new d-pad->axes mappings + +Later changes may be viewed with 'git log Documentation/input/xpad.txt' diff --git a/Documentation/input/devices/yealink.rst b/Documentation/input/devices/yealink.rst new file mode 100644 index 0000000000000000000000000000000000000000..bb5a1aafeca2a03a1fb353502b039d47bb64a1de --- /dev/null +++ b/Documentation/input/devices/yealink.rst @@ -0,0 +1,225 @@ +=============================================== +Driver documentation for yealink usb-p1k phones +=============================================== + +Status +====== + +The p1k is a relatively cheap usb 1.1 phone with: + + - keyboard full support, yealink.ko / input event API + - LCD full support, yealink.ko / sysfs API + - LED full support, yealink.ko / sysfs API + - dialtone full support, yealink.ko / sysfs API + - ringtone full support, yealink.ko / sysfs API + - audio playback full support, snd_usb_audio.ko / alsa API + - audio record full support, snd_usb_audio.ko / alsa API + +For vendor documentation see http://www.yealink.com + + +keyboard features +================= + +The current mapping in the kernel is provided by the map_p1k_to_key +function:: + + Physical USB-P1K button layout input events + + + up up + IN OUT left, right + down down + + pickup C hangup enter, backspace, escape + 1 2 3 1, 2, 3 + 4 5 6 4, 5, 6, + 7 8 9 7, 8, 9, + * 0 # *, 0, #, + +The "up" and "down" keys, are symbolised by arrows on the button. +The "pickup" and "hangup" keys are symbolised by a green and red phone +on the button. + + +LCD features +============ + +The LCD is divided and organised as a 3 line display:: + + |[] [][] [][] [][] in |[][] + |[] M [][] D [][] : [][] out |[][] + store + + NEW REP SU MO TU WE TH FR SA + + [] [] [] [] [] [] [] [] [] [] [] [] + [] [] [] [] [] [] [] [] [] [] [] [] + + + Line 1 Format (see below) : 18.e8.M8.88...188 + Icon names : M D : IN OUT STORE + Line 2 Format : ......... + Icon name : NEW REP SU MO TU WE TH FR SA + Line 3 Format : 888888888888 + + +Format description: + From a userspace perspective the world is separated into "digits" and "icons". + A digit can have a character set, an icon can only be ON or OFF. + + Format specifier:: + + '8' : Generic 7 segment digit with individual addressable segments + + Reduced capability 7 segment digit, when segments are hard wired together. + '1' : 2 segments digit only able to produce a 1. + 'e' : Most significant day of the month digit, + able to produce at least 1 2 3. + 'M' : Most significant minute digit, + able to produce at least 0 1 2 3 4 5. + + Icons or pictograms: + '.' : For example like AM, PM, SU, a 'dot' .. or other single segment + elements. + + +Driver usage +============ + +For userland the following interfaces are available using the sysfs interface:: + + /sys/.../ + line1 Read/Write, lcd line1 + line2 Read/Write, lcd line2 + line3 Read/Write, lcd line3 + + get_icons Read, returns a set of available icons. + hide_icon Write, hide the element by writing the icon name. + show_icon Write, display the element by writing the icon name. + + map_seg7 Read/Write, the 7 segments char set, common for all + yealink phones. (see map_to_7segment.h) + + ringtone Write, upload binary representation of a ringtone, + see yealink.c. status EXPERIMENTAL due to potential + races between async. and sync usb calls. + + +lineX +~~~~~ + +Reading /sys/../lineX will return the format string with its current value. + + Example:: + + cat ./line3 + 888888888888 + Linux Rocks! + +Writing to /sys/../lineX will set the corresponding LCD line. + + - Excess characters are ignored. + - If less characters are written than allowed, the remaining digits are + unchanged. + - The tab '\t'and '\n' char does not overwrite the original content. + - Writing a space to an icon will always hide its content. + + Example:: + + date +"%m.%e.%k:%M" | sed 's/^0/ /' > ./line1 + + Will update the LCD with the current date & time. + + +get_icons +~~~~~~~~~ + +Reading will return all available icon names and its current settings:: + + cat ./get_icons + on M + on D + on : + IN + OUT + STORE + NEW + REP + SU + MO + TU + WE + TH + FR + SA + LED + DIALTONE + RINGTONE + + +show/hide icons +~~~~~~~~~~~~~~~ + +Writing to these files will update the state of the icon. +Only one icon at a time can be updated. + +If an icon is also on a ./lineX the corresponding value is +updated with the first letter of the icon. + + Example - light up the store icon:: + + echo -n "STORE" > ./show_icon + + cat ./line1 + 18.e8.M8.88...188 + S + + Example - sound the ringtone for 10 seconds:: + + echo -n RINGTONE > /sys/..../show_icon + sleep 10 + echo -n RINGTONE > /sys/..../hide_icon + + +Sound features +============== + +Sound is supported by the ALSA driver: snd_usb_audio + +One 16-bit channel with sample and playback rates of 8000 Hz is the practical +limit of the device. + + Example - recording test:: + + arecord -v -d 10 -r 8000 -f S16_LE -t wav foobar.wav + + Example - playback test:: + + aplay foobar.wav + + +Troubleshooting +=============== + +:Q: Module yealink compiled and installed without any problem but phone + is not initialized and does not react to any actions. +:A: If you see something like: + hiddev0: USB HID v1.00 Device [Yealink Network Technology Ltd. VOIP USB Phone + in dmesg, it means that the hid driver has grabbed the device first. Try to + load module yealink before any other usb hid driver. Please see the + instructions provided by your distribution on module configuration. + +:Q: Phone is working now (displays version and accepts keypad input) but I can't + find the sysfs files. +:A: The sysfs files are located on the particular usb endpoint. On most + distributions you can do: "find /sys/ -name get_icons" for a hint. + + +Credits & Acknowledgments +========================= + + - Olivier Vandorpe, for starting the usbb2k-api project doing much of + the reverse engineering. + - Martin Diehl, for pointing out how to handle USB memory allocation. + - Dmitry Torokhov, for the numerous code reviews and suggestions. diff --git a/Documentation/input/elantech.txt b/Documentation/input/elantech.txt deleted file mode 100644 index 1ec0db7879d310dd3525eee439bc291a3e595c02..0000000000000000000000000000000000000000 --- a/Documentation/input/elantech.txt +++ /dev/null @@ -1,817 +0,0 @@ -Elantech Touchpad Driver -======================== - - Copyright (C) 2007-2008 Arjan Opmeer - - Extra information for hardware version 1 found and - provided by Steve Havelka - - Version 2 (EeePC) hardware support based on patches - received from Woody at Xandros and forwarded to me - by user StewieGriffin at the eeeuser.com forum - - -Contents -~~~~~~~~ - - 1. Introduction - 2. Extra knobs - 3. Differentiating hardware versions - 4. Hardware version 1 - 4.1 Registers - 4.2 Native relative mode 4 byte packet format - 4.3 Native absolute mode 4 byte packet format - 5. Hardware version 2 - 5.1 Registers - 5.2 Native absolute mode 6 byte packet format - 5.2.1 Parity checking and packet re-synchronization - 5.2.2 One/Three finger touch - 5.2.3 Two finger touch - 6. Hardware version 3 - 6.1 Registers - 6.2 Native absolute mode 6 byte packet format - 6.2.1 One/Three finger touch - 6.2.2 Two finger touch - 7. Hardware version 4 - 7.1 Registers - 7.2 Native absolute mode 6 byte packet format - 7.2.1 Status packet - 7.2.2 Head packet - 7.2.3 Motion packet - 8. Trackpoint (for Hardware version 3 and 4) - 8.1 Registers - 8.2 Native relative mode 6 byte packet format - 8.2.1 Status Packet - - - -1. Introduction - ~~~~~~~~~~~~ - -Currently the Linux Elantech touchpad driver is aware of four different -hardware versions unimaginatively called version 1,version 2, version 3 -and version 4. Version 1 is found in "older" laptops and uses 4 bytes per -packet. Version 2 seems to be introduced with the EeePC and uses 6 bytes -per packet, and provides additional features such as position of two fingers, -and width of the touch. Hardware version 3 uses 6 bytes per packet (and -for 2 fingers the concatenation of two 6 bytes packets) and allows tracking -of up to 3 fingers. Hardware version 4 uses 6 bytes per packet, and can -combine a status packet with multiple head or motion packets. Hardware version -4 allows tracking up to 5 fingers. - -Some Hardware version 3 and version 4 also have a trackpoint which uses a -separate packet format. It is also 6 bytes per packet. - -The driver tries to support both hardware versions and should be compatible -with the Xorg Synaptics touchpad driver and its graphical configuration -utilities. - -Note that a mouse button is also associated with either the touchpad or the -trackpoint when a trackpoint is available. Disabling the Touchpad in xorg -(TouchPadOff=0) will also disable the buttons associated with the touchpad. - -Additionally the operation of the touchpad can be altered by adjusting the -contents of some of its internal registers. These registers are represented -by the driver as sysfs entries under /sys/bus/serio/drivers/psmouse/serio? -that can be read from and written to. - -Currently only the registers for hardware version 1 are somewhat understood. -Hardware version 2 seems to use some of the same registers but it is not -known whether the bits in the registers represent the same thing or might -have changed their meaning. - -On top of that, some register settings have effect only when the touchpad is -in relative mode and not in absolute mode. As the Linux Elantech touchpad -driver always puts the hardware into absolute mode not all information -mentioned below can be used immediately. But because there is no freely -available Elantech documentation the information is provided here anyway for -completeness sake. - - -///////////////////////////////////////////////////////////////////////////// - - -2. Extra knobs - ~~~~~~~~~~~ - -Currently the Linux Elantech touchpad driver provides three extra knobs under -/sys/bus/serio/drivers/psmouse/serio? for the user. - -* debug - - Turn different levels of debugging ON or OFF. - - By echoing "0" to this file all debugging will be turned OFF. - - Currently a value of "1" will turn on some basic debugging and a value of - "2" will turn on packet debugging. For hardware version 1 the default is - OFF. For version 2 the default is "1". - - Turning packet debugging on will make the driver dump every packet - received to the syslog before processing it. Be warned that this can - generate quite a lot of data! - -* paritycheck - - Turns parity checking ON or OFF. - - By echoing "0" to this file parity checking will be turned OFF. Any - non-zero value will turn it ON. For hardware version 1 the default is ON. - For version 2 the default it is OFF. - - Hardware version 1 provides basic data integrity verification by - calculating a parity bit for the last 3 bytes of each packet. The driver - can check these bits and reject any packet that appears corrupted. Using - this knob you can bypass that check. - - Hardware version 2 does not provide the same parity bits. Only some basic - data consistency checking can be done. For now checking is disabled by - default. Currently even turning it on will do nothing. - -* crc_enabled - - Sets crc_enabled to 0/1. The name "crc_enabled" is the official name of - this integrity check, even though it is not an actual cyclic redundancy - check. - - Depending on the state of crc_enabled, certain basic data integrity - verification is done by the driver on hardware version 3 and 4. The - driver will reject any packet that appears corrupted. Using this knob, - The state of crc_enabled can be altered with this knob. - - Reading the crc_enabled value will show the active value. Echoing - "0" or "1" to this file will set the state to "0" or "1". - -///////////////////////////////////////////////////////////////////////////// - -3. Differentiating hardware versions - ================================= - -To detect the hardware version, read the version number as param[0].param[1].param[2] - - 4 bytes version: (after the arrow is the name given in the Dell-provided driver) - 02.00.22 => EF013 - 02.06.00 => EF019 -In the wild, there appear to be more versions, such as 00.01.64, 01.00.21, -02.00.00, 02.00.04, 02.00.06. - - 6 bytes: - 02.00.30 => EF113 - 02.08.00 => EF023 - 02.08.XX => EF123 - 02.0B.00 => EF215 - 04.01.XX => Scroll_EF051 - 04.02.XX => EF051 -In the wild, there appear to be more versions, such as 04.03.01, 04.04.11. There -appears to be almost no difference, except for EF113, which does not report -pressure/width and has different data consistency checks. - -Probably all the versions with param[0] <= 01 can be considered as -4 bytes/firmware 1. The versions < 02.08.00, with the exception of 02.00.30, as -4 bytes/firmware 2. Everything >= 02.08.00 can be considered as 6 bytes. - -///////////////////////////////////////////////////////////////////////////// - -4. Hardware version 1 - ================== - -4.1 Registers - ~~~~~~~~~ - -By echoing a hexadecimal value to a register it contents can be altered. - -For example: - - echo -n 0x16 > reg_10 - -* reg_10 - - bit 7 6 5 4 3 2 1 0 - B C T D L A S E - - E: 1 = enable smart edges unconditionally - S: 1 = enable smart edges only when dragging - A: 1 = absolute mode (needs 4 byte packets, see reg_11) - L: 1 = enable drag lock (see reg_22) - D: 1 = disable dynamic resolution - T: 1 = disable tapping - C: 1 = enable corner tap - B: 1 = swap left and right button - -* reg_11 - - bit 7 6 5 4 3 2 1 0 - 1 0 0 H V 1 F P - - P: 1 = enable parity checking for relative mode - F: 1 = enable native 4 byte packet mode - V: 1 = enable vertical scroll area - H: 1 = enable horizontal scroll area - -* reg_20 - - single finger width? - -* reg_21 - - scroll area width (small: 0x40 ... wide: 0xff) - -* reg_22 - - drag lock time out (short: 0x14 ... long: 0xfe; - 0xff = tap again to release) - -* reg_23 - - tap make timeout? - -* reg_24 - - tap release timeout? - -* reg_25 - - smart edge cursor speed (0x02 = slow, 0x03 = medium, 0x04 = fast) - -* reg_26 - - smart edge activation area width? - - -4.2 Native relative mode 4 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -byte 0: - bit 7 6 5 4 3 2 1 0 - c c p2 p1 1 M R L - - L, R, M = 1 when Left, Right, Middle mouse button pressed - some models have M as byte 3 odd parity bit - when parity checking is enabled (reg_11, P = 1): - p1..p2 = byte 1 and 2 odd parity bit - c = 1 when corner tap detected - -byte 1: - bit 7 6 5 4 3 2 1 0 - dx7 dx6 dx5 dx4 dx3 dx2 dx1 dx0 - - dx7..dx0 = x movement; positive = right, negative = left - byte 1 = 0xf0 when corner tap detected - -byte 2: - bit 7 6 5 4 3 2 1 0 - dy7 dy6 dy5 dy4 dy3 dy2 dy1 dy0 - - dy7..dy0 = y movement; positive = up, negative = down - -byte 3: - parity checking enabled (reg_11, P = 1): - - bit 7 6 5 4 3 2 1 0 - w h n1 n0 ds3 ds2 ds1 ds0 - - normally: - ds3..ds0 = scroll wheel amount and direction - positive = down or left - negative = up or right - when corner tap detected: - ds0 = 1 when top right corner tapped - ds1 = 1 when bottom right corner tapped - ds2 = 1 when bottom left corner tapped - ds3 = 1 when top left corner tapped - n1..n0 = number of fingers on touchpad - only models with firmware 2.x report this, models with - firmware 1.x seem to map one, two and three finger taps - directly to L, M and R mouse buttons - h = 1 when horizontal scroll action - w = 1 when wide finger touch? - - otherwise (reg_11, P = 0): - - bit 7 6 5 4 3 2 1 0 - ds7 ds6 ds5 ds4 ds3 ds2 ds1 ds0 - - ds7..ds0 = vertical scroll amount and direction - negative = up - positive = down - - -4.3 Native absolute mode 4 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -EF013 and EF019 have a special behaviour (due to a bug in the firmware?), and -when 1 finger is touching, the first 2 position reports must be discarded. -This counting is reset whenever a different number of fingers is reported. - -byte 0: - firmware version 1.x: - - bit 7 6 5 4 3 2 1 0 - D U p1 p2 1 p3 R L - - L, R = 1 when Left, Right mouse button pressed - p1..p3 = byte 1..3 odd parity bit - D, U = 1 when rocker switch pressed Up, Down - - firmware version 2.x: - - bit 7 6 5 4 3 2 1 0 - n1 n0 p2 p1 1 p3 R L - - L, R = 1 when Left, Right mouse button pressed - p1..p3 = byte 1..3 odd parity bit - n1..n0 = number of fingers on touchpad - -byte 1: - firmware version 1.x: - - bit 7 6 5 4 3 2 1 0 - f 0 th tw x9 x8 y9 y8 - - tw = 1 when two finger touch - th = 1 when three finger touch - f = 1 when finger touch - - firmware version 2.x: - - bit 7 6 5 4 3 2 1 0 - . . . . x9 x8 y9 y8 - -byte 2: - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x9..x0 = absolute x value (horizontal) - -byte 3: - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y9..y0 = absolute y value (vertical) - - -///////////////////////////////////////////////////////////////////////////// - - -5. Hardware version 2 - ================== - - -5.1 Registers - ~~~~~~~~~ - -By echoing a hexadecimal value to a register it contents can be altered. - -For example: - - echo -n 0x56 > reg_10 - -* reg_10 - - bit 7 6 5 4 3 2 1 0 - 0 1 0 1 0 1 D 0 - - D: 1 = enable drag and drop - -* reg_11 - - bit 7 6 5 4 3 2 1 0 - 1 0 0 0 S 0 1 0 - - S: 1 = enable vertical scroll - -* reg_21 - - unknown (0x00) - -* reg_22 - - drag and drop release time out (short: 0x70 ... long 0x7e; - 0x7f = never i.e. tap again to release) - - -5.2 Native absolute mode 6 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -5.2.1 Parity checking and packet re-synchronization -There is no parity checking, however some consistency checks can be performed. - -For instance for EF113: - SA1= packet[0]; - A1 = packet[1]; - B1 = packet[2]; - SB1= packet[3]; - C1 = packet[4]; - D1 = packet[5]; - if( (((SA1 & 0x3C) != 0x3C) && ((SA1 & 0xC0) != 0x80)) || // check Byte 1 - (((SA1 & 0x0C) != 0x0C) && ((SA1 & 0xC0) == 0x80)) || // check Byte 1 (one finger pressed) - (((SA1 & 0xC0) != 0x80) && (( A1 & 0xF0) != 0x00)) || // check Byte 2 - (((SB1 & 0x3E) != 0x38) && ((SA1 & 0xC0) != 0x80)) || // check Byte 4 - (((SB1 & 0x0E) != 0x08) && ((SA1 & 0xC0) == 0x80)) || // check Byte 4 (one finger pressed) - (((SA1 & 0xC0) != 0x80) && (( C1 & 0xF0) != 0x00)) ) // check Byte 5 - // error detected - -For all the other ones, there are just a few constant bits: - if( ((packet[0] & 0x0C) != 0x04) || - ((packet[3] & 0x0f) != 0x02) ) - // error detected - - -In case an error is detected, all the packets are shifted by one (and packet[0] is discarded). - -5.2.2 One/Three finger touch - ~~~~~~~~~~~~~~~~ - -byte 0: - - bit 7 6 5 4 3 2 1 0 - n1 n0 w3 w2 . . R L - - L, R = 1 when Left, Right mouse button pressed - n1..n0 = number of fingers on touchpad - -byte 1: - - bit 7 6 5 4 3 2 1 0 - p7 p6 p5 p4 x11 x10 x9 x8 - -byte 2: - - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x11..x0 = absolute x value (horizontal) - -byte 3: - - bit 7 6 5 4 3 2 1 0 - n4 vf w1 w0 . . . b2 - - n4 = set if more than 3 fingers (only in 3 fingers mode) - vf = a kind of flag ? (only on EF123, 0 when finger is over one - of the buttons, 1 otherwise) - w3..w0 = width of the finger touch (not EF113) - b2 (on EF113 only, 0 otherwise), b2.R.L indicates one button pressed: - 0 = none - 1 = Left - 2 = Right - 3 = Middle (Left and Right) - 4 = Forward - 5 = Back - 6 = Another one - 7 = Another one - -byte 4: - - bit 7 6 5 4 3 2 1 0 - p3 p1 p2 p0 y11 y10 y9 y8 - - p7..p0 = pressure (not EF113) - -byte 5: - - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y11..y0 = absolute y value (vertical) - - -5.2.3 Two finger touch - ~~~~~~~~~~~~~~~~ - -Note that the two pairs of coordinates are not exactly the coordinates of the -two fingers, but only the pair of the lower-left and upper-right coordinates. -So the actual fingers might be situated on the other diagonal of the square -defined by these two points. - -byte 0: - - bit 7 6 5 4 3 2 1 0 - n1 n0 ay8 ax8 . . R L - - L, R = 1 when Left, Right mouse button pressed - n1..n0 = number of fingers on touchpad - -byte 1: - - bit 7 6 5 4 3 2 1 0 - ax7 ax6 ax5 ax4 ax3 ax2 ax1 ax0 - - ax8..ax0 = lower-left finger absolute x value - -byte 2: - - bit 7 6 5 4 3 2 1 0 - ay7 ay6 ay5 ay4 ay3 ay2 ay1 ay0 - - ay8..ay0 = lower-left finger absolute y value - -byte 3: - - bit 7 6 5 4 3 2 1 0 - . . by8 bx8 . . . . - -byte 4: - - bit 7 6 5 4 3 2 1 0 - bx7 bx6 bx5 bx4 bx3 bx2 bx1 bx0 - - bx8..bx0 = upper-right finger absolute x value - -byte 5: - - bit 7 6 5 4 3 2 1 0 - by7 by8 by5 by4 by3 by2 by1 by0 - - by8..by0 = upper-right finger absolute y value - -///////////////////////////////////////////////////////////////////////////// - -6. Hardware version 3 - ================== - -6.1 Registers - ~~~~~~~~~ -* reg_10 - - bit 7 6 5 4 3 2 1 0 - 0 0 0 0 R F T A - - A: 1 = enable absolute tracking - T: 1 = enable two finger mode auto correct - F: 1 = disable ABS Position Filter - R: 1 = enable real hardware resolution - -6.2 Native absolute mode 6 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -1 and 3 finger touch shares the same 6-byte packet format, except that -3 finger touch only reports the position of the center of all three fingers. - -Firmware would send 12 bytes of data for 2 finger touch. - -Note on debounce: -In case the box has unstable power supply or other electricity issues, or -when number of finger changes, F/W would send "debounce packet" to inform -driver that the hardware is in debounce status. -The debouce packet has the following signature: - byte 0: 0xc4 - byte 1: 0xff - byte 2: 0xff - byte 3: 0x02 - byte 4: 0xff - byte 5: 0xff -When we encounter this kind of packet, we just ignore it. - -6.2.1 One/Three finger touch - ~~~~~~~~~~~~~~~~~~~~~~ - -byte 0: - - bit 7 6 5 4 3 2 1 0 - n1 n0 w3 w2 0 1 R L - - L, R = 1 when Left, Right mouse button pressed - n1..n0 = number of fingers on touchpad - -byte 1: - - bit 7 6 5 4 3 2 1 0 - p7 p6 p5 p4 x11 x10 x9 x8 - -byte 2: - - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x11..x0 = absolute x value (horizontal) - -byte 3: - - bit 7 6 5 4 3 2 1 0 - 0 0 w1 w0 0 0 1 0 - - w3..w0 = width of the finger touch - -byte 4: - - bit 7 6 5 4 3 2 1 0 - p3 p1 p2 p0 y11 y10 y9 y8 - - p7..p0 = pressure - -byte 5: - - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y11..y0 = absolute y value (vertical) - -6.2.2 Two finger touch - ~~~~~~~~~~~~~~~~ - -The packet format is exactly the same for two finger touch, except the hardware -sends two 6 byte packets. The first packet contains data for the first finger, -the second packet has data for the second finger. So for two finger touch a -total of 12 bytes are sent. - -///////////////////////////////////////////////////////////////////////////// - -7. Hardware version 4 - ================== - -7.1 Registers - ~~~~~~~~~ -* reg_07 - - bit 7 6 5 4 3 2 1 0 - 0 0 0 0 0 0 0 A - - A: 1 = enable absolute tracking - -7.2 Native absolute mode 6 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -v4 hardware is a true multitouch touchpad, capable of tracking up to 5 fingers. -Unfortunately, due to PS/2's limited bandwidth, its packet format is rather -complex. - -Whenever the numbers or identities of the fingers changes, the hardware sends a -status packet to indicate how many and which fingers is on touchpad, followed by -head packets or motion packets. A head packet contains data of finger id, finger -position (absolute x, y values), width, and pressure. A motion packet contains -two fingers' position delta. - -For example, when status packet tells there are 2 fingers on touchpad, then we -can expect two following head packets. If the finger status doesn't change, -the following packets would be motion packets, only sending delta of finger -position, until we receive a status packet. - -One exception is one finger touch. when a status packet tells us there is only -one finger, the hardware would just send head packets afterwards. - -7.2.1 Status packet - ~~~~~~~~~~~~~ - -byte 0: - - bit 7 6 5 4 3 2 1 0 - . . . . 0 1 R L - - L, R = 1 when Left, Right mouse button pressed - -byte 1: - - bit 7 6 5 4 3 2 1 0 - . . . ft4 ft3 ft2 ft1 ft0 - - ft4 ft3 ft2 ft1 ft0 ftn = 1 when finger n is on touchpad - -byte 2: not used - -byte 3: - - bit 7 6 5 4 3 2 1 0 - . . . 1 0 0 0 0 - - constant bits - -byte 4: - - bit 7 6 5 4 3 2 1 0 - p . . . . . . . - - p = 1 for palm - -byte 5: not used - -7.2.2 Head packet - ~~~~~~~~~~~ - -byte 0: - - bit 7 6 5 4 3 2 1 0 - w3 w2 w1 w0 0 1 R L - - L, R = 1 when Left, Right mouse button pressed - w3..w0 = finger width (spans how many trace lines) - -byte 1: - - bit 7 6 5 4 3 2 1 0 - p7 p6 p5 p4 x11 x10 x9 x8 - -byte 2: - - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x11..x0 = absolute x value (horizontal) - -byte 3: - - bit 7 6 5 4 3 2 1 0 - id2 id1 id0 1 0 0 0 1 - - id2..id0 = finger id - -byte 4: - - bit 7 6 5 4 3 2 1 0 - p3 p1 p2 p0 y11 y10 y9 y8 - - p7..p0 = pressure - -byte 5: - - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y11..y0 = absolute y value (vertical) - -7.2.3 Motion packet - ~~~~~~~~~~~~~ - -byte 0: - - bit 7 6 5 4 3 2 1 0 - id2 id1 id0 w 0 1 R L - - L, R = 1 when Left, Right mouse button pressed - id2..id0 = finger id - w = 1 when delta overflows (> 127 or < -128), in this case - firmware sends us (delta x / 5) and (delta y / 5) - -byte 1: - - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x7..x0 = delta x (two's complement) - -byte 2: - - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y7..y0 = delta y (two's complement) - -byte 3: - - bit 7 6 5 4 3 2 1 0 - id2 id1 id0 1 0 0 1 0 - - id2..id0 = finger id - -byte 4: - - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 - - x7..x0 = delta x (two's complement) - -byte 5: - - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - y7..y0 = delta y (two's complement) - - byte 0 ~ 2 for one finger - byte 3 ~ 5 for another - - -8. Trackpoint (for Hardware version 3 and 4) - ========================================= -8.1 Registers - ~~~~~~~~~ -No special registers have been identified. - -8.2 Native relative mode 6 byte packet format - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -8.2.1 Status Packet - ~~~~~~~~~~~~~ - -byte 0: - bit 7 6 5 4 3 2 1 0 - 0 0 sx sy 0 M R L -byte 1: - bit 7 6 5 4 3 2 1 0 - ~sx 0 0 0 0 0 0 0 -byte 2: - bit 7 6 5 4 3 2 1 0 - ~sy 0 0 0 0 0 0 0 -byte 3: - bit 7 6 5 4 3 2 1 0 - 0 0 ~sy ~sx 0 1 1 0 -byte 4: - bit 7 6 5 4 3 2 1 0 - x7 x6 x5 x4 x3 x2 x1 x0 -byte 5: - bit 7 6 5 4 3 2 1 0 - y7 y6 y5 y4 y3 y2 y1 y0 - - - x and y are written in two's complement spread - over 9 bits with sx/sy the relative top bit and - x7..x0 and y7..y0 the lower bits. - ~sx is the inverse of sx, ~sy is the inverse of sy. - The sign of y is opposite to what the input driver - expects for a relative movement diff --git a/Documentation/input/event-codes.rst b/Documentation/input/event-codes.rst new file mode 100644 index 0000000000000000000000000000000000000000..a8c0873beb952e620db9bb2f2df823624ac90650 --- /dev/null +++ b/Documentation/input/event-codes.rst @@ -0,0 +1,409 @@ +.. _input-event-codes: + +================= +Input event codes +================= + + +The input protocol uses a map of types and codes to express input device values +to userspace. This document describes the types and codes and how and when they +may be used. + +A single hardware event generates multiple input events. Each input event +contains the new value of a single data item. A special event type, EV_SYN, is +used to separate input events into packets of input data changes occurring at +the same moment in time. In the following, the term "event" refers to a single +input event encompassing a type, code, and value. + +The input protocol is a stateful protocol. Events are emitted only when values +of event codes have changed. However, the state is maintained within the Linux +input subsystem; drivers do not need to maintain the state and may attempt to +emit unchanged values without harm. Userspace may obtain the current state of +event code values using the EVIOCG* ioctls defined in linux/input.h. The event +reports supported by a device are also provided by sysfs in +class/input/event*/device/capabilities/, and the properties of a device are +provided in class/input/event*/device/properties. + +Event types +=========== + +Event types are groupings of codes under a logical input construct. Each +type has a set of applicable codes to be used in generating events. See the +Codes section for details on valid codes for each type. + +* EV_SYN: + + - Used as markers to separate events. Events may be separated in time or in + space, such as with the multitouch protocol. + +* EV_KEY: + + - Used to describe state changes of keyboards, buttons, or other key-like + devices. + +* EV_REL: + + - Used to describe relative axis value changes, e.g. moving the mouse 5 units + to the left. + +* EV_ABS: + + - Used to describe absolute axis value changes, e.g. describing the + coordinates of a touch on a touchscreen. + +* EV_MSC: + + - Used to describe miscellaneous input data that do not fit into other types. + +* EV_SW: + + - Used to describe binary state input switches. + +* EV_LED: + + - Used to turn LEDs on devices on and off. + +* EV_SND: + + - Used to output sound to devices. + +* EV_REP: + + - Used for autorepeating devices. + +* EV_FF: + + - Used to send force feedback commands to an input device. + +* EV_PWR: + + - A special type for power button and switch input. + +* EV_FF_STATUS: + + - Used to receive force feedback device status. + +Event codes +=========== + +Event codes define the precise type of event. + +EV_SYN +------ + +EV_SYN event values are undefined. Their usage is defined only by when they are +sent in the evdev event stream. + +* SYN_REPORT: + + - Used to synchronize and separate events into packets of input data changes + occurring at the same moment in time. For example, motion of a mouse may set + the REL_X and REL_Y values for one motion, then emit a SYN_REPORT. The next + motion will emit more REL_X and REL_Y values and send another SYN_REPORT. + +* SYN_CONFIG: + + - TBD + +* SYN_MT_REPORT: + + - Used to synchronize and separate touch events. See the + multi-touch-protocol.txt document for more information. + +* SYN_DROPPED: + + - Used to indicate buffer overrun in the evdev client's event queue. + Client should ignore all events up to and including next SYN_REPORT + event and query the device (using EVIOCG* ioctls) to obtain its + current state. + +EV_KEY +------ + +EV_KEY events take the form KEY_ or BTN_. For example, KEY_A is used +to represent the 'A' key on a keyboard. When a key is depressed, an event with +the key's code is emitted with value 1. When the key is released, an event is +emitted with value 0. Some hardware send events when a key is repeated. These +events have a value of 2. In general, KEY_ is used for keyboard keys, and +BTN_ is used for other types of momentary switch events. + +A few EV_KEY codes have special meanings: + +* BTN_TOOL_: + + - These codes are used in conjunction with input trackpads, tablets, and + touchscreens. These devices may be used with fingers, pens, or other tools. + When an event occurs and a tool is used, the corresponding BTN_TOOL_ + code should be set to a value of 1. When the tool is no longer interacting + with the input device, the BTN_TOOL_ code should be reset to 0. All + trackpads, tablets, and touchscreens should use at least one BTN_TOOL_ + code when events are generated. + +* BTN_TOUCH: + + BTN_TOUCH is used for touch contact. While an input tool is determined to be + within meaningful physical contact, the value of this property must be set + to 1. Meaningful physical contact may mean any contact, or it may mean + contact conditioned by an implementation defined property. For example, a + touchpad may set the value to 1 only when the touch pressure rises above a + certain value. BTN_TOUCH may be combined with BTN_TOOL_ codes. For + example, a pen tablet may set BTN_TOOL_PEN to 1 and BTN_TOUCH to 0 while the + pen is hovering over but not touching the tablet surface. + +Note: For appropriate function of the legacy mousedev emulation driver, +BTN_TOUCH must be the first evdev code emitted in a synchronization frame. + +Note: Historically a touch device with BTN_TOOL_FINGER and BTN_TOUCH was +interpreted as a touchpad by userspace, while a similar device without +BTN_TOOL_FINGER was interpreted as a touchscreen. For backwards compatibility +with current userspace it is recommended to follow this distinction. In the +future, this distinction will be deprecated and the device properties ioctl +EVIOCGPROP, defined in linux/input.h, will be used to convey the device type. + +* BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, BTN_TOOL_TRIPLETAP, BTN_TOOL_QUADTAP: + + - These codes denote one, two, three, and four finger interaction on a + trackpad or touchscreen. For example, if the user uses two fingers and moves + them on the touchpad in an effort to scroll content on screen, + BTN_TOOL_DOUBLETAP should be set to value 1 for the duration of the motion. + Note that all BTN_TOOL_ codes and the BTN_TOUCH code are orthogonal in + purpose. A trackpad event generated by finger touches should generate events + for one code from each group. At most only one of these BTN_TOOL_ + codes should have a value of 1 during any synchronization frame. + +Note: Historically some drivers emitted multiple of the finger count codes with +a value of 1 in the same synchronization frame. This usage is deprecated. + +Note: In multitouch drivers, the input_mt_report_finger_count() function should +be used to emit these codes. Please see multi-touch-protocol.txt for details. + +EV_REL +------ + +EV_REL events describe relative changes in a property. For example, a mouse may +move to the left by a certain number of units, but its absolute position in +space is unknown. If the absolute position is known, EV_ABS codes should be used +instead of EV_REL codes. + +A few EV_REL codes have special meanings: + +* REL_WHEEL, REL_HWHEEL: + + - These codes are used for vertical and horizontal scroll wheels, + respectively. + +EV_ABS +------ + +EV_ABS events describe absolute changes in a property. For example, a touchpad +may emit coordinates for a touch location. + +A few EV_ABS codes have special meanings: + +* ABS_DISTANCE: + + - Used to describe the distance of a tool from an interaction surface. This + event should only be emitted while the tool is hovering, meaning in close + proximity of the device and while the value of the BTN_TOUCH code is 0. If + the input device may be used freely in three dimensions, consider ABS_Z + instead. + - BTN_TOOL_ should be set to 1 when the tool comes into detectable + proximity and set to 0 when the tool leaves detectable proximity. + BTN_TOOL_ signals the type of tool that is currently detected by the + hardware and is otherwise independent of ABS_DISTANCE and/or BTN_TOUCH. + +* ABS_MT_: + + - Used to describe multitouch input events. Please see + multi-touch-protocol.txt for details. + +EV_SW +----- + +EV_SW events describe stateful binary switches. For example, the SW_LID code is +used to denote when a laptop lid is closed. + +Upon binding to a device or resuming from suspend, a driver must report +the current switch state. This ensures that the device, kernel, and userspace +state is in sync. + +Upon resume, if the switch state is the same as before suspend, then the input +subsystem will filter out the duplicate switch state reports. The driver does +not need to keep the state of the switch at any time. + +EV_MSC +------ + +EV_MSC events are used for input and output events that do not fall under other +categories. + +A few EV_MSC codes have special meaning: + +* MSC_TIMESTAMP: + + - Used to report the number of microseconds since the last reset. This event + should be coded as an uint32 value, which is allowed to wrap around with + no special consequence. It is assumed that the time difference between two + consecutive events is reliable on a reasonable time scale (hours). + A reset to zero can happen, in which case the time since the last event is + unknown. If the device does not provide this information, the driver must + not provide it to user space. + +EV_LED +------ + +EV_LED events are used for input and output to set and query the state of +various LEDs on devices. + +EV_REP +------ + +EV_REP events are used for specifying autorepeating events. + +EV_SND +------ + +EV_SND events are used for sending sound commands to simple sound output +devices. + +EV_FF +----- + +EV_FF events are used to initialize a force feedback capable device and to cause +such device to feedback. + +EV_PWR +------ + +EV_PWR events are a special type of event used specifically for power +management. Its usage is not well defined. To be addressed later. + +Device properties +================= + +Normally, userspace sets up an input device based on the data it emits, +i.e., the event types. In the case of two devices emitting the same event +types, additional information can be provided in the form of device +properties. + +INPUT_PROP_DIRECT + INPUT_PROP_POINTER +-------------------------------------- + +The INPUT_PROP_DIRECT property indicates that device coordinates should be +directly mapped to screen coordinates (not taking into account trivial +transformations, such as scaling, flipping and rotating). Non-direct input +devices require non-trivial transformation, such as absolute to relative +transformation for touchpads. Typical direct input devices: touchscreens, +drawing tablets; non-direct devices: touchpads, mice. + +The INPUT_PROP_POINTER property indicates that the device is not transposed +on the screen and thus requires use of an on-screen pointer to trace user's +movements. Typical pointer devices: touchpads, tablets, mice; non-pointer +device: touchscreen. + +If neither INPUT_PROP_DIRECT or INPUT_PROP_POINTER are set, the property is +considered undefined and the device type should be deduced in the +traditional way, using emitted event types. + +INPUT_PROP_BUTTONPAD +-------------------- + +For touchpads where the button is placed beneath the surface, such that +pressing down on the pad causes a button click, this property should be +set. Common in clickpad notebooks and macbooks from 2009 and onwards. + +Originally, the buttonpad property was coded into the bcm5974 driver +version field under the name integrated button. For backwards +compatibility, both methods need to be checked in userspace. + +INPUT_PROP_SEMI_MT +------------------ + +Some touchpads, most common between 2008 and 2011, can detect the presence +of multiple contacts without resolving the individual positions; only the +number of contacts and a rectangular shape is known. For such +touchpads, the semi-mt property should be set. + +Depending on the device, the rectangle may enclose all touches, like a +bounding box, or just some of them, for instance the two most recent +touches. The diversity makes the rectangle of limited use, but some +gestures can normally be extracted from it. + +If INPUT_PROP_SEMI_MT is not set, the device is assumed to be a true MT +device. + +INPUT_PROP_TOPBUTTONPAD +----------------------- + +Some laptops, most notably the Lenovo 40 series provide a trackstick +device but do not have physical buttons associated with the trackstick +device. Instead, the top area of the touchpad is marked to show +visual/haptic areas for left, middle, right buttons intended to be used +with the trackstick. + +If INPUT_PROP_TOPBUTTONPAD is set, userspace should emulate buttons +accordingly. This property does not affect kernel behavior. +The kernel does not provide button emulation for such devices but treats +them as any other INPUT_PROP_BUTTONPAD device. + +INPUT_PROP_ACCELEROMETER +------------------------ + +Directional axes on this device (absolute and/or relative x, y, z) represent +accelerometer data. Some devices also report gyroscope data, which devices +can report through the rotational axes (absolute and/or relative rx, ry, rz). + +All other axes retain their meaning. A device must not mix +regular directional axes and accelerometer axes on the same event node. + +Guidelines +========== + +The guidelines below ensure proper single-touch and multi-finger functionality. +For multi-touch functionality, see the multi-touch-protocol.txt document for +more information. + +Mice +---- + +REL_{X,Y} must be reported when the mouse moves. BTN_LEFT must be used to report +the primary button press. BTN_{MIDDLE,RIGHT,4,5,etc.} should be used to report +further buttons of the device. REL_WHEEL and REL_HWHEEL should be used to report +scroll wheel events where available. + +Touchscreens +------------ + +ABS_{X,Y} must be reported with the location of the touch. BTN_TOUCH must be +used to report when a touch is active on the screen. +BTN_{MOUSE,LEFT,MIDDLE,RIGHT} must not be reported as the result of touch +contact. BTN_TOOL_ events should be reported where possible. + +For new hardware, INPUT_PROP_DIRECT should be set. + +Trackpads +--------- + +Legacy trackpads that only provide relative position information must report +events like mice described above. + +Trackpads that provide absolute touch position must report ABS_{X,Y} for the +location of the touch. BTN_TOUCH should be used to report when a touch is active +on the trackpad. Where multi-finger support is available, BTN_TOOL_ should +be used to report the number of touches active on the trackpad. + +For new hardware, INPUT_PROP_POINTER should be set. + +Tablets +------- + +BTN_TOOL_ events must be reported when a stylus or other tool is active on +the tablet. ABS_{X,Y} must be reported with the location of the tool. BTN_TOUCH +should be used to report when the tool is in contact with the tablet. +BTN_{STYLUS,STYLUS2} should be used to report buttons on the tool itself. Any +button may be used for buttons on the tablet except BTN_{MOUSE,LEFT}. +BTN_{0,1,2,etc} are good generic codes for unlabeled buttons. Do not use +meaningful buttons, like BTN_FORWARD, unless the button is labeled for that +purpose on the device. + +For new hardware, both INPUT_PROP_DIRECT and INPUT_PROP_POINTER should be set. diff --git a/Documentation/input/event-codes.txt b/Documentation/input/event-codes.txt deleted file mode 100644 index 36ea940e5bb91fe9711544e5d0bbe2e2293c8ca8..0000000000000000000000000000000000000000 --- a/Documentation/input/event-codes.txt +++ /dev/null @@ -1,352 +0,0 @@ -The input protocol uses a map of types and codes to express input device values -to userspace. This document describes the types and codes and how and when they -may be used. - -A single hardware event generates multiple input events. Each input event -contains the new value of a single data item. A special event type, EV_SYN, is -used to separate input events into packets of input data changes occurring at -the same moment in time. In the following, the term "event" refers to a single -input event encompassing a type, code, and value. - -The input protocol is a stateful protocol. Events are emitted only when values -of event codes have changed. However, the state is maintained within the Linux -input subsystem; drivers do not need to maintain the state and may attempt to -emit unchanged values without harm. Userspace may obtain the current state of -event code values using the EVIOCG* ioctls defined in linux/input.h. The event -reports supported by a device are also provided by sysfs in -class/input/event*/device/capabilities/, and the properties of a device are -provided in class/input/event*/device/properties. - -Event types: -=========== -Event types are groupings of codes under a logical input construct. Each -type has a set of applicable codes to be used in generating events. See the -Codes section for details on valid codes for each type. - -* EV_SYN: - - Used as markers to separate events. Events may be separated in time or in - space, such as with the multitouch protocol. - -* EV_KEY: - - Used to describe state changes of keyboards, buttons, or other key-like - devices. - -* EV_REL: - - Used to describe relative axis value changes, e.g. moving the mouse 5 units - to the left. - -* EV_ABS: - - Used to describe absolute axis value changes, e.g. describing the - coordinates of a touch on a touchscreen. - -* EV_MSC: - - Used to describe miscellaneous input data that do not fit into other types. - -* EV_SW: - - Used to describe binary state input switches. - -* EV_LED: - - Used to turn LEDs on devices on and off. - -* EV_SND: - - Used to output sound to devices. - -* EV_REP: - - Used for autorepeating devices. - -* EV_FF: - - Used to send force feedback commands to an input device. - -* EV_PWR: - - A special type for power button and switch input. - -* EV_FF_STATUS: - - Used to receive force feedback device status. - -Event codes: -=========== -Event codes define the precise type of event. - -EV_SYN: ----------- -EV_SYN event values are undefined. Their usage is defined only by when they are -sent in the evdev event stream. - -* SYN_REPORT: - - Used to synchronize and separate events into packets of input data changes - occurring at the same moment in time. For example, motion of a mouse may set - the REL_X and REL_Y values for one motion, then emit a SYN_REPORT. The next - motion will emit more REL_X and REL_Y values and send another SYN_REPORT. - -* SYN_CONFIG: - - TBD - -* SYN_MT_REPORT: - - Used to synchronize and separate touch events. See the - multi-touch-protocol.txt document for more information. - -* SYN_DROPPED: - - Used to indicate buffer overrun in the evdev client's event queue. - Client should ignore all events up to and including next SYN_REPORT - event and query the device (using EVIOCG* ioctls) to obtain its - current state. - -EV_KEY: ----------- -EV_KEY events take the form KEY_ or BTN_. For example, KEY_A is used -to represent the 'A' key on a keyboard. When a key is depressed, an event with -the key's code is emitted with value 1. When the key is released, an event is -emitted with value 0. Some hardware send events when a key is repeated. These -events have a value of 2. In general, KEY_ is used for keyboard keys, and -BTN_ is used for other types of momentary switch events. - -A few EV_KEY codes have special meanings: - -* BTN_TOOL_: - - These codes are used in conjunction with input trackpads, tablets, and - touchscreens. These devices may be used with fingers, pens, or other tools. - When an event occurs and a tool is used, the corresponding BTN_TOOL_ - code should be set to a value of 1. When the tool is no longer interacting - with the input device, the BTN_TOOL_ code should be reset to 0. All - trackpads, tablets, and touchscreens should use at least one BTN_TOOL_ - code when events are generated. - -* BTN_TOUCH: - BTN_TOUCH is used for touch contact. While an input tool is determined to be - within meaningful physical contact, the value of this property must be set - to 1. Meaningful physical contact may mean any contact, or it may mean - contact conditioned by an implementation defined property. For example, a - touchpad may set the value to 1 only when the touch pressure rises above a - certain value. BTN_TOUCH may be combined with BTN_TOOL_ codes. For - example, a pen tablet may set BTN_TOOL_PEN to 1 and BTN_TOUCH to 0 while the - pen is hovering over but not touching the tablet surface. - -Note: For appropriate function of the legacy mousedev emulation driver, -BTN_TOUCH must be the first evdev code emitted in a synchronization frame. - -Note: Historically a touch device with BTN_TOOL_FINGER and BTN_TOUCH was -interpreted as a touchpad by userspace, while a similar device without -BTN_TOOL_FINGER was interpreted as a touchscreen. For backwards compatibility -with current userspace it is recommended to follow this distinction. In the -future, this distinction will be deprecated and the device properties ioctl -EVIOCGPROP, defined in linux/input.h, will be used to convey the device type. - -* BTN_TOOL_FINGER, BTN_TOOL_DOUBLETAP, BTN_TOOL_TRIPLETAP, BTN_TOOL_QUADTAP: - - These codes denote one, two, three, and four finger interaction on a - trackpad or touchscreen. For example, if the user uses two fingers and moves - them on the touchpad in an effort to scroll content on screen, - BTN_TOOL_DOUBLETAP should be set to value 1 for the duration of the motion. - Note that all BTN_TOOL_ codes and the BTN_TOUCH code are orthogonal in - purpose. A trackpad event generated by finger touches should generate events - for one code from each group. At most only one of these BTN_TOOL_ - codes should have a value of 1 during any synchronization frame. - -Note: Historically some drivers emitted multiple of the finger count codes with -a value of 1 in the same synchronization frame. This usage is deprecated. - -Note: In multitouch drivers, the input_mt_report_finger_count() function should -be used to emit these codes. Please see multi-touch-protocol.txt for details. - -EV_REL: ----------- -EV_REL events describe relative changes in a property. For example, a mouse may -move to the left by a certain number of units, but its absolute position in -space is unknown. If the absolute position is known, EV_ABS codes should be used -instead of EV_REL codes. - -A few EV_REL codes have special meanings: - -* REL_WHEEL, REL_HWHEEL: - - These codes are used for vertical and horizontal scroll wheels, - respectively. - -EV_ABS: ----------- -EV_ABS events describe absolute changes in a property. For example, a touchpad -may emit coordinates for a touch location. - -A few EV_ABS codes have special meanings: - -* ABS_DISTANCE: - - Used to describe the distance of a tool from an interaction surface. This - event should only be emitted while the tool is hovering, meaning in close - proximity of the device and while the value of the BTN_TOUCH code is 0. If - the input device may be used freely in three dimensions, consider ABS_Z - instead. - - BTN_TOOL_ should be set to 1 when the tool comes into detectable - proximity and set to 0 when the tool leaves detectable proximity. - BTN_TOOL_ signals the type of tool that is currently detected by the - hardware and is otherwise independent of ABS_DISTANCE and/or BTN_TOUCH. - -* ABS_MT_: - - Used to describe multitouch input events. Please see - multi-touch-protocol.txt for details. - -EV_SW: ----------- -EV_SW events describe stateful binary switches. For example, the SW_LID code is -used to denote when a laptop lid is closed. - -Upon binding to a device or resuming from suspend, a driver must report -the current switch state. This ensures that the device, kernel, and userspace -state is in sync. - -Upon resume, if the switch state is the same as before suspend, then the input -subsystem will filter out the duplicate switch state reports. The driver does -not need to keep the state of the switch at any time. - -EV_MSC: ----------- -EV_MSC events are used for input and output events that do not fall under other -categories. - -A few EV_MSC codes have special meaning: - -* MSC_TIMESTAMP: - - Used to report the number of microseconds since the last reset. This event - should be coded as an uint32 value, which is allowed to wrap around with - no special consequence. It is assumed that the time difference between two - consecutive events is reliable on a reasonable time scale (hours). - A reset to zero can happen, in which case the time since the last event is - unknown. If the device does not provide this information, the driver must - not provide it to user space. - -EV_LED: ----------- -EV_LED events are used for input and output to set and query the state of -various LEDs on devices. - -EV_REP: ----------- -EV_REP events are used for specifying autorepeating events. - -EV_SND: ----------- -EV_SND events are used for sending sound commands to simple sound output -devices. - -EV_FF: ----------- -EV_FF events are used to initialize a force feedback capable device and to cause -such device to feedback. - -EV_PWR: ----------- -EV_PWR events are a special type of event used specifically for power -management. Its usage is not well defined. To be addressed later. - -Device properties: -================= -Normally, userspace sets up an input device based on the data it emits, -i.e., the event types. In the case of two devices emitting the same event -types, additional information can be provided in the form of device -properties. - -INPUT_PROP_DIRECT + INPUT_PROP_POINTER: --------------------------------------- -The INPUT_PROP_DIRECT property indicates that device coordinates should be -directly mapped to screen coordinates (not taking into account trivial -transformations, such as scaling, flipping and rotating). Non-direct input -devices require non-trivial transformation, such as absolute to relative -transformation for touchpads. Typical direct input devices: touchscreens, -drawing tablets; non-direct devices: touchpads, mice. - -The INPUT_PROP_POINTER property indicates that the device is not transposed -on the screen and thus requires use of an on-screen pointer to trace user's -movements. Typical pointer devices: touchpads, tablets, mice; non-pointer -device: touchscreen. - -If neither INPUT_PROP_DIRECT or INPUT_PROP_POINTER are set, the property is -considered undefined and the device type should be deduced in the -traditional way, using emitted event types. - -INPUT_PROP_BUTTONPAD: --------------------- -For touchpads where the button is placed beneath the surface, such that -pressing down on the pad causes a button click, this property should be -set. Common in clickpad notebooks and macbooks from 2009 and onwards. - -Originally, the buttonpad property was coded into the bcm5974 driver -version field under the name integrated button. For backwards -compatibility, both methods need to be checked in userspace. - -INPUT_PROP_SEMI_MT: ------------------- -Some touchpads, most common between 2008 and 2011, can detect the presence -of multiple contacts without resolving the individual positions; only the -number of contacts and a rectangular shape is known. For such -touchpads, the semi-mt property should be set. - -Depending on the device, the rectangle may enclose all touches, like a -bounding box, or just some of them, for instance the two most recent -touches. The diversity makes the rectangle of limited use, but some -gestures can normally be extracted from it. - -If INPUT_PROP_SEMI_MT is not set, the device is assumed to be a true MT -device. - -INPUT_PROP_TOPBUTTONPAD: ------------------------ -Some laptops, most notably the Lenovo *40 series provide a trackstick -device but do not have physical buttons associated with the trackstick -device. Instead, the top area of the touchpad is marked to show -visual/haptic areas for left, middle, right buttons intended to be used -with the trackstick. - -If INPUT_PROP_TOPBUTTONPAD is set, userspace should emulate buttons -accordingly. This property does not affect kernel behavior. -The kernel does not provide button emulation for such devices but treats -them as any other INPUT_PROP_BUTTONPAD device. - -INPUT_PROP_ACCELEROMETER -------------------------- -Directional axes on this device (absolute and/or relative x, y, z) represent -accelerometer data. All other axes retain their meaning. A device must not mix -regular directional axes and accelerometer axes on the same event node. - -Guidelines: -========== -The guidelines below ensure proper single-touch and multi-finger functionality. -For multi-touch functionality, see the multi-touch-protocol.txt document for -more information. - -Mice: ----------- -REL_{X,Y} must be reported when the mouse moves. BTN_LEFT must be used to report -the primary button press. BTN_{MIDDLE,RIGHT,4,5,etc.} should be used to report -further buttons of the device. REL_WHEEL and REL_HWHEEL should be used to report -scroll wheel events where available. - -Touchscreens: ----------- -ABS_{X,Y} must be reported with the location of the touch. BTN_TOUCH must be -used to report when a touch is active on the screen. -BTN_{MOUSE,LEFT,MIDDLE,RIGHT} must not be reported as the result of touch -contact. BTN_TOOL_ events should be reported where possible. - -For new hardware, INPUT_PROP_DIRECT should be set. - -Trackpads: ----------- -Legacy trackpads that only provide relative position information must report -events like mice described above. - -Trackpads that provide absolute touch position must report ABS_{X,Y} for the -location of the touch. BTN_TOUCH should be used to report when a touch is active -on the trackpad. Where multi-finger support is available, BTN_TOOL_ should -be used to report the number of touches active on the trackpad. - -For new hardware, INPUT_PROP_POINTER should be set. - -Tablets: ----------- -BTN_TOOL_ events must be reported when a stylus or other tool is active on -the tablet. ABS_{X,Y} must be reported with the location of the tool. BTN_TOUCH -should be used to report when the tool is in contact with the tablet. -BTN_{STYLUS,STYLUS2} should be used to report buttons on the tool itself. Any -button may be used for buttons on the tablet except BTN_{MOUSE,LEFT}. -BTN_{0,1,2,etc} are good generic codes for unlabeled buttons. Do not use -meaningful buttons, like BTN_FORWARD, unless the button is labeled for that -purpose on the device. - -For new hardware, both INPUT_PROP_DIRECT and INPUT_PROP_POINTER should be set. diff --git a/Documentation/input/ff.rst b/Documentation/input/ff.rst new file mode 100644 index 0000000000000000000000000000000000000000..26d461998e08974f1e47e54f33cfc630c0a9879d --- /dev/null +++ b/Documentation/input/ff.rst @@ -0,0 +1,265 @@ +======================== +Force feedback for Linux +======================== + +:Author: Johann Deneux on 2001/04/22. +:Updated: Anssi Hannula on 2006/04/09. + +You may redistribute this file. Please remember to include shape.svg and +interactive.svg as well. + +Introduction +~~~~~~~~~~~~ + +This document describes how to use force feedback devices under Linux. The +goal is not to support these devices as if they were simple input-only devices +(as it is already the case), but to really enable the rendering of force +effects. +This document only describes the force feedback part of the Linux input +interface. Please read joystick.txt and input.txt before reading further this +document. + +Instructions to the user +~~~~~~~~~~~~~~~~~~~~~~~~ + +To enable force feedback, you have to: + +1. have your kernel configured with evdev and a driver that supports your + device. +2. make sure evdev module is loaded and /dev/input/event* device files are + created. + +Before you start, let me WARN you that some devices shake violently during the +initialisation phase. This happens for example with my "AVB Top Shot Pegasus". +To stop this annoying behaviour, move you joystick to its limits. Anyway, you +should keep a hand on your device, in order to avoid it to break down if +something goes wrong. + +If you have a serial iforce device, you need to start inputattach. See +joystick.txt for details. + +Does it work ? +-------------- + +There is an utility called fftest that will allow you to test the driver:: + + % fftest /dev/input/eventXX + +Instructions to the developer +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +All interactions are done using the event API. That is, you can use ioctl() +and write() on /dev/input/eventXX. +This information is subject to change. + +Querying device capabilities +---------------------------- + +:: + + #include + #include + + #define BITS_TO_LONGS(x) \ + (((x) + 8 * sizeof (unsigned long) - 1) / (8 * sizeof (unsigned long))) + unsigned long features[BITS_TO_LONGS(FF_CNT)]; + int ioctl(int file_descriptor, int request, unsigned long *features); + +"request" must be EVIOCGBIT(EV_FF, size of features array in bytes ) + +Returns the features supported by the device. features is a bitfield with the +following bits: + +- FF_CONSTANT can render constant force effects +- FF_PERIODIC can render periodic effects with the following waveforms: + + - FF_SQUARE square waveform + - FF_TRIANGLE triangle waveform + - FF_SINE sine waveform + - FF_SAW_UP sawtooth up waveform + - FF_SAW_DOWN sawtooth down waveform + - FF_CUSTOM custom waveform + +- FF_RAMP can render ramp effects +- FF_SPRING can simulate the presence of a spring +- FF_FRICTION can simulate friction +- FF_DAMPER can simulate damper effects +- FF_RUMBLE rumble effects +- FF_INERTIA can simulate inertia +- FF_GAIN gain is adjustable +- FF_AUTOCENTER autocenter is adjustable + +.. note:: + + - In most cases you should use FF_PERIODIC instead of FF_RUMBLE. All + devices that support FF_RUMBLE support FF_PERIODIC (square, triangle, + sine) and the other way around. + + - The exact syntax FF_CUSTOM is undefined for the time being as no driver + supports it yet. + +:: + + int ioctl(int fd, EVIOCGEFFECTS, int *n); + +Returns the number of effects the device can keep in its memory. + +Uploading effects to the device +------------------------------- + +:: + + #include + #include + + int ioctl(int file_descriptor, int request, struct ff_effect *effect); + +"request" must be EVIOCSFF. + +"effect" points to a structure describing the effect to upload. The effect is +uploaded, but not played. +The content of effect may be modified. In particular, its field "id" is set +to the unique id assigned by the driver. This data is required for performing +some operations (removing an effect, controlling the playback). +This if field must be set to -1 by the user in order to tell the driver to +allocate a new effect. + +Effects are file descriptor specific. + +See for a description of the ff_effect struct. You +should also find help in a few sketches, contained in files shape.svg +and interactive.svg: + +.. kernel-figure:: shape.svg + + Shape + +.. kernel-figure:: interactive.svg + + Interactive + + +Removing an effect from the device +---------------------------------- + +:: + + int ioctl(int fd, EVIOCRMFF, effect.id); + +This makes room for new effects in the device's memory. Note that this also +stops the effect if it was playing. + +Controlling the playback of effects +----------------------------------- + +Control of playing is done with write(). Below is an example: + +:: + + #include + #include + + struct input_event play; + struct input_event stop; + struct ff_effect effect; + int fd; + ... + fd = open("/dev/input/eventXX", O_RDWR); + ... + /* Play three times */ + play.type = EV_FF; + play.code = effect.id; + play.value = 3; + + write(fd, (const void*) &play, sizeof(play)); + ... + /* Stop an effect */ + stop.type = EV_FF; + stop.code = effect.id; + stop.value = 0; + + write(fd, (const void*) &play, sizeof(stop)); + +Setting the gain +---------------- + +Not all devices have the same strength. Therefore, users should set a gain +factor depending on how strong they want effects to be. This setting is +persistent across access to the driver. + +:: + + /* Set the gain of the device + int gain; /* between 0 and 100 */ + struct input_event ie; /* structure used to communicate with the driver */ + + ie.type = EV_FF; + ie.code = FF_GAIN; + ie.value = 0xFFFFUL * gain / 100; + + if (write(fd, &ie, sizeof(ie)) == -1) + perror("set gain"); + +Enabling/Disabling autocenter +----------------------------- + +The autocenter feature quite disturbs the rendering of effects in my opinion, +and I think it should be an effect, which computation depends on the game +type. But you can enable it if you want. + +:: + + int autocenter; /* between 0 and 100 */ + struct input_event ie; + + ie.type = EV_FF; + ie.code = FF_AUTOCENTER; + ie.value = 0xFFFFUL * autocenter / 100; + + if (write(fd, &ie, sizeof(ie)) == -1) + perror("set auto-center"); + +A value of 0 means "no auto-center". + +Dynamic update of an effect +--------------------------- + +Proceed as if you wanted to upload a new effect, except that instead of +setting the id field to -1, you set it to the wanted effect id. +Normally, the effect is not stopped and restarted. However, depending on the +type of device, not all parameters can be dynamically updated. For example, +the direction of an effect cannot be updated with iforce devices. In this +case, the driver stops the effect, up-load it, and restart it. + +Therefore it is recommended to dynamically change direction while the effect +is playing only when it is ok to restart the effect with a replay count of 1. + +Information about the status of effects +--------------------------------------- + +Every time the status of an effect is changed, an event is sent. The values +and meanings of the fields of the event are as follows:: + + struct input_event { + /* When the status of the effect changed */ + struct timeval time; + + /* Set to EV_FF_STATUS */ + unsigned short type; + + /* Contains the id of the effect */ + unsigned short code; + + /* Indicates the status */ + unsigned int value; + }; + + FF_STATUS_STOPPED The effect stopped playing + FF_STATUS_PLAYING The effect started to play + +.. note:: + + - Status feedback is only supported by iforce driver. If you have + a really good reason to use this, please contact + linux-joystick@atrey.karlin.mff.cuni.cz or anssi.hannula@gmail.com + so that support for it can be added to the rest of the drivers. diff --git a/Documentation/input/ff.txt b/Documentation/input/ff.txt deleted file mode 100644 index b3867bf49f8f9ec0743e3cc9384d8ed45449a972..0000000000000000000000000000000000000000 --- a/Documentation/input/ff.txt +++ /dev/null @@ -1,221 +0,0 @@ -Force feedback for Linux. -By Johann Deneux on 2001/04/22. -Updated by Anssi Hannula on 2006/04/09. -You may redistribute this file. Please remember to include shape.fig and -interactive.fig as well. ----------------------------------------------------------------------------- - -1. Introduction -~~~~~~~~~~~~~~~ -This document describes how to use force feedback devices under Linux. The -goal is not to support these devices as if they were simple input-only devices -(as it is already the case), but to really enable the rendering of force -effects. -This document only describes the force feedback part of the Linux input -interface. Please read joystick.txt and input.txt before reading further this -document. - -2. Instructions to the user -~~~~~~~~~~~~~~~~~~~~~~~~~~~ -To enable force feedback, you have to: - -1. have your kernel configured with evdev and a driver that supports your - device. -2. make sure evdev module is loaded and /dev/input/event* device files are - created. - -Before you start, let me WARN you that some devices shake violently during the -initialisation phase. This happens for example with my "AVB Top Shot Pegasus". -To stop this annoying behaviour, move you joystick to its limits. Anyway, you -should keep a hand on your device, in order to avoid it to break down if -something goes wrong. - -If you have a serial iforce device, you need to start inputattach. See -joystick.txt for details. - -2.1 Does it work ? -~~~~~~~~~~~~~~~~~~ -There is an utility called fftest that will allow you to test the driver. -% fftest /dev/input/eventXX - -3. Instructions to the developer -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -All interactions are done using the event API. That is, you can use ioctl() -and write() on /dev/input/eventXX. -This information is subject to change. - -3.1 Querying device capabilities -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -#include -#include - -#define BITS_TO_LONGS(x) \ - (((x) + 8 * sizeof (unsigned long) - 1) / (8 * sizeof (unsigned long))) -unsigned long features[BITS_TO_LONGS(FF_CNT)]; -int ioctl(int file_descriptor, int request, unsigned long *features); - -"request" must be EVIOCGBIT(EV_FF, size of features array in bytes ) - -Returns the features supported by the device. features is a bitfield with the -following bits: -- FF_CONSTANT can render constant force effects -- FF_PERIODIC can render periodic effects with the following waveforms: - - FF_SQUARE square waveform - - FF_TRIANGLE triangle waveform - - FF_SINE sine waveform - - FF_SAW_UP sawtooth up waveform - - FF_SAW_DOWN sawtooth down waveform - - FF_CUSTOM custom waveform -- FF_RAMP can render ramp effects -- FF_SPRING can simulate the presence of a spring -- FF_FRICTION can simulate friction -- FF_DAMPER can simulate damper effects -- FF_RUMBLE rumble effects -- FF_INERTIA can simulate inertia -- FF_GAIN gain is adjustable -- FF_AUTOCENTER autocenter is adjustable - -Note: In most cases you should use FF_PERIODIC instead of FF_RUMBLE. All - devices that support FF_RUMBLE support FF_PERIODIC (square, triangle, - sine) and the other way around. - -Note: The exact syntax FF_CUSTOM is undefined for the time being as no driver - supports it yet. - - -int ioctl(int fd, EVIOCGEFFECTS, int *n); - -Returns the number of effects the device can keep in its memory. - -3.2 Uploading effects to the device -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -#include -#include - -int ioctl(int file_descriptor, int request, struct ff_effect *effect); - -"request" must be EVIOCSFF. - -"effect" points to a structure describing the effect to upload. The effect is -uploaded, but not played. -The content of effect may be modified. In particular, its field "id" is set -to the unique id assigned by the driver. This data is required for performing -some operations (removing an effect, controlling the playback). -This if field must be set to -1 by the user in order to tell the driver to -allocate a new effect. - -Effects are file descriptor specific. - -See for a description of the ff_effect struct. You should also -find help in a few sketches, contained in files shape.fig and interactive.fig. -You need xfig to visualize these files. - -3.3 Removing an effect from the device -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -int ioctl(int fd, EVIOCRMFF, effect.id); - -This makes room for new effects in the device's memory. Note that this also -stops the effect if it was playing. - -3.4 Controlling the playback of effects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Control of playing is done with write(). Below is an example: - -#include -#include - - struct input_event play; - struct input_event stop; - struct ff_effect effect; - int fd; -... - fd = open("/dev/input/eventXX", O_RDWR); -... - /* Play three times */ - play.type = EV_FF; - play.code = effect.id; - play.value = 3; - - write(fd, (const void*) &play, sizeof(play)); -... - /* Stop an effect */ - stop.type = EV_FF; - stop.code = effect.id; - stop.value = 0; - - write(fd, (const void*) &play, sizeof(stop)); - -3.5 Setting the gain -~~~~~~~~~~~~~~~~~~~~ -Not all devices have the same strength. Therefore, users should set a gain -factor depending on how strong they want effects to be. This setting is -persistent across access to the driver. - -/* Set the gain of the device -int gain; /* between 0 and 100 */ -struct input_event ie; /* structure used to communicate with the driver */ - -ie.type = EV_FF; -ie.code = FF_GAIN; -ie.value = 0xFFFFUL * gain / 100; - -if (write(fd, &ie, sizeof(ie)) == -1) - perror("set gain"); - -3.6 Enabling/Disabling autocenter -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The autocenter feature quite disturbs the rendering of effects in my opinion, -and I think it should be an effect, which computation depends on the game -type. But you can enable it if you want. - -int autocenter; /* between 0 and 100 */ -struct input_event ie; - -ie.type = EV_FF; -ie.code = FF_AUTOCENTER; -ie.value = 0xFFFFUL * autocenter / 100; - -if (write(fd, &ie, sizeof(ie)) == -1) - perror("set auto-center"); - -A value of 0 means "no auto-center". - -3.7 Dynamic update of an effect -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Proceed as if you wanted to upload a new effect, except that instead of -setting the id field to -1, you set it to the wanted effect id. -Normally, the effect is not stopped and restarted. However, depending on the -type of device, not all parameters can be dynamically updated. For example, -the direction of an effect cannot be updated with iforce devices. In this -case, the driver stops the effect, up-load it, and restart it. - -Therefore it is recommended to dynamically change direction while the effect -is playing only when it is ok to restart the effect with a replay count of 1. - -3.8 Information about the status of effects -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Every time the status of an effect is changed, an event is sent. The values -and meanings of the fields of the event are as follows: - -struct input_event { -/* When the status of the effect changed */ - struct timeval time; - -/* Set to EV_FF_STATUS */ - unsigned short type; - -/* Contains the id of the effect */ - unsigned short code; - -/* Indicates the status */ - unsigned int value; -}; - -FF_STATUS_STOPPED The effect stopped playing -FF_STATUS_PLAYING The effect started to play - -NOTE: Status feedback is only supported by iforce driver. If you have - a really good reason to use this, please contact - linux-joystick@atrey.karlin.mff.cuni.cz or anssi.hannula@gmail.com - so that support for it can be added to the rest of the drivers. - diff --git a/Documentation/input/gamepad.rst b/Documentation/input/gamepad.rst new file mode 100644 index 0000000000000000000000000000000000000000..4d5e7fb80a8456599608817dc8ca00478e2ee1be --- /dev/null +++ b/Documentation/input/gamepad.rst @@ -0,0 +1,191 @@ +--------------------------- +Linux Gamepad Specification +--------------------------- + +:Author: 2013 by David Herrmann + + +Introduction +~~~~~~~~~~~~ +Linux provides many different input drivers for gamepad hardware. To avoid +having user-space deal with different button-mappings for each gamepad, this +document defines how gamepads are supposed to report their data. + +Geometry +~~~~~~~~ +As "gamepad" we define devices which roughly look like this:: + + ____________________________ __ + / [__ZL__] [__ZR__] \ | + / [__ TL __] [__ TR __] \ | Front Triggers + __/________________________________\__ __| + / _ \ | + / /\ __ (N) \ | + / || __ |MO| __ _ _ \ | Main Pad + | <===DP===> |SE| |ST| (W) -|- (E) | | + \ || ___ ___ _ / | + /\ \/ / \ / \ (S) /\ __| + / \________ | LS | ____ | RS | ________/ \ | + | / \ \___/ / \ \___/ / \ | | Control Sticks + | / \_____/ \_____/ \ | __| + | / \ | + \_____/ \_____/ + + |________|______| |______|___________| + D-Pad Left Right Action Pad + Stick Stick + + |_____________| + Menu Pad + +Most gamepads have the following features: + + - Action-Pad + 4 buttons in diamonds-shape (on the right side). The buttons are + differently labeled on most devices so we define them as NORTH, + SOUTH, WEST and EAST. + - D-Pad (Direction-pad) + 4 buttons (on the left side) that point up, down, left and right. + - Menu-Pad + Different constellations, but most-times 2 buttons: SELECT - START + Furthermore, many gamepads have a fancy branded button that is used as + special system-button. It often looks different to the other buttons and + is used to pop up system-menus or system-settings. + - Analog-Sticks + Analog-sticks provide freely moveable sticks to control directions. Not + all devices have both or any, but they are present at most times. + Analog-sticks may also provide a digital button if you press them. + - Triggers + Triggers are located on the upper-side of the pad in vertical direction. + Not all devices provide them, but the upper buttons are normally named + Left- and Right-Triggers, the lower buttons Z-Left and Z-Right. + - Rumble + Many devices provide force-feedback features. But are mostly just + simple rumble motors. + +Detection +~~~~~~~~~ + +All gamepads that follow the protocol described here map BTN_GAMEPAD. This is +an alias for BTN_SOUTH/BTN_A. It can be used to identify a gamepad as such. +However, not all gamepads provide all features, so you need to test for all +features that you need, first. How each feature is mapped is described below. + +Legacy drivers often don't comply to these rules. As we cannot change them +for backwards-compatibility reasons, you need to provide fixup mappings in +user-space yourself. Some of them might also provide module-options that +change the mappings so you can advise users to set these. + +All new gamepads are supposed to comply with this mapping. Please report any +bugs, if they don't. + +There are a lot of less-featured/less-powerful devices out there, which re-use +the buttons from this protocol. However, they try to do this in a compatible +fashion. For example, the "Nintendo Wii Nunchuk" provides two trigger buttons +and one analog stick. It reports them as if it were a gamepad with only one +analog stick and two trigger buttons on the right side. +But that means, that if you only support "real" gamepads, you must test +devices for _all_ reported events that you need. Otherwise, you will also get +devices that report a small subset of the events. + +No other devices, that do not look/feel like a gamepad, shall report these +events. + +Events +~~~~~~ + +Gamepads report the following events: + +- Action-Pad: + + Every gamepad device has at least 2 action buttons. This means, that every + device reports BTN_SOUTH (which BTN_GAMEPAD is an alias for). Regardless + of the labels on the buttons, the codes are sent according to the + physical position of the buttons. + + Please note that 2- and 3-button pads are fairly rare and old. You might + want to filter gamepads that do not report all four. + + - 2-Button Pad: + + If only 2 action-buttons are present, they are reported as BTN_SOUTH and + BTN_EAST. For vertical layouts, the upper button is BTN_EAST. For + horizontal layouts, the button more on the right is BTN_EAST. + + - 3-Button Pad: + + If only 3 action-buttons are present, they are reported as (from left + to right): BTN_WEST, BTN_SOUTH, BTN_EAST + If the buttons are aligned perfectly vertically, they are reported as + (from top down): BTN_WEST, BTN_SOUTH, BTN_EAST + + - 4-Button Pad: + + If all 4 action-buttons are present, they can be aligned in two + different formations. If diamond-shaped, they are reported as BTN_NORTH, + BTN_WEST, BTN_SOUTH, BTN_EAST according to their physical location. + If rectangular-shaped, the upper-left button is BTN_NORTH, lower-left + is BTN_WEST, lower-right is BTN_SOUTH and upper-right is BTN_EAST. + +- D-Pad: + + Every gamepad provides a D-Pad with four directions: Up, Down, Left, Right + Some of these are available as digital buttons, some as analog buttons. Some + may even report both. The kernel does not convert between these so + applications should support both and choose what is more appropriate if + both are reported. + + - Digital buttons are reported as: + + BTN_DPAD_* + + - Analog buttons are reported as: + + ABS_HAT0X and ABS_HAT0Y + + (for ABS values negative is left/up, positive is right/down) + +- Analog-Sticks: + + The left analog-stick is reported as ABS_X, ABS_Y. The right analog stick is + reported as ABS_RX, ABS_RY. Zero, one or two sticks may be present. + If analog-sticks provide digital buttons, they are mapped accordingly as + BTN_THUMBL (first/left) and BTN_THUMBR (second/right). + + (for ABS values negative is left/up, positive is right/down) + +- Triggers: + + Trigger buttons can be available as digital or analog buttons or both. User- + space must correctly deal with any situation and choose the most appropriate + mode. + + Upper trigger buttons are reported as BTN_TR or ABS_HAT1X (right) and BTN_TL + or ABS_HAT1Y (left). Lower trigger buttons are reported as BTN_TR2 or + ABS_HAT2X (right/ZR) and BTN_TL2 or ABS_HAT2Y (left/ZL). + + If only one trigger-button combination is present (upper+lower), they are + reported as "right" triggers (BTN_TR/ABS_HAT1X). + + (ABS trigger values start at 0, pressure is reported as positive values) + +- Menu-Pad: + + Menu buttons are always digital and are mapped according to their location + instead of their labels. That is: + + - 1-button Pad: + + Mapped as BTN_START + + - 2-button Pad: + + Left button mapped as BTN_SELECT, right button mapped as BTN_START + + Many pads also have a third button which is branded or has a special symbol + and meaning. Such buttons are mapped as BTN_MODE. Examples are the Nintendo + "HOME" button, the XBox "X"-button or Sony "PS" button. + +- Rumble: + + Rumble is advertised as FF_RUMBLE. diff --git a/Documentation/input/gamepad.txt b/Documentation/input/gamepad.txt deleted file mode 100644 index 3f6d8a5e9cdc38f5ba0c2ec9a10ff68bb5949405..0000000000000000000000000000000000000000 --- a/Documentation/input/gamepad.txt +++ /dev/null @@ -1,159 +0,0 @@ - Linux Gamepad API ----------------------------------------------------------------------------- - -1. Intro -~~~~~~~~ -Linux provides many different input drivers for gamepad hardware. To avoid -having user-space deal with different button-mappings for each gamepad, this -document defines how gamepads are supposed to report their data. - -2. Geometry -~~~~~~~~~~~ -As "gamepad" we define devices which roughly look like this: - - ____________________________ __ - / [__ZL__] [__ZR__] \ | - / [__ TL __] [__ TR __] \ | Front Triggers - __/________________________________\__ __| - / _ \ | - / /\ __ (N) \ | - / || __ |MO| __ _ _ \ | Main Pad - | <===DP===> |SE| |ST| (W) -|- (E) | | - \ || ___ ___ _ / | - /\ \/ / \ / \ (S) /\ __| - / \________ | LS | ____ | RS | ________/ \ | - | / \ \___/ / \ \___/ / \ | | Control Sticks - | / \_____/ \_____/ \ | __| - | / \ | - \_____/ \_____/ - - |________|______| |______|___________| - D-Pad Left Right Action Pad - Stick Stick - - |_____________| - Menu Pad - -Most gamepads have the following features: - - Action-Pad - 4 buttons in diamonds-shape (on the right side). The buttons are - differently labeled on most devices so we define them as NORTH, - SOUTH, WEST and EAST. - - D-Pad (Direction-pad) - 4 buttons (on the left side) that point up, down, left and right. - - Menu-Pad - Different constellations, but most-times 2 buttons: SELECT - START - Furthermore, many gamepads have a fancy branded button that is used as - special system-button. It often looks different to the other buttons and - is used to pop up system-menus or system-settings. - - Analog-Sticks - Analog-sticks provide freely moveable sticks to control directions. Not - all devices have both or any, but they are present at most times. - Analog-sticks may also provide a digital button if you press them. - - Triggers - Triggers are located on the upper-side of the pad in vertical direction. - Not all devices provide them, but the upper buttons are normally named - Left- and Right-Triggers, the lower buttons Z-Left and Z-Right. - - Rumble - Many devices provide force-feedback features. But are mostly just - simple rumble motors. - -3. Detection -~~~~~~~~~~~~ -All gamepads that follow the protocol described here map BTN_GAMEPAD. This is -an alias for BTN_SOUTH/BTN_A. It can be used to identify a gamepad as such. -However, not all gamepads provide all features, so you need to test for all -features that you need, first. How each feature is mapped is described below. - -Legacy drivers often don't comply to these rules. As we cannot change them -for backwards-compatibility reasons, you need to provide fixup mappings in -user-space yourself. Some of them might also provide module-options that -change the mappings so you can advise users to set these. - -All new gamepads are supposed to comply with this mapping. Please report any -bugs, if they don't. - -There are a lot of less-featured/less-powerful devices out there, which re-use -the buttons from this protocol. However, they try to do this in a compatible -fashion. For example, the "Nintendo Wii Nunchuk" provides two trigger buttons -and one analog stick. It reports them as if it were a gamepad with only one -analog stick and two trigger buttons on the right side. -But that means, that if you only support "real" gamepads, you must test -devices for _all_ reported events that you need. Otherwise, you will also get -devices that report a small subset of the events. - -No other devices, that do not look/feel like a gamepad, shall report these -events. - -4. Events -~~~~~~~~~ -Gamepads report the following events: - -Action-Pad: - Every gamepad device has at least 2 action buttons. This means, that every - device reports BTN_SOUTH (which BTN_GAMEPAD is an alias for). Regardless - of the labels on the buttons, the codes are sent according to the - physical position of the buttons. - Please note that 2- and 3-button pads are fairly rare and old. You might - want to filter gamepads that do not report all four. - 2-Button Pad: - If only 2 action-buttons are present, they are reported as BTN_SOUTH and - BTN_EAST. For vertical layouts, the upper button is BTN_EAST. For - horizontal layouts, the button more on the right is BTN_EAST. - 3-Button Pad: - If only 3 action-buttons are present, they are reported as (from left - to right): BTN_WEST, BTN_SOUTH, BTN_EAST - If the buttons are aligned perfectly vertically, they are reported as - (from top down): BTN_WEST, BTN_SOUTH, BTN_EAST - 4-Button Pad: - If all 4 action-buttons are present, they can be aligned in two - different formations. If diamond-shaped, they are reported as BTN_NORTH, - BTN_WEST, BTN_SOUTH, BTN_EAST according to their physical location. - If rectangular-shaped, the upper-left button is BTN_NORTH, lower-left - is BTN_WEST, lower-right is BTN_SOUTH and upper-right is BTN_EAST. - -D-Pad: - Every gamepad provides a D-Pad with four directions: Up, Down, Left, Right - Some of these are available as digital buttons, some as analog buttons. Some - may even report both. The kernel does not convert between these so - applications should support both and choose what is more appropriate if - both are reported. - Digital buttons are reported as: - BTN_DPAD_* - Analog buttons are reported as: - ABS_HAT0X and ABS_HAT0Y - (for ABS values negative is left/up, positive is right/down) - -Analog-Sticks: - The left analog-stick is reported as ABS_X, ABS_Y. The right analog stick is - reported as ABS_RX, ABS_RY. Zero, one or two sticks may be present. - If analog-sticks provide digital buttons, they are mapped accordingly as - BTN_THUMBL (first/left) and BTN_THUMBR (second/right). - (for ABS values negative is left/up, positive is right/down) - -Triggers: - Trigger buttons can be available as digital or analog buttons or both. User- - space must correctly deal with any situation and choose the most appropriate - mode. - Upper trigger buttons are reported as BTN_TR or ABS_HAT1X (right) and BTN_TL - or ABS_HAT1Y (left). Lower trigger buttons are reported as BTN_TR2 or - ABS_HAT2X (right/ZR) and BTN_TL2 or ABS_HAT2Y (left/ZL). - If only one trigger-button combination is present (upper+lower), they are - reported as "right" triggers (BTN_TR/ABS_HAT1X). - (ABS trigger values start at 0, pressure is reported as positive values) - -Menu-Pad: - Menu buttons are always digital and are mapped according to their location - instead of their labels. That is: - 1-button Pad: Mapped as BTN_START - 2-button Pad: Left button mapped as BTN_SELECT, right button mapped as - BTN_START - Many pads also have a third button which is branded or has a special symbol - and meaning. Such buttons are mapped as BTN_MODE. Examples are the Nintendo - "HOME" button, the XBox "X"-button or Sony "PS" button. - -Rumble: - Rumble is advertised as FF_RUMBLE. - ----------------------------------------------------------------------------- - Written 2013 by David Herrmann diff --git a/Documentation/input/gameport-programming.rst b/Documentation/input/gameport-programming.rst new file mode 100644 index 0000000000000000000000000000000000000000..c96911df1c54afbdc97780becb7be50f4c752c1b --- /dev/null +++ b/Documentation/input/gameport-programming.rst @@ -0,0 +1,222 @@ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Programming gameport drivers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A basic classic gameport +~~~~~~~~~~~~~~~~~~~~~~~~ + +If the gameport doesn't provide more than the inb()/outb() functionality, +the code needed to register it with the joystick drivers is simple:: + + struct gameport gameport; + + gameport.io = MY_IO_ADDRESS; + gameport_register_port(&gameport); + +Make sure struct gameport is initialized to 0 in all other fields. The +gameport generic code will take care of the rest. + +If your hardware supports more than one io address, and your driver can +choose which one to program the hardware to, starting from the more exotic +addresses is preferred, because the likelihood of clashing with the standard +0x201 address is smaller. + +Eg. if your driver supports addresses 0x200, 0x208, 0x210 and 0x218, then +0x218 would be the address of first choice. + +If your hardware supports a gameport address that is not mapped to ISA io +space (is above 0x1000), use that one, and don't map the ISA mirror. + +Also, always request_region() on the whole io space occupied by the +gameport. Although only one ioport is really used, the gameport usually +occupies from one to sixteen addresses in the io space. + +Please also consider enabling the gameport on the card in the ->open() +callback if the io is mapped to ISA space - this way it'll occupy the io +space only when something really is using it. Disable it again in the +->close() callback. You also can select the io address in the ->open() +callback, so that it doesn't fail if some of the possible addresses are +already occupied by other gameports. + +Memory mapped gameport +~~~~~~~~~~~~~~~~~~~~~~ + +When a gameport can be accessed through MMIO, this way is preferred, because +it is faster, allowing more reads per second. Registering such a gameport +isn't as easy as a basic IO one, but not so much complex:: + + struct gameport gameport; + + void my_trigger(struct gameport *gameport) + { + my_mmio = 0xff; + } + + unsigned char my_read(struct gameport *gameport) + { + return my_mmio; + } + + gameport.read = my_read; + gameport.trigger = my_trigger; + gameport_register_port(&gameport); + +.. _gameport_pgm_cooked_mode: + +Cooked mode gameport +~~~~~~~~~~~~~~~~~~~~ + +There are gameports that can report the axis values as numbers, that means +the driver doesn't have to measure them the old way - an ADC is built into +the gameport. To register a cooked gameport:: + + struct gameport gameport; + + int my_cooked_read(struct gameport *gameport, int *axes, int *buttons) + { + int i; + + for (i = 0; i < 4; i++) + axes[i] = my_mmio[i]; + buttons[i] = my_mmio[4]; + } + + int my_open(struct gameport *gameport, int mode) + { + return -(mode != GAMEPORT_MODE_COOKED); + } + + gameport.cooked_read = my_cooked_read; + gameport.open = my_open; + gameport.fuzz = 8; + gameport_register_port(&gameport); + +The only confusing thing here is the fuzz value. Best determined by +experimentation, it is the amount of noise in the ADC data. Perfect +gameports can set this to zero, most common have fuzz between 8 and 32. +See analog.c and input.c for handling of fuzz - the fuzz value determines +the size of a gaussian filter window that is used to eliminate the noise +in the data. + +More complex gameports +~~~~~~~~~~~~~~~~~~~~~~ + +Gameports can support both raw and cooked modes. In that case combine either +examples 1+2 or 1+3. Gameports can support internal calibration - see below, +and also lightning.c and analog.c on how that works. If your driver supports +more than one gameport instance simultaneously, use the ->private member of +the gameport struct to point to your data. + +Unregistering a gameport +~~~~~~~~~~~~~~~~~~~~~~~~ + +Simple:: + + gameport_unregister_port(&gameport); + +The gameport structure +~~~~~~~~~~~~~~~~~~~~~~ + +.. note:: + + This section is outdated. There are several fields here that don't + match what's there at include/linux/gameport.h. + +:: + + struct gameport { + + void *private; + +A private pointer for free use in the gameport driver. (Not the joystick +driver!) + +:: + + int number; + +Number assigned to the gameport when registered. Informational purpose only. + +:: + + int io; + +I/O address for use with raw mode. You have to either set this, or ->read() +to some value if your gameport supports raw mode. + +:: + + int speed; + +Raw mode speed of the gameport reads in thousands of reads per second. + +:: + + int fuzz; + +If the gameport supports cooked mode, this should be set to a value that +represents the amount of noise in the data. See +:ref:`gameport_pgm_cooked_mode`. + +:: + + void (*trigger)(struct gameport *); + +Trigger. This function should trigger the ns558 oneshots. If set to NULL, +outb(0xff, io) will be used. + +:: + + unsigned char (*read)(struct gameport *); + +Read the buttons and ns558 oneshot bits. If set to NULL, inb(io) will be +used instead. + +:: + + int (*cooked_read)(struct gameport *, int *axes, int *buttons); + +If the gameport supports cooked mode, it should point this to its cooked +read function. It should fill axes[0..3] with four values of the joystick axes +and buttons[0] with four bits representing the buttons. + +:: + + int (*calibrate)(struct gameport *, int *axes, int *max); + +Function for calibrating the ADC hardware. When called, axes[0..3] should be +pre-filled by cooked data by the caller, max[0..3] should be pre-filled with +expected maximums for each axis. The calibrate() function should set the +sensitivity of the ADC hardware so that the maximums fit in its range and +recompute the axes[] values to match the new sensitivity or re-read them from +the hardware so that they give valid values. + +:: + + int (*open)(struct gameport *, int mode); + +Open() serves two purposes. First a driver either opens the port in raw or +in cooked mode, the open() callback can decide which modes are supported. +Second, resource allocation can happen here. The port can also be enabled +here. Prior to this call, other fields of the gameport struct (namely the io +member) need not to be valid. + +:: + + void (*close)(struct gameport *); + +Close() should free the resources allocated by open, possibly disabling the +gameport. + +:: + + struct gameport_dev *dev; + struct gameport *next; + +For internal use by the gameport layer. + +:: + + }; + +Enjoy! diff --git a/Documentation/input/gameport-programming.txt b/Documentation/input/gameport-programming.txt deleted file mode 100644 index 03a74fc3b49679c0326a59faa43b2e423751e0c5..0000000000000000000000000000000000000000 --- a/Documentation/input/gameport-programming.txt +++ /dev/null @@ -1,187 +0,0 @@ -Programming gameport drivers -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -1. A basic classic gameport -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -If the gameport doesn't provide more than the inb()/outb() functionality, -the code needed to register it with the joystick drivers is simple: - - struct gameport gameport; - - gameport.io = MY_IO_ADDRESS; - gameport_register_port(&gameport); - -Make sure struct gameport is initialized to 0 in all other fields. The -gameport generic code will take care of the rest. - -If your hardware supports more than one io address, and your driver can -choose which one to program the hardware to, starting from the more exotic -addresses is preferred, because the likelihood of clashing with the standard -0x201 address is smaller. - -Eg. if your driver supports addresses 0x200, 0x208, 0x210 and 0x218, then -0x218 would be the address of first choice. - -If your hardware supports a gameport address that is not mapped to ISA io -space (is above 0x1000), use that one, and don't map the ISA mirror. - -Also, always request_region() on the whole io space occupied by the -gameport. Although only one ioport is really used, the gameport usually -occupies from one to sixteen addresses in the io space. - -Please also consider enabling the gameport on the card in the ->open() -callback if the io is mapped to ISA space - this way it'll occupy the io -space only when something really is using it. Disable it again in the -->close() callback. You also can select the io address in the ->open() -callback, so that it doesn't fail if some of the possible addresses are -already occupied by other gameports. - -2. Memory mapped gameport -~~~~~~~~~~~~~~~~~~~~~~~~~ - -When a gameport can be accessed through MMIO, this way is preferred, because -it is faster, allowing more reads per second. Registering such a gameport -isn't as easy as a basic IO one, but not so much complex: - - struct gameport gameport; - - void my_trigger(struct gameport *gameport) - { - my_mmio = 0xff; - } - - unsigned char my_read(struct gameport *gameport) - { - return my_mmio; - } - - gameport.read = my_read; - gameport.trigger = my_trigger; - gameport_register_port(&gameport); - -3. Cooked mode gameport -~~~~~~~~~~~~~~~~~~~~~~~ - -There are gameports that can report the axis values as numbers, that means -the driver doesn't have to measure them the old way - an ADC is built into -the gameport. To register a cooked gameport: - - struct gameport gameport; - - int my_cooked_read(struct gameport *gameport, int *axes, int *buttons) - { - int i; - - for (i = 0; i < 4; i++) - axes[i] = my_mmio[i]; - buttons[i] = my_mmio[4]; - } - - int my_open(struct gameport *gameport, int mode) - { - return -(mode != GAMEPORT_MODE_COOKED); - } - - gameport.cooked_read = my_cooked_read; - gameport.open = my_open; - gameport.fuzz = 8; - gameport_register_port(&gameport); - -The only confusing thing here is the fuzz value. Best determined by -experimentation, it is the amount of noise in the ADC data. Perfect -gameports can set this to zero, most common have fuzz between 8 and 32. -See analog.c and input.c for handling of fuzz - the fuzz value determines -the size of a gaussian filter window that is used to eliminate the noise -in the data. - -4. More complex gameports -~~~~~~~~~~~~~~~~~~~~~~~~~ - -Gameports can support both raw and cooked modes. In that case combine either -examples 1+2 or 1+3. Gameports can support internal calibration - see below, -and also lightning.c and analog.c on how that works. If your driver supports -more than one gameport instance simultaneously, use the ->private member of -the gameport struct to point to your data. - -5. Unregistering a gameport -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -Simple: - -gameport_unregister_port(&gameport); - -6. The gameport structure -~~~~~~~~~~~~~~~~~~~~~~~~~ - -struct gameport { - - void *private; - -A private pointer for free use in the gameport driver. (Not the joystick -driver!) - - int number; - -Number assigned to the gameport when registered. Informational purpose only. - - int io; - -I/O address for use with raw mode. You have to either set this, or ->read() -to some value if your gameport supports raw mode. - - int speed; - -Raw mode speed of the gameport reads in thousands of reads per second. - - int fuzz; - -If the gameport supports cooked mode, this should be set to a value that -represents the amount of noise in the data. See section 3. - - void (*trigger)(struct gameport *); - -Trigger. This function should trigger the ns558 oneshots. If set to NULL, -outb(0xff, io) will be used. - - unsigned char (*read)(struct gameport *); - -Read the buttons and ns558 oneshot bits. If set to NULL, inb(io) will be -used instead. - - int (*cooked_read)(struct gameport *, int *axes, int *buttons); - -If the gameport supports cooked mode, it should point this to its cooked -read function. It should fill axes[0..3] with four values of the joystick axes -and buttons[0] with four bits representing the buttons. - - int (*calibrate)(struct gameport *, int *axes, int *max); - -Function for calibrating the ADC hardware. When called, axes[0..3] should be -pre-filled by cooked data by the caller, max[0..3] should be pre-filled with -expected maximums for each axis. The calibrate() function should set the -sensitivity of the ADC hardware so that the maximums fit in its range and -recompute the axes[] values to match the new sensitivity or re-read them from -the hardware so that they give valid values. - - int (*open)(struct gameport *, int mode); - -Open() serves two purposes. First a driver either opens the port in raw or -in cooked mode, the open() callback can decide which modes are supported. -Second, resource allocation can happen here. The port can also be enabled -here. Prior to this call, other fields of the gameport struct (namely the io -member) need not to be valid. - - void (*close)(struct gameport *); - -Close() should free the resources allocated by open, possibly disabling the -gameport. - - struct gameport_dev *dev; - struct gameport *next; - -For internal use by the gameport layer. - -}; - -Enjoy! diff --git a/Documentation/input/gpio-tilt.txt b/Documentation/input/gpio-tilt.txt deleted file mode 100644 index 2cdfd9bcb1afc1015b4f98201f6b38cdab1691b3..0000000000000000000000000000000000000000 --- a/Documentation/input/gpio-tilt.txt +++ /dev/null @@ -1,103 +0,0 @@ -Driver for tilt-switches connected via GPIOs -============================================ - -Generic driver to read data from tilt switches connected via gpios. -Orientation can be provided by one or more than one tilt switches, -i.e. each tilt switch providing one axis, and the number of axes -is also not limited. - - -Data structures: ----------------- - -The array of struct gpio in the gpios field is used to list the gpios -that represent the current tilt state. - -The array of struct gpio_tilt_axis describes the axes that are reported -to the input system. The values set therein are used for the -input_set_abs_params calls needed to init the axes. - -The array of struct gpio_tilt_state maps gpio states to the corresponding -values to report. The gpio state is represented as a bitfield where the -bit-index corresponds to the index of the gpio in the struct gpio array. -In the same manner the values stored in the axes array correspond to -the elements of the gpio_tilt_axis-array. - - -Example: --------- - -Example configuration for a single TS1003 tilt switch that rotates around -one axis in 4 steps and emits the current tilt via two GPIOs. - -static int sg060_tilt_enable(struct device *dev) { - /* code to enable the sensors */ -}; - -static void sg060_tilt_disable(struct device *dev) { - /* code to disable the sensors */ -}; - -static struct gpio sg060_tilt_gpios[] = { - { SG060_TILT_GPIO_SENSOR1, GPIOF_IN, "tilt_sensor1" }, - { SG060_TILT_GPIO_SENSOR2, GPIOF_IN, "tilt_sensor2" }, -}; - -static struct gpio_tilt_state sg060_tilt_states[] = { - { - .gpios = (0 << 1) | (0 << 0), - .axes = (int[]) { - 0, - }, - }, { - .gpios = (0 << 1) | (1 << 0), - .axes = (int[]) { - 1, /* 90 degrees */ - }, - }, { - .gpios = (1 << 1) | (1 << 0), - .axes = (int[]) { - 2, /* 180 degrees */ - }, - }, { - .gpios = (1 << 1) | (0 << 0), - .axes = (int[]) { - 3, /* 270 degrees */ - }, - }, -}; - -static struct gpio_tilt_axis sg060_tilt_axes[] = { - { - .axis = ABS_RY, - .min = 0, - .max = 3, - .fuzz = 0, - .flat = 0, - }, -}; - -static struct gpio_tilt_platform_data sg060_tilt_pdata= { - .gpios = sg060_tilt_gpios, - .nr_gpios = ARRAY_SIZE(sg060_tilt_gpios), - - .axes = sg060_tilt_axes, - .nr_axes = ARRAY_SIZE(sg060_tilt_axes), - - .states = sg060_tilt_states, - .nr_states = ARRAY_SIZE(sg060_tilt_states), - - .debounce_interval = 100, - - .poll_interval = 1000, - .enable = sg060_tilt_enable, - .disable = sg060_tilt_disable, -}; - -static struct platform_device sg060_device_tilt = { - .name = "gpio-tilt-polled", - .id = -1, - .dev = { - .platform_data = &sg060_tilt_pdata, - }, -}; diff --git a/Documentation/input/iforce-protocol.txt b/Documentation/input/iforce-protocol.txt deleted file mode 100644 index 66287151c54a99fd76cab71888ac2b720f4c489b..0000000000000000000000000000000000000000 --- a/Documentation/input/iforce-protocol.txt +++ /dev/null @@ -1,258 +0,0 @@ -** Introduction -This document describes what I managed to discover about the protocol used to -specify force effects to I-Force 2.0 devices. None of this information comes -from Immerse. That's why you should not trust what is written in this -document. This document is intended to help understanding the protocol. -This is not a reference. Comments and corrections are welcome. To contact me, -send an email to: johann.deneux@gmail.com - -** WARNING ** -I shall not be held responsible for any damage or harm caused if you try to -send data to your I-Force device based on what you read in this document. - -** Preliminary Notes: -All values are hexadecimal with big-endian encoding (msb on the left). Beware, -values inside packets are encoded using little-endian. Bytes whose roles are -unknown are marked ??? Information that needs deeper inspection is marked (?) - -** General form of a packet ** -This is how packets look when the device uses the rs232 to communicate. -2B OP LEN DATA CS -CS is the checksum. It is equal to the exclusive or of all bytes. - -When using USB: -OP DATA -The 2B, LEN and CS fields have disappeared, probably because USB handles frames and -data corruption is handled or unsignificant. - -First, I describe effects that are sent by the device to the computer - -** Device input state -This packet is used to indicate the state of each button and the value of each -axis -OP= 01 for a joystick, 03 for a wheel -LEN= Varies from device to device -00 X-Axis lsb -01 X-Axis msb -02 Y-Axis lsb, or gas pedal for a wheel -03 Y-Axis msb, or brake pedal for a wheel -04 Throttle -05 Buttons -06 Lower 4 bits: Buttons - Upper 4 bits: Hat -07 Rudder - -** Device effects states -OP= 02 -LEN= Varies -00 ? Bit 1 (Value 2) is the value of the deadman switch -01 Bit 8 is set if the effect is playing. Bits 0 to 7 are the effect id. -02 ?? -03 Address of parameter block changed (lsb) -04 Address of parameter block changed (msb) -05 Address of second parameter block changed (lsb) -... depending on the number of parameter blocks updated - -** Force effect ** -OP= 01 -LEN= 0e -00 Channel (when playing several effects at the same time, each must be assigned a channel) -01 Wave form - Val 00 Constant - Val 20 Square - Val 21 Triangle - Val 22 Sine - Val 23 Sawtooth up - Val 24 Sawtooth down - Val 40 Spring (Force = f(pos)) - Val 41 Friction (Force = f(velocity)) and Inertia (Force = f(acceleration)) - - -02 Axes affected and trigger - Bits 4-7: Val 2 = effect along one axis. Byte 05 indicates direction - Val 4 = X axis only. Byte 05 must contain 5a - Val 8 = Y axis only. Byte 05 must contain b4 - Val c = X and Y axes. Bytes 05 must contain 60 - Bits 0-3: Val 0 = No trigger - Val x+1 = Button x triggers the effect - When the whole byte is 0, cancel the previously set trigger - -03-04 Duration of effect (little endian encoding, in ms) - -05 Direction of effect, if applicable. Else, see 02 for value to assign. - -06-07 Minimum time between triggering. - -08-09 Address of periodicity or magnitude parameters -0a-0b Address of attack and fade parameters, or ffff if none. -*or* -08-09 Address of interactive parameters for X-axis, or ffff if not applicable -0a-0b Address of interactive parameters for Y-axis, or ffff if not applicable - -0c-0d Delay before execution of effect (little endian encoding, in ms) - - -** Time based parameters ** - -*** Attack and fade *** -OP= 02 -LEN= 08 -00-01 Address where to store the parameters -02-03 Duration of attack (little endian encoding, in ms) -04 Level at end of attack. Signed byte. -05-06 Duration of fade. -07 Level at end of fade. - -*** Magnitude *** -OP= 03 -LEN= 03 -00-01 Address -02 Level. Signed byte. - -*** Periodicity *** -OP= 04 -LEN= 07 -00-01 Address -02 Magnitude. Signed byte. -03 Offset. Signed byte. -04 Phase. Val 00 = 0 deg, Val 40 = 90 degs. -05-06 Period (little endian encoding, in ms) - -** Interactive parameters ** -OP= 05 -LEN= 0a -00-01 Address -02 Positive Coeff -03 Negative Coeff -04+05 Offset (center) -06+07 Dead band (Val 01F4 = 5000 (decimal)) -08 Positive saturation (Val 0a = 1000 (decimal) Val 64 = 10000 (decimal)) -09 Negative saturation - -The encoding is a bit funny here: For coeffs, these are signed values. The -maximum value is 64 (100 decimal), the min is 9c. -For the offset, the minimum value is FE0C, the maximum value is 01F4. -For the deadband, the minimum value is 0, the max is 03E8. - -** Controls ** -OP= 41 -LEN= 03 -00 Channel -01 Start/Stop - Val 00: Stop - Val 01: Start and play once. - Val 41: Start and play n times (See byte 02 below) -02 Number of iterations n. - -** Init ** - -*** Querying features *** -OP= ff -Query command. Length varies according to the query type. -The general format of this packet is: -ff 01 QUERY [INDEX] CHECKSUM -responses are of the same form: -FF LEN QUERY VALUE_QUERIED CHECKSUM2 -where LEN = 1 + length(VALUE_QUERIED) - -**** Query ram size **** -QUERY = 42 ('B'uffer size) -The device should reply with the same packet plus two additional bytes -containing the size of the memory: -ff 03 42 03 e8 CS would mean that the device has 1000 bytes of ram available. - -**** Query number of effects **** -QUERY = 4e ('N'umber of effects) -The device should respond by sending the number of effects that can be played -at the same time (one byte) -ff 02 4e 14 CS would stand for 20 effects. - -**** Vendor's id **** -QUERY = 4d ('M'anufacturer) -Query the vendors'id (2 bytes) - -**** Product id ***** -QUERY = 50 ('P'roduct) -Query the product id (2 bytes) - -**** Open device **** -QUERY = 4f ('O'pen) -No data returned. - -**** Close device ***** -QUERY = 43 ('C')lose -No data returned. - -**** Query effect **** -QUERY = 45 ('E') -Send effect type. -Returns nonzero if supported (2 bytes) - -**** Firmware Version **** -QUERY = 56 ('V'ersion) -Sends back 3 bytes - major, minor, subminor - -*** Initialisation of the device *** - -**** Set Control **** -!!! Device dependent, can be different on different models !!! -OP= 40 [] -LEN= 2 or 3 -00 Idx - Idx 00 Set dead zone (0..2048) - Idx 01 Ignore Deadman sensor (0..1) - Idx 02 Enable comm watchdog (0..1) - Idx 03 Set the strength of the spring (0..100) - Idx 04 Enable or disable the spring (0/1) - Idx 05 Set axis saturation threshold (0..2048) - -**** Set Effect State **** -OP= 42 -LEN= 1 -00 State - Bit 3 Pause force feedback - Bit 2 Enable force feedback - Bit 0 Stop all effects - -**** Set overall gain **** -OP= 43 -LEN= 1 -00 Gain - Val 00 = 0% - Val 40 = 50% - Val 80 = 100% - -** Parameter memory ** - -Each device has a certain amount of memory to store parameters of effects. -The amount of RAM may vary, I encountered values from 200 to 1000 bytes. Below -is the amount of memory apparently needed for every set of parameters: - - period : 0c - - magnitude : 02 - - attack and fade : 0e - - interactive : 08 - -** Appendix: How to study the protocol ? ** - -1. Generate effects using the force editor provided with the DirectX SDK, or -use Immersion Studio (freely available at their web site in the developer section: -www.immersion.com) -2. Start a soft spying RS232 or USB (depending on where you connected your -joystick/wheel). I used ComPortSpy from fCoder (alpha version!) -3. Play the effect, and watch what happens on the spy screen. - -A few words about ComPortSpy: -At first glance, this software seems, hum, well... buggy. In fact, data appear with a -few seconds latency. Personally, I restart it every time I play an effect. -Remember it's free (as in free beer) and alpha! - -** URLS ** -Check www.immerse.com for Immersion Studio, and www.fcoder.com for ComPortSpy. - -** Author of this document ** -Johann Deneux -Home page at http://web.archive.org/web/*/http://www.esil.univ-mrs.fr - -Additions by Vojtech Pavlik. - -I-Force is trademark of Immersion Corp. diff --git a/Documentation/input/index.rst b/Documentation/input/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..7a3e71c2bd008c864f34d681a4e3ad68b9eaaaef --- /dev/null +++ b/Documentation/input/index.rst @@ -0,0 +1,20 @@ +============================= +The Linux Input Documentation +============================= + +Contents: + +.. toctree:: + :maxdepth: 2 + :numbered: + + input_uapi + input_kapi + devices/index + +.. only:: subproject and html + + Indices + ======= + + * :ref:`genindex` diff --git a/Documentation/input/input-programming.rst b/Documentation/input/input-programming.rst new file mode 100644 index 0000000000000000000000000000000000000000..45a4c6e05e39bd753622e77db62c2aa74bfffbfb --- /dev/null +++ b/Documentation/input/input-programming.rst @@ -0,0 +1,302 @@ +=============================== +Creating an input device driver +=============================== + +The simplest example +~~~~~~~~~~~~~~~~~~~~ + +Here comes a very simple example of an input device driver. The device has +just one button and the button is accessible at i/o port BUTTON_PORT. When +pressed or released a BUTTON_IRQ happens. The driver could look like:: + + #include + #include + #include + + #include + #include + + static struct input_dev *button_dev; + + static irqreturn_t button_interrupt(int irq, void *dummy) + { + input_report_key(button_dev, BTN_0, inb(BUTTON_PORT) & 1); + input_sync(button_dev); + return IRQ_HANDLED; + } + + static int __init button_init(void) + { + int error; + + if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { + printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); + return -EBUSY; + } + + button_dev = input_allocate_device(); + if (!button_dev) { + printk(KERN_ERR "button.c: Not enough memory\n"); + error = -ENOMEM; + goto err_free_irq; + } + + button_dev->evbit[0] = BIT_MASK(EV_KEY); + button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); + + error = input_register_device(button_dev); + if (error) { + printk(KERN_ERR "button.c: Failed to register device\n"); + goto err_free_dev; + } + + return 0; + + err_free_dev: + input_free_device(button_dev); + err_free_irq: + free_irq(BUTTON_IRQ, button_interrupt); + return error; + } + + static void __exit button_exit(void) + { + input_unregister_device(button_dev); + free_irq(BUTTON_IRQ, button_interrupt); + } + + module_init(button_init); + module_exit(button_exit); + +What the example does +~~~~~~~~~~~~~~~~~~~~~ + +First it has to include the file, which interfaces to the +input subsystem. This provides all the definitions needed. + +In the _init function, which is called either upon module load or when +booting the kernel, it grabs the required resources (it should also check +for the presence of the device). + +Then it allocates a new input device structure with input_allocate_device() +and sets up input bitfields. This way the device driver tells the other +parts of the input systems what it is - what events can be generated or +accepted by this input device. Our example device can only generate EV_KEY +type events, and from those only BTN_0 event code. Thus we only set these +two bits. We could have used:: + + set_bit(EV_KEY, button_dev.evbit); + set_bit(BTN_0, button_dev.keybit); + +as well, but with more than single bits the first approach tends to be +shorter. + +Then the example driver registers the input device structure by calling:: + + input_register_device(&button_dev); + +This adds the button_dev structure to linked lists of the input driver and +calls device handler modules _connect functions to tell them a new input +device has appeared. input_register_device() may sleep and therefore must +not be called from an interrupt or with a spinlock held. + +While in use, the only used function of the driver is:: + + button_interrupt() + +which upon every interrupt from the button checks its state and reports it +via the:: + + input_report_key() + +call to the input system. There is no need to check whether the interrupt +routine isn't reporting two same value events (press, press for example) to +the input system, because the input_report_* functions check that +themselves. + +Then there is the:: + + input_sync() + +call to tell those who receive the events that we've sent a complete report. +This doesn't seem important in the one button case, but is quite important +for for example mouse movement, where you don't want the X and Y values +to be interpreted separately, because that'd result in a different movement. + +dev->open() and dev->close() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In case the driver has to repeatedly poll the device, because it doesn't +have an interrupt coming from it and the polling is too expensive to be done +all the time, or if the device uses a valuable resource (eg. interrupt), it +can use the open and close callback to know when it can stop polling or +release the interrupt and when it must resume polling or grab the interrupt +again. To do that, we would add this to our example driver:: + + static int button_open(struct input_dev *dev) + { + if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { + printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); + return -EBUSY; + } + + return 0; + } + + static void button_close(struct input_dev *dev) + { + free_irq(IRQ_AMIGA_VERTB, button_interrupt); + } + + static int __init button_init(void) + { + ... + button_dev->open = button_open; + button_dev->close = button_close; + ... + } + +Note that input core keeps track of number of users for the device and +makes sure that dev->open() is called only when the first user connects +to the device and that dev->close() is called when the very last user +disconnects. Calls to both callbacks are serialized. + +The open() callback should return a 0 in case of success or any nonzero value +in case of failure. The close() callback (which is void) must always succeed. + +Basic event types +~~~~~~~~~~~~~~~~~ + +The most simple event type is EV_KEY, which is used for keys and buttons. +It's reported to the input system via:: + + input_report_key(struct input_dev *dev, int code, int value) + +See uapi/linux/input-event-codes.h for the allowable values of code (from 0 to +KEY_MAX). Value is interpreted as a truth value, ie any nonzero value means key +pressed, zero value means key released. The input code generates events only +in case the value is different from before. + +In addition to EV_KEY, there are two more basic event types: EV_REL and +EV_ABS. They are used for relative and absolute values supplied by the +device. A relative value may be for example a mouse movement in the X axis. +The mouse reports it as a relative difference from the last position, +because it doesn't have any absolute coordinate system to work in. Absolute +events are namely for joysticks and digitizers - devices that do work in an +absolute coordinate systems. + +Having the device report EV_REL buttons is as simple as with EV_KEY, simply +set the corresponding bits and call the:: + + input_report_rel(struct input_dev *dev, int code, int value) + +function. Events are generated only for nonzero value. + +However EV_ABS requires a little special care. Before calling +input_register_device, you have to fill additional fields in the input_dev +struct for each absolute axis your device has. If our button device had also +the ABS_X axis:: + + button_dev.absmin[ABS_X] = 0; + button_dev.absmax[ABS_X] = 255; + button_dev.absfuzz[ABS_X] = 4; + button_dev.absflat[ABS_X] = 8; + +Or, you can just say:: + + input_set_abs_params(button_dev, ABS_X, 0, 255, 4, 8); + +This setting would be appropriate for a joystick X axis, with the minimum of +0, maximum of 255 (which the joystick *must* be able to reach, no problem if +it sometimes reports more, but it must be able to always reach the min and +max values), with noise in the data up to +- 4, and with a center flat +position of size 8. + +If you don't need absfuzz and absflat, you can set them to zero, which mean +that the thing is precise and always returns to exactly the center position +(if it has any). + +BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These three macros from bitops.h help some bitfield computations:: + + BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for + x bits + BIT_WORD(x) - returns the index in the array in longs for bit x + BIT_MASK(x) - returns the index in a long for bit x + +The id* and name fields +~~~~~~~~~~~~~~~~~~~~~~~ + +The dev->name should be set before registering the input device by the input +device driver. It's a string like 'Generic button device' containing a +user friendly name of the device. + +The id* fields contain the bus ID (PCI, USB, ...), vendor ID and device ID +of the device. The bus IDs are defined in input.h. The vendor and device ids +are defined in pci_ids.h, usb_ids.h and similar include files. These fields +should be set by the input device driver before registering it. + +The idtype field can be used for specific information for the input device +driver. + +The id and name fields can be passed to userland via the evdev interface. + +The keycode, keycodemax, keycodesize fields +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +These three fields should be used by input devices that have dense keymaps. +The keycode is an array used to map from scancodes to input system keycodes. +The keycode max should contain the size of the array and keycodesize the +size of each entry in it (in bytes). + +Userspace can query and alter current scancode to keycode mappings using +EVIOCGKEYCODE and EVIOCSKEYCODE ioctls on corresponding evdev interface. +When a device has all 3 aforementioned fields filled in, the driver may +rely on kernel's default implementation of setting and querying keycode +mappings. + +dev->getkeycode() and dev->setkeycode() +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +getkeycode() and setkeycode() callbacks allow drivers to override default +keycode/keycodesize/keycodemax mapping mechanism provided by input core +and implement sparse keycode maps. + +Key autorepeat +~~~~~~~~~~~~~~ + +... is simple. It is handled by the input.c module. Hardware autorepeat is +not used, because it's not present in many devices and even where it is +present, it is broken sometimes (at keyboards: Toshiba notebooks). To enable +autorepeat for your device, just set EV_REP in dev->evbit. All will be +handled by the input system. + +Other event types, handling output events +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The other event types up to now are: + +- EV_LED - used for the keyboard LEDs. +- EV_SND - used for keyboard beeps. + +They are very similar to for example key events, but they go in the other +direction - from the system to the input device driver. If your input device +driver can handle these events, it has to set the respective bits in evbit, +*and* also the callback routine:: + + button_dev->event = button_event; + + int button_event(struct input_dev *dev, unsigned int type, + unsigned int code, int value) + { + if (type == EV_SND && code == SND_BELL) { + outb(value, BUTTON_BELL); + return 0; + } + return -1; + } + +This callback routine can be called from an interrupt or a BH (although that +isn't a rule), and thus must not sleep, and must not take too long to finish. diff --git a/Documentation/input/input-programming.txt b/Documentation/input/input-programming.txt deleted file mode 100644 index 7f8b9d97bc47d32359150822eaa16b23acfc13be..0000000000000000000000000000000000000000 --- a/Documentation/input/input-programming.txt +++ /dev/null @@ -1,302 +0,0 @@ -Programming input drivers -~~~~~~~~~~~~~~~~~~~~~~~~~ - -1. Creating an input device driver -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -1.0 The simplest example -~~~~~~~~~~~~~~~~~~~~~~~~ - -Here comes a very simple example of an input device driver. The device has -just one button and the button is accessible at i/o port BUTTON_PORT. When -pressed or released a BUTTON_IRQ happens. The driver could look like: - -#include -#include -#include - -#include -#include - -static struct input_dev *button_dev; - -static irqreturn_t button_interrupt(int irq, void *dummy) -{ - input_report_key(button_dev, BTN_0, inb(BUTTON_PORT) & 1); - input_sync(button_dev); - return IRQ_HANDLED; -} - -static int __init button_init(void) -{ - int error; - - if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { - printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); - return -EBUSY; - } - - button_dev = input_allocate_device(); - if (!button_dev) { - printk(KERN_ERR "button.c: Not enough memory\n"); - error = -ENOMEM; - goto err_free_irq; - } - - button_dev->evbit[0] = BIT_MASK(EV_KEY); - button_dev->keybit[BIT_WORD(BTN_0)] = BIT_MASK(BTN_0); - - error = input_register_device(button_dev); - if (error) { - printk(KERN_ERR "button.c: Failed to register device\n"); - goto err_free_dev; - } - - return 0; - - err_free_dev: - input_free_device(button_dev); - err_free_irq: - free_irq(BUTTON_IRQ, button_interrupt); - return error; -} - -static void __exit button_exit(void) -{ - input_unregister_device(button_dev); - free_irq(BUTTON_IRQ, button_interrupt); -} - -module_init(button_init); -module_exit(button_exit); - -1.1 What the example does -~~~~~~~~~~~~~~~~~~~~~~~~~ - -First it has to include the file, which interfaces to the -input subsystem. This provides all the definitions needed. - -In the _init function, which is called either upon module load or when -booting the kernel, it grabs the required resources (it should also check -for the presence of the device). - -Then it allocates a new input device structure with input_allocate_device() -and sets up input bitfields. This way the device driver tells the other -parts of the input systems what it is - what events can be generated or -accepted by this input device. Our example device can only generate EV_KEY -type events, and from those only BTN_0 event code. Thus we only set these -two bits. We could have used - - set_bit(EV_KEY, button_dev.evbit); - set_bit(BTN_0, button_dev.keybit); - -as well, but with more than single bits the first approach tends to be -shorter. - -Then the example driver registers the input device structure by calling - - input_register_device(&button_dev); - -This adds the button_dev structure to linked lists of the input driver and -calls device handler modules _connect functions to tell them a new input -device has appeared. input_register_device() may sleep and therefore must -not be called from an interrupt or with a spinlock held. - -While in use, the only used function of the driver is - - button_interrupt() - -which upon every interrupt from the button checks its state and reports it -via the - - input_report_key() - -call to the input system. There is no need to check whether the interrupt -routine isn't reporting two same value events (press, press for example) to -the input system, because the input_report_* functions check that -themselves. - -Then there is the - - input_sync() - -call to tell those who receive the events that we've sent a complete report. -This doesn't seem important in the one button case, but is quite important -for for example mouse movement, where you don't want the X and Y values -to be interpreted separately, because that'd result in a different movement. - -1.2 dev->open() and dev->close() -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -In case the driver has to repeatedly poll the device, because it doesn't -have an interrupt coming from it and the polling is too expensive to be done -all the time, or if the device uses a valuable resource (eg. interrupt), it -can use the open and close callback to know when it can stop polling or -release the interrupt and when it must resume polling or grab the interrupt -again. To do that, we would add this to our example driver: - -static int button_open(struct input_dev *dev) -{ - if (request_irq(BUTTON_IRQ, button_interrupt, 0, "button", NULL)) { - printk(KERN_ERR "button.c: Can't allocate irq %d\n", button_irq); - return -EBUSY; - } - - return 0; -} - -static void button_close(struct input_dev *dev) -{ - free_irq(IRQ_AMIGA_VERTB, button_interrupt); -} - -static int __init button_init(void) -{ - ... - button_dev->open = button_open; - button_dev->close = button_close; - ... -} - -Note that input core keeps track of number of users for the device and -makes sure that dev->open() is called only when the first user connects -to the device and that dev->close() is called when the very last user -disconnects. Calls to both callbacks are serialized. - -The open() callback should return a 0 in case of success or any nonzero value -in case of failure. The close() callback (which is void) must always succeed. - -1.3 Basic event types -~~~~~~~~~~~~~~~~~~~~~ - -The most simple event type is EV_KEY, which is used for keys and buttons. -It's reported to the input system via: - - input_report_key(struct input_dev *dev, int code, int value) - -See linux/input.h for the allowable values of code (from 0 to KEY_MAX). -Value is interpreted as a truth value, ie any nonzero value means key -pressed, zero value means key released. The input code generates events only -in case the value is different from before. - -In addition to EV_KEY, there are two more basic event types: EV_REL and -EV_ABS. They are used for relative and absolute values supplied by the -device. A relative value may be for example a mouse movement in the X axis. -The mouse reports it as a relative difference from the last position, -because it doesn't have any absolute coordinate system to work in. Absolute -events are namely for joysticks and digitizers - devices that do work in an -absolute coordinate systems. - -Having the device report EV_REL buttons is as simple as with EV_KEY, simply -set the corresponding bits and call the - - input_report_rel(struct input_dev *dev, int code, int value) - -function. Events are generated only for nonzero value. - -However EV_ABS requires a little special care. Before calling -input_register_device, you have to fill additional fields in the input_dev -struct for each absolute axis your device has. If our button device had also -the ABS_X axis: - - button_dev.absmin[ABS_X] = 0; - button_dev.absmax[ABS_X] = 255; - button_dev.absfuzz[ABS_X] = 4; - button_dev.absflat[ABS_X] = 8; - -Or, you can just say: - - input_set_abs_params(button_dev, ABS_X, 0, 255, 4, 8); - -This setting would be appropriate for a joystick X axis, with the minimum of -0, maximum of 255 (which the joystick *must* be able to reach, no problem if -it sometimes reports more, but it must be able to always reach the min and -max values), with noise in the data up to +- 4, and with a center flat -position of size 8. - -If you don't need absfuzz and absflat, you can set them to zero, which mean -that the thing is precise and always returns to exactly the center position -(if it has any). - -1.4 BITS_TO_LONGS(), BIT_WORD(), BIT_MASK() -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -These three macros from bitops.h help some bitfield computations: - - BITS_TO_LONGS(x) - returns the length of a bitfield array in longs for - x bits - BIT_WORD(x) - returns the index in the array in longs for bit x - BIT_MASK(x) - returns the index in a long for bit x - -1.5 The id* and name fields -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The dev->name should be set before registering the input device by the input -device driver. It's a string like 'Generic button device' containing a -user friendly name of the device. - -The id* fields contain the bus ID (PCI, USB, ...), vendor ID and device ID -of the device. The bus IDs are defined in input.h. The vendor and device ids -are defined in pci_ids.h, usb_ids.h and similar include files. These fields -should be set by the input device driver before registering it. - -The idtype field can be used for specific information for the input device -driver. - -The id and name fields can be passed to userland via the evdev interface. - -1.6 The keycode, keycodemax, keycodesize fields -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -These three fields should be used by input devices that have dense keymaps. -The keycode is an array used to map from scancodes to input system keycodes. -The keycode max should contain the size of the array and keycodesize the -size of each entry in it (in bytes). - -Userspace can query and alter current scancode to keycode mappings using -EVIOCGKEYCODE and EVIOCSKEYCODE ioctls on corresponding evdev interface. -When a device has all 3 aforementioned fields filled in, the driver may -rely on kernel's default implementation of setting and querying keycode -mappings. - -1.7 dev->getkeycode() and dev->setkeycode() -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -getkeycode() and setkeycode() callbacks allow drivers to override default -keycode/keycodesize/keycodemax mapping mechanism provided by input core -and implement sparse keycode maps. - -1.8 Key autorepeat -~~~~~~~~~~~~~~~~~~ - -... is simple. It is handled by the input.c module. Hardware autorepeat is -not used, because it's not present in many devices and even where it is -present, it is broken sometimes (at keyboards: Toshiba notebooks). To enable -autorepeat for your device, just set EV_REP in dev->evbit. All will be -handled by the input system. - -1.9 Other event types, handling output events -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - -The other event types up to now are: - -EV_LED - used for the keyboard LEDs. -EV_SND - used for keyboard beeps. - -They are very similar to for example key events, but they go in the other -direction - from the system to the input device driver. If your input device -driver can handle these events, it has to set the respective bits in evbit, -*and* also the callback routine: - - button_dev->event = button_event; - -int button_event(struct input_dev *dev, unsigned int type, unsigned int code, int value); -{ - if (type == EV_SND && code == SND_BELL) { - outb(value, BUTTON_BELL); - return 0; - } - return -1; -} - -This callback routine can be called from an interrupt or a BH (although that -isn't a rule), and thus must not sleep, and must not take too long to finish. diff --git a/Documentation/input/input.rst b/Documentation/input/input.rst new file mode 100644 index 0000000000000000000000000000000000000000..3b3a22975106987781eda18172728a3884c59da5 --- /dev/null +++ b/Documentation/input/input.rst @@ -0,0 +1,281 @@ +.. include:: + +============ +Introduction +============ + +:Copyright: |copy| 1999-2001 Vojtech Pavlik - Sponsored by SuSE + +Architecture +============ + +Input subsystem a collection of drivers that is designed to support +all input devices under Linux. Most of the drivers reside in +drivers/input, although quite a few live in drivers/hid and +drivers/platform. + +The core of the input subsystem is the input module, which must be +loaded before any other of the input modules - it serves as a way of +communication between two groups of modules: + +Device drivers +-------------- + +These modules talk to the hardware (for example via USB), and provide +events (keystrokes, mouse movements) to the input module. + +Event handlers +-------------- + +These modules get events from input core and pass them where needed +via various interfaces - keystrokes to the kernel, mouse movements via +a simulated PS/2 interface to GPM and X, and so on. + +Simple Usage +============ + +For the most usual configuration, with one USB mouse and one USB keyboard, +you'll have to load the following modules (or have them built in to the +kernel):: + + input + mousedev + usbcore + uhci_hcd or ohci_hcd or ehci_hcd + usbhid + hid_generic + +After this, the USB keyboard will work straight away, and the USB mouse +will be available as a character device on major 13, minor 63:: + + crw-r--r-- 1 root root 13, 63 Mar 28 22:45 mice + +This device usually created automatically by the system. The commands +to create it by hand are:: + + cd /dev + mkdir input + mknod input/mice c 13 63 + +After that you have to point GPM (the textmode mouse cut&paste tool) and +XFree to this device to use it - GPM should be called like:: + + gpm -t ps2 -m /dev/input/mice + +And in X:: + + Section "Pointer" + Protocol "ImPS/2" + Device "/dev/input/mice" + ZAxisMapping 4 5 + EndSection + +When you do all of the above, you can use your USB mouse and keyboard. + +Detailed Description +==================== + +Event handlers +-------------- + +Event handlers distribute the events from the devices to userspace and +in-kernel consumers, as needed. + +evdev +~~~~~ + +``evdev`` is the generic input event interface. It passes the events +generated in the kernel straight to the program, with timestamps. The +event codes are the same on all architectures and are hardware +independent. + +This is the preferred interface for userspace to consume user +input, and all clients are encouraged to use it. + +See :ref:`event-interface` for notes on API. + +The devices are in /dev/input:: + + crw-r--r-- 1 root root 13, 64 Apr 1 10:49 event0 + crw-r--r-- 1 root root 13, 65 Apr 1 10:50 event1 + crw-r--r-- 1 root root 13, 66 Apr 1 10:50 event2 + crw-r--r-- 1 root root 13, 67 Apr 1 10:50 event3 + ... + +There are two ranges of minors: 64 through 95 is the static legacy +range. If there are more than 32 input devices in a system, additional +evdev nodes are created with minors starting with 256. + +keyboard +~~~~~~~~ + +``keyboard`` is in-kernel input handler ad is a part of VT code. It +consumes keyboard keystrokes and handles user input for VT consoles. + +mousedev +~~~~~~~~ + +``mousedev`` is a hack to make legacy programs that use mouse input +work. It takes events from either mice or digitizers/tablets and makes +a PS/2-style (a la /dev/psaux) mouse device available to the +userland. + +Mousedev devices in /dev/input (as shown above) are:: + + crw-r--r-- 1 root root 13, 32 Mar 28 22:45 mouse0 + crw-r--r-- 1 root root 13, 33 Mar 29 00:41 mouse1 + crw-r--r-- 1 root root 13, 34 Mar 29 00:41 mouse2 + crw-r--r-- 1 root root 13, 35 Apr 1 10:50 mouse3 + ... + ... + crw-r--r-- 1 root root 13, 62 Apr 1 10:50 mouse30 + crw-r--r-- 1 root root 13, 63 Apr 1 10:50 mice + +Each ``mouse`` device is assigned to a single mouse or digitizer, except +the last one - ``mice``. This single character device is shared by all +mice and digitizers, and even if none are connected, the device is +present. This is useful for hotplugging USB mice, so that older programs +that do not handle hotplug can open the device even when no mice are +present. + +CONFIG_INPUT_MOUSEDEV_SCREEN_[XY] in the kernel configuration are +the size of your screen (in pixels) in XFree86. This is needed if you +want to use your digitizer in X, because its movement is sent to X +via a virtual PS/2 mouse and thus needs to be scaled +accordingly. These values won't be used if you use a mouse only. + +Mousedev will generate either PS/2, ImPS/2 (Microsoft IntelliMouse) or +ExplorerPS/2 (IntelliMouse Explorer) protocols, depending on what the +program reading the data wishes. You can set GPM and X to any of +these. You'll need ImPS/2 if you want to make use of a wheel on a USB +mouse and ExplorerPS/2 if you want to use extra (up to 5) buttons. + +joydev +~~~~~~ + +``joydev`` implements v0.x and v1.x Linux joystick API. See +:ref:`joystick-api` for details. + +As soon as any joystick is connected, it can be accessed in /dev/input on:: + + crw-r--r-- 1 root root 13, 0 Apr 1 10:50 js0 + crw-r--r-- 1 root root 13, 1 Apr 1 10:50 js1 + crw-r--r-- 1 root root 13, 2 Apr 1 10:50 js2 + crw-r--r-- 1 root root 13, 3 Apr 1 10:50 js3 + ... + +And so on up to js31 in legacy range, and additional nodes with minors +above 256 if there are more joystick devices. + +Device drivers +-------------- + +Device drivers are the modules that generate events. + +hid-generic +~~~~~~~~~~~ + +``hid-generic`` is one of the largest and most complex driver of the +whole suite. It handles all HID devices, and because there is a very +wide variety of them, and because the USB HID specification isn't +simple, it needs to be this big. + +Currently, it handles USB mice, joysticks, gamepads, steering wheels +keyboards, trackballs and digitizers. + +However, USB uses HID also for monitor controls, speaker controls, UPSs, +LCDs and many other purposes. + +The monitor and speaker controls should be easy to add to the hid/input +interface, but for the UPSs and LCDs it doesn't make much sense. For this, +the hiddev interface was designed. See Documentation/hid/hiddev.txt +for more information about it. + +The usage of the usbhid module is very simple, it takes no parameters, +detects everything automatically and when a HID device is inserted, it +detects it appropriately. + +However, because the devices vary wildly, you might happen to have a +device that doesn't work well. In that case #define DEBUG at the beginning +of hid-core.c and send me the syslog traces. + +usbmouse +~~~~~~~~ + +For embedded systems, for mice with broken HID descriptors and just any +other use when the big usbhid wouldn't be a good choice, there is the +usbmouse driver. It handles USB mice only. It uses a simpler HIDBP +protocol. This also means the mice must support this simpler protocol. Not +all do. If you don't have any strong reason to use this module, use usbhid +instead. + +usbkbd +~~~~~~ + +Much like usbmouse, this module talks to keyboards with a simplified +HIDBP protocol. It's smaller, but doesn't support any extra special keys. +Use usbhid instead if there isn't any special reason to use this. + +psmouse +~~~~~~~ + +This is driver for all flavors of pointing devices using PS/2 +protocol, including Synaptics and ALPS touchpads, Intellimouse +Explorer devices, Logitech PS/2 mice and so on. + +atkbd +~~~~~ + +This is driver for PS/2 (AT) keyboards. + +iforce +~~~~~~ + +A driver for I-Force joysticks and wheels, both over USB and RS232. +It includes Force Feedback support now, even though Immersion +Corp. considers the protocol a trade secret and won't disclose a word +about it. + +Verifying if it works +===================== + +Typing a couple keys on the keyboard should be enough to check that +a keyboard works and is correctly connected to the kernel keyboard +driver. + +Doing a ``cat /dev/input/mouse0`` (c, 13, 32) will verify that a mouse +is also emulated; characters should appear if you move it. + +You can test the joystick emulation with the ``jstest`` utility, +available in the joystick package (see :ref:`joystick-doc`). + +You can test the event devices with the ``evtest`` utility. + +.. _event-interface: + +Event interface +=============== + +You can use blocking and nonblocking reads, and also select() on the +/dev/input/eventX devices, and you'll always get a whole number of input +events on a read. Their layout is:: + + struct input_event { + struct timeval time; + unsigned short type; + unsigned short code; + unsigned int value; + }; + +``time`` is the timestamp, it returns the time at which the event happened. +Type is for example EV_REL for relative moment, EV_KEY for a keypress or +release. More types are defined in include/uapi/linux/input-event-codes.h. + +``code`` is event code, for example REL_X or KEY_BACKSPACE, again a complete +list is in include/uapi/linux/input-event-codes.h. + +``value`` is the value the event carries. Either a relative change for +EV_REL, absolute new value for EV_ABS (joysticks ...), or 0 for EV_KEY for +release, 1 for keypress and 2 for autorepeat. + +See :ref:`input-event-codes` for more information about various even codes. diff --git a/Documentation/input/input.txt b/Documentation/input/input.txt deleted file mode 100644 index 7ebce100fe905316590096f9e9aa2a8e5c604f97..0000000000000000000000000000000000000000 --- a/Documentation/input/input.txt +++ /dev/null @@ -1,290 +0,0 @@ - Linux Input drivers v1.0 - (c) 1999-2001 Vojtech Pavlik - Sponsored by SuSE ----------------------------------------------------------------------------- - -0. Disclaimer -~~~~~~~~~~~~~ - This program is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2 of the License, or (at your option) -any later version. - - This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. - - You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., 59 -Temple Place, Suite 330, Boston, MA 02111-1307 USA - - Should you need to contact me, the author, you can do so either by e-mail -- mail your message to , or by paper mail: Vojtech Pavlik, -Simunkova 1594, Prague 8, 182 00 Czech Republic - - For your convenience, the GNU General Public License version 2 is included -in the package: See the file COPYING. - -1. Introduction -~~~~~~~~~~~~~~~ - This is a collection of drivers that is designed to support all input -devices under Linux. While it is currently used only on for USB input -devices, future use (say 2.5/2.6) is expected to expand to replace -most of the existing input system, which is why it lives in -drivers/input/ instead of drivers/usb/. - - The centre of the input drivers is the input module, which must be -loaded before any other of the input modules - it serves as a way of -communication between two groups of modules: - -1.1 Device drivers -~~~~~~~~~~~~~~~~~~ - These modules talk to the hardware (for example via USB), and provide -events (keystrokes, mouse movements) to the input module. - -1.2 Event handlers -~~~~~~~~~~~~~~~~~~ - These modules get events from input and pass them where needed via -various interfaces - keystrokes to the kernel, mouse movements via a -simulated PS/2 interface to GPM and X and so on. - -2. Simple Usage -~~~~~~~~~~~~~~~ - For the most usual configuration, with one USB mouse and one USB keyboard, -you'll have to load the following modules (or have them built in to the -kernel): - - input - mousedev - keybdev - usbcore - uhci_hcd or ohci_hcd or ehci_hcd - usbhid - - After this, the USB keyboard will work straight away, and the USB mouse -will be available as a character device on major 13, minor 63: - - crw-r--r-- 1 root root 13, 63 Mar 28 22:45 mice - - This device has to be created. - The commands to create it by hand are: - - cd /dev - mkdir input - mknod input/mice c 13 63 - - After that you have to point GPM (the textmode mouse cut&paste tool) and -XFree to this device to use it - GPM should be called like: - - gpm -t ps2 -m /dev/input/mice - - And in X: - - Section "Pointer" - Protocol "ImPS/2" - Device "/dev/input/mice" - ZAxisMapping 4 5 - EndSection - - When you do all of the above, you can use your USB mouse and keyboard. - -3. Detailed Description -~~~~~~~~~~~~~~~~~~~~~~~ -3.1 Device drivers -~~~~~~~~~~~~~~~~~~ - Device drivers are the modules that generate events. The events are -however not useful without being handled, so you also will need to use some -of the modules from section 3.2. - -3.1.1 usbhid -~~~~~~~~~~~~ - usbhid is the largest and most complex driver of the whole suite. It -handles all HID devices, and because there is a very wide variety of them, -and because the USB HID specification isn't simple, it needs to be this big. - - Currently, it handles USB mice, joysticks, gamepads, steering wheels -keyboards, trackballs and digitizers. - - However, USB uses HID also for monitor controls, speaker controls, UPSs, -LCDs and many other purposes. - - The monitor and speaker controls should be easy to add to the hid/input -interface, but for the UPSs and LCDs it doesn't make much sense. For this, -the hiddev interface was designed. See Documentation/hid/hiddev.txt -for more information about it. - - The usage of the usbhid module is very simple, it takes no parameters, -detects everything automatically and when a HID device is inserted, it -detects it appropriately. - - However, because the devices vary wildly, you might happen to have a -device that doesn't work well. In that case #define DEBUG at the beginning -of hid-core.c and send me the syslog traces. - -3.1.2 usbmouse -~~~~~~~~~~~~~~ - For embedded systems, for mice with broken HID descriptors and just any -other use when the big usbhid wouldn't be a good choice, there is the -usbmouse driver. It handles USB mice only. It uses a simpler HIDBP -protocol. This also means the mice must support this simpler protocol. Not -all do. If you don't have any strong reason to use this module, use usbhid -instead. - -3.1.3 usbkbd -~~~~~~~~~~~~ - Much like usbmouse, this module talks to keyboards with a simplified -HIDBP protocol. It's smaller, but doesn't support any extra special keys. -Use usbhid instead if there isn't any special reason to use this. - -3.1.4 wacom -~~~~~~~~~~~ - This is a driver for Wacom Graphire and Intuos tablets. Not for Wacom -PenPartner, that one is handled by the HID driver. Although the Intuos and -Graphire tablets claim that they are HID tablets as well, they are not and -thus need this specific driver. - -3.1.5 iforce -~~~~~~~~~~~~ - A driver for I-Force joysticks and wheels, both over USB and RS232. -It includes ForceFeedback support now, even though Immersion -Corp. considers the protocol a trade secret and won't disclose a word -about it. - -3.2 Event handlers -~~~~~~~~~~~~~~~~~~ - Event handlers distribute the events from the devices to userland and -kernel, as needed. - -3.2.1 keybdev -~~~~~~~~~~~~~ - keybdev is currently a rather ugly hack that translates the input -events into architecture-specific keyboard raw mode (Xlated AT Set2 on -x86), and passes them into the handle_scancode function of the -keyboard.c module. This works well enough on all architectures that -keybdev can generate rawmode on, other architectures can be added to -it. - - The right way would be to pass the events to keyboard.c directly, -best if keyboard.c would itself be an event handler. This is done in -the input patch, available on the webpage mentioned below. - -3.2.2 mousedev -~~~~~~~~~~~~~~ - mousedev is also a hack to make programs that use mouse input -work. It takes events from either mice or digitizers/tablets and makes -a PS/2-style (a la /dev/psaux) mouse device available to the -userland. Ideally, the programs could use a more reasonable interface, -for example evdev - - Mousedev devices in /dev/input (as shown above) are: - - crw-r--r-- 1 root root 13, 32 Mar 28 22:45 mouse0 - crw-r--r-- 1 root root 13, 33 Mar 29 00:41 mouse1 - crw-r--r-- 1 root root 13, 34 Mar 29 00:41 mouse2 - crw-r--r-- 1 root root 13, 35 Apr 1 10:50 mouse3 - ... - ... - crw-r--r-- 1 root root 13, 62 Apr 1 10:50 mouse30 - crw-r--r-- 1 root root 13, 63 Apr 1 10:50 mice - -Each 'mouse' device is assigned to a single mouse or digitizer, except -the last one - 'mice'. This single character device is shared by all -mice and digitizers, and even if none are connected, the device is -present. This is useful for hotplugging USB mice, so that programs -can open the device even when no mice are present. - - CONFIG_INPUT_MOUSEDEV_SCREEN_[XY] in the kernel configuration are -the size of your screen (in pixels) in XFree86. This is needed if you -want to use your digitizer in X, because its movement is sent to X -via a virtual PS/2 mouse and thus needs to be scaled -accordingly. These values won't be used if you use a mouse only. - - Mousedev will generate either PS/2, ImPS/2 (Microsoft IntelliMouse) or -ExplorerPS/2 (IntelliMouse Explorer) protocols, depending on what the -program reading the data wishes. You can set GPM and X to any of -these. You'll need ImPS/2 if you want to make use of a wheel on a USB -mouse and ExplorerPS/2 if you want to use extra (up to 5) buttons. - -3.2.3 joydev -~~~~~~~~~~~~ - Joydev implements v0.x and v1.x Linux joystick api, much like -drivers/char/joystick/joystick.c used to in earlier versions. See -joystick-api.txt in the Documentation subdirectory for details. As -soon as any joystick is connected, it can be accessed in /dev/input -on: - - crw-r--r-- 1 root root 13, 0 Apr 1 10:50 js0 - crw-r--r-- 1 root root 13, 1 Apr 1 10:50 js1 - crw-r--r-- 1 root root 13, 2 Apr 1 10:50 js2 - crw-r--r-- 1 root root 13, 3 Apr 1 10:50 js3 - ... - -And so on up to js31. - -3.2.4 evdev -~~~~~~~~~~~ - evdev is the generic input event interface. It passes the events -generated in the kernel straight to the program, with timestamps. The -API is still evolving, but should be usable now. It's described in -section 5. - - This should be the way for GPM and X to get keyboard and mouse -events. It allows for multihead in X without any specific multihead -kernel support. The event codes are the same on all architectures and -are hardware independent. - - The devices are in /dev/input: - - crw-r--r-- 1 root root 13, 64 Apr 1 10:49 event0 - crw-r--r-- 1 root root 13, 65 Apr 1 10:50 event1 - crw-r--r-- 1 root root 13, 66 Apr 1 10:50 event2 - crw-r--r-- 1 root root 13, 67 Apr 1 10:50 event3 - ... - -And so on up to event31. - -4. Verifying if it works -~~~~~~~~~~~~~~~~~~~~~~~~ - Typing a couple keys on the keyboard should be enough to check that -a USB keyboard works and is correctly connected to the kernel keyboard -driver. - - Doing a "cat /dev/input/mouse0" (c, 13, 32) will verify that a mouse -is also emulated; characters should appear if you move it. - - You can test the joystick emulation with the 'jstest' utility, -available in the joystick package (see Documentation/input/joystick.txt). - - You can test the event devices with the 'evtest' utility available -in the LinuxConsole project CVS archive (see the URL below). - -5. Event interface -~~~~~~~~~~~~~~~~~~ - Should you want to add event device support into any application (X, gpm, -svgalib ...) I will be happy to provide you any help I -can. Here goes a description of the current state of things, which is going -to be extended, but not changed incompatibly as time goes: - - You can use blocking and nonblocking reads, also select() on the -/dev/input/eventX devices, and you'll always get a whole number of input -events on a read. Their layout is: - -struct input_event { - struct timeval time; - unsigned short type; - unsigned short code; - unsigned int value; -}; - - 'time' is the timestamp, it returns the time at which the event happened. -Type is for example EV_REL for relative moment, EV_KEY for a keypress or -release. More types are defined in include/uapi/linux/input-event-codes.h. - - 'code' is event code, for example REL_X or KEY_BACKSPACE, again a complete -list is in include/uapi/linux/input-event-codes.h. - - 'value' is the value the event carries. Either a relative change for -EV_REL, absolute new value for EV_ABS (joysticks ...), or 0 for EV_KEY for -release, 1 for keypress and 2 for autorepeat. - diff --git a/Documentation/input/input_kapi.rst b/Documentation/input/input_kapi.rst new file mode 100644 index 0000000000000000000000000000000000000000..41f1b7e6b78ea56097300339f5c39894a9403fc4 --- /dev/null +++ b/Documentation/input/input_kapi.rst @@ -0,0 +1,17 @@ +.. include:: + +################################ +Linux Input Subsystem kernel API +################################ + +.. class:: toc-title + + Table of Contents + +.. toctree:: + :maxdepth: 2 + :numbered: + + input-programming + gameport-programming + notifier diff --git a/Documentation/input/input_uapi.rst b/Documentation/input/input_uapi.rst new file mode 100644 index 0000000000000000000000000000000000000000..4a0391609327a1d4931382d438e532ce75cf3305 --- /dev/null +++ b/Documentation/input/input_uapi.rst @@ -0,0 +1,22 @@ +.. include:: + +################################### +Linux Input Subsystem userspace API +################################### + +.. class:: toc-title + + Table of Contents + +.. toctree:: + :maxdepth: 2 + :numbered: + + input + event-codes + multi-touch-protocol + gamepad + ff + joydev/index + uinput + userio diff --git a/Documentation/input/interactive.fig b/Documentation/input/interactive.fig deleted file mode 100644 index 1e7de387723a972aff917cbb4d26a29aca79c198..0000000000000000000000000000000000000000 --- a/Documentation/input/interactive.fig +++ /dev/null @@ -1,42 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -2 1 0 2 0 7 50 0 -1 6.000 0 0 -1 0 0 6 - 1200 3600 1800 3600 2400 4800 3000 4800 4200 5700 4800 5700 -2 2 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 5 - 1200 3150 4800 3150 4800 6300 1200 6300 1200 3150 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 1200 4800 4800 4800 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4 - 2400 4800 2400 6525 1950 7125 1950 7800 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4 - 3000 4800 3000 6525 3600 7125 3600 7800 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3 - 0 0 1.00 60.00 120.00 - 3825 5400 4125 5100 5400 5100 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3 - 0 0 1.00 60.00 120.00 - 2100 4200 2400 3900 5400 3900 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 4800 5700 5400 5700 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 1800 3600 5400 3600 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 1 3 - 0 0 1.00 60.00 120.00 - 2700 4800 2700 4425 5400 4425 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 1950 7800 3600 7800 -4 1 0 50 0 0 12 0.0000 4 135 810 2775 7725 Dead band\001 -4 0 0 50 0 0 12 0.0000 4 180 1155 5400 5700 right saturation\001 -4 0 0 50 0 0 12 0.0000 4 135 1065 5400 3600 left saturation\001 -4 0 0 50 0 0 12 0.0000 4 180 2505 5400 3900 left coeff ( positive in that case )\001 -4 0 0 50 0 0 12 0.0000 4 180 2640 5475 5100 right coeff ( negative in that case )\001 -4 0 0 50 0 0 12 0.0000 4 105 480 5400 4425 center\001 diff --git a/Documentation/input/interactive.svg b/Documentation/input/interactive.svg new file mode 100644 index 0000000000000000000000000000000000000000..a3513709a09bb83208e00fea7ea4266b9e998562 --- /dev/null +++ b/Documentation/input/interactive.svg @@ -0,0 +1,24 @@ + + + + + + + + + + + + + + + + + + Dead band + right saturation + left saturation + left coeff ( positive in that case ) + right coeff ( negative in that case ) + center + diff --git a/Documentation/input/joydev/index.rst b/Documentation/input/joydev/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..8d9666c7561c8bafdfaaa0cb07db36fd85a55daa --- /dev/null +++ b/Documentation/input/joydev/index.rst @@ -0,0 +1,18 @@ +.. include:: + +====================== +Linux Joystick support +====================== + +:Copyright: |copy| 1996-2000 Vojtech Pavlik - Sponsored by SuSE + +.. class:: toc-title + + Table of Contents + +.. toctree:: + :maxdepth: 3 + :numbered: + + joystick + joystick-api diff --git a/Documentation/input/joydev/joystick-api.rst b/Documentation/input/joydev/joystick-api.rst new file mode 100644 index 0000000000000000000000000000000000000000..95803e2e8cd00e978c308e7008ebc2f2532fca99 --- /dev/null +++ b/Documentation/input/joydev/joystick-api.rst @@ -0,0 +1,348 @@ +.. _joystick-api: + +===================== +Programming Interface +===================== + +:Author: Ragnar Hojland Espinosa - 7 Aug 1998 + +Introduction +============ + +.. important:: + This document describes legacy ``js`` interface. Newer clients are + encouraged to switch to the generic event (``evdev``) interface. + +The 1.0 driver uses a new, event based approach to the joystick driver. +Instead of the user program polling for the joystick values, the joystick +driver now reports only any changes of its state. See joystick-api.txt, +joystick.h and jstest.c included in the joystick package for more +information. The joystick device can be used in either blocking or +nonblocking mode, and supports select() calls. + +For backward compatibility the old (v0.x) interface is still included. +Any call to the joystick driver using the old interface will return values +that are compatible to the old interface. This interface is still limited +to 2 axes, and applications using it usually decode only 2 buttons, although +the driver provides up to 32. + +Initialization +============== + +Open the joystick device following the usual semantics (that is, with open). +Since the driver now reports events instead of polling for changes, +immediately after the open it will issue a series of synthetic events +(JS_EVENT_INIT) that you can read to obtain the initial state of the +joystick. + +By default, the device is opened in blocking mode:: + + int fd = open ("/dev/input/js0", O_RDONLY); + + +Event Reading +============= + +:: + + struct js_event e; + read (fd, &e, sizeof(e)); + +where js_event is defined as:: + + struct js_event { + __u32 time; /* event timestamp in milliseconds */ + __s16 value; /* value */ + __u8 type; /* event type */ + __u8 number; /* axis/button number */ + }; + +If the read is successful, it will return sizeof(e), unless you wanted to read +more than one event per read as described in section 3.1. + + +js_event.type +------------- + +The possible values of ``type`` are:: + + #define JS_EVENT_BUTTON 0x01 /* button pressed/released */ + #define JS_EVENT_AXIS 0x02 /* joystick moved */ + #define JS_EVENT_INIT 0x80 /* initial state of device */ + +As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed +events on open. That is, if it's issuing a INIT BUTTON event, the +current type value will be:: + + int type = JS_EVENT_BUTTON | JS_EVENT_INIT; /* 0x81 */ + +If you choose not to differentiate between synthetic or real events +you can turn off the JS_EVENT_INIT bits:: + + type &= ~JS_EVENT_INIT; /* 0x01 */ + + +js_event.number +--------------- + +The values of ``number`` correspond to the axis or button that +generated the event. Note that they carry separate numeration (that +is, you have both an axis 0 and a button 0). Generally, + + =============== ======= + Axis number + =============== ======= + 1st Axis X 0 + 1st Axis Y 1 + 2nd Axis X 2 + 2nd Axis Y 3 + ...and so on + =============== ======= + +Hats vary from one joystick type to another. Some can be moved in 8 +directions, some only in 4, The driver, however, always reports a hat as two +independent axis, even if the hardware doesn't allow independent movement. + + +js_event.value +-------------- + +For an axis, ``value`` is a signed integer between -32767 and +32767 +representing the position of the joystick along that axis. If you +don't read a 0 when the joystick is ``dead``, or if it doesn't span the +full range, you should recalibrate it (with, for example, jscal). + +For a button, ``value`` for a press button event is 1 and for a release +button event is 0. + +Though this:: + + if (js_event.type == JS_EVENT_BUTTON) { + buttons_state ^= (1 << js_event.number); + } + +may work well if you handle JS_EVENT_INIT events separately, + +:: + + if ((js_event.type & ~JS_EVENT_INIT) == JS_EVENT_BUTTON) { + if (js_event.value) + buttons_state |= (1 << js_event.number); + else + buttons_state &= ~(1 << js_event.number); + } + +is much safer since it can't lose sync with the driver. As you would +have to write a separate handler for JS_EVENT_INIT events in the first +snippet, this ends up being shorter. + + +js_event.time +------------- + +The time an event was generated is stored in ``js_event.time``. It's a time +in milliseconds since ... well, since sometime in the past. This eases the +task of detecting double clicks, figuring out if movement of axis and button +presses happened at the same time, and similar. + + +Reading +======= + +If you open the device in blocking mode, a read will block (that is, +wait) forever until an event is generated and effectively read. There +are two alternatives if you can't afford to wait forever (which is, +admittedly, a long time;) + + a) use select to wait until there's data to be read on fd, or + until it timeouts. There's a good example on the select(2) + man page. + + b) open the device in non-blocking mode (O_NONBLOCK) + + +O_NONBLOCK +---------- + +If read returns -1 when reading in O_NONBLOCK mode, this isn't +necessarily a "real" error (check errno(3)); it can just mean there +are no events pending to be read on the driver queue. You should read +all events on the queue (that is, until you get a -1). + +For example, + +:: + + while (1) { + while (read (fd, &e, sizeof(e)) > 0) { + process_event (e); + } + /* EAGAIN is returned when the queue is empty */ + if (errno != EAGAIN) { + /* error */ + } + /* do something interesting with processed events */ + } + +One reason for emptying the queue is that if it gets full you'll start +missing events since the queue is finite, and older events will get +overwritten. + +The other reason is that you want to know all what happened, and not +delay the processing till later. + +Why can get the queue full? Because you don't empty the queue as +mentioned, or because too much time elapses from one read to another +and too many events to store in the queue get generated. Note that +high system load may contribute to space those reads even more. + +If time between reads is enough to fill the queue and lose an event, +the driver will switch to startup mode and next time you read it, +synthetic events (JS_EVENT_INIT) will be generated to inform you of +the actual state of the joystick. + + +.. note:: + + As of version 1.2.8, the queue is circular and able to hold 64 + events. You can increment this size bumping up JS_BUFF_SIZE in + joystick.h and recompiling the driver. + + +In the above code, you might as well want to read more than one event +at a time using the typical read(2) functionality. For that, you would +replace the read above with something like:: + + struct js_event mybuffer[0xff]; + int i = read (fd, mybuffer, sizeof(mybuffer)); + +In this case, read would return -1 if the queue was empty, or some +other value in which the number of events read would be i / +sizeof(js_event) Again, if the buffer was full, it's a good idea to +process the events and keep reading it until you empty the driver queue. + + +IOCTLs +====== + +The joystick driver defines the following ioctl(2) operations:: + + /* function 3rd arg */ + #define JSIOCGAXES /* get number of axes char */ + #define JSIOCGBUTTONS /* get number of buttons char */ + #define JSIOCGVERSION /* get driver version int */ + #define JSIOCGNAME(len) /* get identifier string char */ + #define JSIOCSCORR /* set correction values &js_corr */ + #define JSIOCGCORR /* get correction values &js_corr */ + +For example, to read the number of axes:: + + char number_of_axes; + ioctl (fd, JSIOCGAXES, &number_of_axes); + + +JSIOGCVERSION +------------- + +JSIOGCVERSION is a good way to check in run-time whether the running +driver is 1.0+ and supports the event interface. If it is not, the +IOCTL will fail. For a compile-time decision, you can test the +JS_VERSION symbol:: + + #ifdef JS_VERSION + #if JS_VERSION > 0xsomething + + +JSIOCGNAME +---------- + +JSIOCGNAME(len) allows you to get the name string of the joystick - the same +as is being printed at boot time. The 'len' argument is the length of the +buffer provided by the application asking for the name. It is used to avoid +possible overrun should the name be too long:: + + char name[128]; + if (ioctl(fd, JSIOCGNAME(sizeof(name)), name) < 0) + strncpy(name, "Unknown", sizeof(name)); + printf("Name: %s\n", name); + + +JSIOC[SG]CORR +------------- + +For usage on JSIOC[SG]CORR I suggest you to look into jscal.c They are +not needed in a normal program, only in joystick calibration software +such as jscal or kcmjoy. These IOCTLs and data types aren't considered +to be in the stable part of the API, and therefore may change without +warning in following releases of the driver. + +Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold +information for all axis. That is, struct js_corr corr[MAX_AXIS]; + +struct js_corr is defined as:: + + struct js_corr { + __s32 coef[8]; + __u16 prec; + __u16 type; + }; + +and ``type``:: + + #define JS_CORR_NONE 0x00 /* returns raw values */ + #define JS_CORR_BROKEN 0x01 /* broken line */ + + +Backward compatibility +====================== + +The 0.x joystick driver API is quite limited and its usage is deprecated. +The driver offers backward compatibility, though. Here's a quick summary:: + + struct JS_DATA_TYPE js; + while (1) { + if (read (fd, &js, JS_RETURN) != JS_RETURN) { + /* error */ + } + usleep (1000); + } + +As you can figure out from the example, the read returns immediately, +with the actual state of the joystick:: + + struct JS_DATA_TYPE { + int buttons; /* immediate button state */ + int x; /* immediate x axis value */ + int y; /* immediate y axis value */ + }; + +and JS_RETURN is defined as:: + + #define JS_RETURN sizeof(struct JS_DATA_TYPE) + +To test the state of the buttons, + +:: + + first_button_state = js.buttons & 1; + second_button_state = js.buttons & 2; + +The axis values do not have a defined range in the original 0.x driver, +except for that the values are non-negative. The 1.2.8+ drivers use a +fixed range for reporting the values, 1 being the minimum, 128 the +center, and 255 maximum value. + +The v0.8.0.2 driver also had an interface for 'digital joysticks', (now +called Multisystem joysticks in this driver), under /dev/djsX. This driver +doesn't try to be compatible with that interface. + + +Final Notes +=========== + +:: + + ____/| Comments, additions, and specially corrections are welcome. + \ o.O| Documentation valid for at least version 1.2.8 of the joystick + =(_)= driver and as usual, the ultimate source for documentation is + U to "Use The Source Luke" or, at your convenience, Vojtech ;) diff --git a/Documentation/input/joydev/joystick.rst b/Documentation/input/joydev/joystick.rst new file mode 100644 index 0000000000000000000000000000000000000000..9746fd76cc581914ba315c2bf81c7f829ac6954d --- /dev/null +++ b/Documentation/input/joydev/joystick.rst @@ -0,0 +1,585 @@ +.. include:: + +.. _joystick-doc: + +Introduction +============ + +The joystick driver for Linux provides support for a variety of joysticks +and similar devices. It is based on a larger project aiming to support all +input devices in Linux. + +The mailing list for the project is: + + linux-input@vger.kernel.org + +send "subscribe linux-input" to majordomo@vger.kernel.org to subscribe to it. + +Usage +===== + +For basic usage you just choose the right options in kernel config and +you should be set. + +Utilities +--------- + +For testing and other purposes (for example serial devices), there is a set +of utilities, such as ``jstest``, ``jscal``, and ``evtest``, +usually packaged as ``joystick``, ``input-utils``, ``evtest``, and so on. + +``inputattach`` utility is required if your joystick is connected to a +serial port. + +Device nodes +------------ + +For applications to be able to use the joysticks, device nodes should be +created in /dev. Normally it is done automatically by the system, but +it can also be done by hand:: + + cd /dev + rm js* + mkdir input + mknod input/js0 c 13 0 + mknod input/js1 c 13 1 + mknod input/js2 c 13 2 + mknod input/js3 c 13 3 + ln -s input/js0 js0 + ln -s input/js1 js1 + ln -s input/js2 js2 + ln -s input/js3 js3 + +For testing with inpututils it's also convenient to create these:: + + mknod input/event0 c 13 64 + mknod input/event1 c 13 65 + mknod input/event2 c 13 66 + mknod input/event3 c 13 67 + +Modules needed +-------------- + +For all joystick drivers to function, you'll need the userland interface +module in kernel, either loaded or compiled in:: + + modprobe joydev + +For gameport joysticks, you'll have to load the gameport driver as well:: + + modprobe ns558 + +And for serial port joysticks, you'll need the serial input line +discipline module loaded and the inputattach utility started:: + + modprobe serport + inputattach -xxx /dev/tts/X & + +In addition to that, you'll need the joystick driver module itself, most +usually you'll have an analog joystick:: + + modprobe analog + +For automatic module loading, something like this might work - tailor to +your needs:: + + alias tty-ldisc-2 serport + alias char-major-13 input + above input joydev ns558 analog + options analog map=gamepad,none,2btn + +Verifying that it works +----------------------- + +For testing the joystick driver functionality, there is the jstest +program in the utilities package. You run it by typing:: + + jstest /dev/input/js0 + +And it should show a line with the joystick values, which update as you +move the stick, and press its buttons. The axes should all be zero when the +joystick is in the center position. They should not jitter by themselves to +other close values, and they also should be steady in any other position of +the stick. They should have the full range from -32767 to 32767. If all this +is met, then it's all fine, and you can play the games. :) + +If it's not, then there might be a problem. Try to calibrate the joystick, +and if it still doesn't work, read the drivers section of this file, the +troubleshooting section, and the FAQ. + +Calibration +----------- + +For most joysticks you won't need any manual calibration, since the +joystick should be autocalibrated by the driver automagically. However, with +some analog joysticks, that either do not use linear resistors, or if you +want better precision, you can use the jscal program:: + + jscal -c /dev/input/js0 + +included in the joystick package to set better correction coefficients than +what the driver would choose itself. + +After calibrating the joystick you can verify if you like the new +calibration using the jstest command, and if you do, you then can save the +correction coefficients into a file:: + + jscal -p /dev/input/js0 > /etc/joystick.cal + +And add a line to your rc script executing that file:: + + source /etc/joystick.cal + +This way, after the next reboot your joystick will remain calibrated. You +can also add the ``jscal -p`` line to your shutdown script. + +HW specific driver information +============================== + +In this section each of the separate hardware specific drivers is described. + +Analog joysticks +---------------- + +The analog.c uses the standard analog inputs of the gameport, and thus +supports all standard joysticks and gamepads. It uses a very advanced +routine for this, allowing for data precision that can't be found on any +other system. + +It also supports extensions like additional hats and buttons compatible +with CH Flightstick Pro, ThrustMaster FCS or 6 and 8 button gamepads. Saitek +Cyborg 'digital' joysticks are also supported by this driver, because +they're basically souped up CHF sticks. + +However the only types that can be autodetected are: + +* 2-axis, 4-button joystick +* 3-axis, 4-button joystick +* 4-axis, 4-button joystick +* Saitek Cyborg 'digital' joysticks + +For other joystick types (more/less axes, hats, and buttons) support +you'll need to specify the types either on the kernel command line or on the +module command line, when inserting analog into the kernel. The +parameters are:: + + analog.map=,,,.... + +'type' is type of the joystick from the table below, defining joysticks +present on gameports in the system, starting with gameport0, second 'type' +entry defining joystick on gameport1 and so on. + + ========= ===================================================== + Type Meaning + ========= ===================================================== + none No analog joystick on that port + auto Autodetect joystick + 2btn 2-button n-axis joystick + y-joy Two 2-button 2-axis joysticks on an Y-cable + y-pad Two 2-button 2-axis gamepads on an Y-cable + fcs Thrustmaster FCS compatible joystick + chf Joystick with a CH Flightstick compatible hat + fullchf CH Flightstick compatible with two hats and 6 buttons + gamepad 4/6-button n-axis gamepad + gamepad8 8-button 2-axis gamepad + ========= ===================================================== + +In case your joystick doesn't fit in any of the above categories, you can +specify the type as a number by combining the bits in the table below. This +is not recommended unless you really know what are you doing. It's not +dangerous, but not simple either. + + ==== ========================= + Bit Meaning + ==== ========================= + 0 Axis X1 + 1 Axis Y1 + 2 Axis X2 + 3 Axis Y2 + 4 Button A + 5 Button B + 6 Button C + 7 Button D + 8 CHF Buttons X and Y + 9 CHF Hat 1 + 10 CHF Hat 2 + 11 FCS Hat + 12 Pad Button X + 13 Pad Button Y + 14 Pad Button U + 15 Pad Button V + 16 Saitek F1-F4 Buttons + 17 Saitek Digital Mode + 19 GamePad + 20 Joy2 Axis X1 + 21 Joy2 Axis Y1 + 22 Joy2 Axis X2 + 23 Joy2 Axis Y2 + 24 Joy2 Button A + 25 Joy2 Button B + 26 Joy2 Button C + 27 Joy2 Button D + 31 Joy2 GamePad + ==== ========================= + +Microsoft SideWinder joysticks +------------------------------ + +Microsoft 'Digital Overdrive' protocol is supported by the sidewinder.c +module. All currently supported joysticks: + +* Microsoft SideWinder 3D Pro +* Microsoft SideWinder Force Feedback Pro +* Microsoft SideWinder Force Feedback Wheel +* Microsoft SideWinder FreeStyle Pro +* Microsoft SideWinder GamePad (up to four, chained) +* Microsoft SideWinder Precision Pro +* Microsoft SideWinder Precision Pro USB + +are autodetected, and thus no module parameters are needed. + +There is one caveat with the 3D Pro. There are 9 buttons reported, +although the joystick has only 8. The 9th button is the mode switch on the +rear side of the joystick. However, moving it, you'll reset the joystick, +and make it unresponsive for about a one third of a second. Furthermore, the +joystick will also re-center itself, taking the position it was in during +this time as a new center position. Use it if you want, but think first. + +The SideWinder Standard is not a digital joystick, and thus is supported +by the analog driver described above. + +Logitech ADI devices +-------------------- + +Logitech ADI protocol is supported by the adi.c module. It should support +any Logitech device using this protocol. This includes, but is not limited +to: + +* Logitech CyberMan 2 +* Logitech ThunderPad Digital +* Logitech WingMan Extreme Digital +* Logitech WingMan Formula +* Logitech WingMan Interceptor +* Logitech WingMan GamePad +* Logitech WingMan GamePad USB +* Logitech WingMan GamePad Extreme +* Logitech WingMan Extreme Digital 3D + +ADI devices are autodetected, and the driver supports up to two (any +combination of) devices on a single gameport, using an Y-cable or chained +together. + +Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan +Extreme and Logitech WingMan ThunderPad are not digital joysticks and are +handled by the analog driver described above. Logitech WingMan Warrior and +Logitech Magellan are supported by serial drivers described below. Logitech +WingMan Force and Logitech WingMan Formula Force are supported by the +I-Force driver described below. Logitech CyberMan is not supported yet. + +Gravis GrIP +----------- + +Gravis GrIP protocol is supported by the grip.c module. It currently +supports: + +* Gravis GamePad Pro +* Gravis BlackHawk Digital +* Gravis Xterminator +* Gravis Xterminator DualControl + +All these devices are autodetected, and you can even use any combination +of up to two of these pads either chained together or using an Y-cable on a +single gameport. + +GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is +supported by the stinger driver. Other Gravis joysticks are supported by the +analog driver. + +FPGaming A3D and MadCatz A3D +---------------------------- + +The Assassin 3D protocol created by FPGaming, is used both by FPGaming +themselves and is licensed to MadCatz. A3D devices are supported by the +a3d.c module. It currently supports: + +* FPGaming Assassin 3D +* MadCatz Panther +* MadCatz Panther XL + +All these devices are autodetected. Because the Assassin 3D and the Panther +allow connecting analog joysticks to them, you'll need to load the analog +driver as well to handle the attached joysticks. + +The trackball should work with USB mousedev module as a normal mouse. See +the USB documentation for how to setup an USB mouse. + +ThrustMaster DirectConnect (BSP) +-------------------------------- + +The TM DirectConnect (BSP) protocol is supported by the tmdc.c +module. This includes, but is not limited to: + +* ThrustMaster Millennium 3D Interceptor +* ThrustMaster 3D Rage Pad +* ThrustMaster Fusion Digital Game Pad + +Devices not directly supported, but hopefully working are: + +* ThrustMaster FragMaster +* ThrustMaster Attack Throttle + +If you have one of these, contact me. + +TMDC devices are autodetected, and thus no parameters to the module +are needed. Up to two TMDC devices can be connected to one gameport, using +an Y-cable. + +Creative Labs Blaster +--------------------- + +The Blaster protocol is supported by the cobra.c module. It supports only +the: + +* Creative Blaster GamePad Cobra + +Up to two of these can be used on a single gameport, using an Y-cable. + +Genius Digital joysticks +------------------------ + +The Genius digitally communicating joysticks are supported by the gf2k.c +module. This includes: + +* Genius Flight2000 F-23 joystick +* Genius Flight2000 F-31 joystick +* Genius G-09D gamepad + +Other Genius digital joysticks are not supported yet, but support can be +added fairly easily. + +InterAct Digital joysticks +-------------------------- + +The InterAct digitally communicating joysticks are supported by the +interact.c module. This includes: + +* InterAct HammerHead/FX gamepad +* InterAct ProPad8 gamepad + +Other InterAct digital joysticks are not supported yet, but support can be +added fairly easily. + +PDPI Lightning 4 gamecards +-------------------------- + +PDPI Lightning 4 gamecards are supported by the lightning.c module. +Once the module is loaded, the analog driver can be used to handle the +joysticks. Digitally communicating joystick will work only on port 0, while +using Y-cables, you can connect up to 8 analog joysticks to a single L4 +card, 16 in case you have two in your system. + +Trident 4DWave / Aureal Vortex +------------------------------ + +Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets +provide an "Enhanced Game Port" mode where the soundcard handles polling the +joystick. This mode is supported by the pcigame.c module. Once loaded the +analog driver can use the enhanced features of these gameports.. + +Crystal SoundFusion +------------------- + +Soundcards with Crystal SoundFusion chipsets provide an "Enhanced Game +Port", much like the 4DWave or Vortex above. This, and also the normal mode +for the port of the SoundFusion is supported by the cs461x.c module. + +SoundBlaster Live! +------------------ + +The Live! has a special PCI gameport, which, although it doesn't provide +any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than +its ISA counterparts. It also requires special support, hence the +emu10k1-gp.c module for it instead of the normal ns558.c one. + +SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes +------------------------------------------------------------------------ + +These PCI soundcards have specific gameports. They are handled by the +sound drivers themselves. Make sure you select gameport support in the +joystick menu and sound card support in the sound menu for your appropriate +card. + +Amiga +----- + +Amiga joysticks, connected to an Amiga, are supported by the amijoy.c +driver. Since they can't be autodetected, the driver has a command line: + + amijoy.map=, + +a and b define the joysticks connected to the JOY0DAT and JOY1DAT ports of +the Amiga. + + ====== =========================== + Value Joystick type + ====== =========================== + 0 None + 1 1-button digital joystick + ====== =========================== + +No more joystick types are supported now, but that should change in the +future if I get an Amiga in the reach of my fingers. + +Game console and 8-bit pads and joysticks +----------------------------------------- + +These pads and joysticks are not designed for PCs and other computers +Linux runs on, and usually require a special connector for attaching +them through a parallel port. + +See :ref:`joystick-parport` for more info. + +SpaceTec/LabTec devices +----------------------- + +SpaceTec serial devices communicate using the SpaceWare protocol. It is +supported by the spaceorb.c and spaceball.c drivers. The devices currently +supported by spaceorb.c are: + +* SpaceTec SpaceBall Avenger +* SpaceTec SpaceOrb 360 + +Devices currently supported by spaceball.c are: + +* SpaceTec SpaceBall 4000 FLX + +In addition to having the spaceorb/spaceball and serport modules in the +kernel, you also need to attach a serial port to it. to do that, run the +inputattach program:: + + inputattach --spaceorb /dev/tts/x & + +or:: + + inputattach --spaceball /dev/tts/x & + +where /dev/tts/x is the serial port which the device is connected to. After +doing this, the device will be reported and will start working. + +There is one caveat with the SpaceOrb. The button #6, the on the bottom +side of the orb, although reported as an ordinary button, causes internal +recentering of the spaceorb, moving the zero point to the position in which +the ball is at the moment of pressing the button. So, think first before +you bind it to some other function. + +SpaceTec SpaceBall 2003 FLX and 3003 FLX are not supported yet. + +Logitech SWIFT devices +---------------------- + +The SWIFT serial protocol is supported by the warrior.c module. It +currently supports only the: + +* Logitech WingMan Warrior + +but in the future, Logitech CyberMan (the original one, not CM2) could be +supported as well. To use the module, you need to run inputattach after you +insert/compile the module into your kernel:: + + inputattach --warrior /dev/tts/x & + +/dev/tts/x is the serial port your Warrior is attached to. + +Magellan / Space Mouse +---------------------- + +The Magellan (or Space Mouse), manufactured by LogiCad3d (formerly Space +Systems), for many other companies (Logitech, HP, ...) is supported by the +joy-magellan module. It currently supports only the: + +* Magellan 3D +* Space Mouse + +models, the additional buttons on the 'Plus' versions are not supported yet. + +To use it, you need to attach the serial port to the driver using the:: + + inputattach --magellan /dev/tts/x & + +command. After that the Magellan will be detected, initialized, will beep, +and the /dev/input/jsX device should become usable. + +I-Force devices +--------------- + +All I-Force devices are supported by the iforce module. This includes: + +* AVB Mag Turbo Force +* AVB Top Shot Pegasus +* AVB Top Shot Force Feedback Racing Wheel +* Logitech WingMan Force +* Logitech WingMan Force Wheel +* Guillemot Race Leader Force Feedback +* Guillemot Force Feedback Racing Wheel +* Thrustmaster Motor Sport GT + +To use it, you need to attach the serial port to the driver using the:: + + inputattach --iforce /dev/tts/x & + +command. After that the I-Force device will be detected, and the +/dev/input/jsX device should become usable. + +In case you're using the device via the USB port, the inputattach command +isn't needed. + +The I-Force driver now supports force feedback via the event interface. + +Please note that Logitech WingMan 3D devices are _not_ supported by this +module, rather by hid. Force feedback is not supported for those devices. +Logitech gamepads are also hid devices. + +Gravis Stinger gamepad +---------------------- + +The Gravis Stinger serial port gamepad, designed for use with laptop +computers, is supported by the stinger.c module. To use it, attach the +serial port to the driver using:: + + inputattach --stinger /dev/tty/x & + +where x is the number of the serial port. + +Troubleshooting +=============== + +There is quite a high probability that you run into some problems. For +testing whether the driver works, if in doubt, use the jstest utility in +some of its modes. The most useful modes are "normal" - for the 1.x +interface, and "old" for the "0.x" interface. You run it by typing:: + + jstest --normal /dev/input/js0 + jstest --old /dev/input/js0 + +Additionally you can do a test with the evtest utility:: + + evtest /dev/input/event0 + +Oh, and read the FAQ! :) + +FAQ +=== + +:Q: Running 'jstest /dev/input/js0' results in "File not found" error. What's the + cause? +:A: The device files don't exist. Create them (see section 2.2). + +:Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick + or pad that uses a 9-pin D-type cannon connector to the serial port of my + PC? +:A: Yes, it is possible, but it'll burn your serial port or the pad. It + won't work, of course. + +:Q: My joystick doesn't work with Quake / Quake 2. What's the cause? +:A: Quake / Quake 2 don't support joystick. Use joy2key to simulate keypresses + for them. diff --git a/Documentation/input/joystick-api.txt b/Documentation/input/joystick-api.txt deleted file mode 100644 index 943b18eac91827d62b10ab79dceb519bb895effd..0000000000000000000000000000000000000000 --- a/Documentation/input/joystick-api.txt +++ /dev/null @@ -1,314 +0,0 @@ - Joystick API Documentation -*-Text-*- - - Ragnar Hojland Espinosa - - - 7 Aug 1998 - -1. Initialization -~~~~~~~~~~~~~~~~~ - -Open the joystick device following the usual semantics (that is, with open). -Since the driver now reports events instead of polling for changes, -immediately after the open it will issue a series of synthetic events -(JS_EVENT_INIT) that you can read to check the initial state of the -joystick. - -By default, the device is opened in blocking mode. - - int fd = open ("/dev/input/js0", O_RDONLY); - - -2. Event Reading -~~~~~~~~~~~~~~~~ - - struct js_event e; - read (fd, &e, sizeof(e)); - -where js_event is defined as - - struct js_event { - __u32 time; /* event timestamp in milliseconds */ - __s16 value; /* value */ - __u8 type; /* event type */ - __u8 number; /* axis/button number */ - }; - -If the read is successful, it will return sizeof(e), unless you wanted to read -more than one event per read as described in section 3.1. - - -2.1 js_event.type -~~~~~~~~~~~~~~~~~ - -The possible values of ``type'' are - - #define JS_EVENT_BUTTON 0x01 /* button pressed/released */ - #define JS_EVENT_AXIS 0x02 /* joystick moved */ - #define JS_EVENT_INIT 0x80 /* initial state of device */ - -As mentioned above, the driver will issue synthetic JS_EVENT_INIT ORed -events on open. That is, if it's issuing a INIT BUTTON event, the -current type value will be - - int type = JS_EVENT_BUTTON | JS_EVENT_INIT; /* 0x81 */ - -If you choose not to differentiate between synthetic or real events -you can turn off the JS_EVENT_INIT bits - - type &= ~JS_EVENT_INIT; /* 0x01 */ - - -2.2 js_event.number -~~~~~~~~~~~~~~~~~~~ - -The values of ``number'' correspond to the axis or button that -generated the event. Note that they carry separate numeration (that -is, you have both an axis 0 and a button 0). Generally, - - number - 1st Axis X 0 - 1st Axis Y 1 - 2nd Axis X 2 - 2nd Axis Y 3 - ...and so on - -Hats vary from one joystick type to another. Some can be moved in 8 -directions, some only in 4, The driver, however, always reports a hat as two -independent axis, even if the hardware doesn't allow independent movement. - - -2.3 js_event.value -~~~~~~~~~~~~~~~~~~ - -For an axis, ``value'' is a signed integer between -32767 and +32767 -representing the position of the joystick along that axis. If you -don't read a 0 when the joystick is `dead', or if it doesn't span the -full range, you should recalibrate it (with, for example, jscal). - -For a button, ``value'' for a press button event is 1 and for a release -button event is 0. - -Though this - - if (js_event.type == JS_EVENT_BUTTON) { - buttons_state ^= (1 << js_event.number); - } - -may work well if you handle JS_EVENT_INIT events separately, - - if ((js_event.type & ~JS_EVENT_INIT) == JS_EVENT_BUTTON) { - if (js_event.value) - buttons_state |= (1 << js_event.number); - else - buttons_state &= ~(1 << js_event.number); - } - -is much safer since it can't lose sync with the driver. As you would -have to write a separate handler for JS_EVENT_INIT events in the first -snippet, this ends up being shorter. - - -2.4 js_event.time -~~~~~~~~~~~~~~~~~ - -The time an event was generated is stored in ``js_event.time''. It's a time -in milliseconds since ... well, since sometime in the past. This eases the -task of detecting double clicks, figuring out if movement of axis and button -presses happened at the same time, and similar. - - -3. Reading -~~~~~~~~~~ - -If you open the device in blocking mode, a read will block (that is, -wait) forever until an event is generated and effectively read. There -are two alternatives if you can't afford to wait forever (which is, -admittedly, a long time;) - - a) use select to wait until there's data to be read on fd, or - until it timeouts. There's a good example on the select(2) - man page. - - b) open the device in non-blocking mode (O_NONBLOCK) - - -3.1 O_NONBLOCK -~~~~~~~~~~~~~~ - -If read returns -1 when reading in O_NONBLOCK mode, this isn't -necessarily a "real" error (check errno(3)); it can just mean there -are no events pending to be read on the driver queue. You should read -all events on the queue (that is, until you get a -1). - -For example, - - while (1) { - while (read (fd, &e, sizeof(e)) > 0) { - process_event (e); - } - /* EAGAIN is returned when the queue is empty */ - if (errno != EAGAIN) { - /* error */ - } - /* do something interesting with processed events */ - } - -One reason for emptying the queue is that if it gets full you'll start -missing events since the queue is finite, and older events will get -overwritten. - -The other reason is that you want to know all what happened, and not -delay the processing till later. - -Why can get the queue full? Because you don't empty the queue as -mentioned, or because too much time elapses from one read to another -and too many events to store in the queue get generated. Note that -high system load may contribute to space those reads even more. - -If time between reads is enough to fill the queue and lose an event, -the driver will switch to startup mode and next time you read it, -synthetic events (JS_EVENT_INIT) will be generated to inform you of -the actual state of the joystick. - -[As for version 1.2.8, the queue is circular and able to hold 64 - events. You can increment this size bumping up JS_BUFF_SIZE in - joystick.h and recompiling the driver.] - - -In the above code, you might as well want to read more than one event -at a time using the typical read(2) functionality. For that, you would -replace the read above with something like - - struct js_event mybuffer[0xff]; - int i = read (fd, mybuffer, sizeof(mybuffer)); - -In this case, read would return -1 if the queue was empty, or some -other value in which the number of events read would be i / -sizeof(js_event) Again, if the buffer was full, it's a good idea to -process the events and keep reading it until you empty the driver queue. - - -4. IOCTLs -~~~~~~~~~ - -The joystick driver defines the following ioctl(2) operations. - - /* function 3rd arg */ - #define JSIOCGAXES /* get number of axes char */ - #define JSIOCGBUTTONS /* get number of buttons char */ - #define JSIOCGVERSION /* get driver version int */ - #define JSIOCGNAME(len) /* get identifier string char */ - #define JSIOCSCORR /* set correction values &js_corr */ - #define JSIOCGCORR /* get correction values &js_corr */ - -For example, to read the number of axes - - char number_of_axes; - ioctl (fd, JSIOCGAXES, &number_of_axes); - - -4.1 JSIOGCVERSION -~~~~~~~~~~~~~~~~~ - -JSIOGCVERSION is a good way to check in run-time whether the running -driver is 1.0+ and supports the event interface. If it is not, the -IOCTL will fail. For a compile-time decision, you can test the -JS_VERSION symbol - - #ifdef JS_VERSION - #if JS_VERSION > 0xsomething - - -4.2 JSIOCGNAME -~~~~~~~~~~~~~~ - -JSIOCGNAME(len) allows you to get the name string of the joystick - the same -as is being printed at boot time. The 'len' argument is the length of the -buffer provided by the application asking for the name. It is used to avoid -possible overrun should the name be too long. - - char name[128]; - if (ioctl(fd, JSIOCGNAME(sizeof(name)), name) < 0) - strncpy(name, "Unknown", sizeof(name)); - printf("Name: %s\n", name); - - -4.3 JSIOC[SG]CORR -~~~~~~~~~~~~~~~~~ - -For usage on JSIOC[SG]CORR I suggest you to look into jscal.c They are -not needed in a normal program, only in joystick calibration software -such as jscal or kcmjoy. These IOCTLs and data types aren't considered -to be in the stable part of the API, and therefore may change without -warning in following releases of the driver. - -Both JSIOCSCORR and JSIOCGCORR expect &js_corr to be able to hold -information for all axis. That is, struct js_corr corr[MAX_AXIS]; - -struct js_corr is defined as - - struct js_corr { - __s32 coef[8]; - __u16 prec; - __u16 type; - }; - -and ``type'' - - #define JS_CORR_NONE 0x00 /* returns raw values */ - #define JS_CORR_BROKEN 0x01 /* broken line */ - - -5. Backward compatibility -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The 0.x joystick driver API is quite limited and its usage is deprecated. -The driver offers backward compatibility, though. Here's a quick summary: - - struct JS_DATA_TYPE js; - while (1) { - if (read (fd, &js, JS_RETURN) != JS_RETURN) { - /* error */ - } - usleep (1000); - } - -As you can figure out from the example, the read returns immediately, -with the actual state of the joystick. - - struct JS_DATA_TYPE { - int buttons; /* immediate button state */ - int x; /* immediate x axis value */ - int y; /* immediate y axis value */ - }; - -and JS_RETURN is defined as - - #define JS_RETURN sizeof(struct JS_DATA_TYPE) - -To test the state of the buttons, - - first_button_state = js.buttons & 1; - second_button_state = js.buttons & 2; - -The axis values do not have a defined range in the original 0.x driver, -except for that the values are non-negative. The 1.2.8+ drivers use a -fixed range for reporting the values, 1 being the minimum, 128 the -center, and 255 maximum value. - -The v0.8.0.2 driver also had an interface for 'digital joysticks', (now -called Multisystem joysticks in this driver), under /dev/djsX. This driver -doesn't try to be compatible with that interface. - - -6. Final Notes -~~~~~~~~~~~~~~ - -____/| Comments, additions, and specially corrections are welcome. -\ o.O| Documentation valid for at least version 1.2.8 of the joystick - =(_)= driver and as usual, the ultimate source for documentation is - U to "Use The Source Luke" or, at your convenience, Vojtech ;) - - - Ragnar -EOF diff --git a/Documentation/input/joystick-parport.txt b/Documentation/input/joystick-parport.txt deleted file mode 100644 index 56870c70a7965a1efbe2c628e8acd27d3664cc52..0000000000000000000000000000000000000000 --- a/Documentation/input/joystick-parport.txt +++ /dev/null @@ -1,542 +0,0 @@ - Linux Joystick parport drivers v2.0 - (c) 1998-2000 Vojtech Pavlik - (c) 1998 Andree Borrmann - Sponsored by SuSE ----------------------------------------------------------------------------- - -0. Disclaimer -~~~~~~~~~~~~~ - Any information in this file is provided as-is, without any guarantee that -it will be true. So, use it at your own risk. The possible damages that can -happen include burning your parallel port, and/or the sticks and joystick -and maybe even more. Like when a lightning kills you it is not our problem. - -1. Intro -~~~~~~~~ - The joystick parport drivers are used for joysticks and gamepads not -originally designed for PCs and other computers Linux runs on. Because of -that, PCs usually lack the right ports to connect these devices to. Parallel -port, because of its ability to change single bits at will, and providing -both output and input bits is the most suitable port on the PC for -connecting such devices. - -2. Devices supported -~~~~~~~~~~~~~~~~~~~~ - Many console and 8-bit computer gamepads and joysticks are supported. The -following subsections discuss usage of each. - -2.1 NES and SNES -~~~~~~~~~~~~~~~~ - The Nintendo Entertainment System and Super Nintendo Entertainment System -gamepads are widely available, and easy to get. Also, they are quite easy to -connect to a PC, and don't need much processing speed (108 us for NES and -165 us for SNES, compared to about 1000 us for PC gamepads) to communicate -with them. - - All NES and SNES use the same synchronous serial protocol, clocked from -the computer's side (and thus timing insensitive). To allow up to 5 NES -and/or SNES gamepads and/or SNES mice connected to the parallel port at once, -the output lines of the parallel port are shared, while one of 5 available -input lines is assigned to each gamepad. - - This protocol is handled by the gamecon.c driver, so that's the one -you'll use for NES, SNES gamepads and SNES mice. - - The main problem with PC parallel ports is that they don't have +5V power -source on any of their pins. So, if you want a reliable source of power -for your pads, use either keyboard or joystick port, and make a pass-through -cable. You can also pull the power directly from the power supply (the red -wire is +5V). - - If you want to use the parallel port only, you can take the power is from -some data pin. For most gamepad and parport implementations only one pin is -needed, and I'd recommend pin 9 for that, the highest data bit. On the other -hand, if you are not planning to use anything else than NES / SNES on the -port, anything between and including pin 4 and pin 9 will work. - -(pin 9) -----> Power - - Unfortunately, there are pads that need a lot more of power, and parallel -ports that can't give much current through the data pins. If this is your -case, you'll need to use diodes (as a prevention of destroying your parallel -port), and combine the currents of two or more data bits together. - - Diodes -(pin 9) ----|>|-------+------> Power - | -(pin 8) ----|>|-------+ - | -(pin 7) ----|>|-------+ - | - : - | -(pin 4) ----|>|-------+ - - Ground is quite easy. On PC's parallel port the ground is on any of the -pins from pin 18 to pin 25. So use any pin of these you like for the ground. - -(pin 18) -----> Ground - - NES and SNES pads have two input bits, Clock and Latch, which drive the -serial transfer. These are connected to pins 2 and 3 of the parallel port, -respectively. - -(pin 2) -----> Clock -(pin 3) -----> Latch - - And the last thing is the NES / SNES data wire. Only that isn't shared and -each pad needs its own data pin. The parallel port pins are: - -(pin 10) -----> Pad 1 data -(pin 11) -----> Pad 2 data -(pin 12) -----> Pad 3 data -(pin 13) -----> Pad 4 data -(pin 15) -----> Pad 5 data - - Note that pin 14 is not used, since it is not an input pin on the parallel -port. - - This is everything you need on the PC's side of the connection, now on to -the gamepads side. The NES and SNES have different connectors. Also, there -are quite a lot of NES clones, and because Nintendo used proprietary -connectors for their machines, the cloners couldn't and used standard D-Cannon -connectors. Anyway, if you've got a gamepad, and it has buttons A, B, Turbo -A, Turbo B, Select and Start, and is connected through 5 wires, then it is -either a NES or NES clone and will work with this connection. SNES gamepads -also use 5 wires, but have more buttons. They will work as well, of course. - -Pinout for NES gamepads Pinout for SNES gamepads and mice - - +----> Power +-----------------------\ - | 7 | o o o o | x x o | 1 - 5 +---------+ 7 +-----------------------/ - | x x o \ | | | | | - | o o o o | | | | | +-> Ground - 4 +------------+ 1 | | | +------------> Data - | | | | | | +---------------> Latch - | | | +-> Ground | +------------------> Clock - | | +----> Clock +---------------------> Power - | +-------> Latch - +----------> Data - -Pinout for NES clone (db9) gamepads Pinout for NES clone (db15) gamepads - - +---------> Clock +-----------------> Data - | +-------> Latch | +---> Ground - | | +-----> Data | | - | | | ___________________ - _____________ 8 \ o x x x x x x o / 1 - 5 \ x o o o x / 1 \ o x x o x x o / - \ x o x o / 15 `~~~~~~~~~~~~~' 9 - 9 `~~~~~~~' 6 | | | - | | | | +----> Clock - | +----> Power | +----------> Latch - +--------> Ground +----------------> Power - -2.2 Multisystem joysticks -~~~~~~~~~~~~~~~~~~~~~~~~~ - In the era of 8-bit machines, there was something like de-facto standard -for joystick ports. They were all digital, and all used D-Cannon 9 pin -connectors (db9). Because of that, a single joystick could be used without -hassle on Atari (130, 800XE, 800XL, 2600, 7200), Amiga, Commodore C64, -Amstrad CPC, Sinclair ZX Spectrum and many other machines. That's why these -joysticks are called "Multisystem". - - Now their pinout: - - +---------> Right - | +-------> Left - | | +-----> Down - | | | +---> Up - | | | | - _____________ -5 \ x o o o o / 1 - \ x o x o / - 9 `~~~~~~~' 6 - | | - | +----> Button - +--------> Ground - - However, as time passed, extensions to this standard developed, and these -were not compatible with each other: - - - Atari 130, 800/XL/XE MSX - - +-----------> Power - +---------> Right | +---------> Right - | +-------> Left | | +-------> Left - | | +-----> Down | | | +-----> Down - | | | +---> Up | | | | +---> Up - | | | | | | | | | - _____________ _____________ -5 \ x o o o o / 1 5 \ o o o o o / 1 - \ x o o o / \ o o o o / - 9 `~~~~~~~' 6 9 `~~~~~~~' 6 - | | | | | | | - | | +----> Button | | | +----> Button 1 - | +------> Power | | +------> Button 2 - +--------> Ground | +--------> Output 3 - +----------> Ground - - Amstrad CPC Commodore C64 - - +-----------> Analog Y - +---------> Right | +---------> Right - | +-------> Left | | +-------> Left - | | +-----> Down | | | +-----> Down - | | | +---> Up | | | | +---> Up - | | | | | | | | | - _____________ _____________ -5 \ x o o o o / 1 5 \ o o o o o / 1 - \ x o o o / \ o o o o / - 9 `~~~~~~~' 6 9 `~~~~~~~' 6 - | | | | | | | - | | +----> Button 1 | | | +----> Button - | +------> Button 2 | | +------> Power - +--------> Ground | +--------> Ground - +----------> Analog X - - Sinclair Spectrum +2A/+3 Amiga 1200 - - +-----------> Up +-----------> Button 3 - | +---------> Fire | +---------> Right - | | | | +-------> Left - | | +-----> Ground | | | +-----> Down - | | | | | | | +---> Up - | | | | | | | | - _____________ _____________ -5 \ o o x o x / 1 5 \ o o o o o / 1 - \ o o o o / \ o o o o / - 9 `~~~~~~~' 6 9 `~~~~~~~' 6 - | | | | | | | | - | | | +----> Right | | | +----> Button 1 - | | +------> Left | | +------> Power - | +--------> Ground | +--------> Ground - +----------> Down +----------> Button 2 - - And there were many others. - -2.2.1 Multisystem joysticks using db9.c -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - For the Multisystem joysticks, and their derivatives, the db9.c driver -was written. It allows only one joystick / gamepad per parallel port, but -the interface is easy to build and works with almost anything. - - For the basic 1-button Multisystem joystick you connect its wires to the -parallel port like this: - -(pin 1) -----> Power -(pin 18) -----> Ground - -(pin 2) -----> Up -(pin 3) -----> Down -(pin 4) -----> Left -(pin 5) -----> Right -(pin 6) -----> Button 1 - - However, if the joystick is switch based (eg. clicks when you move it), -you might or might not, depending on your parallel port, need 10 kOhm pullup -resistors on each of the direction and button signals, like this: - -(pin 2) ------------+------> Up - Resistor | -(pin 1) --[10kOhm]--+ - - Try without, and if it doesn't work, add them. For TTL based joysticks / -gamepads the pullups are not needed. - - For joysticks with two buttons you connect the second button to pin 7 on -the parallel port. - -(pin 7) -----> Button 2 - - And that's it. - - On a side note, if you have already built a different adapter for use with -the digital joystick driver 0.8.0.2, this is also supported by the db9.c -driver, as device type 8. (See section 3.2) - -2.2.2 Multisystem joysticks using gamecon.c -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - For some people just one joystick per parallel port is not enough, and/or -want to use them on one parallel port together with NES/SNES/PSX pads. This is -possible using the gamecon.c. It supports up to 5 devices of the above types, -including 1 and 2 buttons Multisystem joysticks. - - However, there is nothing for free. To allow more sticks to be used at -once, you need the sticks to be purely switch based (that is non-TTL), and -not to need power. Just a plain simple six switches inside. If your -joystick can do more (eg. turbofire) you'll need to disable it totally first -if you want to use gamecon.c. - - Also, the connection is a bit more complex. You'll need a bunch of diodes, -and one pullup resistor. First, you connect the Directions and the button -the same as for db9, however with the diodes between. - - Diodes -(pin 2) -----|<|----> Up -(pin 3) -----|<|----> Down -(pin 4) -----|<|----> Left -(pin 5) -----|<|----> Right -(pin 6) -----|<|----> Button 1 - - For two button sticks you also connect the other button. - -(pin 7) -----|<|----> Button 2 - - And finally, you connect the Ground wire of the joystick, like done in -this little schematic to Power and Data on the parallel port, as described -for the NES / SNES pads in section 2.1 of this file - that is, one data pin -for each joystick. The power source is shared. - -Data ------------+-----> Ground - Resistor | -Power --[10kOhm]--+ - - And that's all, here we go! - -2.2.3 Multisystem joysticks using turbografx.c -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The TurboGraFX interface, designed by - - Steffen Schwenke - - allows up to 7 Multisystem joysticks connected to the parallel port. In -Steffen's version, there is support for up to 5 buttons per joystick. However, -since this doesn't work reliably on all parallel ports, the turbografx.c driver -supports only one button per joystick. For more information on how to build the -interface, see - - http://www2.burg-halle.de/~schwenke/parport.html - -2.3 Sony Playstation -~~~~~~~~~~~~~~~~~~~~ - - The PSX controller is supported by the gamecon.c. Pinout of the PSX -controller (compatible with DirectPadPro): - - +---------+---------+---------+ -9 | o o o | o o o | o o o | 1 parallel - \________|_________|________/ port pins - | | | | | | - | | | | | +--------> Clock --- (4) - | | | | +------------> Select --- (3) - | | | +---------------> Power --- (5-9) - | | +------------------> Ground --- (18-25) - | +-------------------------> Command --- (2) - +----------------------------> Data --- (one of 10,11,12,13,15) - - The driver supports these controllers: - - * Standard PSX Pad - * NegCon PSX Pad - * Analog PSX Pad (red mode) - * Analog PSX Pad (green mode) - * PSX Rumble Pad - * PSX DDR Pad - -2.4 Sega -~~~~~~~~ - All the Sega controllers are more or less based on the standard 2-button -Multisystem joystick. However, since they don't use switches and use TTL -logic, the only driver usable with them is the db9.c driver. - -2.4.1 Sega Master System -~~~~~~~~~~~~~~~~~~~~~~~~ - The SMS gamepads are almost exactly the same as normal 2-button -Multisystem joysticks. Set the driver to Multi2 mode, use the corresponding -parallel port pins, and the following schematic: - - +-----------> Power - | +---------> Right - | | +-------> Left - | | | +-----> Down - | | | | +---> Up - | | | | | - _____________ -5 \ o o o o o / 1 - \ o o x o / - 9 `~~~~~~~' 6 - | | | - | | +----> Button 1 - | +--------> Ground - +----------> Button 2 - -2.4.2 Sega Genesis aka MegaDrive -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The Sega Genesis (in Europe sold as Sega MegaDrive) pads are an extension -to the Sega Master System pads. They use more buttons (3+1, 5+1, 6+1). Use -the following schematic: - - +-----------> Power - | +---------> Right - | | +-------> Left - | | | +-----> Down - | | | | +---> Up - | | | | | - _____________ -5 \ o o o o o / 1 - \ o o o o / - 9 `~~~~~~~' 6 - | | | | - | | | +----> Button 1 - | | +------> Select - | +--------> Ground - +----------> Button 2 - - The Select pin goes to pin 14 on the parallel port. - -(pin 14) -----> Select - - The rest is the same as for Multi2 joysticks using db9.c - -2.4.3 Sega Saturn -~~~~~~~~~~~~~~~~~ - Sega Saturn has eight buttons, and to transfer that, without hacks like -Genesis 6 pads use, it needs one more select pin. Anyway, it is still -handled by the db9.c driver. Its pinout is very different from anything -else. Use this schematic: - - +-----------> Select 1 - | +---------> Power - | | +-------> Up - | | | +-----> Down - | | | | +---> Ground - | | | | | - _____________ -5 \ o o o o o / 1 - \ o o o o / - 9 `~~~~~~~' 6 - | | | | - | | | +----> Select 2 - | | +------> Right - | +--------> Left - +----------> Power - - Select 1 is pin 14 on the parallel port, Select 2 is pin 16 on the -parallel port. - -(pin 14) -----> Select 1 -(pin 16) -----> Select 2 - - The other pins (Up, Down, Right, Left, Power, Ground) are the same as for -Multi joysticks using db9.c - -3. The drivers -~~~~~~~~~~~~~~ - There are three drivers for the parallel port interfaces. Each, as -described above, allows to connect a different group of joysticks and pads. -Here are described their command lines: - -3.1 gamecon.c -~~~~~~~~~~~~~ - Using gamecon.c you can connect up to five devices to one parallel port. It -uses the following kernel/module command line: - - gamecon.map=port,pad1,pad2,pad3,pad4,pad5 - - Where 'port' the number of the parport interface (eg. 0 for parport0). - - And 'pad1' to 'pad5' are pad types connected to different data input pins -(10,11,12,13,15), as described in section 2.1 of this file. - - The types are: - - Type | Joystick/Pad - -------------------- - 0 | None - 1 | SNES pad - 2 | NES pad - 4 | Multisystem 1-button joystick - 5 | Multisystem 2-button joystick - 6 | N64 pad - 7 | Sony PSX controller - 8 | Sony PSX DDR controller - 9 | SNES mouse - - The exact type of the PSX controller type is autoprobed when used, so -hot swapping should work (but is not recommended). - - Should you want to use more than one of parallel ports at once, you can use -gamecon.map2 and gamecon.map3 as additional command line parameters for two -more parallel ports. - - There are two options specific to PSX driver portion. gamecon.psx_delay sets -the command delay when talking to the controllers. The default of 25 should -work but you can try lowering it for better performance. If your pads don't -respond try raising it until they work. Setting the type to 8 allows the -driver to be used with Dance Dance Revolution or similar games. Arrow keys are -registered as key presses instead of X and Y axes. - -3.2 db9.c -~~~~~~~~~ - Apart from making an interface, there is nothing difficult on using the -db9.c driver. It uses the following kernel/module command line: - - db9.dev=port,type - - Where 'port' is the number of the parport interface (eg. 0 for parport0). - - Caveat here: This driver only works on bidirectional parallel ports. If -your parallel port is recent enough, you should have no trouble with this. -Old parallel ports may not have this feature. - - 'Type' is the type of joystick or pad attached: - - Type | Joystick/Pad - -------------------- - 0 | None - 1 | Multisystem 1-button joystick - 2 | Multisystem 2-button joystick - 3 | Genesis pad (3+1 buttons) - 5 | Genesis pad (5+1 buttons) - 6 | Genesis pad (6+2 buttons) - 7 | Saturn pad (8 buttons) - 8 | Multisystem 1-button joystick (v0.8.0.2 pin-out) - 9 | Two Multisystem 1-button joysticks (v0.8.0.2 pin-out) - 10 | Amiga CD32 pad - - Should you want to use more than one of these joysticks/pads at once, you -can use db9.dev2 and db9.dev3 as additional command line parameters for two -more joysticks/pads. - -3.3 turbografx.c -~~~~~~~~~~~~~~~~ - The turbografx.c driver uses a very simple kernel/module command line: - - turbografx.map=port,js1,js2,js3,js4,js5,js6,js7 - - Where 'port' is the number of the parport interface (eg. 0 for parport0). - - 'jsX' is the number of buttons the Multisystem joysticks connected to the -interface ports 1-7 have. For a standard multisystem joystick, this is 1. - - Should you want to use more than one of these interfaces at once, you can -use turbografx.map2 and turbografx.map3 as additional command line parameters -for two more interfaces. - -3.4 PC parallel port pinout -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - .----------------------------------------. - At the PC: \ 13 12 11 10 9 8 7 6 5 4 3 2 1 / - \ 25 24 23 22 21 20 19 18 17 16 15 14 / - ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - Pin | Name | Description - ~~~~~~|~~~~~~~~~|~~~~~~~~~~ - 1 | /STROBE | Strobe - 2-9 | D0-D7 | Data Bit 0-7 - 10 | /ACK | Acknowledge - 11 | BUSY | Busy - 12 | PE | Paper End - 13 | SELIN | Select In - 14 | /AUTOFD | Autofeed - 15 | /ERROR | Error - 16 | /INIT | Initialize - 17 | /SEL | Select - 18-25 | GND | Signal Ground - -3.5 End -~~~~~~~ - That's all, folks! Have fun! diff --git a/Documentation/input/joystick.txt b/Documentation/input/joystick.txt deleted file mode 100644 index 8d027dc86c1f001efa3576ab4b41363aea031af8..0000000000000000000000000000000000000000 --- a/Documentation/input/joystick.txt +++ /dev/null @@ -1,586 +0,0 @@ - Linux Joystick driver v2.0.0 - (c) 1996-2000 Vojtech Pavlik - Sponsored by SuSE ----------------------------------------------------------------------------- - -0. Disclaimer -~~~~~~~~~~~~~ - This program is free software; you can redistribute it and/or modify it -under the terms of the GNU General Public License as published by the Free -Software Foundation; either version 2 of the License, or (at your option) -any later version. - - This program is distributed in the hope that it will be useful, but -WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY -or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -more details. - - You should have received a copy of the GNU General Public License along -with this program; if not, write to the Free Software Foundation, Inc., 59 -Temple Place, Suite 330, Boston, MA 02111-1307 USA - - Should you need to contact me, the author, you can do so either by e-mail -- mail your message to , or by paper mail: Vojtech Pavlik, -Simunkova 1594, Prague 8, 182 00 Czech Republic - - For your convenience, the GNU General Public License version 2 is included -in the package: See the file COPYING. - -1. Intro -~~~~~~~~ - The joystick driver for Linux provides support for a variety of joysticks -and similar devices. It is based on a larger project aiming to support all -input devices in Linux. - - Should you encounter any problems while using the driver, or joysticks -this driver can't make complete use of, I'm very interested in hearing about -them. Bug reports and success stories are also welcome. - - The input project website is at: - - http://atrey.karlin.mff.cuni.cz/~vojtech/input/ - - There is also a mailing list for the driver at: - - listproc@atrey.karlin.mff.cuni.cz - -send "subscribe linux-joystick Your Name" to subscribe to it. - -2. Usage -~~~~~~~~ - For basic usage you just choose the right options in kernel config and -you should be set. - -2.1 inpututils -~~~~~~~~~~~~~~ -For testing and other purposes (for example serial devices), a set of -utilities is available at the abovementioned website. I suggest you download -and install it before going on. - -2.2 Device nodes -~~~~~~~~~~~~~~~~ -For applications to be able to use the joysticks, -you'll have to manually create these nodes in /dev: - -cd /dev -rm js* -mkdir input -mknod input/js0 c 13 0 -mknod input/js1 c 13 1 -mknod input/js2 c 13 2 -mknod input/js3 c 13 3 -ln -s input/js0 js0 -ln -s input/js1 js1 -ln -s input/js2 js2 -ln -s input/js3 js3 - -For testing with inpututils it's also convenient to create these: - -mknod input/event0 c 13 64 -mknod input/event1 c 13 65 -mknod input/event2 c 13 66 -mknod input/event3 c 13 67 - -2.4 Modules needed -~~~~~~~~~~~~~~~~~~ - For all joystick drivers to function, you'll need the userland interface -module in kernel, either loaded or compiled in: - - modprobe joydev - - For gameport joysticks, you'll have to load the gameport driver as well; - - modprobe ns558 - - And for serial port joysticks, you'll need the serial input line -discipline module loaded and the inputattach utility started: - - modprobe serport - inputattach -xxx /dev/tts/X & - - In addition to that, you'll need the joystick driver module itself, most -usually you'll have an analog joystick: - - modprobe analog - - For automatic module loading, something like this might work - tailor to -your needs: - - alias tty-ldisc-2 serport - alias char-major-13 input - above input joydev ns558 analog - options analog map=gamepad,none,2btn - -2.5 Verifying that it works -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - For testing the joystick driver functionality, there is the jstest -program in the utilities package. You run it by typing: - - jstest /dev/input/js0 - - And it should show a line with the joystick values, which update as you -move the stick, and press its buttons. The axes should all be zero when the -joystick is in the center position. They should not jitter by themselves to -other close values, and they also should be steady in any other position of -the stick. They should have the full range from -32767 to 32767. If all this -is met, then it's all fine, and you can play the games. :) - - If it's not, then there might be a problem. Try to calibrate the joystick, -and if it still doesn't work, read the drivers section of this file, the -troubleshooting section, and the FAQ. - -2.6. Calibration -~~~~~~~~~~~~~~~~ - For most joysticks you won't need any manual calibration, since the -joystick should be autocalibrated by the driver automagically. However, with -some analog joysticks, that either do not use linear resistors, or if you -want better precision, you can use the jscal program - - jscal -c /dev/input/js0 - - included in the joystick package to set better correction coefficients than -what the driver would choose itself. - - After calibrating the joystick you can verify if you like the new -calibration using the jstest command, and if you do, you then can save the -correction coefficients into a file - - jscal -p /dev/input/js0 > /etc/joystick.cal - - And add a line to your rc script executing that file - - source /etc/joystick.cal - - This way, after the next reboot your joystick will remain calibrated. You -can also add the jscal -p line to your shutdown script. - - -3. HW specific driver information -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -In this section each of the separate hardware specific drivers is described. - -3.1 Analog joysticks -~~~~~~~~~~~~~~~~~~~~ - The analog.c uses the standard analog inputs of the gameport, and thus -supports all standard joysticks and gamepads. It uses a very advanced -routine for this, allowing for data precision that can't be found on any -other system. - - It also supports extensions like additional hats and buttons compatible -with CH Flightstick Pro, ThrustMaster FCS or 6 and 8 button gamepads. Saitek -Cyborg 'digital' joysticks are also supported by this driver, because -they're basically souped up CHF sticks. - - However the only types that can be autodetected are: - -* 2-axis, 4-button joystick -* 3-axis, 4-button joystick -* 4-axis, 4-button joystick -* Saitek Cyborg 'digital' joysticks - - For other joystick types (more/less axes, hats, and buttons) support -you'll need to specify the types either on the kernel command line or on the -module command line, when inserting analog into the kernel. The -parameters are: - - analog.map=,,,.... - - 'type' is type of the joystick from the table below, defining joysticks -present on gameports in the system, starting with gameport0, second 'type' -entry defining joystick on gameport1 and so on. - - Type | Meaning - ----------------------------------- - none | No analog joystick on that port - auto | Autodetect joystick - 2btn | 2-button n-axis joystick - y-joy | Two 2-button 2-axis joysticks on an Y-cable - y-pad | Two 2-button 2-axis gamepads on an Y-cable - fcs | Thrustmaster FCS compatible joystick - chf | Joystick with a CH Flightstick compatible hat - fullchf | CH Flightstick compatible with two hats and 6 buttons - gamepad | 4/6-button n-axis gamepad - gamepad8 | 8-button 2-axis gamepad - - In case your joystick doesn't fit in any of the above categories, you can -specify the type as a number by combining the bits in the table below. This -is not recommended unless you really know what are you doing. It's not -dangerous, but not simple either. - - Bit | Meaning - -------------------------- - 0 | Axis X1 - 1 | Axis Y1 - 2 | Axis X2 - 3 | Axis Y2 - 4 | Button A - 5 | Button B - 6 | Button C - 7 | Button D - 8 | CHF Buttons X and Y - 9 | CHF Hat 1 - 10 | CHF Hat 2 - 11 | FCS Hat - 12 | Pad Button X - 13 | Pad Button Y - 14 | Pad Button U - 15 | Pad Button V - 16 | Saitek F1-F4 Buttons - 17 | Saitek Digital Mode - 19 | GamePad - 20 | Joy2 Axis X1 - 21 | Joy2 Axis Y1 - 22 | Joy2 Axis X2 - 23 | Joy2 Axis Y2 - 24 | Joy2 Button A - 25 | Joy2 Button B - 26 | Joy2 Button C - 27 | Joy2 Button D - 31 | Joy2 GamePad - -3.2 Microsoft SideWinder joysticks -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Microsoft 'Digital Overdrive' protocol is supported by the sidewinder.c -module. All currently supported joysticks: - -* Microsoft SideWinder 3D Pro -* Microsoft SideWinder Force Feedback Pro -* Microsoft SideWinder Force Feedback Wheel -* Microsoft SideWinder FreeStyle Pro -* Microsoft SideWinder GamePad (up to four, chained) -* Microsoft SideWinder Precision Pro -* Microsoft SideWinder Precision Pro USB - - are autodetected, and thus no module parameters are needed. - - There is one caveat with the 3D Pro. There are 9 buttons reported, -although the joystick has only 8. The 9th button is the mode switch on the -rear side of the joystick. However, moving it, you'll reset the joystick, -and make it unresponsive for about a one third of a second. Furthermore, the -joystick will also re-center itself, taking the position it was in during -this time as a new center position. Use it if you want, but think first. - - The SideWinder Standard is not a digital joystick, and thus is supported -by the analog driver described above. - -3.3 Logitech ADI devices -~~~~~~~~~~~~~~~~~~~~~~~~ - Logitech ADI protocol is supported by the adi.c module. It should support -any Logitech device using this protocol. This includes, but is not limited -to: - -* Logitech CyberMan 2 -* Logitech ThunderPad Digital -* Logitech WingMan Extreme Digital -* Logitech WingMan Formula -* Logitech WingMan Interceptor -* Logitech WingMan GamePad -* Logitech WingMan GamePad USB -* Logitech WingMan GamePad Extreme -* Logitech WingMan Extreme Digital 3D - - ADI devices are autodetected, and the driver supports up to two (any -combination of) devices on a single gameport, using an Y-cable or chained -together. - - Logitech WingMan Joystick, Logitech WingMan Attack, Logitech WingMan -Extreme and Logitech WingMan ThunderPad are not digital joysticks and are -handled by the analog driver described above. Logitech WingMan Warrior and -Logitech Magellan are supported by serial drivers described below. Logitech -WingMan Force and Logitech WingMan Formula Force are supported by the -I-Force driver described below. Logitech CyberMan is not supported yet. - -3.4 Gravis GrIP -~~~~~~~~~~~~~~~ - Gravis GrIP protocol is supported by the grip.c module. It currently -supports: - -* Gravis GamePad Pro -* Gravis BlackHawk Digital -* Gravis Xterminator -* Gravis Xterminator DualControl - - All these devices are autodetected, and you can even use any combination -of up to two of these pads either chained together or using an Y-cable on a -single gameport. - -GrIP MultiPort isn't supported yet. Gravis Stinger is a serial device and is -supported by the stinger driver. Other Gravis joysticks are supported by the -analog driver. - -3.5 FPGaming A3D and MadCatz A3D -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The Assassin 3D protocol created by FPGaming, is used both by FPGaming -themselves and is licensed to MadCatz. A3D devices are supported by the -a3d.c module. It currently supports: - -* FPGaming Assassin 3D -* MadCatz Panther -* MadCatz Panther XL - - All these devices are autodetected. Because the Assassin 3D and the Panther -allow connecting analog joysticks to them, you'll need to load the analog -driver as well to handle the attached joysticks. - - The trackball should work with USB mousedev module as a normal mouse. See -the USB documentation for how to setup an USB mouse. - -3.6 ThrustMaster DirectConnect (BSP) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The TM DirectConnect (BSP) protocol is supported by the tmdc.c -module. This includes, but is not limited to: - -* ThrustMaster Millennium 3D Interceptor -* ThrustMaster 3D Rage Pad -* ThrustMaster Fusion Digital Game Pad - - Devices not directly supported, but hopefully working are: - -* ThrustMaster FragMaster -* ThrustMaster Attack Throttle - - If you have one of these, contact me. - - TMDC devices are autodetected, and thus no parameters to the module -are needed. Up to two TMDC devices can be connected to one gameport, using -an Y-cable. - -3.7 Creative Labs Blaster -~~~~~~~~~~~~~~~~~~~~~~~~~ - The Blaster protocol is supported by the cobra.c module. It supports only -the: - -* Creative Blaster GamePad Cobra - - Up to two of these can be used on a single gameport, using an Y-cable. - -3.8 Genius Digital joysticks -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The Genius digitally communicating joysticks are supported by the gf2k.c -module. This includes: - -* Genius Flight2000 F-23 joystick -* Genius Flight2000 F-31 joystick -* Genius G-09D gamepad - - Other Genius digital joysticks are not supported yet, but support can be -added fairly easily. - -3.9 InterAct Digital joysticks -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The InterAct digitally communicating joysticks are supported by the -interact.c module. This includes: - -* InterAct HammerHead/FX gamepad -* InterAct ProPad8 gamepad - - Other InterAct digital joysticks are not supported yet, but support can be -added fairly easily. - -3.10 PDPI Lightning 4 gamecards -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - PDPI Lightning 4 gamecards are supported by the lightning.c module. -Once the module is loaded, the analog driver can be used to handle the -joysticks. Digitally communicating joystick will work only on port 0, while -using Y-cables, you can connect up to 8 analog joysticks to a single L4 -card, 16 in case you have two in your system. - -3.11 Trident 4DWave / Aureal Vortex -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Soundcards with a Trident 4DWave DX/NX or Aureal Vortex/Vortex2 chipsets -provide an "Enhanced Game Port" mode where the soundcard handles polling the -joystick. This mode is supported by the pcigame.c module. Once loaded the -analog driver can use the enhanced features of these gameports.. - -3.13 Crystal SoundFusion -~~~~~~~~~~~~~~~~~~~~~~~~ - Soundcards with Crystal SoundFusion chipsets provide an "Enhanced Game -Port", much like the 4DWave or Vortex above. This, and also the normal mode -for the port of the SoundFusion is supported by the cs461x.c module. - -3.14 SoundBlaster Live! -~~~~~~~~~~~~~~~~~~~~~~~~ - The Live! has a special PCI gameport, which, although it doesn't provide -any "Enhanced" stuff like 4DWave and friends, is quite a bit faster than -its ISA counterparts. It also requires special support, hence the -emu10k1-gp.c module for it instead of the normal ns558.c one. - -3.15 SoundBlaster 64 and 128 - ES1370 and ES1371, ESS Solo1 and S3 SonicVibes -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - These PCI soundcards have specific gameports. They are handled by the -sound drivers themselves. Make sure you select gameport support in the -joystick menu and sound card support in the sound menu for your appropriate -card. - -3.16 Amiga -~~~~~~~~~~ - Amiga joysticks, connected to an Amiga, are supported by the amijoy.c -driver. Since they can't be autodetected, the driver has a command line. - - amijoy.map=, - - a and b define the joysticks connected to the JOY0DAT and JOY1DAT ports of -the Amiga. - - Value | Joystick type - --------------------- - 0 | None - 1 | 1-button digital joystick - - No more joystick types are supported now, but that should change in the -future if I get an Amiga in the reach of my fingers. - -3.17 Game console and 8-bit pads and joysticks -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -See joystick-parport.txt for more info. - -3.18 SpaceTec/LabTec devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - SpaceTec serial devices communicate using the SpaceWare protocol. It is -supported by the spaceorb.c and spaceball.c drivers. The devices currently -supported by spaceorb.c are: - -* SpaceTec SpaceBall Avenger -* SpaceTec SpaceOrb 360 - -Devices currently supported by spaceball.c are: - -* SpaceTec SpaceBall 4000 FLX - - In addition to having the spaceorb/spaceball and serport modules in the -kernel, you also need to attach a serial port to it. to do that, run the -inputattach program: - - inputattach --spaceorb /dev/tts/x & -or - inputattach --spaceball /dev/tts/x & - -where /dev/tts/x is the serial port which the device is connected to. After -doing this, the device will be reported and will start working. - - There is one caveat with the SpaceOrb. The button #6, the on the bottom -side of the orb, although reported as an ordinary button, causes internal -recentering of the spaceorb, moving the zero point to the position in which -the ball is at the moment of pressing the button. So, think first before -you bind it to some other function. - -SpaceTec SpaceBall 2003 FLX and 3003 FLX are not supported yet. - -3.19 Logitech SWIFT devices -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The SWIFT serial protocol is supported by the warrior.c module. It -currently supports only the: - -* Logitech WingMan Warrior - -but in the future, Logitech CyberMan (the original one, not CM2) could be -supported as well. To use the module, you need to run inputattach after you -insert/compile the module into your kernel: - - inputattach --warrior /dev/tts/x & - -/dev/tts/x is the serial port your Warrior is attached to. - -3.20 Magellan / Space Mouse -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The Magellan (or Space Mouse), manufactured by LogiCad3d (formerly Space -Systems), for many other companies (Logitech, HP, ...) is supported by the -joy-magellan module. It currently supports only the: - -* Magellan 3D -* Space Mouse - -models, the additional buttons on the 'Plus' versions are not supported yet. - - To use it, you need to attach the serial port to the driver using the - - inputattach --magellan /dev/tts/x & - -command. After that the Magellan will be detected, initialized, will beep, -and the /dev/input/jsX device should become usable. - -3.21 I-Force devices -~~~~~~~~~~~~~~~~~~~~ - All I-Force devices are supported by the iforce module. This includes: - -* AVB Mag Turbo Force -* AVB Top Shot Pegasus -* AVB Top Shot Force Feedback Racing Wheel -* Logitech WingMan Force -* Logitech WingMan Force Wheel -* Guillemot Race Leader Force Feedback -* Guillemot Force Feedback Racing Wheel -* Thrustmaster Motor Sport GT - - To use it, you need to attach the serial port to the driver using the - - inputattach --iforce /dev/tts/x & - -command. After that the I-Force device will be detected, and the -/dev/input/jsX device should become usable. - - In case you're using the device via the USB port, the inputattach command -isn't needed. - - The I-Force driver now supports force feedback via the event interface. - - Please note that Logitech WingMan *3D devices are _not_ supported by this -module, rather by hid. Force feedback is not supported for those devices. -Logitech gamepads are also hid devices. - -3.22 Gravis Stinger gamepad -~~~~~~~~~~~~~~~~~~~~~~~~~~~ - The Gravis Stinger serial port gamepad, designed for use with laptop -computers, is supported by the stinger.c module. To use it, attach the -serial port to the driver using: - - inputattach --stinger /dev/tty/x & - -where x is the number of the serial port. - -4. Troubleshooting -~~~~~~~~~~~~~~~~~~ - There is quite a high probability that you run into some problems. For -testing whether the driver works, if in doubt, use the jstest utility in -some of its modes. The most useful modes are "normal" - for the 1.x -interface, and "old" for the "0.x" interface. You run it by typing: - - jstest --normal /dev/input/js0 - jstest --old /dev/input/js0 - - Additionally you can do a test with the evtest utility: - - evtest /dev/input/event0 - - Oh, and read the FAQ! :) - -5. FAQ -~~~~~~ -Q: Running 'jstest /dev/input/js0' results in "File not found" error. What's the - cause? -A: The device files don't exist. Create them (see section 2.2). - -Q: Is it possible to connect my old Atari/Commodore/Amiga/console joystick - or pad that uses a 9-pin D-type cannon connector to the serial port of my - PC? -A: Yes, it is possible, but it'll burn your serial port or the pad. It - won't work, of course. - -Q: My joystick doesn't work with Quake / Quake 2. What's the cause? -A: Quake / Quake 2 don't support joystick. Use joy2key to simulate keypresses - for them. - -6. Programming Interface -~~~~~~~~~~~~~~~~~~~~~~~~ - The 1.0 driver uses a new, event based approach to the joystick driver. -Instead of the user program polling for the joystick values, the joystick -driver now reports only any changes of its state. See joystick-api.txt, -joystick.h and jstest.c included in the joystick package for more -information. The joystick device can be used in either blocking or -nonblocking mode and supports select() calls. - - For backward compatibility the old (v0.x) interface is still included. -Any call to the joystick driver using the old interface will return values -that are compatible to the old interface. This interface is still limited -to 2 axes, and applications using it usually decode only 2 buttons, although -the driver provides up to 32. diff --git a/Documentation/input/multi-touch-protocol.rst b/Documentation/input/multi-touch-protocol.rst new file mode 100644 index 0000000000000000000000000000000000000000..8035868c56bc651a1d57af01f578a70a211e59d2 --- /dev/null +++ b/Documentation/input/multi-touch-protocol.rst @@ -0,0 +1,410 @@ +.. include:: + +========================= +Multi-touch (MT) Protocol +========================= + +:Copyright: |copy| 2009-2010 Henrik Rydberg + + +Introduction +------------ + +In order to utilize the full power of the new multi-touch and multi-user +devices, a way to report detailed data from multiple contacts, i.e., +objects in direct contact with the device surface, is needed. This +document describes the multi-touch (MT) protocol which allows kernel +drivers to report details for an arbitrary number of contacts. + +The protocol is divided into two types, depending on the capabilities of the +hardware. For devices handling anonymous contacts (type A), the protocol +describes how to send the raw data for all contacts to the receiver. For +devices capable of tracking identifiable contacts (type B), the protocol +describes how to send updates for individual contacts via event slots. + +.. note:: + MT potocol type A is obsolete, all kernel drivers have been + converted to use type B. + +Protocol Usage +-------------- + +Contact details are sent sequentially as separate packets of ABS_MT +events. Only the ABS_MT events are recognized as part of a contact +packet. Since these events are ignored by current single-touch (ST) +applications, the MT protocol can be implemented on top of the ST protocol +in an existing driver. + +Drivers for type A devices separate contact packets by calling +input_mt_sync() at the end of each packet. This generates a SYN_MT_REPORT +event, which instructs the receiver to accept the data for the current +contact and prepare to receive another. + +Drivers for type B devices separate contact packets by calling +input_mt_slot(), with a slot as argument, at the beginning of each packet. +This generates an ABS_MT_SLOT event, which instructs the receiver to +prepare for updates of the given slot. + +All drivers mark the end of a multi-touch transfer by calling the usual +input_sync() function. This instructs the receiver to act upon events +accumulated since last EV_SYN/SYN_REPORT and prepare to receive a new set +of events/packets. + +The main difference between the stateless type A protocol and the stateful +type B slot protocol lies in the usage of identifiable contacts to reduce +the amount of data sent to userspace. The slot protocol requires the use of +the ABS_MT_TRACKING_ID, either provided by the hardware or computed from +the raw data [#f5]_. + +For type A devices, the kernel driver should generate an arbitrary +enumeration of the full set of anonymous contacts currently on the +surface. The order in which the packets appear in the event stream is not +important. Event filtering and finger tracking is left to user space [#f3]_. + +For type B devices, the kernel driver should associate a slot with each +identified contact, and use that slot to propagate changes for the contact. +Creation, replacement and destruction of contacts is achieved by modifying +the ABS_MT_TRACKING_ID of the associated slot. A non-negative tracking id +is interpreted as a contact, and the value -1 denotes an unused slot. A +tracking id not previously present is considered new, and a tracking id no +longer present is considered removed. Since only changes are propagated, +the full state of each initiated contact has to reside in the receiving +end. Upon receiving an MT event, one simply updates the appropriate +attribute of the current slot. + +Some devices identify and/or track more contacts than they can report to the +driver. A driver for such a device should associate one type B slot with each +contact that is reported by the hardware. Whenever the identity of the +contact associated with a slot changes, the driver should invalidate that +slot by changing its ABS_MT_TRACKING_ID. If the hardware signals that it is +tracking more contacts than it is currently reporting, the driver should use +a BTN_TOOL_*TAP event to inform userspace of the total number of contacts +being tracked by the hardware at that moment. The driver should do this by +explicitly sending the corresponding BTN_TOOL_*TAP event and setting +use_count to false when calling input_mt_report_pointer_emulation(). +The driver should only advertise as many slots as the hardware can report. +Userspace can detect that a driver can report more total contacts than slots +by noting that the largest supported BTN_TOOL_*TAP event is larger than the +total number of type B slots reported in the absinfo for the ABS_MT_SLOT axis. + +The minimum value of the ABS_MT_SLOT axis must be 0. + +Protocol Example A +------------------ + +Here is what a minimal event sequence for a two-contact touch would look +like for a type A device:: + + ABS_MT_POSITION_X x[0] + ABS_MT_POSITION_Y y[0] + SYN_MT_REPORT + ABS_MT_POSITION_X x[1] + ABS_MT_POSITION_Y y[1] + SYN_MT_REPORT + SYN_REPORT + +The sequence after moving one of the contacts looks exactly the same; the +raw data for all present contacts are sent between every synchronization +with SYN_REPORT. + +Here is the sequence after lifting the first contact:: + + ABS_MT_POSITION_X x[1] + ABS_MT_POSITION_Y y[1] + SYN_MT_REPORT + SYN_REPORT + +And here is the sequence after lifting the second contact:: + + SYN_MT_REPORT + SYN_REPORT + +If the driver reports one of BTN_TOUCH or ABS_PRESSURE in addition to the +ABS_MT events, the last SYN_MT_REPORT event may be omitted. Otherwise, the +last SYN_REPORT will be dropped by the input core, resulting in no +zero-contact event reaching userland. + + +Protocol Example B +------------------ + +Here is what a minimal event sequence for a two-contact touch would look +like for a type B device:: + + ABS_MT_SLOT 0 + ABS_MT_TRACKING_ID 45 + ABS_MT_POSITION_X x[0] + ABS_MT_POSITION_Y y[0] + ABS_MT_SLOT 1 + ABS_MT_TRACKING_ID 46 + ABS_MT_POSITION_X x[1] + ABS_MT_POSITION_Y y[1] + SYN_REPORT + +Here is the sequence after moving contact 45 in the x direction:: + + ABS_MT_SLOT 0 + ABS_MT_POSITION_X x[0] + SYN_REPORT + +Here is the sequence after lifting the contact in slot 0:: + + ABS_MT_TRACKING_ID -1 + SYN_REPORT + +The slot being modified is already 0, so the ABS_MT_SLOT is omitted. The +message removes the association of slot 0 with contact 45, thereby +destroying contact 45 and freeing slot 0 to be reused for another contact. + +Finally, here is the sequence after lifting the second contact:: + + ABS_MT_SLOT 1 + ABS_MT_TRACKING_ID -1 + SYN_REPORT + + +Event Usage +----------- + +A set of ABS_MT events with the desired properties is defined. The events +are divided into categories, to allow for partial implementation. The +minimum set consists of ABS_MT_POSITION_X and ABS_MT_POSITION_Y, which +allows for multiple contacts to be tracked. If the device supports it, the +ABS_MT_TOUCH_MAJOR and ABS_MT_WIDTH_MAJOR may be used to provide the size +of the contact area and approaching tool, respectively. + +The TOUCH and WIDTH parameters have a geometrical interpretation; imagine +looking through a window at someone gently holding a finger against the +glass. You will see two regions, one inner region consisting of the part +of the finger actually touching the glass, and one outer region formed by +the perimeter of the finger. The center of the touching region (a) is +ABS_MT_POSITION_X/Y and the center of the approaching finger (b) is +ABS_MT_TOOL_X/Y. The touch diameter is ABS_MT_TOUCH_MAJOR and the finger +diameter is ABS_MT_WIDTH_MAJOR. Now imagine the person pressing the finger +harder against the glass. The touch region will increase, and in general, +the ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR, which is always smaller +than unity, is related to the contact pressure. For pressure-based devices, +ABS_MT_PRESSURE may be used to provide the pressure on the contact area +instead. Devices capable of contact hovering can use ABS_MT_DISTANCE to +indicate the distance between the contact and the surface. + +:: + + + Linux MT Win8 + __________ _______________________ + / \ | | + / \ | | + / ____ \ | | + / / \ \ | | + \ \ a \ \ | a | + \ \____/ \ | | + \ \ | | + \ b \ | b | + \ \ | | + \ \ | | + \ \ | | + \ / | | + \ / | | + \ / | | + \__________/ |_______________________| + + +In addition to the MAJOR parameters, the oval shape of the touch and finger +regions can be described by adding the MINOR parameters, such that MAJOR +and MINOR are the major and minor axis of an ellipse. The orientation of +the touch ellipse can be described with the ORIENTATION parameter, and the +direction of the finger ellipse is given by the vector (a - b). + +For type A devices, further specification of the touch shape is possible +via ABS_MT_BLOB_ID. + +The ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a +finger or a pen or something else. Finally, the ABS_MT_TRACKING_ID event +may be used to track identified contacts over time [#f5]_. + +In the type B protocol, ABS_MT_TOOL_TYPE and ABS_MT_TRACKING_ID are +implicitly handled by input core; drivers should instead call +input_mt_report_slot_state(). + + +Event Semantics +--------------- + +ABS_MT_TOUCH_MAJOR + The length of the major axis of the contact. The length should be given in + surface units. If the surface has an X times Y resolution, the largest + possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal [#f4]_. + +ABS_MT_TOUCH_MINOR + The length, in surface units, of the minor axis of the contact. If the + contact is circular, this event can be omitted [#f4]_. + +ABS_MT_WIDTH_MAJOR + The length, in surface units, of the major axis of the approaching + tool. This should be understood as the size of the tool itself. The + orientation of the contact and the approaching tool are assumed to be the + same [#f4]_. + +ABS_MT_WIDTH_MINOR + The length, in surface units, of the minor axis of the approaching + tool. Omit if circular [#f4]_. + + The above four values can be used to derive additional information about + the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates + the notion of pressure. The fingers of the hand and the palm all have + different characteristic widths. + +ABS_MT_PRESSURE + The pressure, in arbitrary units, on the contact area. May be used instead + of TOUCH and WIDTH for pressure-based devices or any device with a spatial + signal intensity distribution. + +ABS_MT_DISTANCE + The distance, in surface units, between the contact and the surface. Zero + distance means the contact is touching the surface. A positive number means + the contact is hovering above the surface. + +ABS_MT_ORIENTATION + The orientation of the touching ellipse. The value should describe a signed + quarter of a revolution clockwise around the touch center. The signed value + range is arbitrary, but zero should be returned for an ellipse aligned with + the Y axis of the surface, a negative value when the ellipse is turned to + the left, and a positive value when the ellipse is turned to the + right. When completely aligned with the X axis, the range max should be + returned. + + Touch ellipsis are symmetrical by default. For devices capable of true 360 + degree orientation, the reported orientation must exceed the range max to + indicate more than a quarter of a revolution. For an upside-down finger, + range max * 2 should be returned. + + Orientation can be omitted if the touch area is circular, or if the + information is not available in the kernel driver. Partial orientation + support is possible if the device can distinguish between the two axis, but + not (uniquely) any values in between. In such cases, the range of + ABS_MT_ORIENTATION should be [0, 1] [#f4]_. + +ABS_MT_POSITION_X + The surface X coordinate of the center of the touching ellipse. + +ABS_MT_POSITION_Y + The surface Y coordinate of the center of the touching ellipse. + +ABS_MT_TOOL_X + The surface X coordinate of the center of the approaching tool. Omit if + the device cannot distinguish between the intended touch point and the + tool itself. + +ABS_MT_TOOL_Y + The surface Y coordinate of the center of the approaching tool. Omit if the + device cannot distinguish between the intended touch point and the tool + itself. + + The four position values can be used to separate the position of the touch + from the position of the tool. If both positions are present, the major + tool axis points towards the touch point [#f1]_. Otherwise, the tool axes are + aligned with the touch axes. + +ABS_MT_TOOL_TYPE + The type of approaching tool. A lot of kernel drivers cannot distinguish + between different tool types, such as a finger or a pen. In such cases, the + event should be omitted. The protocol currently supports MT_TOOL_FINGER, + MT_TOOL_PEN, and MT_TOOL_PALM [#f2]_. For type B devices, this event is + handled by input core; drivers should instead use + input_mt_report_slot_state(). A contact's ABS_MT_TOOL_TYPE may change over + time while still touching the device, because the firmware may not be able + to determine which tool is being used when it first appears. + +ABS_MT_BLOB_ID + The BLOB_ID groups several packets together into one arbitrarily shaped + contact. The sequence of points forms a polygon which defines the shape of + the contact. This is a low-level anonymous grouping for type A devices, and + should not be confused with the high-level trackingID [#f5]_. Most type A + devices do not have blob capability, so drivers can safely omit this event. + +ABS_MT_TRACKING_ID + The TRACKING_ID identifies an initiated contact throughout its life cycle + [#f5]_. The value range of the TRACKING_ID should be large enough to ensure + unique identification of a contact maintained over an extended period of + time. For type B devices, this event is handled by input core; drivers + should instead use input_mt_report_slot_state(). + + +Event Computation +----------------- + +The flora of different hardware unavoidably leads to some devices fitting +better to the MT protocol than others. To simplify and unify the mapping, +this section gives recipes for how to compute certain events. + +For devices reporting contacts as rectangular shapes, signed orientation +cannot be obtained. Assuming X and Y are the lengths of the sides of the +touching rectangle, here is a simple formula that retains the most +information possible:: + + ABS_MT_TOUCH_MAJOR := max(X, Y) + ABS_MT_TOUCH_MINOR := min(X, Y) + ABS_MT_ORIENTATION := bool(X > Y) + +The range of ABS_MT_ORIENTATION should be set to [0, 1], to indicate that +the device can distinguish between a finger along the Y axis (0) and a +finger along the X axis (1). + +For win8 devices with both T and C coordinates, the position mapping is:: + + ABS_MT_POSITION_X := T_X + ABS_MT_POSITION_Y := T_Y + ABS_MT_TOOL_X := C_X + ABS_MT_TOOL_Y := C_Y + +Unfortunately, there is not enough information to specify both the touching +ellipse and the tool ellipse, so one has to resort to approximations. One +simple scheme, which is compatible with earlier usage, is:: + + ABS_MT_TOUCH_MAJOR := min(X, Y) + ABS_MT_TOUCH_MINOR := + ABS_MT_ORIENTATION := + ABS_MT_WIDTH_MAJOR := min(X, Y) + distance(T, C) + ABS_MT_WIDTH_MINOR := min(X, Y) + +Rationale: We have no information about the orientation of the touching +ellipse, so approximate it with an inscribed circle instead. The tool +ellipse should align with the vector (T - C), so the diameter must +increase with distance(T, C). Finally, assume that the touch diameter is +equal to the tool thickness, and we arrive at the formulas above. + +Finger Tracking +--------------- + +The process of finger tracking, i.e., to assign a unique trackingID to each +initiated contact on the surface, is a Euclidian Bipartite Matching +problem. At each event synchronization, the set of actual contacts is +matched to the set of contacts from the previous synchronization. A full +implementation can be found in [#f3]_. + + +Gestures +-------- + +In the specific application of creating gesture events, the TOUCH and WIDTH +parameters can be used to, e.g., approximate finger pressure or distinguish +between index finger and thumb. With the addition of the MINOR parameters, +one can also distinguish between a sweeping finger and a pointing finger, +and with ORIENTATION, one can detect twisting of fingers. + + +Notes +----- + +In order to stay compatible with existing applications, the data reported +in a finger packet must not be recognized as single-touch events. + +For type A devices, all finger data bypasses input filtering, since +subsequent events of the same type refer to different fingers. + +.. [#f1] Also, the difference (TOOL_X - POSITION_X) can be used to model tilt. +.. [#f2] The list can of course be extended. +.. [#f3] The mtdev project: http://bitmath.org/code/mtdev/. +.. [#f4] See the section on event computation. +.. [#f5] See the section on finger tracking. diff --git a/Documentation/input/multi-touch-protocol.txt b/Documentation/input/multi-touch-protocol.txt deleted file mode 100644 index c51f1146f3bd8396f572cddb9c87ae2620d236ba..0000000000000000000000000000000000000000 --- a/Documentation/input/multi-touch-protocol.txt +++ /dev/null @@ -1,418 +0,0 @@ -Multi-touch (MT) Protocol -------------------------- - Copyright (C) 2009-2010 Henrik Rydberg - - -Introduction ------------- - -In order to utilize the full power of the new multi-touch and multi-user -devices, a way to report detailed data from multiple contacts, i.e., -objects in direct contact with the device surface, is needed. This -document describes the multi-touch (MT) protocol which allows kernel -drivers to report details for an arbitrary number of contacts. - -The protocol is divided into two types, depending on the capabilities of the -hardware. For devices handling anonymous contacts (type A), the protocol -describes how to send the raw data for all contacts to the receiver. For -devices capable of tracking identifiable contacts (type B), the protocol -describes how to send updates for individual contacts via event slots. - - -Protocol Usage --------------- - -Contact details are sent sequentially as separate packets of ABS_MT -events. Only the ABS_MT events are recognized as part of a contact -packet. Since these events are ignored by current single-touch (ST) -applications, the MT protocol can be implemented on top of the ST protocol -in an existing driver. - -Drivers for type A devices separate contact packets by calling -input_mt_sync() at the end of each packet. This generates a SYN_MT_REPORT -event, which instructs the receiver to accept the data for the current -contact and prepare to receive another. - -Drivers for type B devices separate contact packets by calling -input_mt_slot(), with a slot as argument, at the beginning of each packet. -This generates an ABS_MT_SLOT event, which instructs the receiver to -prepare for updates of the given slot. - -All drivers mark the end of a multi-touch transfer by calling the usual -input_sync() function. This instructs the receiver to act upon events -accumulated since last EV_SYN/SYN_REPORT and prepare to receive a new set -of events/packets. - -The main difference between the stateless type A protocol and the stateful -type B slot protocol lies in the usage of identifiable contacts to reduce -the amount of data sent to userspace. The slot protocol requires the use of -the ABS_MT_TRACKING_ID, either provided by the hardware or computed from -the raw data [5]. - -For type A devices, the kernel driver should generate an arbitrary -enumeration of the full set of anonymous contacts currently on the -surface. The order in which the packets appear in the event stream is not -important. Event filtering and finger tracking is left to user space [3]. - -For type B devices, the kernel driver should associate a slot with each -identified contact, and use that slot to propagate changes for the contact. -Creation, replacement and destruction of contacts is achieved by modifying -the ABS_MT_TRACKING_ID of the associated slot. A non-negative tracking id -is interpreted as a contact, and the value -1 denotes an unused slot. A -tracking id not previously present is considered new, and a tracking id no -longer present is considered removed. Since only changes are propagated, -the full state of each initiated contact has to reside in the receiving -end. Upon receiving an MT event, one simply updates the appropriate -attribute of the current slot. - -Some devices identify and/or track more contacts than they can report to the -driver. A driver for such a device should associate one type B slot with each -contact that is reported by the hardware. Whenever the identity of the -contact associated with a slot changes, the driver should invalidate that -slot by changing its ABS_MT_TRACKING_ID. If the hardware signals that it is -tracking more contacts than it is currently reporting, the driver should use -a BTN_TOOL_*TAP event to inform userspace of the total number of contacts -being tracked by the hardware at that moment. The driver should do this by -explicitly sending the corresponding BTN_TOOL_*TAP event and setting -use_count to false when calling input_mt_report_pointer_emulation(). -The driver should only advertise as many slots as the hardware can report. -Userspace can detect that a driver can report more total contacts than slots -by noting that the largest supported BTN_TOOL_*TAP event is larger than the -total number of type B slots reported in the absinfo for the ABS_MT_SLOT axis. - -The minimum value of the ABS_MT_SLOT axis must be 0. - -Protocol Example A ------------------- - -Here is what a minimal event sequence for a two-contact touch would look -like for a type A device: - - ABS_MT_POSITION_X x[0] - ABS_MT_POSITION_Y y[0] - SYN_MT_REPORT - ABS_MT_POSITION_X x[1] - ABS_MT_POSITION_Y y[1] - SYN_MT_REPORT - SYN_REPORT - -The sequence after moving one of the contacts looks exactly the same; the -raw data for all present contacts are sent between every synchronization -with SYN_REPORT. - -Here is the sequence after lifting the first contact: - - ABS_MT_POSITION_X x[1] - ABS_MT_POSITION_Y y[1] - SYN_MT_REPORT - SYN_REPORT - -And here is the sequence after lifting the second contact: - - SYN_MT_REPORT - SYN_REPORT - -If the driver reports one of BTN_TOUCH or ABS_PRESSURE in addition to the -ABS_MT events, the last SYN_MT_REPORT event may be omitted. Otherwise, the -last SYN_REPORT will be dropped by the input core, resulting in no -zero-contact event reaching userland. - - -Protocol Example B ------------------- - -Here is what a minimal event sequence for a two-contact touch would look -like for a type B device: - - ABS_MT_SLOT 0 - ABS_MT_TRACKING_ID 45 - ABS_MT_POSITION_X x[0] - ABS_MT_POSITION_Y y[0] - ABS_MT_SLOT 1 - ABS_MT_TRACKING_ID 46 - ABS_MT_POSITION_X x[1] - ABS_MT_POSITION_Y y[1] - SYN_REPORT - -Here is the sequence after moving contact 45 in the x direction: - - ABS_MT_SLOT 0 - ABS_MT_POSITION_X x[0] - SYN_REPORT - -Here is the sequence after lifting the contact in slot 0: - - ABS_MT_TRACKING_ID -1 - SYN_REPORT - -The slot being modified is already 0, so the ABS_MT_SLOT is omitted. The -message removes the association of slot 0 with contact 45, thereby -destroying contact 45 and freeing slot 0 to be reused for another contact. - -Finally, here is the sequence after lifting the second contact: - - ABS_MT_SLOT 1 - ABS_MT_TRACKING_ID -1 - SYN_REPORT - - -Event Usage ------------ - -A set of ABS_MT events with the desired properties is defined. The events -are divided into categories, to allow for partial implementation. The -minimum set consists of ABS_MT_POSITION_X and ABS_MT_POSITION_Y, which -allows for multiple contacts to be tracked. If the device supports it, the -ABS_MT_TOUCH_MAJOR and ABS_MT_WIDTH_MAJOR may be used to provide the size -of the contact area and approaching tool, respectively. - -The TOUCH and WIDTH parameters have a geometrical interpretation; imagine -looking through a window at someone gently holding a finger against the -glass. You will see two regions, one inner region consisting of the part -of the finger actually touching the glass, and one outer region formed by -the perimeter of the finger. The center of the touching region (a) is -ABS_MT_POSITION_X/Y and the center of the approaching finger (b) is -ABS_MT_TOOL_X/Y. The touch diameter is ABS_MT_TOUCH_MAJOR and the finger -diameter is ABS_MT_WIDTH_MAJOR. Now imagine the person pressing the finger -harder against the glass. The touch region will increase, and in general, -the ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR, which is always smaller -than unity, is related to the contact pressure. For pressure-based devices, -ABS_MT_PRESSURE may be used to provide the pressure on the contact area -instead. Devices capable of contact hovering can use ABS_MT_DISTANCE to -indicate the distance between the contact and the surface. - - - Linux MT Win8 - __________ _______________________ - / \ | | - / \ | | - / ____ \ | | - / / \ \ | | - \ \ a \ \ | a | - \ \____/ \ | | - \ \ | | - \ b \ | b | - \ \ | | - \ \ | | - \ \ | | - \ / | | - \ / | | - \ / | | - \__________/ |_______________________| - - -In addition to the MAJOR parameters, the oval shape of the touch and finger -regions can be described by adding the MINOR parameters, such that MAJOR -and MINOR are the major and minor axis of an ellipse. The orientation of -the touch ellipse can be described with the ORIENTATION parameter, and the -direction of the finger ellipse is given by the vector (a - b). - -For type A devices, further specification of the touch shape is possible -via ABS_MT_BLOB_ID. - -The ABS_MT_TOOL_TYPE may be used to specify whether the touching tool is a -finger or a pen or something else. Finally, the ABS_MT_TRACKING_ID event -may be used to track identified contacts over time [5]. - -In the type B protocol, ABS_MT_TOOL_TYPE and ABS_MT_TRACKING_ID are -implicitly handled by input core; drivers should instead call -input_mt_report_slot_state(). - - -Event Semantics ---------------- - -ABS_MT_TOUCH_MAJOR - -The length of the major axis of the contact. The length should be given in -surface units. If the surface has an X times Y resolution, the largest -possible value of ABS_MT_TOUCH_MAJOR is sqrt(X^2 + Y^2), the diagonal [4]. - -ABS_MT_TOUCH_MINOR - -The length, in surface units, of the minor axis of the contact. If the -contact is circular, this event can be omitted [4]. - -ABS_MT_WIDTH_MAJOR - -The length, in surface units, of the major axis of the approaching -tool. This should be understood as the size of the tool itself. The -orientation of the contact and the approaching tool are assumed to be the -same [4]. - -ABS_MT_WIDTH_MINOR - -The length, in surface units, of the minor axis of the approaching -tool. Omit if circular [4]. - -The above four values can be used to derive additional information about -the contact. The ratio ABS_MT_TOUCH_MAJOR / ABS_MT_WIDTH_MAJOR approximates -the notion of pressure. The fingers of the hand and the palm all have -different characteristic widths. - -ABS_MT_PRESSURE - -The pressure, in arbitrary units, on the contact area. May be used instead -of TOUCH and WIDTH for pressure-based devices or any device with a spatial -signal intensity distribution. - -ABS_MT_DISTANCE - -The distance, in surface units, between the contact and the surface. Zero -distance means the contact is touching the surface. A positive number means -the contact is hovering above the surface. - -ABS_MT_ORIENTATION - -The orientation of the touching ellipse. The value should describe a signed -quarter of a revolution clockwise around the touch center. The signed value -range is arbitrary, but zero should be returned for an ellipse aligned with -the Y axis of the surface, a negative value when the ellipse is turned to -the left, and a positive value when the ellipse is turned to the -right. When completely aligned with the X axis, the range max should be -returned. - -Touch ellipsis are symmetrical by default. For devices capable of true 360 -degree orientation, the reported orientation must exceed the range max to -indicate more than a quarter of a revolution. For an upside-down finger, -range max * 2 should be returned. - -Orientation can be omitted if the touch area is circular, or if the -information is not available in the kernel driver. Partial orientation -support is possible if the device can distinguish between the two axis, but -not (uniquely) any values in between. In such cases, the range of -ABS_MT_ORIENTATION should be [0, 1] [4]. - -ABS_MT_POSITION_X - -The surface X coordinate of the center of the touching ellipse. - -ABS_MT_POSITION_Y - -The surface Y coordinate of the center of the touching ellipse. - -ABS_MT_TOOL_X - -The surface X coordinate of the center of the approaching tool. Omit if -the device cannot distinguish between the intended touch point and the -tool itself. - -ABS_MT_TOOL_Y - -The surface Y coordinate of the center of the approaching tool. Omit if the -device cannot distinguish between the intended touch point and the tool -itself. - -The four position values can be used to separate the position of the touch -from the position of the tool. If both positions are present, the major -tool axis points towards the touch point [1]. Otherwise, the tool axes are -aligned with the touch axes. - -ABS_MT_TOOL_TYPE - -The type of approaching tool. A lot of kernel drivers cannot distinguish -between different tool types, such as a finger or a pen. In such cases, the -event should be omitted. The protocol currently supports MT_TOOL_FINGER, -MT_TOOL_PEN, and MT_TOOL_PALM [2]. For type B devices, this event is handled -by input core; drivers should instead use input_mt_report_slot_state(). -A contact's ABS_MT_TOOL_TYPE may change over time while still touching the -device, because the firmware may not be able to determine which tool is being -used when it first appears. - -ABS_MT_BLOB_ID - -The BLOB_ID groups several packets together into one arbitrarily shaped -contact. The sequence of points forms a polygon which defines the shape of -the contact. This is a low-level anonymous grouping for type A devices, and -should not be confused with the high-level trackingID [5]. Most type A -devices do not have blob capability, so drivers can safely omit this event. - -ABS_MT_TRACKING_ID - -The TRACKING_ID identifies an initiated contact throughout its life cycle -[5]. The value range of the TRACKING_ID should be large enough to ensure -unique identification of a contact maintained over an extended period of -time. For type B devices, this event is handled by input core; drivers -should instead use input_mt_report_slot_state(). - - -Event Computation ------------------ - -The flora of different hardware unavoidably leads to some devices fitting -better to the MT protocol than others. To simplify and unify the mapping, -this section gives recipes for how to compute certain events. - -For devices reporting contacts as rectangular shapes, signed orientation -cannot be obtained. Assuming X and Y are the lengths of the sides of the -touching rectangle, here is a simple formula that retains the most -information possible: - - ABS_MT_TOUCH_MAJOR := max(X, Y) - ABS_MT_TOUCH_MINOR := min(X, Y) - ABS_MT_ORIENTATION := bool(X > Y) - -The range of ABS_MT_ORIENTATION should be set to [0, 1], to indicate that -the device can distinguish between a finger along the Y axis (0) and a -finger along the X axis (1). - -For win8 devices with both T and C coordinates, the position mapping is - - ABS_MT_POSITION_X := T_X - ABS_MT_POSITION_Y := T_Y - ABS_MT_TOOL_X := C_X - ABS_MT_TOOL_Y := C_Y - -Unfortunately, there is not enough information to specify both the touching -ellipse and the tool ellipse, so one has to resort to approximations. One -simple scheme, which is compatible with earlier usage, is: - - ABS_MT_TOUCH_MAJOR := min(X, Y) - ABS_MT_TOUCH_MINOR := - ABS_MT_ORIENTATION := - ABS_MT_WIDTH_MAJOR := min(X, Y) + distance(T, C) - ABS_MT_WIDTH_MINOR := min(X, Y) - -Rationale: We have no information about the orientation of the touching -ellipse, so approximate it with an inscribed circle instead. The tool -ellipse should align with the vector (T - C), so the diameter must -increase with distance(T, C). Finally, assume that the touch diameter is -equal to the tool thickness, and we arrive at the formulas above. - -Finger Tracking ---------------- - -The process of finger tracking, i.e., to assign a unique trackingID to each -initiated contact on the surface, is a Euclidian Bipartite Matching -problem. At each event synchronization, the set of actual contacts is -matched to the set of contacts from the previous synchronization. A full -implementation can be found in [3]. - - -Gestures --------- - -In the specific application of creating gesture events, the TOUCH and WIDTH -parameters can be used to, e.g., approximate finger pressure or distinguish -between index finger and thumb. With the addition of the MINOR parameters, -one can also distinguish between a sweeping finger and a pointing finger, -and with ORIENTATION, one can detect twisting of fingers. - - -Notes ------ - -In order to stay compatible with existing applications, the data reported -in a finger packet must not be recognized as single-touch events. - -For type A devices, all finger data bypasses input filtering, since -subsequent events of the same type refer to different fingers. - -For example usage of the type A protocol, see the bcm5974 driver. For -example usage of the type B protocol, see the hid-egalax driver. - -[1] Also, the difference (TOOL_X - POSITION_X) can be used to model tilt. -[2] The list can of course be extended. -[3] The mtdev project: http://bitmath.org/code/mtdev/. -[4] See the section on event computation. -[5] See the section on finger tracking. diff --git a/Documentation/input/notifier.rst b/Documentation/input/notifier.rst new file mode 100644 index 0000000000000000000000000000000000000000..161350cb865ebc78eaf1a94b898c962b047da6d8 --- /dev/null +++ b/Documentation/input/notifier.rst @@ -0,0 +1,54 @@ +================= +Keyboard notifier +================= + +One can use register_keyboard_notifier to get called back on keyboard +events (see kbd_keycode() function for details). The passed structure is +keyboard_notifier_param: + +- 'vc' always provide the VC for which the keyboard event applies; +- 'down' is 1 for a key press event, 0 for a key release; +- 'shift' is the current modifier state, mask bit indexes are KG_*; +- 'value' depends on the type of event. + +- KBD_KEYCODE events are always sent before other events, value is the keycode. +- KBD_UNBOUND_KEYCODE events are sent if the keycode is not bound to a keysym. + value is the keycode. +- KBD_UNICODE events are sent if the keycode -> keysym translation produced a + unicode character. value is the unicode value. +- KBD_KEYSYM events are sent if the keycode -> keysym translation produced a + non-unicode character. value is the keysym. +- KBD_POST_KEYSYM events are sent after the treatment of non-unicode keysyms. + That permits one to inspect the resulting LEDs for instance. + +For each kind of event but the last, the callback may return NOTIFY_STOP in +order to "eat" the event: the notify loop is stopped and the keyboard event is +dropped. + +In a rough C snippet, we have:: + + kbd_keycode(keycode) { + ... + params.value = keycode; + if (notifier_call_chain(KBD_KEYCODE,¶ms) == NOTIFY_STOP) + || !bound) { + notifier_call_chain(KBD_UNBOUND_KEYCODE,¶ms); + return; + } + + if (unicode) { + param.value = unicode; + if (notifier_call_chain(KBD_UNICODE,¶ms) == NOTIFY_STOP) + return; + emit unicode; + return; + } + + params.value = keysym; + if (notifier_call_chain(KBD_KEYSYM,¶ms) == NOTIFY_STOP) + return; + apply keysym; + notifier_call_chain(KBD_POST_KEYSYM,¶ms); + } + +.. note:: This notifier is usually called from interrupt context. diff --git a/Documentation/input/notifier.txt b/Documentation/input/notifier.txt deleted file mode 100644 index 95172ca6f3d21c198a5adaa048aa4f08b27031b7..0000000000000000000000000000000000000000 --- a/Documentation/input/notifier.txt +++ /dev/null @@ -1,52 +0,0 @@ -Keyboard notifier - -One can use register_keyboard_notifier to get called back on keyboard -events (see kbd_keycode() function for details). The passed structure is -keyboard_notifier_param: - -- 'vc' always provide the VC for which the keyboard event applies; -- 'down' is 1 for a key press event, 0 for a key release; -- 'shift' is the current modifier state, mask bit indexes are KG_*; -- 'value' depends on the type of event. - -- KBD_KEYCODE events are always sent before other events, value is the keycode. -- KBD_UNBOUND_KEYCODE events are sent if the keycode is not bound to a keysym. - value is the keycode. -- KBD_UNICODE events are sent if the keycode -> keysym translation produced a - unicode character. value is the unicode value. -- KBD_KEYSYM events are sent if the keycode -> keysym translation produced a - non-unicode character. value is the keysym. -- KBD_POST_KEYSYM events are sent after the treatment of non-unicode keysyms. - That permits one to inspect the resulting LEDs for instance. - -For each kind of event but the last, the callback may return NOTIFY_STOP in -order to "eat" the event: the notify loop is stopped and the keyboard event is -dropped. - -In a rough C snippet, we have: - -kbd_keycode(keycode) { - ... - params.value = keycode; - if (notifier_call_chain(KBD_KEYCODE,¶ms) == NOTIFY_STOP) - || !bound) { - notifier_call_chain(KBD_UNBOUND_KEYCODE,¶ms); - return; - } - - if (unicode) { - param.value = unicode; - if (notifier_call_chain(KBD_UNICODE,¶ms) == NOTIFY_STOP) - return; - emit unicode; - return; - } - - params.value = keysym; - if (notifier_call_chain(KBD_KEYSYM,¶ms) == NOTIFY_STOP) - return; - apply keysym; - notifier_call_chain(KBD_POST_KEYSYM,¶ms); -} - -NOTE: This notifier is usually called from interrupt context. diff --git a/Documentation/input/ntrig.txt b/Documentation/input/ntrig.txt deleted file mode 100644 index be1fd981f73f7353241d5fddccd7c13da6e9c0f9..0000000000000000000000000000000000000000 --- a/Documentation/input/ntrig.txt +++ /dev/null @@ -1,126 +0,0 @@ -N-Trig touchscreen Driver -------------------------- - Copyright (c) 2008-2010 Rafi Rubin - Copyright (c) 2009-2010 Stephane Chatty - -This driver provides support for N-Trig pen and multi-touch sensors. Single -and multi-touch events are translated to the appropriate protocols for -the hid and input systems. Pen events are sufficiently hid compliant and -are left to the hid core. The driver also provides additional filtering -and utility functions accessible with sysfs and module parameters. - -This driver has been reported to work properly with multiple N-Trig devices -attached. - - -Parameters ----------- - -Note: values set at load time are global and will apply to all applicable -devices. Adjusting parameters with sysfs will override the load time values, -but only for that one device. - -The following parameters are used to configure filters to reduce noise: - -activate_slack number of fingers to ignore before processing events - -activation_height size threshold to activate immediately -activation_width - -min_height size threshold bellow which fingers are ignored -min_width both to decide activation and during activity - -deactivate_slack the number of "no contact" frames to ignore before - propagating the end of activity events - -When the last finger is removed from the device, it sends a number of empty -frames. By holding off on deactivation for a few frames we can tolerate false -erroneous disconnects, where the sensor may mistakenly not detect a finger that -is still present. Thus deactivate_slack addresses problems where a users might -see breaks in lines during drawing, or drop an object during a long drag. - - -Additional sysfs items ----------------------- - -These nodes just provide easy access to the ranges reported by the device. -sensor_logical_height the range for positions reported during activity -sensor_logical_width - -sensor_physical_height internal ranges not used for normal events but -sensor_physical_width useful for tuning - -All N-Trig devices with product id of 1 report events in the ranges of -X: 0-9600 -Y: 0-7200 -However not all of these devices have the same physical dimensions. Most -seem to be 12" sensors (Dell Latitude XT and XT2 and the HP TX2), and -at least one model (Dell Studio 17) has a 17" sensor. The ratio of physical -to logical sizes is used to adjust the size based filter parameters. - - -Filtering ---------- - -With the release of the early multi-touch firmwares it became increasingly -obvious that these sensors were prone to erroneous events. Users reported -seeing both inappropriately dropped contact and ghosts, contacts reported -where no finger was actually touching the screen. - -Deactivation slack helps prevent dropped contact for single touch use, but does -not address the problem of dropping one of more contacts while other contacts -are still active. Drops in the multi-touch context require additional -processing and should be handled in tandem with tacking. - -As observed ghost contacts are similar to actual use of the sensor, but they -seem to have different profiles. Ghost activity typically shows up as small -short lived touches. As such, I assume that the longer the continuous stream -of events the more likely those events are from a real contact, and that the -larger the size of each contact the more likely it is real. Balancing the -goals of preventing ghosts and accepting real events quickly (to minimize -user observable latency), the filter accumulates confidence for incoming -events until it hits thresholds and begins propagating. In the interest in -minimizing stored state as well as the cost of operations to make a decision, -I've kept that decision simple. - -Time is measured in terms of the number of fingers reported, not frames since -the probability of multiple simultaneous ghosts is expected to drop off -dramatically with increasing numbers. Rather than accumulate weight as a -function of size, I just use it as a binary threshold. A sufficiently large -contact immediately overrides the waiting period and leads to activation. - -Setting the activation size thresholds to large values will result in deciding -primarily on activation slack. If you see longer lived ghosts, turning up the -activation slack while reducing the size thresholds may suffice to eliminate -the ghosts while keeping the screen quite responsive to firm taps. - -Contacts continue to be filtered with min_height and min_width even after -the initial activation filter is satisfied. The intent is to provide -a mechanism for filtering out ghosts in the form of an extra finger while -you actually are using the screen. In practice this sort of ghost has -been far less problematic or relatively rare and I've left the defaults -set to 0 for both parameters, effectively turning off that filter. - -I don't know what the optimal values are for these filters. If the defaults -don't work for you, please play with the parameters. If you do find other -values more comfortable, I would appreciate feedback. - -The calibration of these devices does drift over time. If ghosts or contact -dropping worsen and interfere with the normal usage of your device, try -recalibrating it. - - -Calibration ------------ - -The N-Trig windows tools provide calibration and testing routines. Also an -unofficial unsupported set of user space tools including a calibrator is -available at: -http://code.launchpad.net/~rafi-seas/+junk/ntrig_calib - - -Tracking --------- - -As of yet, all tested N-Trig firmwares do not track fingers. When multiple -contacts are active they seem to be sorted primarily by Y position. diff --git a/Documentation/input/rotary-encoder.txt b/Documentation/input/rotary-encoder.txt deleted file mode 100644 index 46a74f0c551a1b5a1ada3dc7e3ae248df1edbbff..0000000000000000000000000000000000000000 --- a/Documentation/input/rotary-encoder.txt +++ /dev/null @@ -1,126 +0,0 @@ -rotary-encoder - a generic driver for GPIO connected devices -Daniel Mack , Feb 2009 - -0. Function ------------ - -Rotary encoders are devices which are connected to the CPU or other -peripherals with two wires. The outputs are phase-shifted by 90 degrees -and by triggering on falling and rising edges, the turn direction can -be determined. - -Some encoders have both outputs low in stable states, others also have -a stable state with both outputs high (half-period mode) and some have -a stable state in all steps (quarter-period mode). - -The phase diagram of these two outputs look like this: - - _____ _____ _____ - | | | | | | - Channel A ____| |_____| |_____| |____ - - : : : : : : : : : : : : - __ _____ _____ _____ - | | | | | | | - Channel B |_____| |_____| |_____| |__ - - : : : : : : : : : : : : - Event a b c d a b c d a b c d - - |<-------->| - one step - - |<-->| - one step (half-period mode) - - |<>| - one step (quarter-period mode) - -For more information, please see - https://en.wikipedia.org/wiki/Rotary_encoder - - -1. Events / state machine -------------------------- - -In half-period mode, state a) and c) above are used to determine the -rotational direction based on the last stable state. Events are reported in -states b) and d) given that the new stable state is different from the last -(i.e. the rotation was not reversed half-way). - -Otherwise, the following apply: - -a) Rising edge on channel A, channel B in low state - This state is used to recognize a clockwise turn - -b) Rising edge on channel B, channel A in high state - When entering this state, the encoder is put into 'armed' state, - meaning that there it has seen half the way of a one-step transition. - -c) Falling edge on channel A, channel B in high state - This state is used to recognize a counter-clockwise turn - -d) Falling edge on channel B, channel A in low state - Parking position. If the encoder enters this state, a full transition - should have happened, unless it flipped back on half the way. The - 'armed' state tells us about that. - -2. Platform requirements ------------------------- - -As there is no hardware dependent call in this driver, the platform it is -used with must support gpiolib. Another requirement is that IRQs must be -able to fire on both edges. - - -3. Board integration --------------------- - -To use this driver in your system, register a platform_device with the -name 'rotary-encoder' and associate the IRQs and some specific platform -data with it. - -struct rotary_encoder_platform_data is declared in -include/linux/rotary-encoder.h and needs to be filled with the number of -steps the encoder has and can carry information about externally inverted -signals (because of an inverting buffer or other reasons). The encoder -can be set up to deliver input information as either an absolute or relative -axes. For relative axes the input event returns +/-1 for each step. For -absolute axes the position of the encoder can either roll over between zero -and the number of steps or will clamp at the maximum and zero depending on -the configuration. - -Because GPIO to IRQ mapping is platform specific, this information must -be given in separately to the driver. See the example below. - ------------------- - -/* board support file example */ - -#include -#include - -#define GPIO_ROTARY_A 1 -#define GPIO_ROTARY_B 2 - -static struct rotary_encoder_platform_data my_rotary_encoder_info = { - .steps = 24, - .axis = ABS_X, - .relative_axis = false, - .rollover = false, - .gpio_a = GPIO_ROTARY_A, - .gpio_b = GPIO_ROTARY_B, - .inverted_a = 0, - .inverted_b = 0, - .half_period = false, - .wakeup_source = false, -}; - -static struct platform_device rotary_encoder_device = { - .name = "rotary-encoder", - .id = 0, - .dev = { - .platform_data = &my_rotary_encoder_info, - } -}; - diff --git a/Documentation/input/sentelic.txt b/Documentation/input/sentelic.txt deleted file mode 100644 index 89251e2a3eba101fddf89e7d4da3a2f1b8fb5ba8..0000000000000000000000000000000000000000 --- a/Documentation/input/sentelic.txt +++ /dev/null @@ -1,873 +0,0 @@ -Copyright (C) 2002-2011 Sentelic Corporation. -Last update: Dec-07-2011 - -============================================================================== -* Finger Sensing Pad Intellimouse Mode(scrolling wheel, 4th and 5th buttons) -============================================================================== -A) MSID 4: Scrolling wheel mode plus Forward page(4th button) and Backward - page (5th button) -@1. Set sample rate to 200; -@2. Set sample rate to 200; -@3. Set sample rate to 80; -@4. Issuing the "Get device ID" command (0xF2) and waits for the response; -@5. FSP will respond 0x04. - -Packet 1 - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|W|W|W|W| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7 => Y overflow - Bit6 => X overflow - Bit5 => Y sign bit - Bit4 => X sign bit - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X Movement(9-bit 2's complement integers) -Byte 3: Y Movement(9-bit 2's complement integers) -Byte 4: Bit3~Bit0 => the scrolling wheel's movement since the last data report. - valid values, -8 ~ +7 - Bit4 => 1 = 4th mouse button is pressed, Forward one page. - 0 = 4th mouse button is not pressed. - Bit5 => 1 = 5th mouse button is pressed, Backward one page. - 0 = 5th mouse button is not pressed. - -B) MSID 6: Horizontal and Vertical scrolling. -@ Set bit 1 in register 0x40 to 1 - -# FSP replaces scrolling wheel's movement as 4 bits to show horizontal and - vertical scrolling. - -Packet 1 - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |Y|X|y|x|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 | | |B|F|r|l|u|d| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7 => Y overflow - Bit6 => X overflow - Bit5 => Y sign bit - Bit4 => X sign bit - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X Movement(9-bit 2's complement integers) -Byte 3: Y Movement(9-bit 2's complement integers) -Byte 4: Bit0 => the Vertical scrolling movement downward. - Bit1 => the Vertical scrolling movement upward. - Bit2 => the Horizontal scrolling movement leftward. - Bit3 => the Horizontal scrolling movement rightward. - Bit4 => 1 = 4th mouse button is pressed, Forward one page. - 0 = 4th mouse button is not pressed. - Bit5 => 1 = 5th mouse button is pressed, Backward one page. - 0 = 5th mouse button is not pressed. - -C) MSID 7: -# FSP uses 2 packets (8 Bytes) to represent Absolute Position. - so we have PACKET NUMBER to identify packets. - If PACKET NUMBER is 0, the packet is Packet 1. - If PACKET NUMBER is 1, the packet is Packet 2. - Please count this number in program. - -# MSID6 special packet will be enable at the same time when enable MSID 7. - -============================================================================== -* Absolute position for STL3886-G0. -============================================================================== -@ Set bit 2 or 3 in register 0x40 to 1 -@ Set bit 6 in register 0x40 to 1 - -Packet 1 (ABSOLUTE POSITION) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|V|1|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|d|u|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - Bit5 => valid bit - Bit4 => 1 - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit4 => scroll up - Bit5 => scroll down - Bit6 => scroll left - Bit7 => scroll right - -Notify Packet for G0 - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |1|0|0|1|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |M|M|M|M|M|M|M|M| 4 |0|0|0|0|0|0|0|0| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - Bit5 => 0 - Bit4 => 1 - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: Message Type => 0x5A (Enable/Disable status packet) - Mode Type => 0xA5 (Normal/Icon mode status) -Byte 3: Message Type => 0x00 (Disabled) - => 0x01 (Enabled) - Mode Type => 0x00 (Normal) - => 0x01 (Icon) -Byte 4: Bit7~Bit0 => Don't Care - -============================================================================== -* Absolute position for STL3888-Ax. -============================================================================== -Packet 1 (ABSOLUTE POSITION) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|V|A|1|L|0|1| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. - When both fingers are up, the last two reports have zero valid - bit. - Bit4 => arc - Bit3 => 1 - Bit2 => Left Button, 1 is pressed, 0 is released. - Bit1 => 0 - Bit0 => 1 -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit5~Bit4 => y1_g - Bit7~Bit6 => x1_g - -Packet 2 (ABSOLUTE POSITION) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|V|A|1|R|1|0| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |x|x|y|y|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordinates packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. - When both fingers are up, the last two reports have zero valid - bit. - Bit4 => arc - Bit3 => 1 - Bit2 => Right Button, 1 is pressed, 0 is released. - Bit1 => 1 - Bit0 => 0 -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit5~Bit4 => y2_g - Bit7~Bit6 => x2_g - -Notify Packet for STL3888-Ax - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|d|u|0|0|0|0| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordinates packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => 1 - Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): - 0: left button is generated by the on-pad command - 1: left button is generated by the external button - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) -Byte 3: Bit7~Bit6 => Don't care - Bit5~Bit4 => Number of fingers - Bit3~Bit1 => Reserved - Bit0 => 1: enter gesture mode; 0: leaving gesture mode -Byte 4: Bit7 => scroll right button - Bit6 => scroll left button - Bit5 => scroll down button - Bit4 => scroll up button - * Note that if gesture and additional button (Bit4~Bit7) - happen at the same time, the button information will not - be sent. - Bit3~Bit0 => Reserved - -Sample sequence of Multi-finger, Multi-coordinate mode: - - notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, - abs pkt 2, ..., notify packet (valid bit == 0) - -============================================================================== -* Absolute position for STL3888-B0. -============================================================================== -Packet 1(ABSOLUTE POSITION) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|V|F|1|0|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordinates packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. - When both fingers are up, the last two reports have zero valid - bit. - Bit4 => finger up/down information. 1: finger down, 0: finger up. - Bit3 => 1 - Bit2 => finger index, 0 is the first finger, 1 is the second finger. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit4 => scroll down button - Bit5 => scroll up button - Bit6 => scroll left button - Bit7 => scroll right button - -Packet 2 (ABSOLUTE POSITION) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|V|F|1|1|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|u|d|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => Valid bit, 0 means that the coordinate is invalid or finger up. - When both fingers are up, the last two reports have zero valid - bit. - Bit4 => finger up/down information. 1: finger down, 0: finger up. - Bit3 => 1 - Bit2 => finger index, 0 is the first finger, 1 is the second finger. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit4 => scroll down button - Bit5 => scroll up button - Bit6 => scroll left button - Bit7 => scroll right button - -Notify Packet for STL3888-B0 - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |1|0|1|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - => 11, Normal data packet with on-pad click - Bit5 => 1 - Bit4 => when in absolute coordinates mode (valid when EN_PKT_GO is 1): - 0: left button is generated by the on-pad command - 1: left button is generated by the external button - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: Message Type => 0xB7 (Multi Finger, Multi Coordinate mode) -Byte 3: Bit7~Bit6 => Don't care - Bit5~Bit4 => Number of fingers - Bit3~Bit1 => Reserved - Bit0 => 1: enter gesture mode; 0: leaving gesture mode -Byte 4: Bit7 => scroll right button - Bit6 => scroll left button - Bit5 => scroll up button - Bit4 => scroll down button - * Note that if gesture and additional button(Bit4~Bit7) - happen at the same time, the button information will not - be sent. - Bit3~Bit0 => Reserved - -Sample sequence of Multi-finger, Multi-coordinate mode: - - notify packet (valid bit == 1), abs pkt 1, abs pkt 2, abs pkt 1, - abs pkt 2, ..., notify packet (valid bit == 0) - -============================================================================== -* Absolute position for STL3888-Cx and STL3888-Dx. -============================================================================== -Single Finger, Absolute Coordinate Mode (SFAC) - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|0|P|1|M|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|B|F|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordinates packet - => 10, Notify packet - Bit5 => Coordinate mode(always 0 in SFAC mode): - 0: single-finger absolute coordinates (SFAC) mode - 1: multi-finger, multiple coordinates (MFMC) mode - Bit4 => 0: The LEFT button is generated by on-pad command (OPC) - 1: The LEFT button is generated by external button - Default is 1 even if the LEFT button is not pressed. - Bit3 => Always 1, as specified by PS/2 protocol. - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit4 => 4th mouse button(forward one page) - Bit5 => 5th mouse button(backward one page) - Bit6 => scroll left button - Bit7 => scroll right button - -Multi Finger, Multiple Coordinates Mode (MFMC): - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |0|1|1|P|1|F|R|L| 2 |X|X|X|X|X|X|X|X| 3 |Y|Y|Y|Y|Y|Y|Y|Y| 4 |r|l|B|F|X|X|Y|Y| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordination packet - => 10, Notify packet - Bit5 => Coordinate mode (always 1 in MFMC mode): - 0: single-finger absolute coordinates (SFAC) mode - 1: multi-finger, multiple coordinates (MFMC) mode - Bit4 => 0: The LEFT button is generated by on-pad command (OPC) - 1: The LEFT button is generated by external button - Default is 1 even if the LEFT button is not pressed. - Bit3 => Always 1, as specified by PS/2 protocol. - Bit2 => Finger index, 0 is the first finger, 1 is the second finger. - If bit 1 and 0 are all 1 and bit 4 is 0, the middle external - button is pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: X coordinate (xpos[9:2]) -Byte 3: Y coordinate (ypos[9:2]) -Byte 4: Bit1~Bit0 => Y coordinate (xpos[1:0]) - Bit3~Bit2 => X coordinate (ypos[1:0]) - Bit4 => 4th mouse button(forward one page) - Bit5 => 5th mouse button(backward one page) - Bit6 => scroll left button - Bit7 => scroll right button - - When one of the two fingers is up, the device will output four consecutive -MFMC#0 report packets with zero X and Y to represent 1st finger is up or -four consecutive MFMC#1 report packets with zero X and Y to represent that -the 2nd finger is up. On the other hand, if both fingers are up, the device -will output four consecutive single-finger, absolute coordinate(SFAC) packets -with zero X and Y. - -Notify Packet for STL3888-Cx/Dx - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |1|0|0|P|1|M|R|L| 2 |C|C|C|C|C|C|C|C| 3 |0|0|F|F|0|0|0|i| 4 |r|l|u|d|0|0|0|0| - |---------------| |---------------| |---------------| |---------------| - -Byte 1: Bit7~Bit6 => 00, Normal data packet - => 01, Absolute coordinates packet - => 10, Notify packet - Bit5 => Always 0 - Bit4 => 0: The LEFT button is generated by on-pad command(OPC) - 1: The LEFT button is generated by external button - Default is 1 even if the LEFT button is not pressed. - Bit3 => 1 - Bit2 => Middle Button, 1 is pressed, 0 is not pressed. - Bit1 => Right Button, 1 is pressed, 0 is not pressed. - Bit0 => Left Button, 1 is pressed, 0 is not pressed. -Byte 2: Message type: - 0xba => gesture information - 0xc0 => one finger hold-rotating gesture -Byte 3: The first parameter for the received message: - 0xba => gesture ID (refer to the 'Gesture ID' section) - 0xc0 => region ID -Byte 4: The second parameter for the received message: - 0xba => N/A - 0xc0 => finger up/down information - -Sample sequence of Multi-finger, Multi-coordinates mode: - - notify packet (valid bit == 1), MFMC packet 1 (byte 1, bit 2 == 0), - MFMC packet 2 (byte 1, bit 2 == 1), MFMC packet 1, MFMC packet 2, - ..., notify packet (valid bit == 0) - - That is, when the device is in MFMC mode, the host will receive - interleaved absolute coordinate packets for each finger. - -============================================================================== -* FSP Enable/Disable packet -============================================================================== - Bit 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 -BYTE |---------------|BYTE |---------------|BYTE|---------------|BYTE|---------------| - 1 |Y|X|0|0|1|M|R|L| 2 |0|1|0|1|1|0|1|E| 3 | | | | | | | | | 4 | | | | | | | | | - |---------------| |---------------| |---------------| |---------------| - -FSP will send out enable/disable packet when FSP receive PS/2 enable/disable -command. Host will receive the packet which Middle, Right, Left button will -be set. The packet only use byte 0 and byte 1 as a pattern of original packet. -Ignore the other bytes of the packet. - -Byte 1: Bit7 => 0, Y overflow - Bit6 => 0, X overflow - Bit5 => 0, Y sign bit - Bit4 => 0, X sign bit - Bit3 => 1 - Bit2 => 1, Middle Button - Bit1 => 1, Right Button - Bit0 => 1, Left Button -Byte 2: Bit7~1 => (0101101b) - Bit0 => 1 = Enable - 0 = Disable -Byte 3: Don't care -Byte 4: Don't care (MOUSE ID 3, 4) -Byte 5~8: Don't care (Absolute packet) - -============================================================================== -* PS/2 Command Set -============================================================================== - -FSP supports basic PS/2 commanding set and modes, refer to following URL for -details about PS/2 commands: - -http://www.computer-engineering.org/ps2mouse/ - -============================================================================== -* Programming Sequence for Determining Packet Parsing Flow -============================================================================== -1. Identify FSP by reading device ID(0x00) and version(0x01) register - -2a. For FSP version < STL3888 Cx, determine number of buttons by reading - the 'test mode status' (0x20) register: - - buttons = reg[0x20] & 0x30 - - if buttons == 0x30 or buttons == 0x20: - # two/four buttons - Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' - section A for packet parsing detail(ignore byte 4, bit ~ 7) - elif buttons == 0x10: - # 6 buttons - Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' - section B for packet parsing detail - elif buttons == 0x00: - # 6 buttons - Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' - section A for packet parsing detail - -2b. For FSP version >= STL3888 Cx: - Refer to 'Finger Sensing Pad PS/2 Mouse Intellimouse' - section A for packet parsing detail (ignore byte 4, bit ~ 7) - -============================================================================== -* Programming Sequence for Register Reading/Writing -============================================================================== - -Register inversion requirement: - - Following values needed to be inverted(the '~' operator in C) before being -sent to FSP: - - 0xe8, 0xe9, 0xee, 0xf2, 0xf3 and 0xff. - -Register swapping requirement: - - Following values needed to have their higher 4 bits and lower 4 bits being -swapped before being sent to FSP: - - 10, 20, 40, 60, 80, 100 and 200. - -Register reading sequence: - - 1. send 0xf3 PS/2 command to FSP; - - 2. send 0x66 PS/2 command to FSP; - - 3. send 0x88 PS/2 command to FSP; - - 4. send 0xf3 PS/2 command to FSP; - - 5. if the register address being to read is not required to be - inverted(refer to the 'Register inversion requirement' section), - goto step 6 - - 5a. send 0x68 PS/2 command to FSP; - - 5b. send the inverted register address to FSP and goto step 8; - - 6. if the register address being to read is not required to be - swapped(refer to the 'Register swapping requirement' section), - goto step 7 - - 6a. send 0xcc PS/2 command to FSP; - - 6b. send the swapped register address to FSP and goto step 8; - - 7. send 0x66 PS/2 command to FSP; - - 7a. send the original register address to FSP and goto step 8; - - 8. send 0xe9(status request) PS/2 command to FSP; - - 9. the 4th byte of the response read from FSP should be the - requested register value(?? indicates don't care byte): - - host: 0xe9 - 3888: 0xfa (??) (??) (val) - - * Note that since the Cx release, the hardware will return 1's - complement of the register value at the 3rd byte of status request - result: - - host: 0xe9 - 3888: 0xfa (??) (~val) (val) - -Register writing sequence: - - 1. send 0xf3 PS/2 command to FSP; - - 2. if the register address being to write is not required to be - inverted(refer to the 'Register inversion requirement' section), - goto step 3 - - 2a. send 0x74 PS/2 command to FSP; - - 2b. send the inverted register address to FSP and goto step 5; - - 3. if the register address being to write is not required to be - swapped(refer to the 'Register swapping requirement' section), - goto step 4 - - 3a. send 0x77 PS/2 command to FSP; - - 3b. send the swapped register address to FSP and goto step 5; - - 4. send 0x55 PS/2 command to FSP; - - 4a. send the register address to FSP and goto step 5; - - 5. send 0xf3 PS/2 command to FSP; - - 6. if the register value being to write is not required to be - inverted(refer to the 'Register inversion requirement' section), - goto step 7 - - 6a. send 0x47 PS/2 command to FSP; - - 6b. send the inverted register value to FSP and goto step 9; - - 7. if the register value being to write is not required to be - swapped(refer to the 'Register swapping requirement' section), - goto step 8 - - 7a. send 0x44 PS/2 command to FSP; - - 7b. send the swapped register value to FSP and goto step 9; - - 8. send 0x33 PS/2 command to FSP; - - 8a. send the register value to FSP; - - 9. the register writing sequence is completed. - - * Note that since the Cx release, the hardware will return 1's - complement of the register value at the 3rd byte of status request - result. Host can optionally send another 0xe9 (status request) PS/2 - command to FSP at the end of register writing to verify that the - register writing operation is successful (?? indicates don't care - byte): - - host: 0xe9 - 3888: 0xfa (??) (~val) (val) - -============================================================================== -* Programming Sequence for Page Register Reading/Writing -============================================================================== - - In order to overcome the limitation of maximum number of registers -supported, the hardware separates register into different groups called -'pages.' Each page is able to include up to 255 registers. - - The default page after power up is 0x82; therefore, if one has to get -access to register 0x8301, one has to use following sequence to switch -to page 0x83, then start reading/writing from/to offset 0x01 by using -the register read/write sequence described in previous section. - -Page register reading sequence: - - 1. send 0xf3 PS/2 command to FSP; - - 2. send 0x66 PS/2 command to FSP; - - 3. send 0x88 PS/2 command to FSP; - - 4. send 0xf3 PS/2 command to FSP; - - 5. send 0x83 PS/2 command to FSP; - - 6. send 0x88 PS/2 command to FSP; - - 7. send 0xe9(status request) PS/2 command to FSP; - - 8. the response read from FSP should be the requested page value. - -Page register writing sequence: - - 1. send 0xf3 PS/2 command to FSP; - - 2. send 0x38 PS/2 command to FSP; - - 3. send 0x88 PS/2 command to FSP; - - 4. send 0xf3 PS/2 command to FSP; - - 5. if the page address being written is not required to be - inverted(refer to the 'Register inversion requirement' section), - goto step 6 - - 5a. send 0x47 PS/2 command to FSP; - - 5b. send the inverted page address to FSP and goto step 9; - - 6. if the page address being written is not required to be - swapped(refer to the 'Register swapping requirement' section), - goto step 7 - - 6a. send 0x44 PS/2 command to FSP; - - 6b. send the swapped page address to FSP and goto step 9; - - 7. send 0x33 PS/2 command to FSP; - - 8. send the page address to FSP; - - 9. the page register writing sequence is completed. - -============================================================================== -* Gesture ID -============================================================================== - - Unlike other devices which sends multiple fingers' coordinates to host, -FSP processes multiple fingers' coordinates internally and convert them -into a 8 bits integer, namely 'Gesture ID.' Following is a list of -supported gesture IDs: - - ID Description - 0x86 2 finger straight up - 0x82 2 finger straight down - 0x80 2 finger straight right - 0x84 2 finger straight left - 0x8f 2 finger zoom in - 0x8b 2 finger zoom out - 0xc0 2 finger curve, counter clockwise - 0xc4 2 finger curve, clockwise - 0x2e 3 finger straight up - 0x2a 3 finger straight down - 0x28 3 finger straight right - 0x2c 3 finger straight left - 0x38 palm - -============================================================================== -* Register Listing -============================================================================== - - Registers are represented in 16 bits values. The higher 8 bits represent -the page address and the lower 8 bits represent the relative offset within -that particular page. Refer to the 'Programming Sequence for Page Register -Reading/Writing' section for instructions on how to change current page -address. - -offset width default r/w name -0x8200 bit7~bit0 0x01 RO device ID - -0x8201 bit7~bit0 RW version ID - 0xc1: STL3888 Ax - 0xd0 ~ 0xd2: STL3888 Bx - 0xe0 ~ 0xe1: STL3888 Cx - 0xe2 ~ 0xe3: STL3888 Dx - -0x8202 bit7~bit0 0x01 RO vendor ID - -0x8203 bit7~bit0 0x01 RO product ID - -0x8204 bit3~bit0 0x01 RW revision ID - -0x820b test mode status 1 - bit3 1 RO 0: rotate 180 degree - 1: no rotation - *only supported by H/W prior to Cx - -0x820f register file page control - bit2 0 RW 1: rotate 180 degree - 0: no rotation - *supported since Cx - - bit0 0 RW 1 to enable page 1 register files - *only supported by H/W prior to Cx - -0x8210 RW system control 1 - bit0 1 RW Reserved, must be 1 - bit1 0 RW Reserved, must be 0 - bit4 0 RW Reserved, must be 0 - bit5 1 RW register clock gating enable - 0: read only, 1: read/write enable - (Note that following registers does not require clock gating being - enabled prior to write: 05 06 07 08 09 0c 0f 10 11 12 16 17 18 23 2e - 40 41 42 43. In addition to that, this bit must be 1 when gesture - mode is enabled) - -0x8220 test mode status - bit5~bit4 RO number of buttons - 11 => 2, lbtn/rbtn - 10 => 4, lbtn/rbtn/scru/scrd - 01 => 6, lbtn/rbtn/scru/scrd/scrl/scrr - 00 => 6, lbtn/rbtn/scru/scrd/fbtn/bbtn - *only supported by H/W prior to Cx - -0x8231 RW on-pad command detection - bit7 0 RW on-pad command left button down tag - enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - -0x8234 RW on-pad command control 5 - bit4~bit0 0x05 RW XLO in 0s/4/1, so 03h = 0010.1b = 2.5 - (Note that position unit is in 0.5 scanline) - *only supported by H/W prior to Cx - - bit7 0 RW on-pad tap zone enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - -0x8235 RW on-pad command control 6 - bit4~bit0 0x1d RW XHI in 0s/4/1, so 19h = 1100.1b = 12.5 - (Note that position unit is in 0.5 scanline) - *only supported by H/W prior to Cx - -0x8236 RW on-pad command control 7 - bit4~bit0 0x04 RW YLO in 0s/4/1, so 03h = 0010.1b = 2.5 - (Note that position unit is in 0.5 scanline) - *only supported by H/W prior to Cx - -0x8237 RW on-pad command control 8 - bit4~bit0 0x13 RW YHI in 0s/4/1, so 11h = 1000.1b = 8.5 - (Note that position unit is in 0.5 scanline) - *only supported by H/W prior to Cx - -0x8240 RW system control 5 - bit1 0 RW FSP Intellimouse mode enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - - bit2 0 RW movement + abs. coordinate mode enable - 0: disable, 1: enable - (Note that this function has the functionality of bit 1 even when - bit 1 is not set. However, the format is different from that of bit 1. - In addition, when bit 1 and bit 2 are set at the same time, bit 2 will - override bit 1.) - *only supported by H/W prior to Cx - - bit3 0 RW abs. coordinate only mode enable - 0: disable, 1: enable - (Note that this function has the functionality of bit 1 even when - bit 1 is not set. However, the format is different from that of bit 1. - In addition, when bit 1, bit 2 and bit 3 are set at the same time, - bit 3 will override bit 1 and 2.) - *only supported by H/W prior to Cx - - bit5 0 RW auto switch enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - - bit6 0 RW G0 abs. + notify packet format enable - 0: disable, 1: enable - (Note that the absolute/relative coordinate output still depends on - bit 2 and 3. That is, if any of those bit is 1, host will receive - absolute coordinates; otherwise, host only receives packets with - relative coordinate.) - *only supported by H/W prior to Cx - - bit7 0 RW EN_PS2_F2: PS/2 gesture mode 2nd - finger packet enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - -0x8243 RW on-pad control - bit0 0 RW on-pad control enable - 0: disable, 1: enable - (Note that if this bit is cleared, bit 3/5 will be ineffective) - *only supported by H/W prior to Cx - - bit3 0 RW on-pad fix vertical scrolling enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - - bit5 0 RW on-pad fix horizontal scrolling enable - 0: disable, 1: enable - *only supported by H/W prior to Cx - -0x8290 RW software control register 1 - bit0 0 RW absolute coordination mode - 0: disable, 1: enable - *supported since Cx - - bit1 0 RW gesture ID output - 0: disable, 1: enable - *supported since Cx - - bit2 0 RW two fingers' coordinates output - 0: disable, 1: enable - *supported since Cx - - bit3 0 RW finger up one packet output - 0: disable, 1: enable - *supported since Cx - - bit4 0 RW absolute coordination continuous mode - 0: disable, 1: enable - *supported since Cx - - bit6~bit5 00 RW gesture group selection - 00: basic - 01: suite - 10: suite pro - 11: advanced - *supported since Cx - - bit7 0 RW Bx packet output compatible mode - 0: disable, 1: enable *supported since Cx - *supported since Cx - - -0x833d RW on-pad command control 1 - bit7 1 RW on-pad command detection enable - 0: disable, 1: enable - *supported since Cx - -0x833e RW on-pad command detection - bit7 0 RW on-pad command left button down tag - enable. Works only in H/W based PS/2 - data packet mode. - 0: disable, 1: enable - *supported since Cx diff --git a/Documentation/input/shape.fig b/Documentation/input/shape.fig deleted file mode 100644 index c22bff83d06f74a116eb10020f1329ef814dd4ca..0000000000000000000000000000000000000000 --- a/Documentation/input/shape.fig +++ /dev/null @@ -1,65 +0,0 @@ -#FIG 3.2 -Landscape -Center -Inches -Letter -100.00 -Single --2 -1200 2 -2 1 0 2 0 7 50 0 -1 0.000 0 0 -1 0 0 6 - 4200 3600 4200 3075 4950 2325 7425 2325 8250 3150 8250 3600 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 4200 3675 4200 5400 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 8250 3675 8250 5400 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 3675 3600 8700 3600 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 8775 3600 10200 3600 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 8325 3150 9075 3150 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 7500 2325 10200 2325 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 3600 3600 3000 3600 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 4125 3075 3000 3075 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 4200 5400 8175 5400 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 10125 2325 10125 3600 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 3000 3150 3000 3600 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 9075 3150 9075 3600 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 4950 2325 4950 1200 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 2 - 7425 2325 7425 1200 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4 - 4200 3075 4200 2400 3600 1800 3600 1200 -2 1 1 1 0 7 50 0 -1 4.000 0 0 -1 0 0 4 - 8250 3150 8250 2475 8775 1950 8775 1200 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 3600 1275 4950 1275 -2 1 0 1 0 7 50 0 -1 4.000 0 0 -1 1 1 2 - 0 0 1.00 60.00 120.00 - 0 0 1.00 60.00 120.00 - 7425 1275 8700 1275 -4 1 0 50 0 0 12 0.0000 4 135 1140 6075 5325 Effect duration\001 -4 0 0 50 0 0 12 0.0000 4 180 1305 10200 3000 Effect magnitude\001 -4 0 0 50 0 0 12 0.0000 4 135 780 9150 3450 Fade level\001 -4 1 0 50 0 0 12 0.0000 4 180 1035 4275 1200 Attack length\001 -4 1 0 50 0 0 12 0.0000 4 180 885 8175 1200 Fade length\001 -4 2 0 50 0 0 12 0.0000 4 135 930 2925 3375 Attack level\001 diff --git a/Documentation/input/shape.svg b/Documentation/input/shape.svg new file mode 100644 index 0000000000000000000000000000000000000000..fc0af44db3f79d3328434bf60785d62bb4f90a50 --- /dev/null +++ b/Documentation/input/shape.svg @@ -0,0 +1,39 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Effect duration + Effect magnitude + Fade level + Attack length + Fade length + Attack level + diff --git a/Documentation/input/uinput.rst b/Documentation/input/uinput.rst new file mode 100644 index 0000000000000000000000000000000000000000..b8e90b6a126cdb5e58c8feb81407bc4121df0b80 --- /dev/null +++ b/Documentation/input/uinput.rst @@ -0,0 +1,245 @@ +============= +uinput module +============= + +Introduction +============ + +uinput is a kernel module that makes it possible to emulate input devices +from userspace. By writing to /dev/uinput (or /dev/input/uinput) device, a +process can create a virtual input device with specific capabilities. Once +this virtual device is created, the process can send events through it, +that will be delivered to userspace and in-kernel consumers. + +Interface +========= + +:: + + linux/uinput.h + +The uinput header defines ioctls to create, set up, and destroy virtual +devices. + +libevdev +======== + +libevdev is a wrapper library for evdev devices that provides interfaces to +create uinput devices and send events. libevdev is less error-prone than +accessing uinput directly, and should be considered for new software. + +For examples and more information about libevdev: +https://www.freedesktop.org/software/libevdev/doc/latest/ + +Examples +======== + +Keyboard events +--------------- + +This first example shows how to create a new virtual device, and how to +send a key event. All default imports and error handlers were removed for +the sake of simplicity. + +.. code-block:: c + + #include + + void emit(int fd, int type, int code, int val) + { + struct input_event ie; + + ie.type = type; + ie.code = code; + ie.value = val; + /* timestamp values below are ignored */ + ie.time.tv_sec = 0; + ie.time.tv_usec = 0; + + write(fd, &ie, sizeof(ie)); + } + + int main(void) + { + struct uinput_setup usetup; + + int fd = open("/dev/uinput", O_WRONLY | O_NONBLOCK); + + + /* + * The ioctls below will enable the device that is about to be + * created, to pass key events, in this case the space key. + */ + ioctl(fd, UI_SET_EVBIT, EV_KEY); + ioctl(fd, UI_SET_KEYBIT, KEY_SPACE); + + memset(&usetup, 0, sizeof(usetup)); + usetup.id.bustype = BUS_USB; + usetup.id.vendor = 0x1234; /* sample vendor */ + usetup.id.product = 0x5678; /* sample product */ + strcpy(usetup.name, "Example device"); + + ioctl(fd, UI_DEV_SETUP, &usetup); + ioctl(fd, UI_DEV_CREATE); + + /* + * On UI_DEV_CREATE the kernel will create the device node for this + * device. We are inserting a pause here so that userspace has time + * to detect, initialize the new device, and can start listening to + * the event, otherwise it will not notice the event we are about + * to send. This pause is only needed in our example code! + */ + sleep(1); + + /* Key press, report the event, send key release, and report again */ + emit(fd, EV_KEY, KEY_SPACE, 1); + emit(fd, EV_SYN, SYN_REPORT, 0); + emit(fd, EV_KEY, KEY_SPACE, 0); + emit(fd, EV_SYN, SYN_REPORT, 0); + + /* + * Give userspace some time to read the events before we destroy the + * device with UI_DEV_DESTOY. + */ + sleep(1); + + ioctl(fd, UI_DEV_DESTROY); + close(fd); + + return 0; + } + +Mouse movements +--------------- + +This example shows how to create a virtual device that behaves like a physical +mouse. + +.. code-block:: c + + #include + + /* emit function is identical to of the first example */ + + int main(void) + { + struct uinput_setup usetup; + int i = 50; + + int fd = open("/dev/uinput", O_WRONLY | O_NONBLOCK); + + /* enable mouse button left and relative events */ + ioctl(fd, UI_SET_EVBIT, EV_KEY); + ioctl(fd, UI_SET_KEYBIT, BTN_LEFT); + + ioctl(fd, UI_SET_EVBIT, EV_REL); + ioctl(fd, UI_SET_RELBIT, REL_X); + ioctl(fd, UI_SET_RELBIT, REL_Y); + + memset(&usetup, 0, sizeof(usetup)); + usetup.id.bustype = BUS_USB; + usetup.id.vendor = 0x1234; /* sample vendor */ + usetup.id.product = 0x5678; /* sample product */ + strcpy(usetup.name, "Example device"); + + ioctl(fd, UI_DEV_SETUP, &usetup); + ioctl(fd, UI_DEV_CREATE); + + /* + * On UI_DEV_CREATE the kernel will create the device node for this + * device. We are inserting a pause here so that userspace has time + * to detect, initialize the new device, and can start listening to + * the event, otherwise it will not notice the event we are about + * to send. This pause is only needed in our example code! + */ + sleep(1); + + /* Move the mouse diagonally, 5 units per axis */ + while (i--) { + emit(fd, EV_REL, REL_X, 5); + emit(fd, EV_REL, REL_Y, 5); + emit(fd, EV_SYN, SYN_REPORT, 0); + usleep(15000); + } + + /* + * Give userspace some time to read the events before we destroy the + * device with UI_DEV_DESTOY. + */ + sleep(1); + + ioctl(fd, UI_DEV_DESTROY); + close(fd); + + return 0; + } + + +uinput old interface +-------------------- + +Before uinput version 5, there wasn't a dedicated ioctl to set up a virtual +device. Programs supportinf older versions of uinput interface need to fill +a uinput_user_dev structure and write it to the uinput file descriptor to +configure the new uinput device. New code should not use the old interface +but interact with uinput via ioctl calls, or use libevdev. + +.. code-block:: c + + #include + + /* emit function is identical to of the first example */ + + int main(void) + { + struct uinput_user_dev uud; + int version, rc, fd; + + fd = open("/dev/uinput", O_WRONLY | O_NONBLOCK); + rc = ioctl(fd, UI_GET_VERSION, &version); + + if (rc == 0 && version >= 5) { + /* use UI_DEV_SETUP */ + return 0; + } + + /* + * The ioctls below will enable the device that is about to be + * created, to pass key events, in this case the space key. + */ + ioctl(fd, UI_SET_EVBIT, EV_KEY); + ioctl(fd, UI_SET_KEYBIT, KEY_SPACE); + + memset(&uud, 0, sizeof(uud)); + snprintf(uud.name, UINPUT_MAX_NAME_SIZE, "uinput old interface"); + write(fd, &uud, sizeof(uud)); + + ioctl(fd, UI_DEV_CREATE); + + /* + * On UI_DEV_CREATE the kernel will create the device node for this + * device. We are inserting a pause here so that userspace has time + * to detect, initialize the new device, and can start listening to + * the event, otherwise it will not notice the event we are about + * to send. This pause is only needed in our example code! + */ + sleep(1); + + /* Key press, report the event, send key release, and report again */ + emit(fd, EV_KEY, KEY_SPACE, 1); + emit(fd, EV_SYN, SYN_REPORT, 0); + emit(fd, EV_KEY, KEY_SPACE, 0); + emit(fd, EV_SYN, SYN_REPORT, 0); + + /* + * Give userspace some time to read the events before we destroy the + * device with UI_DEV_DESTOY. + */ + sleep(1); + + ioctl(fd, UI_DEV_DESTROY); + + close(fd); + return 0; + } + diff --git a/Documentation/input/userio.rst b/Documentation/input/userio.rst new file mode 100644 index 0000000000000000000000000000000000000000..f780c77931fe086dcc5f21648e0aac56566dec63 --- /dev/null +++ b/Documentation/input/userio.rst @@ -0,0 +1,85 @@ +.. include:: + +=================== +The userio Protocol +=================== + + +:Copyright: |copy| 2015 Stephen Chandler Paul + +Sponsored by Red Hat + + +Introduction +============= + +This module is intended to try to make the lives of input driver developers +easier by allowing them to test various serio devices (mainly the various +touchpads found on laptops) without having to have the physical device in front +of them. userio accomplishes this by allowing any privileged userspace program +to directly interact with the kernel's serio driver and control a virtual serio +port from there. + +Usage overview +============== + +In order to interact with the userio kernel module, one simply opens the +/dev/userio character device in their applications. Commands are sent to the +kernel module by writing to the device, and any data received from the serio +driver is read as-is from the /dev/userio device. All of the structures and +macros you need to interact with the device are defined in and +. + +Command Structure +================= + +The struct used for sending commands to /dev/userio is as follows:: + + struct userio_cmd { + __u8 type; + __u8 data; + }; + +``type`` describes the type of command that is being sent. This can be any one +of the USERIO_CMD macros defined in . ``data`` is the argument +that goes along with the command. In the event that the command doesn't have an +argument, this field can be left untouched and will be ignored by the kernel. +Each command should be sent by writing the struct directly to the character +device. In the event that the command you send is invalid, an error will be +returned by the character device and a more descriptive error will be printed +to the kernel log. Only one command can be sent at a time, any additional data +written to the character device after the initial command will be ignored. + +To close the virtual serio port, just close /dev/userio. + +Commands +======== + +USERIO_CMD_REGISTER +~~~~~~~~~~~~~~~~~~~ + +Registers the port with the serio driver and begins transmitting data back and +forth. Registration can only be performed once a port type is set with +USERIO_CMD_SET_PORT_TYPE. Has no argument. + +USERIO_CMD_SET_PORT_TYPE +~~~~~~~~~~~~~~~~~~~~~~~~ + +Sets the type of port we're emulating, where ``data`` is the port type being +set. Can be any of the macros from . For example: SERIO_8042 +would set the port type to be a normal PS/2 port. + +USERIO_CMD_SEND_INTERRUPT +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Sends an interrupt through the virtual serio port to the serio driver, where +``data`` is the interrupt data being sent. + +Userspace tools +=============== + +The userio userspace tools are able to record PS/2 devices using some of the +debugging information from i8042, and play back the devices on /dev/userio. The +latest version of these tools can be found at: + + https://github.com/Lyude/ps2emu diff --git a/Documentation/input/userio.txt b/Documentation/input/userio.txt deleted file mode 100644 index 0880c0f447a6cee8158432fff7bb6c30c6af8d8c..0000000000000000000000000000000000000000 --- a/Documentation/input/userio.txt +++ /dev/null @@ -1,70 +0,0 @@ - The userio Protocol - (c) 2015 Stephen Chandler Paul - Sponsored by Red Hat --------------------------------------------------------------------------------- - -1. Introduction -~~~~~~~~~~~~~~~ - This module is intended to try to make the lives of input driver developers -easier by allowing them to test various serio devices (mainly the various -touchpads found on laptops) without having to have the physical device in front -of them. userio accomplishes this by allowing any privileged userspace program -to directly interact with the kernel's serio driver and control a virtual serio -port from there. - -2. Usage overview -~~~~~~~~~~~~~~~~~ - In order to interact with the userio kernel module, one simply opens the -/dev/userio character device in their applications. Commands are sent to the -kernel module by writing to the device, and any data received from the serio -driver is read as-is from the /dev/userio device. All of the structures and -macros you need to interact with the device are defined in and -. - -3. Command Structure -~~~~~~~~~~~~~~~~~~~~ - The struct used for sending commands to /dev/userio is as follows: - - struct userio_cmd { - __u8 type; - __u8 data; - }; - - "type" describes the type of command that is being sent. This can be any one -of the USERIO_CMD macros defined in . "data" is the argument -that goes along with the command. In the event that the command doesn't have an -argument, this field can be left untouched and will be ignored by the kernel. -Each command should be sent by writing the struct directly to the character -device. In the event that the command you send is invalid, an error will be -returned by the character device and a more descriptive error will be printed -to the kernel log. Only one command can be sent at a time, any additional data -written to the character device after the initial command will be ignored. - To close the virtual serio port, just close /dev/userio. - -4. Commands -~~~~~~~~~~~ - -4.1 USERIO_CMD_REGISTER -~~~~~~~~~~~~~~~~~~~~~~~ - Registers the port with the serio driver and begins transmitting data back and -forth. Registration can only be performed once a port type is set with -USERIO_CMD_SET_PORT_TYPE. Has no argument. - -4.2 USERIO_CMD_SET_PORT_TYPE -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sets the type of port we're emulating, where "data" is the port type being -set. Can be any of the macros from . For example: SERIO_8042 -would set the port type to be a normal PS/2 port. - -4.3 USERIO_CMD_SEND_INTERRUPT -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Sends an interrupt through the virtual serio port to the serio driver, where -"data" is the interrupt data being sent. - -5. Userspace tools -~~~~~~~~~~~~~~~~~~ - The userio userspace tools are able to record PS/2 devices using some of the -debugging information from i8042, and play back the devices on /dev/userio. The -latest version of these tools can be found at: - - https://github.com/Lyude/ps2emu diff --git a/Documentation/input/walkera0701.txt b/Documentation/input/walkera0701.txt deleted file mode 100644 index 49e3ac60dcef57d045720f5d9f96fc2b522a0f62..0000000000000000000000000000000000000000 --- a/Documentation/input/walkera0701.txt +++ /dev/null @@ -1,109 +0,0 @@ - -Walkera WK-0701 transmitter is supplied with a ready to fly Walkera -helicopters such as HM36, HM37, HM60. The walkera0701 module enables to use -this transmitter as joystick - -Devel homepage and download: -http://zub.fei.tuke.sk/walkera-wk0701/ - -or use cogito: -cg-clone http://zub.fei.tuke.sk/GIT/walkera0701-joystick - - -Connecting to PC: - -At back side of transmitter S-video connector can be found. Modulation -pulses from processor to HF part can be found at pin 2 of this connector, -pin 3 is GND. Between pin 3 and CPU 5k6 resistor can be found. To get -modulation pulses to PC, signal pulses must be amplified. - -Cable: (walkera TX to parport) - -Walkera WK-0701 TX S-VIDEO connector: - (back side of TX) - __ __ S-video: canon25 - / |_| \ pin 2 (signal) NPN parport - / O 4 3 O \ pin 3 (GND) LED ________________ 10 ACK - ( O 2 1 O ) | C - \ ___ / 2 ________________________|\|_____|/ - | [___] | |/| B |\ - ------- 3 __________________________________|________________ 25 GND - E - - -I use green LED and BC109 NPN transistor. - -Software: - -Build kernel with walkera0701 module. Module walkera0701 need exclusive -access to parport, modules like lp must be unloaded before loading -walkera0701 module, check dmesg for error messages. Connect TX to PC by -cable and run jstest /dev/input/js0 to see values from TX. If no value can -be changed by TX "joystick", check output from /proc/interrupts. Value for -(usually irq7) parport must increase if TX is on. - - - -Technical details: - -Driver use interrupt from parport ACK input bit to measure pulse length -using hrtimers. - -Frame format: -Based on walkera WK-0701 PCM Format description by Shaul Eizikovich. -(downloaded from http://www.smartpropoplus.com/Docs/Walkera_Wk-0701_PCM.pdf) - -Signal pulses: - (ANALOG) - SYNC BIN OCT - +---------+ +------+ - | | | | ---+ +------+ +--- - -Frame: - SYNC , BIN1, OCT1, BIN2, OCT2 ... BIN24, OCT24, BIN25, next frame SYNC .. - -pulse length: - Binary values: Analog octal values: - - 288 uS Binary 0 318 uS 000 - 438 uS Binary 1 398 uS 001 - 478 uS 010 - 558 uS 011 - 638 uS 100 - 1306 uS SYNC 718 uS 101 - 798 uS 110 - 878 uS 111 - -24 bin+oct values + 1 bin value = 24*4+1 bits = 97 bits - -(Warning, pulses on ACK are inverted by transistor, irq is raised up on sync -to bin change or octal value to bin change). - -Binary data representations: - -One binary and octal value can be grouped to nibble. 24 nibbles + one binary -values can be sampled between sync pulses. - -Values for first four channels (analog joystick values) can be found in -first 10 nibbles. Analog value is represented by one sign bit and 9 bit -absolute binary value. (10 bits per channel). Next nibble is checksum for -first ten nibbles. - -Next nibbles 12 .. 21 represents four channels (not all channels can be -directly controlled from TX). Binary representations are the same as in first -four channels. In nibbles 22 and 23 is a special magic number. Nibble 24 is -checksum for nibbles 12..23. - -After last octal value for nibble 24 and next sync pulse one additional -binary value can be sampled. This bit and magic number is not used in -software driver. Some details about this magic numbers can be found in -Walkera_Wk-0701_PCM.pdf. - -Checksum calculation: - -Summary of octal values in nibbles must be same as octal value in checksum -nibble (only first 3 bits are used). Binary value for checksum nibble is -calculated by sum of binary values in checked nibbles + sum of octal values -in checked nibbles divided by 8. Only bit 0 of this sum is used. - diff --git a/Documentation/input/xpad.txt b/Documentation/input/xpad.txt deleted file mode 100644 index d1b23f295db4ef989008fa9ced872f33838f03ed..0000000000000000000000000000000000000000 --- a/Documentation/input/xpad.txt +++ /dev/null @@ -1,226 +0,0 @@ -xpad - Linux USB driver for Xbox compatible controllers - -This driver exposes all first-party and third-party Xbox compatible -controllers. It has a long history and has enjoyed considerable usage -as Window's xinput library caused most PC games to focus on Xbox -controller compatibility. - -Due to backwards compatibility all buttons are reported as digital. -This only effects Original Xbox controllers. All later controller models -have only digital face buttons. - -Rumble is supported on some models of Xbox 360 controllers but not of -Original Xbox controllers nor on Xbox One controllers. As of writing -the Xbox One's rumble protocol has not been reverse engineered but in -the future could be supported. - - -0. Notes --------- -The number of buttons/axes reported varies based on 3 things: -- if you are using a known controller -- if you are using a known dance pad -- if using an unknown device (one not listed below), what you set in the - module configuration for "Map D-PAD to buttons rather than axes for unknown - pads" (module option dpad_to_buttons) - -If you set dpad_to_buttons to N and you are using an unknown device -the driver will map the directional pad to axes (X/Y). -If you said Y it will map the d-pad to buttons, which is needed for dance -style games to function correctly. The default is Y. - -dpad_to_buttons has no effect for known pads. A erroneous commit message -claimed dpad_to_buttons could be used to force behavior on known devices. -This is not true. Both dpad_to_buttons and triggers_to_buttons only affect -unknown controllers. - - -0.1 Normal Controllers ----------------------- -With a normal controller, the directional pad is mapped to its own X/Y axes. -The jstest-program from joystick-1.2.15 (jstest-version 2.1.0) will report 8 -axes and 10 buttons. - -All 8 axes work, though they all have the same range (-32768..32767) -and the zero-setting is not correct for the triggers (I don't know if that -is some limitation of jstest, since the input device setup should be fine. I -didn't have a look at jstest itself yet). - -All of the 10 buttons work (in digital mode). The six buttons on the -right side (A, B, X, Y, black, white) are said to be "analog" and -report their values as 8 bit unsigned, not sure what this is good for. - -I tested the controller with quake3, and configuration and -in game functionality were OK. However, I find it rather difficult to -play first person shooters with a pad. Your mileage may vary. - - -0.2 Xbox Dance Pads -------------------- -When using a known dance pad, jstest will report 6 axes and 14 buttons. - -For dance style pads (like the redoctane pad) several changes -have been made. The old driver would map the d-pad to axes, resulting -in the driver being unable to report when the user was pressing both -left+right or up+down, making DDR style games unplayable. - -Known dance pads automatically map the d-pad to buttons and will work -correctly out of the box. - -If your dance pad is recognized by the driver but is using axes instead -of buttons, see section 0.3 - Unknown Controllers - -I've tested this with Stepmania, and it works quite well. - - -0.3 Unknown Controllers ----------------------- -If you have an unknown xbox controller, it should work just fine with -the default settings. - -HOWEVER if you have an unknown dance pad not listed below, it will not -work UNLESS you set "dpad_to_buttons" to 1 in the module configuration. - -PLEASE, if you have an unknown controller, email Dom with -a dump from /proc/bus/usb and a description of the pad (manufacturer, country, -whether it is a dance pad or normal controller) so that we can add your pad -to the list of supported devices, ensuring that it will work out of the -box in the future. - - -1. USB adapters --------------- -All generations of Xbox controllers speak USB over the wire. -- Original Xbox controllers use a proprietary connector and require adapters. -- Wireless Xbox 360 controllers require a 'Xbox 360 Wireless Gaming Receiver - for Windows' -- Wired Xbox 360 controllers use standard USB connectors. -- Xbox One controllers can be wireless but speak Wi-Fi Direct and are not - yet supported. -- Xbox One controllers can be wired and use standard Micro-USB connectors. - - - -1.1 Original Xbox USB adapters --------------- -Using this driver with an Original Xbox controller requires an -adapter cable to break out the proprietary connector's pins to USB. -You can buy these online fairly cheap, or build your own. - -Such a cable is pretty easy to build. The Controller itself is a USB -compound device (a hub with three ports for two expansion slots and -the controller device) with the only difference in a nonstandard connector -(5 pins vs. 4 on standard USB 1.0 connectors). - -You just need to solder a USB connector onto the cable and keep the -yellow wire unconnected. The other pins have the same order on both -connectors so there is no magic to it. Detailed info on these matters -can be found on the net ([1], [2], [3]). - -Thanks to the trip splitter found on the cable you don't even need to cut the -original one. You can buy an extension cable and cut that instead. That way, -you can still use the controller with your X-Box, if you have one ;) - - - -2. Driver Installation ----------------------- - -Once you have the adapter cable, if needed, and the controller connected -the xpad module should be auto loaded. To confirm you can cat -/proc/bus/usb/devices. There should be an entry like the one at the end [4]. - - - -3. Supported Controllers ------------------------- -For a full list of supported controllers and associated vendor and product -IDs see the xpad_device[] array[6]. - -As of the historic version 0.0.6 (2006-10-10) the following devices -were supported: - original Microsoft XBOX controller (US), vendor=0x045e, product=0x0202 - smaller Microsoft XBOX controller (US), vendor=0x045e, product=0x0289 - original Microsoft XBOX controller (Japan), vendor=0x045e, product=0x0285 - InterAct PowerPad Pro (Germany), vendor=0x05fd, product=0x107a - RedOctane Xbox Dance Pad (US), vendor=0x0c12, product=0x8809 - -Unrecognized models of Xbox controllers should function as Generic -Xbox controllers. Unrecognized Dance Pad controllers require setting -the module option 'dpad_to_buttons'. - -If you have an unrecognized controller please see 0.3 - Unknown Controllers - - -4. Manual Testing ------------------ -To test this driver's functionality you may use 'jstest'. - -For example: -> modprobe xpad -> modprobe joydev -> jstest /dev/js0 - -If you're using a normal controller, there should be a single line showing -18 inputs (8 axes, 10 buttons), and its values should change if you move -the sticks and push the buttons. If you're using a dance pad, it should -show 20 inputs (6 axes, 14 buttons). - -It works? Voila, you're done ;) - - - -5. Thanks ---------- - -I have to thank ITO Takayuki for the detailed info on his site - http://euc.jp/periphs/xbox-controller.ja.html. - -His useful info and both the usb-skeleton as well as the iforce input driver -(Greg Kroah-Hartmann; Vojtech Pavlik) helped a lot in rapid prototyping -the basic functionality. - - - -6. References -------------- - -[1]: http://euc.jp/periphs/xbox-controller.ja.html (ITO Takayuki) -[2]: http://xpad.xbox-scene.com/ -[3]: http://www.markosweb.com/www/xboxhackz.com/ -[4]: /proc/bus/usb/devices - dump from InterAct PowerPad Pro (Germany): - -T: Bus=01 Lev=03 Prnt=04 Port=00 Cnt=01 Dev#= 5 Spd=12 MxCh= 0 -D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=32 #Cfgs= 1 -P: Vendor=05fd ProdID=107a Rev= 1.00 -C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA -I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=(none) -E: Ad=81(I) Atr=03(Int.) MxPS= 32 Ivl= 10ms -E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl= 10ms - -[5]: /proc/bus/usb/devices - dump from Redoctane Xbox Dance Pad (US): - -T: Bus=01 Lev=02 Prnt=09 Port=00 Cnt=01 Dev#= 10 Spd=12 MxCh= 0 -D: Ver= 1.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS= 8 #Cfgs= 1 -P: Vendor=0c12 ProdID=8809 Rev= 0.01 -S: Product=XBOX DDR -C:* #Ifs= 1 Cfg#= 1 Atr=80 MxPwr=100mA -I: If#= 0 Alt= 0 #EPs= 2 Cls=58(unk. ) Sub=42 Prot=00 Driver=xpad -E: Ad=82(I) Atr=03(Int.) MxPS= 32 Ivl=4ms -E: Ad=02(O) Atr=03(Int.) MxPS= 32 Ivl=4ms - -[6]: http://lxr.free-electrons.com/ident?i=xpad_device - - - -7. Historic Edits ------------------ -Marko Friedemann -2002-07-16 - - original doc - -Dominic Cerquetti -2005-03-19 - - added stuff for dance pads, new d-pad->axes mappings - -Later changes may be viewed with 'git log Documentation/input/xpad.txt' diff --git a/Documentation/input/yealink.txt b/Documentation/input/yealink.txt deleted file mode 100644 index 8277b76ec50649f3a35f5f5407d135fc37312a0e..0000000000000000000000000000000000000000 --- a/Documentation/input/yealink.txt +++ /dev/null @@ -1,216 +0,0 @@ -Driver documentation for yealink usb-p1k phones - -0. Status -~~~~~~~~~ -The p1k is a relatively cheap usb 1.1 phone with: - - keyboard full support, yealink.ko / input event API - - LCD full support, yealink.ko / sysfs API - - LED full support, yealink.ko / sysfs API - - dialtone full support, yealink.ko / sysfs API - - ringtone full support, yealink.ko / sysfs API - - audio playback full support, snd_usb_audio.ko / alsa API - - audio record full support, snd_usb_audio.ko / alsa API - -For vendor documentation see http://www.yealink.com - - -1. Compilation (stand alone version) -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Currently only kernel 2.6.x.y versions are supported. -In order to build the yealink.ko module do - - make - -If you encounter problems please check if in the MAKE_OPTS variable in -the Makefile is pointing to the location where your kernel sources -are located, default /usr/src/linux. - - -1.1 Troubleshooting -~~~~~~~~~~~~~~~~~~~ -Q: Module yealink compiled and installed without any problem but phone - is not initialized and does not react to any actions. -A: If you see something like: - hiddev0: USB HID v1.00 Device [Yealink Network Technology Ltd. VOIP USB Phone - in dmesg, it means that the hid driver has grabbed the device first. Try to - load module yealink before any other usb hid driver. Please see the - instructions provided by your distribution on module configuration. - -Q: Phone is working now (displays version and accepts keypad input) but I can't - find the sysfs files. -A: The sysfs files are located on the particular usb endpoint. On most - distributions you can do: "find /sys/ -name get_icons" for a hint. - - -2. keyboard features -~~~~~~~~~~~~~~~~~~~~ -The current mapping in the kernel is provided by the map_p1k_to_key -function: - - Physical USB-P1K button layout input events - - - up up - IN OUT left, right - down down - - pickup C hangup enter, backspace, escape - 1 2 3 1, 2, 3 - 4 5 6 4, 5, 6, - 7 8 9 7, 8, 9, - * 0 # *, 0, #, - - The "up" and "down" keys, are symbolised by arrows on the button. - The "pickup" and "hangup" keys are symbolised by a green and red phone - on the button. - - -3. LCD features -~~~~~~~~~~~~~~~ -The LCD is divided and organised as a 3 line display: - - |[] [][] [][] [][] in |[][] - |[] M [][] D [][] : [][] out |[][] - store - - NEW REP SU MO TU WE TH FR SA - - [] [] [] [] [] [] [] [] [] [] [] [] - [] [] [] [] [] [] [] [] [] [] [] [] - - -Line 1 Format (see below) : 18.e8.M8.88...188 - Icon names : M D : IN OUT STORE -Line 2 Format : ......... - Icon name : NEW REP SU MO TU WE TH FR SA -Line 3 Format : 888888888888 - - -Format description: - From a userspace perspective the world is separated into "digits" and "icons". - A digit can have a character set, an icon can only be ON or OFF. - - Format specifier - '8' : Generic 7 segment digit with individual addressable segments - - Reduced capability 7 segment digit, when segments are hard wired together. - '1' : 2 segments digit only able to produce a 1. - 'e' : Most significant day of the month digit, - able to produce at least 1 2 3. - 'M' : Most significant minute digit, - able to produce at least 0 1 2 3 4 5. - - Icons or pictograms: - '.' : For example like AM, PM, SU, a 'dot' .. or other single segment - elements. - - -4. Driver usage -~~~~~~~~~~~~~~~ -For userland the following interfaces are available using the sysfs interface: - /sys/.../ - line1 Read/Write, lcd line1 - line2 Read/Write, lcd line2 - line3 Read/Write, lcd line3 - - get_icons Read, returns a set of available icons. - hide_icon Write, hide the element by writing the icon name. - show_icon Write, display the element by writing the icon name. - - map_seg7 Read/Write, the 7 segments char set, common for all - yealink phones. (see map_to_7segment.h) - - ringtone Write, upload binary representation of a ringtone, - see yealink.c. status EXPERIMENTAL due to potential - races between async. and sync usb calls. - - -4.1 lineX -~~~~~~~~~ -Reading /sys/../lineX will return the format string with its current value: - - Example: - cat ./line3 - 888888888888 - Linux Rocks! - -Writing to /sys/../lineX will set the corresponding LCD line. - - Excess characters are ignored. - - If less characters are written than allowed, the remaining digits are - unchanged. - - The tab '\t'and '\n' char does not overwrite the original content. - - Writing a space to an icon will always hide its content. - - Example: - date +"%m.%e.%k:%M" | sed 's/^0/ /' > ./line1 - - Will update the LCD with the current date & time. - - -4.2 get_icons -~~~~~~~~~~~~~ -Reading will return all available icon names and its current settings: - - cat ./get_icons - on M - on D - on : - IN - OUT - STORE - NEW - REP - SU - MO - TU - WE - TH - FR - SA - LED - DIALTONE - RINGTONE - - -4.3 show/hide icons -~~~~~~~~~~~~~~~~~~~ -Writing to these files will update the state of the icon. -Only one icon at a time can be updated. - -If an icon is also on a ./lineX the corresponding value is -updated with the first letter of the icon. - - Example - light up the store icon: - echo -n "STORE" > ./show_icon - - cat ./line1 - 18.e8.M8.88...188 - S - - Example - sound the ringtone for 10 seconds: - echo -n RINGTONE > /sys/..../show_icon - sleep 10 - echo -n RINGTONE > /sys/..../hide_icon - - -5. Sound features -~~~~~~~~~~~~~~~~~ -Sound is supported by the ALSA driver: snd_usb_audio - -One 16-bit channel with sample and playback rates of 8000 Hz is the practical -limit of the device. - - Example - recording test: - arecord -v -d 10 -r 8000 -f S16_LE -t wav foobar.wav - - Example - playback test: - aplay foobar.wav - - -6. Credits & Acknowledgments -~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - Olivier Vandorpe, for starting the usbb2k-api project doing much of - the reverse engineering. - - Martin Diehl, for pointing out how to handle USB memory allocation. - - Dmitry Torokhov, for the numerous code reviews and suggestions. - diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt index 08244bea504877397ecabd2732e8e0cf00e6da09..1e9fcb4d0ec83ed4eaf2330f16775769ac366aa3 100644 --- a/Documentation/ioctl/ioctl-number.txt +++ b/Documentation/ioctl/ioctl-number.txt @@ -191,6 +191,7 @@ Code Seq#(hex) Include File Comments 'W' 00-1F linux/watchdog.h conflict! 'W' 00-1F linux/wanrouter.h conflict! (pre 3.9) 'W' 00-3F sound/asound.h conflict! +'W' 40-5F drivers/pci/switch/switchtec.c 'X' all fs/xfs/xfs_fs.h conflict! and fs/xfs/linux-2.6/xfs_ioctl32.h and include/linux/falloc.h @@ -212,7 +213,7 @@ Code Seq#(hex) Include File Comments 'c' 00-1F linux/chio.h conflict! 'c' 80-9F arch/s390/include/asm/chsc.h conflict! 'c' A0-AF arch/x86/include/asm/msr.h conflict! -'d' 00-FF linux/char/drm/drm/h conflict! +'d' 00-FF linux/char/drm/drm.h conflict! 'd' 02-40 pcmcia/ds.h conflict! 'd' F0-FF linux/digi1.h 'e' all linux/digi1.h conflict! @@ -308,6 +309,7 @@ Code Seq#(hex) Include File Comments 0xA3 80-8F Port ACL in development: 0xA3 90-9F linux/dtlk.h +0xA4 00-1F uapi/linux/tee.h Generic TEE subsystem 0xAA 00-3F linux/uapi/linux/userfaultfd.h 0xAB 00-1F linux/nbd.h 0xAC 00-1F linux/raw.h diff --git a/Documentation/kbuild/makefiles.txt b/Documentation/kbuild/makefiles.txt index 9b9c4797fc55653dec668d82ae8c43ccdab58e4b..e18daca65ccd2add180e5e74f97d0cc4743677cb 100644 --- a/Documentation/kbuild/makefiles.txt +++ b/Documentation/kbuild/makefiles.txt @@ -44,11 +44,11 @@ This document describes the Linux kernel Makefiles. --- 6.11 Post-link pass === 7 Kbuild syntax for exported headers - --- 7.1 header-y + --- 7.1 no-export-headers --- 7.2 genhdr-y - --- 7.3 destination-y - --- 7.4 generic-y - --- 7.5 generated-y + --- 7.3 generic-y + --- 7.4 generated-y + --- 7.5 mandatory-y === 8 Kbuild Variables === 9 Makefile language @@ -1236,7 +1236,7 @@ When kbuild executes, the following steps are followed (roughly): that may be shared between individual architectures. The recommended approach how to use a generic header file is to list the file in the Kbuild file. - See "7.4 generic-y" for further info on syntax etc. + See "7.3 generic-y" for further info on syntax etc. --- 6.11 Post-link pass @@ -1263,53 +1263,32 @@ The pre-processing does: - drop include of compiler.h - drop all sections that are kernel internal (guarded by ifdef __KERNEL__) -Each relevant directory contains a file name "Kbuild" which specifies the -headers to be exported. -See subsequent chapter for the syntax of the Kbuild file. - - --- 7.1 header-y - - header-y specifies header files to be exported. - - Example: - #include/linux/Kbuild - header-y += usb/ - header-y += aio_abi.h +All headers under include/uapi/, include/generated/uapi/, +arch//include/uapi/ and arch//include/generated/uapi/ +are exported. - The convention is to list one file per line and - preferably in alphabetic order. +A Kbuild file may be defined under arch//include/uapi/asm/ and +arch//include/asm/ to list asm files coming from asm-generic. +See subsequent chapter for the syntax of the Kbuild file. - header-y also specifies which subdirectories to visit. - A subdirectory is identified by a trailing '/' which - can be seen in the example above for the usb subdirectory. + --- 7.1 no-export-headers - Subdirectories are visited before their parent directories. + no-export-headers is essentially used by include/uapi/linux/Kbuild to + avoid exporting specific headers (e.g. kvm.h) on architectures that do + not support it. It should be avoided as much as possible. --- 7.2 genhdr-y - genhdr-y specifies generated files to be exported. - Generated files are special as they need to be looked - up in another directory when doing 'make O=...' builds. + genhdr-y specifies asm files to be generated. Example: - #include/linux/Kbuild - genhdr-y += version.h + #arch/x86/include/uapi/asm/Kbuild + genhdr-y += unistd_32.h + genhdr-y += unistd_64.h + genhdr-y += unistd_x32.h - --- 7.3 destination-y - When an architecture has a set of exported headers that needs to be - exported to a different directory destination-y is used. - destination-y specifies the destination directory for all exported - headers in the file where it is present. - - Example: - #arch/xtensa/platforms/s6105/include/platform/Kbuild - destination-y := include/linux - - In the example above all exported headers in the Kbuild file - will be located in the directory "include/linux" when exported. - - --- 7.4 generic-y + --- 7.3 generic-y If an architecture uses a verbatim copy of a header from include/asm-generic then this is listed in the file @@ -1336,7 +1315,7 @@ See subsequent chapter for the syntax of the Kbuild file. Example: termios.h #include - --- 7.5 generated-y + --- 7.4 generated-y If an architecture generates other header files alongside generic-y wrappers, and not included in genhdr-y, then generated-y specifies @@ -1349,6 +1328,15 @@ See subsequent chapter for the syntax of the Kbuild file. #arch/x86/include/asm/Kbuild generated-y += syscalls_32.h + --- 7.5 mandatory-y + + mandatory-y is essentially used by include/uapi/asm-generic/Kbuild.asm + to define the minimun set of headers that must be exported in + include/asm. + + The convention is to list one subdir per line and + preferably in alphabetic order. + === 8 Kbuild Variables The top Makefile exports the following variables: diff --git a/Documentation/kdump/kdump.txt b/Documentation/kdump/kdump.txt index b0eb27b956d95a2fc744d46b53647e05c7186e50..615434d81108ed28fc4f077a0e1b35cd609b2610 100644 --- a/Documentation/kdump/kdump.txt +++ b/Documentation/kdump/kdump.txt @@ -18,7 +18,7 @@ memory image to a dump file on the local disk, or across the network to a remote system. Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -s390x and arm architectures. +s390x, arm and arm64 architectures. When the system kernel boots, it reserves a small section of memory for the dump-capture kernel. This ensures that ongoing Direct Memory Access @@ -249,6 +249,13 @@ Dump-capture kernel config options (Arch Dependent, arm) AUTO_ZRELADDR=y +Dump-capture kernel config options (Arch Dependent, arm64) +---------------------------------------------------------- + +- Please note that kvm of the dump-capture kernel will not be enabled + on non-VHE systems even if it is configured. This is because the CPU + will not be reset to EL2 on panic. + Extended crashkernel syntax =========================== @@ -305,6 +312,8 @@ Boot into System Kernel kernel will automatically locate the crash kernel image within the first 512MB of RAM if X is not given. + On arm64, use "crashkernel=Y[@X]". Note that the start address of + the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). Load the Dump-capture Kernel ============================ @@ -327,6 +336,8 @@ For s390x: - Use image or bzImage For arm: - Use zImage +For arm64: + - Use vmlinux or Image If you are using a uncompressed vmlinux image then use following command to load dump-capture kernel. @@ -370,6 +381,9 @@ For s390x: For arm: "1 maxcpus=1 reset_devices" +For arm64: + "1 maxcpus=1 reset_devices" + Notes on loading the dump-capture kernel: * By default, the ELF headers are stored in ELF64 format to support diff --git a/Documentation/kref.txt b/Documentation/kref.txt index ddf85a5dde0c12a435b9cbcc30f44159de5acc0b..d26a27ca964d4906e2955778393f99fa25d52153 100644 --- a/Documentation/kref.txt +++ b/Documentation/kref.txt @@ -84,6 +84,7 @@ int my_data_handler(void) task = kthread_run(more_data_handling, data, "more_data_handling"); if (task == ERR_PTR(-ENOMEM)) { rv = -ENOMEM; + kref_put(&data->refcount, data_release); goto out; } diff --git a/Documentation/leds/leds-lp55xx.txt b/Documentation/leds/leds-lp55xx.txt index bcea12a0c5847e12211a14d43f356d0f1a1e77f0..e23fa91ea722fd82882eb80937ff8f4e87e65140 100644 --- a/Documentation/leds/leds-lp55xx.txt +++ b/Documentation/leds/leds-lp55xx.txt @@ -117,7 +117,7 @@ As soon as 'loading' is set to 0, registered callback is called. Inside the callback, the selected engine is loaded and memory is updated. To run programmed pattern, 'run_engine' attribute should be enabled. -The pattern sqeuence of LP8501 is similar to LP5523. +The pattern sequence of LP8501 is similar to LP5523. However pattern data is specific. Ex 1) Engine 1 is used echo 1 > /sys/bus/i2c/devices/xxxx/select_engine diff --git a/Documentation/lightnvm/pblk.txt b/Documentation/lightnvm/pblk.txt new file mode 100644 index 0000000000000000000000000000000000000000..1040ed1cec812dce0c17cadbb7c73a6febd28c61 --- /dev/null +++ b/Documentation/lightnvm/pblk.txt @@ -0,0 +1,21 @@ +pblk: Physical Block Device Target +================================== + +pblk implements a fully associative, host-based FTL that exposes a traditional +block I/O interface. Its primary responsibilities are: + + - Map logical addresses onto physical addresses (4KB granularity) in a + logical-to-physical (L2P) table. + - Maintain the integrity and consistency of the L2P table as well as its + recovery from normal tear down and power outage. + - Deal with controller- and media-specific constrains. + - Handle I/O errors. + - Implement garbage collection. + - Maintain consistency across the I/O stack during synchronization points. + +For more information please refer to: + + http://lightnvm.io + +which maintains updated FAQs, manual pages, technical documentation, tools, +contacts, etc. diff --git a/Documentation/livepatch/livepatch.txt b/Documentation/livepatch/livepatch.txt index 9d2096c7160de53121ca723d2231db3a46a15e81..ecdb18104ab0e83f7d7639c590deba69f71213b6 100644 --- a/Documentation/livepatch/livepatch.txt +++ b/Documentation/livepatch/livepatch.txt @@ -72,7 +72,8 @@ example, they add a NULL pointer or a boundary check, fix a race by adding a missing memory barrier, or add some locking around a critical section. Most of these changes are self contained and the function presents itself the same way to the rest of the system. In this case, the functions might -be updated independently one by one. +be updated independently one by one. (This can be done by setting the +'immediate' flag in the klp_patch struct.) But there are more complex fixes. For example, a patch might change ordering of locking in multiple functions at the same time. Or a patch @@ -86,20 +87,141 @@ or no data are stored in the modified structures at the moment. The theory about how to apply functions a safe way is rather complex. The aim is to define a so-called consistency model. It attempts to define conditions when the new implementation could be used so that the system -stays consistent. The theory is not yet finished. See the discussion at -https://lkml.kernel.org/r/20141107140458.GA21774@suse.cz - -The current consistency model is very simple. It guarantees that either -the old or the new function is called. But various functions get redirected -one by one without any synchronization. - -In other words, the current implementation _never_ modifies the behavior -in the middle of the call. It is because it does _not_ rewrite the entire -function in the memory. Instead, the function gets redirected at the -very beginning. But this redirection is used immediately even when -some other functions from the same patch have not been redirected yet. - -See also the section "Limitations" below. +stays consistent. + +Livepatch has a consistency model which is a hybrid of kGraft and +kpatch: it uses kGraft's per-task consistency and syscall barrier +switching combined with kpatch's stack trace switching. There are also +a number of fallback options which make it quite flexible. + +Patches are applied on a per-task basis, when the task is deemed safe to +switch over. When a patch is enabled, livepatch enters into a +transition state where tasks are converging to the patched state. +Usually this transition state can complete in a few seconds. The same +sequence occurs when a patch is disabled, except the tasks converge from +the patched state to the unpatched state. + +An interrupt handler inherits the patched state of the task it +interrupts. The same is true for forked tasks: the child inherits the +patched state of the parent. + +Livepatch uses several complementary approaches to determine when it's +safe to patch tasks: + +1. The first and most effective approach is stack checking of sleeping + tasks. If no affected functions are on the stack of a given task, + the task is patched. In most cases this will patch most or all of + the tasks on the first try. Otherwise it'll keep trying + periodically. This option is only available if the architecture has + reliable stacks (HAVE_RELIABLE_STACKTRACE). + +2. The second approach, if needed, is kernel exit switching. A + task is switched when it returns to user space from a system call, a + user space IRQ, or a signal. It's useful in the following cases: + + a) Patching I/O-bound user tasks which are sleeping on an affected + function. In this case you have to send SIGSTOP and SIGCONT to + force it to exit the kernel and be patched. + b) Patching CPU-bound user tasks. If the task is highly CPU-bound + then it will get patched the next time it gets interrupted by an + IRQ. + c) In the future it could be useful for applying patches for + architectures which don't yet have HAVE_RELIABLE_STACKTRACE. In + this case you would have to signal most of the tasks on the + system. However this isn't supported yet because there's + currently no way to patch kthreads without + HAVE_RELIABLE_STACKTRACE. + +3. For idle "swapper" tasks, since they don't ever exit the kernel, they + instead have a klp_update_patch_state() call in the idle loop which + allows them to be patched before the CPU enters the idle state. + + (Note there's not yet such an approach for kthreads.) + +All the above approaches may be skipped by setting the 'immediate' flag +in the 'klp_patch' struct, which will disable per-task consistency and +patch all tasks immediately. This can be useful if the patch doesn't +change any function or data semantics. Note that, even with this flag +set, it's possible that some tasks may still be running with an old +version of the function, until that function returns. + +There's also an 'immediate' flag in the 'klp_func' struct which allows +you to specify that certain functions in the patch can be applied +without per-task consistency. This might be useful if you want to patch +a common function like schedule(), and the function change doesn't need +consistency but the rest of the patch does. + +For architectures which don't have HAVE_RELIABLE_STACKTRACE, the user +must set patch->immediate which causes all tasks to be patched +immediately. This option should be used with care, only when the patch +doesn't change any function or data semantics. + +In the future, architectures which don't have HAVE_RELIABLE_STACKTRACE +may be allowed to use per-task consistency if we can come up with +another way to patch kthreads. + +The /sys/kernel/livepatch//transition file shows whether a patch +is in transition. Only a single patch (the topmost patch on the stack) +can be in transition at a given time. A patch can remain in transition +indefinitely, if any of the tasks are stuck in the initial patch state. + +A transition can be reversed and effectively canceled by writing the +opposite value to the /sys/kernel/livepatch//enabled file while +the transition is in progress. Then all the tasks will attempt to +converge back to the original patch state. + +There's also a /proc//patch_state file which can be used to +determine which tasks are blocking completion of a patching operation. +If a patch is in transition, this file shows 0 to indicate the task is +unpatched and 1 to indicate it's patched. Otherwise, if no patch is in +transition, it shows -1. Any tasks which are blocking the transition +can be signaled with SIGSTOP and SIGCONT to force them to change their +patched state. + + +3.1 Adding consistency model support to new architectures +--------------------------------------------------------- + +For adding consistency model support to new architectures, there are a +few options: + +1) Add CONFIG_HAVE_RELIABLE_STACKTRACE. This means porting objtool, and + for non-DWARF unwinders, also making sure there's a way for the stack + tracing code to detect interrupts on the stack. + +2) Alternatively, ensure that every kthread has a call to + klp_update_patch_state() in a safe location. Kthreads are typically + in an infinite loop which does some action repeatedly. The safe + location to switch the kthread's patch state would be at a designated + point in the loop where there are no locks taken and all data + structures are in a well-defined state. + + The location is clear when using workqueues or the kthread worker + API. These kthreads process independent actions in a generic loop. + + It's much more complicated with kthreads which have a custom loop. + There the safe location must be carefully selected on a case-by-case + basis. + + In that case, arches without HAVE_RELIABLE_STACKTRACE would still be + able to use the non-stack-checking parts of the consistency model: + + a) patching user tasks when they cross the kernel/user space + boundary; and + + b) patching kthreads and idle tasks at their designated patch points. + + This option isn't as good as option 1 because it requires signaling + user tasks and waking kthreads to patch them. But it could still be + a good backup option for those architectures which don't have + reliable stack traces yet. + +In the meantime, patches for such architectures can bypass the +consistency model by setting klp_patch.immediate to true. This option +is perfectly fine for patches which don't change the semantics of the +patched functions. In practice, this is usable for ~90% of security +fixes. Use of this option also means the patch can't be unloaded after +it has been disabled. 4. Livepatch module @@ -134,7 +256,7 @@ Documentation/livepatch/module-elf-format.txt for more details. 4.2. Metadata ------------- +------------- The patch is described by several structures that split the information into three levels: @@ -156,6 +278,9 @@ into three levels: only for a particular object ( vmlinux or a kernel module ). Note that kallsyms allows for searching symbols according to the object name. + There's also an 'immediate' flag which, when set, patches the + function immediately, bypassing the consistency model safety checks. + + struct klp_object defines an array of patched functions (struct klp_func) in the same object. Where the object is either vmlinux (NULL) or a module name. @@ -172,10 +297,13 @@ into three levels: This structure handles all patched functions consistently and eventually, synchronously. The whole patch is applied only when all patched symbols are found. The only exception are symbols from objects - (kernel modules) that have not been loaded yet. Also if a more complex - consistency model is supported then a selected unit (thread, - kernel as a whole) will see the new code from the entire patch - only when it is in a safe state. + (kernel modules) that have not been loaded yet. + + Setting the 'immediate' flag applies the patch to all tasks + immediately, bypassing the consistency model safety checks. + + For more details on how the patch is applied on a per-task basis, + see the "Consistency model" section. 4.3. Livepatch module handling @@ -188,8 +316,15 @@ section "Livepatch life-cycle" below for more details about these two operations. Module removal is only safe when there are no users of the underlying -functions. The immediate consistency model is not able to detect this; -therefore livepatch modules cannot be removed. See "Limitations" below. +functions. The immediate consistency model is not able to detect this. The +code just redirects the functions at the very beginning and it does not +check if the functions are in use. In other words, it knows when the +functions get called but it does not know when the functions return. +Therefore it cannot be decided when the livepatch module can be safely +removed. This is solved by a hybrid consistency model. When the system is +transitioned to a new patch state (patched/unpatched) it is guaranteed that +no task sleeps or runs in the old code. + 5. Livepatch life-cycle ======================= @@ -239,9 +374,15 @@ Registered patches might be enabled either by calling klp_enable_patch() or by writing '1' to /sys/kernel/livepatch//enabled. The system will start using the new implementation of the patched functions at this stage. -In particular, if an original function is patched for the first time, a -function specific struct klp_ops is created and an universal ftrace handler -is registered. +When a patch is enabled, livepatch enters into a transition state where +tasks are converging to the patched state. This is indicated by a value +of '1' in /sys/kernel/livepatch//transition. Once all tasks have +been patched, the 'transition' value changes to '0'. For more +information about this process, see the "Consistency model" section. + +If an original function is patched for the first time, a function +specific struct klp_ops is created and an universal ftrace handler is +registered. Functions might be patched multiple times. The ftrace handler is registered only once for the given function. Further patches just add an entry to the @@ -261,6 +402,12 @@ by writing '0' to /sys/kernel/livepatch//enabled. At this stage either the code from the previously enabled patch or even the original code gets used. +When a patch is disabled, livepatch enters into a transition state where +tasks are converging to the unpatched state. This is indicated by a +value of '1' in /sys/kernel/livepatch//transition. Once all tasks +have been unpatched, the 'transition' value changes to '0'. For more +information about this process, see the "Consistency model" section. + Here all the functions (struct klp_func) associated with the to-be-disabled patch are removed from the corresponding struct klp_ops. The ftrace handler is unregistered and the struct klp_ops is freed when the func_stack list @@ -329,23 +476,6 @@ The current Livepatch implementation has several limitations: by "notrace". - + Livepatch modules can not be removed. - - The current implementation just redirects the functions at the very - beginning. It does not check if the functions are in use. In other - words, it knows when the functions get called but it does not - know when the functions return. Therefore it can not decide when - the livepatch module can be safely removed. - - This will get most likely solved once a more complex consistency model - is supported. The idea is that a safe state for patching should also - mean a safe state for removing the patch. - - Note that the patch itself might get disabled by writing zero - to /sys/kernel/livepatch//enabled. It causes that the new - code will not longer get called. But it does not guarantee - that anyone is not sleeping anywhere in the new code. - + Livepatch works reliably only when the dynamic ftrace is located at the very beginning of the function. diff --git a/Documentation/md/md-cluster.txt b/Documentation/md/md-cluster.txt index 38883276d31c379fe6de344537b08d0b4e10e6a9..82ee51604e9ad498ca52a66dc95086ae10d1e98b 100644 --- a/Documentation/md/md-cluster.txt +++ b/Documentation/md/md-cluster.txt @@ -77,7 +77,7 @@ There are three groups of locks for managing the device: 3.1.2 RESYNCING: informs other nodes that a resync is initiated or ended so that each node may suspend or resume the region. Each RESYNCING message identifies a range of the devices that the - sending node is about to resync. This over-rides any pervious + sending node is about to resync. This overrides any previous notification from that node: only one ranged can be resynced at a time per-node. @@ -321,4 +321,4 @@ The algorithm is: There are somethings which are not supported by cluster MD yet. -- update size and change array_sectors. +- change array_sectors. diff --git a/Documentation/md/raid5-ppl.txt b/Documentation/md/raid5-ppl.txt new file mode 100644 index 0000000000000000000000000000000000000000..127072b093632701f7bb598bae603dd2ec3ede26 --- /dev/null +++ b/Documentation/md/raid5-ppl.txt @@ -0,0 +1,44 @@ +Partial Parity Log + +Partial Parity Log (PPL) is a feature available for RAID5 arrays. The issue +addressed by PPL is that after a dirty shutdown, parity of a particular stripe +may become inconsistent with data on other member disks. If the array is also +in degraded state, there is no way to recalculate parity, because one of the +disks is missing. This can lead to silent data corruption when rebuilding the +array or using it is as degraded - data calculated from parity for array blocks +that have not been touched by a write request during the unclean shutdown can +be incorrect. Such condition is known as the RAID5 Write Hole. Because of +this, md by default does not allow starting a dirty degraded array. + +Partial parity for a write operation is the XOR of stripe data chunks not +modified by this write. It is just enough data needed for recovering from the +write hole. XORing partial parity with the modified chunks produces parity for +the stripe, consistent with its state before the write operation, regardless of +which chunk writes have completed. If one of the not modified data disks of +this stripe is missing, this updated parity can be used to recover its +contents. PPL recovery is also performed when starting an array after an +unclean shutdown and all disks are available, eliminating the need to resync +the array. Because of this, using write-intent bitmap and PPL together is not +supported. + +When handling a write request PPL writes partial parity before new data and +parity are dispatched to disks. PPL is a distributed log - it is stored on +array member drives in the metadata area, on the parity drive of a particular +stripe. It does not require a dedicated journaling drive. Write performance is +reduced by up to 30%-40% but it scales with the number of drives in the array +and the journaling drive does not become a bottleneck or a single point of +failure. + +Unlike raid5-cache, the other solution in md for closing the write hole, PPL is +not a true journal. It does not protect from losing in-flight data, only from +silent data corruption. If a dirty disk of a stripe is lost, no PPL recovery is +performed for this stripe (parity is not updated). So it is possible to have +arbitrary data in the written part of a stripe if that disk is lost. In such +case the behavior is the same as in plain raid5. + +PPL is available for md version-1 metadata and external (specifically IMSM) +metadata arrays. It can be enabled using mdadm option --consistency-policy=ppl. + +Currently, volatile write-back cache should be disabled on all member drives +when using PPL. Otherwise it cannot guarantee consistency in case of power +failure. diff --git a/Documentation/media/Makefile b/Documentation/media/Makefile index 9b3e70b2cab29ddb02a3b61878f67569410764d1..36166952d555fbc5444e6c2c9c11c081c81df2f1 100644 --- a/Documentation/media/Makefile +++ b/Documentation/media/Makefile @@ -1,51 +1,6 @@ -# Rules to convert DOT and SVG to Sphinx images - -SRC_DIR=$(srctree)/Documentation/media - -DOTS = \ - uapi/v4l/pipeline.dot \ - -IMAGES = \ - typical_media_device.svg \ - uapi/dvb/dvbstb.svg \ - uapi/v4l/bayer.svg \ - uapi/v4l/constraints.svg \ - uapi/v4l/crop.svg \ - uapi/v4l/fieldseq_bt.svg \ - uapi/v4l/fieldseq_tb.svg \ - uapi/v4l/nv12mt.svg \ - uapi/v4l/nv12mt_example.svg \ - uapi/v4l/pipeline.svg \ - uapi/v4l/selection.svg \ - uapi/v4l/subdev-image-processing-full.svg \ - uapi/v4l/subdev-image-processing-scaling-multi-source.svg \ - uapi/v4l/subdev-image-processing-crop.svg \ - uapi/v4l/vbi_525.svg \ - uapi/v4l/vbi_625.svg \ - uapi/v4l/vbi_hsync.svg \ - -DOTTGT := $(patsubst %.dot,%.svg,$(DOTS)) -IMGDOT := $(patsubst %,$(SRC_DIR)/%,$(DOTTGT)) - -IMGTGT := $(patsubst %.svg,%.pdf,$(IMAGES)) -IMGPDF := $(patsubst %,$(SRC_DIR)/%,$(IMGTGT)) - -cmd = $(echo-cmd) $(cmd_$(1)) - -quiet_cmd_genpdf = GENPDF $2 - cmd_genpdf = convert $2 $3 - -quiet_cmd_gendot = DOT $2 - cmd_gendot = dot -Tsvg $2 > $3 || { rm -f $3; exit 1; } - -%.pdf: %.svg - @$(call cmd,genpdf,$<,$@) - -%.svg: %.dot - @$(call cmd,gendot,$<,$@) - # Rules to convert a .h file to inline RST documentation +SRC_DIR=$(srctree)/Documentation/media PARSER = $(srctree)/Documentation/sphinx/parse-headers.pl UAPI = $(srctree)/include/uapi/linux KAPI = $(srctree)/include/linux diff --git a/Documentation/media/intro.rst b/Documentation/media/intro.rst index 8f7490c9a8efcda19316968085fce830feeba624..9ce2e23a0236796c557b3dd371ad71db845a646a 100644 --- a/Documentation/media/intro.rst +++ b/Documentation/media/intro.rst @@ -13,9 +13,9 @@ A typical media device hardware is shown at :ref:`typical_media_device`. .. _typical_media_device: -.. figure:: typical_media_device.* - :alt: typical_media_device.pdf / typical_media_device.svg - :align: center +.. kernel-figure:: typical_media_device.svg + :alt: typical_media_device.svg + :align: center Typical Media Device diff --git a/Documentation/media/kapi/cec-core.rst b/Documentation/media/kapi/cec-core.rst index 81c6d8e93774a89bd4d1d745b512ea41e7d1a513..7a04c5386dc8a3ba91c87816eaf9295e194882c6 100644 --- a/Documentation/media/kapi/cec-core.rst +++ b/Documentation/media/kapi/cec-core.rst @@ -27,11 +27,8 @@ HDMI 1.3a specification is sufficient: http://www.microprocessor.org/HDMISpecification13a.pdf -The Kernel Interface -==================== - -CEC Adapter ------------ +CEC Adapter Interface +--------------------- The struct cec_adapter represents the CEC adapter hardware. It is created by calling cec_allocate_adapter() and deleted by calling cec_delete_adapter(): @@ -51,6 +48,7 @@ ops: priv: will be stored in adap->priv and can be used by the adapter ops. + Use cec_get_drvdata(adap) to get the priv pointer. name: the name of the CEC adapter. Note: this name will be copied. @@ -65,6 +63,10 @@ available_las: the number of simultaneous logical addresses that this adapter can handle. Must be 1 <= available_las <= CEC_MAX_LOG_ADDRS. +To obtain the priv pointer use this helper function: + +.. c:function:: + void *cec_get_drvdata(const struct cec_adapter *adap); To register the /dev/cecX device node and the remote control device (if CEC_CAP_RC is set) you call: diff --git a/Documentation/media/kapi/csi2.rst b/Documentation/media/kapi/csi2.rst index 2004db00b12b56a7b56639ec41589b80f2892440..e33fcb967922d460f41a865eb148ee46969427ac 100644 --- a/Documentation/media/kapi/csi2.rst +++ b/Documentation/media/kapi/csi2.rst @@ -45,10 +45,11 @@ where * - bits_per_sample - Number of bits per sample. -The transmitter drivers must configure the CSI-2 transmitter to *LP-11 -mode* whenever the transmitter is powered on but not active. Some -transmitters do this automatically but some have to be explicitly -programmed to do so. +The transmitter drivers must, if possible, configure the CSI-2 +transmitter to *LP-11 mode* whenever the transmitter is powered on but +not active. Some transmitters do this automatically but some have to +be explicitly programmed to do so, and some are unable to do so +altogether due to hardware constraints. Receiver drivers ---------------- diff --git a/Documentation/media/kapi/v4l2-core.rst b/Documentation/media/kapi/v4l2-core.rst index e9677150ed99d45c8922801e37599e8a93133d32..d8f6c46d26d58d58db6384434c4b2583ffc29155 100644 --- a/Documentation/media/kapi/v4l2-core.rst +++ b/Documentation/media/kapi/v4l2-core.rst @@ -1,4 +1,4 @@ -Video2Linux devices +Video4Linux devices ------------------- .. toctree:: diff --git a/Documentation/media/lirc.h.rst.exceptions b/Documentation/media/lirc.h.rst.exceptions index 246c850151d71343b0d212fbbc0f4f631590bfc0..c130617a9986ef4ccab1c060f319e00cb2251449 100644 --- a/Documentation/media/lirc.h.rst.exceptions +++ b/Documentation/media/lirc.h.rst.exceptions @@ -35,7 +35,6 @@ ignore define PULSE_MASK ignore define LIRC_MODE2_SPACE ignore define LIRC_MODE2_PULSE -ignore define LIRC_MODE2_TIMEOUT ignore define LIRC_VALUE_MASK ignore define LIRC_MODE2_MASK diff --git a/Documentation/media/uapi/cec/cec-func-ioctl.rst b/Documentation/media/uapi/cec/cec-func-ioctl.rst index 7dcfd178fb243c33f6ba23bb331236496018809c..22fb6304a2df048c20a5fb8b9046d69d78df3493 100644 --- a/Documentation/media/uapi/cec/cec-func-ioctl.rst +++ b/Documentation/media/uapi/cec/cec-func-ioctl.rst @@ -30,7 +30,7 @@ Arguments ``request`` CEC ioctl request code as defined in the cec.h header file, for - example :c:func:`CEC_ADAP_G_CAPS`. + example :ref:`CEC_ADAP_G_CAPS `. ``argp`` Pointer to a request-specific structure. diff --git a/Documentation/media/uapi/cec/cec-func-open.rst b/Documentation/media/uapi/cec/cec-func-open.rst index 0304388cd15976a7ff00eb46ca9bc9ef5f0a3c94..18dfb62f2efe7de04aa8812aa4d9f1bf0abf18fc 100644 --- a/Documentation/media/uapi/cec/cec-func-open.rst +++ b/Documentation/media/uapi/cec/cec-func-open.rst @@ -33,7 +33,7 @@ Arguments Open flags. Access mode must be ``O_RDWR``. When the ``O_NONBLOCK`` flag is given, the - :ref:`CEC_RECEIVE ` and :c:func:`CEC_DQEVENT` ioctls + :ref:`CEC_RECEIVE ` and :ref:`CEC_DQEVENT ` ioctls will return the ``EAGAIN`` error code when no message or event is available, and ioctls :ref:`CEC_TRANSMIT `, :ref:`CEC_ADAP_S_PHYS_ADDR ` and diff --git a/Documentation/media/uapi/cec/cec-func-poll.rst b/Documentation/media/uapi/cec/cec-func-poll.rst index 6a863cfda6e05172be7e60e930fbd0ff885b3ce4..fa0abd8fb16056ac370e35ae1f0a964ee1ead020 100644 --- a/Documentation/media/uapi/cec/cec-func-poll.rst +++ b/Documentation/media/uapi/cec/cec-func-poll.rst @@ -30,7 +30,7 @@ Arguments List of FD events to be watched ``nfds`` - Number of FD efents at the \*ufds array + Number of FD events at the \*ufds array ``timeout`` Timeout to wait for events @@ -49,7 +49,7 @@ is non-zero). CEC devices set the ``POLLIN`` and ``POLLRDNORM`` flags in the ``revents`` field if there are messages in the receive queue. If the transmit queue has room for new messages, the ``POLLOUT`` and ``POLLWRNORM`` flags are set. If there are events in the event queue, -then the ``POLLPRI`` flag is set. When the function timed out it returns +then the ``POLLPRI`` flag is set. When the function times out it returns a value of zero, on failure it returns -1 and the ``errno`` variable is set appropriately. diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst index 09f09bbe28d4ffb1d5b3291221c4b678e4e38c6e..fcf863ab6f4378343dc764f29c5baca37438f6f1 100644 --- a/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst +++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-log-addrs.rst @@ -351,3 +351,16 @@ On success 0 is returned, on error -1 and the ``errno`` variable is set appropriately. The generic error codes are described at the :ref:`Generic Error Codes ` chapter. +The :ref:`ioctl CEC_ADAP_S_LOG_ADDRS ` can return the following +error codes: + +ENOTTY + The ``CEC_CAP_LOG_ADDRS`` capability wasn't set, so this ioctl is not supported. + +EBUSY + The CEC adapter is currently configuring itself, or it is already configured and + ``num_log_addrs`` is non-zero, or another filehandle is in exclusive follower or + initiator mode, or the filehandle is in mode ``CEC_MODE_NO_INITIATOR``. + +EINVAL + The contents of struct :c:type:`cec_log_addrs` is invalid. diff --git a/Documentation/media/uapi/cec/cec-ioc-adap-g-phys-addr.rst b/Documentation/media/uapi/cec/cec-ioc-adap-g-phys-addr.rst index a3cdc75cec3e3c0754027869cee43ad8ca019260..9e49d4be35d5b6b7332ee5c6522400f57012b55e 100644 --- a/Documentation/media/uapi/cec/cec-ioc-adap-g-phys-addr.rst +++ b/Documentation/media/uapi/cec/cec-ioc-adap-g-phys-addr.rst @@ -78,3 +78,16 @@ Return Value On success 0 is returned, on error -1 and the ``errno`` variable is set appropriately. The generic error codes are described at the :ref:`Generic Error Codes ` chapter. + +The :ref:`ioctl CEC_ADAP_S_PHYS_ADDR ` can return the following +error codes: + +ENOTTY + The ``CEC_CAP_PHYS_ADDR`` capability wasn't set, so this ioctl is not supported. + +EBUSY + Another filehandle is in exclusive follower or initiator mode, or the filehandle + is in mode ``CEC_MODE_NO_INITIATOR``. + +EINVAL + The physical address is malformed. diff --git a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst index 6e589a1fae1704c0eea188c662537d37ea6142b5..4d3570c2e0b3662183743be39de726946a3b2595 100644 --- a/Documentation/media/uapi/cec/cec-ioc-dqevent.rst +++ b/Documentation/media/uapi/cec/cec-ioc-dqevent.rst @@ -56,7 +56,7 @@ it is guaranteed that the state did change in between the two events. * - __u16 - ``phys_addr`` - The current physical address. This is ``CEC_PHYS_ADDR_INVALID`` if no - valid physical address is set. + valid physical address is set. * - __u16 - ``log_addr_mask`` - The current set of claimed logical addresses. This is 0 if no logical @@ -174,3 +174,14 @@ Return Value On success 0 is returned, on error -1 and the ``errno`` variable is set appropriately. The generic error codes are described at the :ref:`Generic Error Codes ` chapter. + +The :ref:`ioctl CEC_DQEVENT ` can return the following +error codes: + +EAGAIN + This is returned when the filehandle is in non-blocking mode and there + are no pending events. + +ERESTARTSYS + An interrupt (e.g. Ctrl-C) arrived while in blocking mode waiting for + events to arrive. diff --git a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst index e4ded9df0a84a3be52a4f2a5fb09359e1d4fe285..664f0d47bbcdc753a81587ce24b7bd4b4c1f3b30 100644 --- a/Documentation/media/uapi/cec/cec-ioc-g-mode.rst +++ b/Documentation/media/uapi/cec/cec-ioc-g-mode.rst @@ -249,3 +249,15 @@ Return Value On success 0 is returned, on error -1 and the ``errno`` variable is set appropriately. The generic error codes are described at the :ref:`Generic Error Codes ` chapter. + +The :ref:`ioctl CEC_S_MODE ` can return the following +error codes: + +EINVAL + The requested mode is invalid. + +EPERM + Monitor mode is requested without having root permissions + +EBUSY + Someone else is already an exclusive follower or initiator. diff --git a/Documentation/media/uapi/cec/cec-ioc-receive.rst b/Documentation/media/uapi/cec/cec-ioc-receive.rst index dc2adb391c0a2d2183e3e00bc90af520e727c6dd..267044f7ac301244ffeae3d07439b6fc4470b534 100644 --- a/Documentation/media/uapi/cec/cec-ioc-receive.rst +++ b/Documentation/media/uapi/cec/cec-ioc-receive.rst @@ -51,13 +51,13 @@ A received message can be: be non-zero). To send a CEC message the application has to fill in the struct -:c:type:` cec_msg` and pass it to :ref:`ioctl CEC_TRANSMIT `. +:c:type:`cec_msg` and pass it to :ref:`ioctl CEC_TRANSMIT `. The :ref:`ioctl CEC_TRANSMIT ` is only available if ``CEC_CAP_TRANSMIT`` is set. If there is no more room in the transmit queue, then it will return -1 and set errno to the ``EBUSY`` error code. The transmit queue has enough room for 18 messages (about 1 second worth of 2-byte messages). Note that the CEC kernel framework will also reply -to core messages (see :ref:cec-core-processing), so it is not a good +to core messages (see :ref:`cec-core-processing`), so it is not a good idea to fully fill up the transmit queue. If the file descriptor is in non-blocking mode then the transmit will @@ -69,6 +69,18 @@ The ``sequence`` field is filled in for every transmit and this can be checked against the received messages to find the corresponding transmit result. +Normally calling :ref:`ioctl CEC_TRANSMIT ` when the physical +address is invalid (due to e.g. a disconnect) will return ``ENONET``. + +However, the CEC specification allows sending messages from 'Unregistered' to +'TV' when the physical address is invalid since some TVs pull the hotplug detect +pin of the HDMI connector low when they go into standby, or when switching to +another input. + +When the hotplug detect pin goes low the EDID disappears, and thus the +physical address, but the cable is still connected and CEC still works. +In order to detect/wake up the device it is allowed to send poll and 'Image/Text +View On' messages from initiator 0xf ('Unregistered') to destination 0 ('TV'). .. tabularcolumns:: |p{1.0cm}|p{3.5cm}|p{13.0cm}| @@ -289,3 +301,42 @@ Return Value On success 0 is returned, on error -1 and the ``errno`` variable is set appropriately. The generic error codes are described at the :ref:`Generic Error Codes ` chapter. + +The :ref:`ioctl CEC_RECEIVE ` can return the following +error codes: + +EAGAIN + No messages are in the receive queue, and the filehandle is in non-blocking mode. + +ETIMEDOUT + The ``timeout`` was reached while waiting for a message. + +ERESTARTSYS + The wait for a message was interrupted (e.g. by Ctrl-C). + +The :ref:`ioctl CEC_TRANSMIT ` can return the following +error codes: + +ENOTTY + The ``CEC_CAP_TRANSMIT`` capability wasn't set, so this ioctl is not supported. + +EPERM + The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS ` + has never been called. + +ENONET + The CEC adapter is not configured, i.e. :ref:`ioctl CEC_ADAP_S_LOG_ADDRS ` + was called, but the physical address is invalid so no logical address was claimed. + An exception is made in this case for transmits from initiator 0xf ('Unregistered') + to destination 0 ('TV'). In that case the transmit will proceed as usual. + +EBUSY + Another filehandle is in exclusive follower or initiator mode, or the filehandle + is in mode ``CEC_MODE_NO_INITIATOR``. This is also returned if the transmit + queue is full. + +EINVAL + The contents of struct :c:type:`cec_msg` is invalid. + +ERESTARTSYS + The wait for a successful transmit was interrupted (e.g. by Ctrl-C). diff --git a/Documentation/media/uapi/dvb/intro.rst b/Documentation/media/uapi/dvb/intro.rst index 2ed5c23102b492dc5cb134ac2b63c24a63dd466a..652c4aacd2c6da91b010e041c780c3bcb7f9ae17 100644 --- a/Documentation/media/uapi/dvb/intro.rst +++ b/Documentation/media/uapi/dvb/intro.rst @@ -55,9 +55,9 @@ Overview .. _stb_components: -.. figure:: dvbstb.* - :alt: dvbstb.pdf / dvbstb.svg - :align: center +.. kernel-figure:: dvbstb.svg + :alt: dvbstb.svg + :align: center Components of a DVB card/STB diff --git a/Documentation/media/uapi/mediactl/media-types.rst b/Documentation/media/uapi/mediactl/media-types.rst index 3e03dc2e60031b49636d4036a9a1943193cf494c..2a5164aea2b404b807033fd07bc5a6e896cac36c 100644 --- a/Documentation/media/uapi/mediactl/media-types.rst +++ b/Documentation/media/uapi/mediactl/media-types.rst @@ -284,7 +284,8 @@ Types and flags used to represent the media graph elements supported scaling ratios is entity-specific and can differ between the horizontal and vertical directions (in particular scaling can be supported in one direction only). Binning and - skipping are considered as scaling. + sub-sampling (occasionally also referred to as skipping) are + considered as scaling. - .. row 28 diff --git a/Documentation/media/uapi/rc/lirc-dev-intro.rst b/Documentation/media/uapi/rc/lirc-dev-intro.rst index ef97e40f2fd873c7f951f138d877bd4d656a314c..d1936eeb9ce06e607868072a2287b09a49f25dbf 100644 --- a/Documentation/media/uapi/rc/lirc-dev-intro.rst +++ b/Documentation/media/uapi/rc/lirc-dev-intro.rst @@ -27,6 +27,8 @@ What you should see for a chardev: $ ls -l /dev/lirc* crw-rw---- 1 root root 248, 0 Jul 2 22:20 /dev/lirc0 +.. _lirc_modes: + ********** LIRC modes ********** @@ -38,25 +40,62 @@ on the following table. ``LIRC_MODE_MODE2`` - The driver returns a sequence of pulse and space codes to userspace. + The driver returns a sequence of pulse and space codes to userspace, + as a series of u32 values. This mode is used only for IR receive. + The upper 8 bits determine the packet type, and the lower 24 bits + the payload. Use ``LIRC_VALUE()`` macro to get the payload, and + the macro ``LIRC_MODE2()`` will give you the type, which + is one of: + + ``LIRC_MODE2_PULSE`` + + Signifies the presence of IR in microseconds. + + ``LIRC_MODE2_SPACE`` + + Signifies absence of IR in microseconds. + + ``LIRC_MODE2_FREQUENCY`` + + If measurement of the carrier frequency was enabled with + :ref:`lirc_set_measure_carrier_mode` then this packet gives you + the carrier frequency in Hertz. + + ``LIRC_MODE2_TIMEOUT`` + + If timeout reports are enabled with + :ref:`lirc_set_rec_timeout_reports`, when the timeout set with + :ref:`lirc_set_rec_timeout` expires due to no IR being detected, + this packet will be sent, with the number of microseconds with + no IR. + .. _lirc-mode-lirccode: ``LIRC_MODE_LIRCCODE`` - The IR signal is decoded internally by the receiver. The LIRC interface - returns the scancode as an integer value. This is the usual mode used - by several TV media cards. + This mode can be used for IR receive and send. - This mode is used only for IR receive. + The IR signal is decoded internally by the receiver, or encoded by the + transmitter. The LIRC interface represents the scancode as byte string, + which might not be a u32, it can be any length. The value is entirely + driver dependent. This mode is used by some older lirc drivers. + + The length of each code depends on the driver, which can be retrieved + with :ref:`lirc_get_length`. This length is used both + for transmitting and receiving IR. .. _lirc-mode-pulse: ``LIRC_MODE_PULSE`` - On puse mode, a sequence of pulse/space integer values are written to the - lirc device using :Ref:`lirc-write`. + In pulse mode, a sequence of pulse/space integer values are written to the + lirc device using :ref:`lirc-write`. + + The values are alternating pulse and space lengths, in microseconds. The + first and last entry must be a pulse, so there must be an odd number + of entries. This mode is used only for IR send. diff --git a/Documentation/media/uapi/rc/lirc-get-features.rst b/Documentation/media/uapi/rc/lirc-get-features.rst index 79e07b4d44d681e04f2679ffb608d3943ecba905..64f89a4f9d9ce2d937cb6b518e900a96ba8b4975 100644 --- a/Documentation/media/uapi/rc/lirc-get-features.rst +++ b/Documentation/media/uapi/rc/lirc-get-features.rst @@ -48,8 +48,8 @@ LIRC features ``LIRC_CAN_REC_PULSE`` - The driver is capable of receiving using - :ref:`LIRC_MODE_PULSE `. + Unused. Kept just to avoid breaking uAPI. + :ref:`LIRC_MODE_PULSE ` can only be used for transmitting. .. _LIRC-CAN-REC-MODE2: @@ -156,19 +156,22 @@ LIRC features ``LIRC_CAN_SEND_PULSE`` - The driver supports sending using :ref:`LIRC_MODE_PULSE `. + The driver supports sending (also called as IR blasting or IR TX) using + :ref:`LIRC_MODE_PULSE `. .. _LIRC-CAN-SEND-MODE2: ``LIRC_CAN_SEND_MODE2`` - The driver supports sending using :ref:`LIRC_MODE_MODE2 `. + Unused. Kept just to avoid breaking uAPI. + :ref:`LIRC_MODE_MODE2 ` can only be used for receiving. .. _LIRC-CAN-SEND-LIRCCODE: ``LIRC_CAN_SEND_LIRCCODE`` - The driver supports sending codes (also called as IR blasting or IR TX). + The driver supports sending (also called as IR blasting or IR TX) using + :ref:`LIRC_MODE_LIRCCODE `. Return Value diff --git a/Documentation/media/uapi/rc/lirc-get-length.rst b/Documentation/media/uapi/rc/lirc-get-length.rst index 8c2747c8d2c9e2cab495e47acd87b19d86984a1f..3990af5de0e917fe135b8333e7ada35b90a9481d 100644 --- a/Documentation/media/uapi/rc/lirc-get-length.rst +++ b/Documentation/media/uapi/rc/lirc-get-length.rst @@ -30,7 +30,8 @@ Arguments Description =========== -Retrieves the code length in bits (only for ``LIRC-MODE-LIRCCODE``). +Retrieves the code length in bits (only for +:ref:`LIRC_MODE_LIRCCODE `). Reads on the device must be done in blocks matching the bit count. The bit could should be rounded up so that it matches full bytes. diff --git a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst index a5023e0194c13223c2a6a9b433af41cbed5e3d58..a4eb6c0a26e9ca9481528b958086d945ca4524cc 100644 --- a/Documentation/media/uapi/rc/lirc-get-rec-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-rec-mode.rst @@ -35,8 +35,8 @@ Description Get/set supported receive modes. Only :ref:`LIRC_MODE_MODE2 ` and :ref:`LIRC_MODE_LIRCCODE ` are supported for IR -receive. - +receive. Use :ref:`lirc_get_features` to find out which modes the driver +supports. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-get-send-mode.rst b/Documentation/media/uapi/rc/lirc-get-send-mode.rst index 51ac13428969add93744281d38604411945ef6c4..a169b234290e9023572a1ae8d78fdc434a5f9b02 100644 --- a/Documentation/media/uapi/rc/lirc-get-send-mode.rst +++ b/Documentation/media/uapi/rc/lirc-get-send-mode.rst @@ -34,9 +34,12 @@ Arguments Description =========== -Get/set supported transmit mode. +Get/set current transmit mode. -Only :ref:`LIRC_MODE_PULSE ` is supported by for IR send. +Only :ref:`LIRC_MODE_PULSE ` and +:ref:`LIRC_MODE_LIRCCODE ` is supported by for IR send, +depending on the driver. Use :ref:`lirc_get_features` to find out which +modes the driver supports. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-read.rst b/Documentation/media/uapi/rc/lirc-read.rst index 4c678f60e87204908ad148b027ce60dc5ae4596b..ff14a69104e54f373a963eddacb259ffd4fc025d 100644 --- a/Documentation/media/uapi/rc/lirc-read.rst +++ b/Documentation/media/uapi/rc/lirc-read.rst @@ -44,17 +44,13 @@ descriptor ``fd`` into the buffer starting at ``buf``. If ``count`` is zero, :ref:`read() ` returns zero and has no other results. If ``count`` is greater than ``SSIZE_MAX``, the result is unspecified. -The lircd userspace daemon reads raw IR data from the LIRC chardev. The -exact format of the data depends on what modes a driver supports, and -what mode has been selected. lircd obtains supported modes and sets the -active mode via the ioctl interface, detailed at :ref:`lirc_func`. -The generally preferred mode for receive is -:ref:`LIRC_MODE_MODE2 `, in which packets containing an -int value describing an IR signal are read from the chardev. +The exact format of the data depends on what :ref:`lirc_modes` a driver +uses. Use :ref:`lirc_get_features` to get the supported mode. -See also -`http://www.lirc.org/html/technical.html `__ -for more info. +The generally preferred mode for receive is +:ref:`LIRC_MODE_MODE2 `, +in which packets containing an int value describing an IR signal are +read from the chardev. Return Value ============ diff --git a/Documentation/media/uapi/rc/lirc-set-rec-carrier-range.rst b/Documentation/media/uapi/rc/lirc-set-rec-carrier-range.rst index a83fbbfa0d3bcd12517907c4f14535d1f4a4b18f..a89246806c4b908a52931200f3cd2b2a804dd13c 100644 --- a/Documentation/media/uapi/rc/lirc-set-rec-carrier-range.rst +++ b/Documentation/media/uapi/rc/lirc-set-rec-carrier-range.rst @@ -9,7 +9,7 @@ ioctl LIRC_SET_REC_CARRIER_RANGE Name ==== -LIRC_SET_REC_CARRIER_RANGE - Set lower bond of the carrier used to modulate +LIRC_SET_REC_CARRIER_RANGE - Set lower bound of the carrier used to modulate IR receive. Synopsis diff --git a/Documentation/media/uapi/rc/lirc-set-rec-timeout-reports.rst b/Documentation/media/uapi/rc/lirc-set-rec-timeout-reports.rst index 9c501bbf4c626c25401545c35d0546d2a827403a..86353e6026955f9ff85005325b3d1cc267886a76 100644 --- a/Documentation/media/uapi/rc/lirc-set-rec-timeout-reports.rst +++ b/Documentation/media/uapi/rc/lirc-set-rec-timeout-reports.rst @@ -31,6 +31,8 @@ Arguments Description =========== +.. _lirc-mode2-timeout: + Enable or disable timeout reports for IR receive. By default, timeout reports should be turned off. diff --git a/Documentation/media/uapi/rc/lirc-write.rst b/Documentation/media/uapi/rc/lirc-write.rst index 3b035c6613b1b4656ef4abdd84048c10d79c88a8..2aad0fef4a5b19f242ff4d42841551214005d4a8 100644 --- a/Documentation/media/uapi/rc/lirc-write.rst +++ b/Documentation/media/uapi/rc/lirc-write.rst @@ -42,13 +42,16 @@ Description referenced by the file descriptor ``fd`` from the buffer starting at ``buf``. -The data written to the chardev is a pulse/space sequence of integer -values. Pulses and spaces are only marked implicitly by their position. -The data must start and end with a pulse, therefore, the data must -always include an uneven number of samples. The write function must -block until the data has been transmitted by the hardware. If more data -is provided than the hardware can send, the driver returns ``EINVAL``. - +The exact format of the data depends on what mode a driver uses, use +:ref:`lirc_get_features` to get the supported mode. + +When in :ref:`LIRC_MODE_PULSE ` mode, the data written to +the chardev is a pulse/space sequence of integer values. Pulses and spaces +are only marked implicitly by their position. The data must start and end +with a pulse, therefore, the data must always include an uneven number of +samples. The write function must block until the data has been transmitted +by the hardware. If more data is provided than the hardware can send, the +driver returns ``EINVAL``. Return Value ============ diff --git a/Documentation/media/uapi/v4l/buffer.rst b/Documentation/media/uapi/v4l/buffer.rst index ac58966ccb9b22530f343eea2e4c7d62c5151ba7..ae6ee73f151c658dc613860f2d39da1cac7de0dd 100644 --- a/Documentation/media/uapi/v4l/buffer.rst +++ b/Documentation/media/uapi/v4l/buffer.rst @@ -34,6 +34,125 @@ flags are copied from the OUTPUT video buffer to the CAPTURE video buffer. +Interactions between formats, controls and buffers +================================================== + +V4L2 exposes parameters that influence the buffer size, or the way data is +laid out in the buffer. Those parameters are exposed through both formats and +controls. One example of such a control is the ``V4L2_CID_ROTATE`` control +that modifies the direction in which pixels are stored in the buffer, as well +as the buffer size when the selected format includes padding at the end of +lines. + +The set of information needed to interpret the content of a buffer (e.g. the +pixel format, the line stride, the tiling orientation or the rotation) is +collectively referred to in the rest of this section as the buffer layout. + +Controls that can modify the buffer layout shall set the +``V4L2_CTRL_FLAG_MODIFY_LAYOUT`` flag. + +Modifying formats or controls that influence the buffer size or layout require +the stream to be stopped. Any attempt at such a modification while the stream +is active shall cause the ioctl setting the format or the control to return +the ``EBUSY`` error code. In that case drivers shall also set the +``V4L2_CTRL_FLAG_GRABBED`` flag when calling +:c:func:`VIDIOC_QUERYCTRL` or :c:func:`VIDIOC_QUERY_EXT_CTRL` for such a +control while the stream is active. + +.. note:: + + The :c:func:`VIDIOC_S_SELECTION` ioctl can, depending on the hardware (for + instance if the device doesn't include a scaler), modify the format in + addition to the selection rectangle. Similarly, the + :c:func:`VIDIOC_S_INPUT`, :c:func:`VIDIOC_S_OUTPUT`, :c:func:`VIDIOC_S_STD` + and :c:func:`VIDIOC_S_DV_TIMINGS` ioctls can also modify the format and + selection rectangles. When those ioctls result in a buffer size or layout + change, drivers shall handle that condition as they would handle it in the + :c:func:`VIDIOC_S_FMT` ioctl in all cases described in this section. + +Controls that only influence the buffer layout can be modified at any time +when the stream is stopped. As they don't influence the buffer size, no +special handling is needed to synchronize those controls with buffer +allocation and the ``V4L2_CTRL_FLAG_GRABBED`` flag is cleared once the +stream is stopped. + +Formats and controls that influence the buffer size interact with buffer +allocation. The simplest way to handle this is for drivers to always require +buffers to be reallocated in order to change those formats or controls. In +that case, to perform such changes, userspace applications shall first stop +the video stream with the :c:func:`VIDIOC_STREAMOFF` ioctl if it is running +and free all buffers with the :c:func:`VIDIOC_REQBUFS` ioctl if they are +allocated. After freeing all buffers the ``V4L2_CTRL_FLAG_GRABBED`` flag +for controls is cleared. The format or controls can then be modified, and +buffers shall then be reallocated and the stream restarted. A typical ioctl +sequence is + + #. VIDIOC_STREAMOFF + #. VIDIOC_REQBUFS(0) + #. VIDIOC_S_EXT_CTRLS + #. VIDIOC_S_FMT + #. VIDIOC_REQBUFS(n) + #. VIDIOC_QBUF + #. VIDIOC_STREAMON + +The second :c:func:`VIDIOC_REQBUFS` call will take the new format and control +value into account to compute the buffer size to allocate. Applications can +also retrieve the size by calling the :c:func:`VIDIOC_G_FMT` ioctl if needed. + +.. note:: + + The API doesn't mandate the above order for control (3.) and format (4.) + changes. Format and controls can be set in a different order, or even + interleaved, depending on the device and use case. For instance some + controls might behave differently for different pixel formats, in which + case the format might need to be set first. + +When reallocation is required, any attempt to modify format or controls that +influences the buffer size while buffers are allocated shall cause the format +or control set ioctl to return the ``EBUSY`` error. Any attempt to queue a +buffer too small for the current format or controls shall cause the +:c:func:`VIDIOC_QBUF` ioctl to return a ``EINVAL`` error. + +Buffer reallocation is an expensive operation. To avoid that cost, drivers can +(and are encouraged to) allow format or controls that influence the buffer +size to be changed with buffers allocated. In that case, a typical ioctl +sequence to modify format and controls is + + #. VIDIOC_STREAMOFF + #. VIDIOC_S_EXT_CTRLS + #. VIDIOC_S_FMT + #. VIDIOC_QBUF + #. VIDIOC_STREAMON + +For this sequence to operate correctly, queued buffers need to be large enough +for the new format or controls. Drivers shall return a ``ENOSPC`` error in +response to format change (:c:func:`VIDIOC_S_FMT`) or control changes +(:c:func:`VIDIOC_S_CTRL` or :c:func:`VIDIOC_S_EXT_CTRLS`) if buffers too small +for the new format are currently queued. As a simplification, drivers are +allowed to return a ``EBUSY`` error from these ioctls if any buffer is +currently queued, without checking the queued buffers sizes. + +Additionally, drivers shall return a ``EINVAL`` error from the +:c:func:`VIDIOC_QBUF` ioctl if the buffer being queued is too small for the +current format or controls. Together, these requirements ensure that queued +buffers will always be large enough for the configured format and controls. + +Userspace applications can query the buffer size required for a given format +and controls by first setting the desired control values and then trying the +desired format. The :c:func:`VIDIOC_TRY_FMT` ioctl will return the required +buffer size. + + #. VIDIOC_S_EXT_CTRLS(x) + #. VIDIOC_TRY_FMT() + #. VIDIOC_S_EXT_CTRLS(y) + #. VIDIOC_TRY_FMT() + +The :c:func:`VIDIOC_CREATE_BUFS` ioctl can then be used to allocate buffers +based on the queried sizes (for instance by allocating a set of buffers large +enough for all the desired formats and controls, or by allocating separate set +of appropriately sized buffers for each use case). + + .. c:type:: v4l2_buffer struct v4l2_buffer @@ -330,6 +449,9 @@ enum v4l2_buf_type - 12 - Buffer for Software Defined Radio (SDR) output stream, see :ref:`sdr`. + * - ``V4L2_BUF_TYPE_META_CAPTURE`` + - 13 + - Buffer for metadata capture, see :ref:`metadata`. diff --git a/Documentation/media/uapi/v4l/crop.rst b/Documentation/media/uapi/v4l/crop.rst index be58894c9c89d48a7bb58fff741d0313d8d1d4fe..182565b9ace492fcb622493ca7d674b426a56ac6 100644 --- a/Documentation/media/uapi/v4l/crop.rst +++ b/Documentation/media/uapi/v4l/crop.rst @@ -53,8 +53,8 @@ Cropping Structures .. _crop-scale: -.. figure:: crop.* - :alt: crop.pdf / crop.svg +.. kernel-figure:: crop.svg + :alt: crop.svg :align: center Image Cropping, Insertion and Scaling diff --git a/Documentation/media/uapi/v4l/depth-formats.rst b/Documentation/media/uapi/v4l/depth-formats.rst index 82f183870aaeb7c60e182768ead193d5f0d0eae8..d1641e9687a61d8fba79a53da38b393cd50a85ec 100644 --- a/Documentation/media/uapi/v4l/depth-formats.rst +++ b/Documentation/media/uapi/v4l/depth-formats.rst @@ -12,4 +12,5 @@ Depth data provides distance to points, mapped onto the image plane .. toctree:: :maxdepth: 1 + pixfmt-inzi pixfmt-z16 diff --git a/Documentation/media/uapi/v4l/dev-capture.rst b/Documentation/media/uapi/v4l/dev-capture.rst index 32b32055d0700666d450de225508accf68f62e26..4218742ab5d91364fc873ca049e878cfd053007e 100644 --- a/Documentation/media/uapi/v4l/dev-capture.rst +++ b/Documentation/media/uapi/v4l/dev-capture.rst @@ -42,8 +42,8 @@ Video capture devices shall support :ref:`audio input

    ,qdf2400_e44". - */ - if (!strcmp(device->options, "qdf2400_e44")) - device->con->write = qdf2400_e44_early_write; - else - device->con->write = pl011_early_write; + device->con->write = pl011_early_write; return 0; } OF_EARLYCON_DECLARE(pl011, "arm,pl011", pl011_early_console_setup); OF_EARLYCON_DECLARE(pl011, "arm,sbsa-uart", pl011_early_console_setup); -EARLYCON_DECLARE(qdf2400_e44, pl011_early_console_setup); + +/* + * On Qualcomm Datacenter Technologies QDF2400 SOCs affected by + * Erratum 44, traditional earlycon can be enabled by specifying + * "earlycon=qdf2400_e44,
    ". Any options are ignored. + * + * Alternatively, you can just specify "earlycon", and the early console + * will be enabled with the information from the SPCR table. In this + * case, the SPCR code will detect the need for the E44 work-around, + * and set the console name to "qdf2400_e44". + */ +static int __init +qdf2400_e44_early_console_setup(struct earlycon_device *device, + const char *opt) +{ + if (!device->port.membase) + return -ENODEV; + + device->con->write = qdf2400_e44_early_write; + return 0; +} +EARLYCON_DECLARE(qdf2400_e44, qdf2400_e44_early_console_setup); #else #define AMBA_CONSOLE NULL diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 1f50a83ef958609c3f27473b135e84e65303330d..c355ac9abafcc12adb5f8fd3205adaf4a16f86c1 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include @@ -71,6 +70,7 @@ #include #include "serial_mctrl_gpio.h" +#include "atmel_serial.h" static void atmel_start_rx(struct uart_port *port); static void atmel_stop_rx(struct uart_port *port); @@ -119,8 +119,9 @@ struct atmel_uart_char { /* * at91: 6 USARTs and one DBGU port (SAM9260) * avr32: 4 + * samx7: 3 USARTs and 5 UARTs */ -#define ATMEL_MAX_UART 7 +#define ATMEL_MAX_UART 8 /* * We wrap our port structure around the generic uart_port. @@ -175,6 +176,7 @@ struct atmel_uart_port { unsigned int pending_status; spinlock_t lock_suspended; +#ifdef CONFIG_PM struct { u32 cr; u32 mr; @@ -185,6 +187,7 @@ struct atmel_uart_port { u32 fmr; u32 fimr; } cache; +#endif int (*prepare_rx)(struct uart_port *port); int (*prepare_tx)(struct uart_port *port); diff --git a/include/linux/atmel_serial.h b/drivers/tty/serial/atmel_serial.h similarity index 100% rename from include/linux/atmel_serial.h rename to drivers/tty/serial/atmel_serial.h diff --git a/drivers/tty/serial/fsl_lpuart.c b/drivers/tty/serial/fsl_lpuart.c index f02934ffb3293de45601f434be9dd7786cafdfe4..15df1ba7809510b45c6d96b17d72e92064301973 100644 --- a/drivers/tty/serial/fsl_lpuart.c +++ b/drivers/tty/serial/fsl_lpuart.c @@ -1705,6 +1705,13 @@ lpuart_console_write(struct console *co, const char *s, unsigned int count) { struct lpuart_port *sport = lpuart_ports[co->index]; unsigned char old_cr2, cr2; + unsigned long flags; + int locked = 1; + + if (sport->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&sport->port.lock, flags); + else + spin_lock_irqsave(&sport->port.lock, flags); /* first save CR2 and then disable interrupts */ cr2 = old_cr2 = readb(sport->port.membase + UARTCR2); @@ -1719,6 +1726,9 @@ lpuart_console_write(struct console *co, const char *s, unsigned int count) barrier(); writeb(old_cr2, sport->port.membase + UARTCR2); + + if (locked) + spin_unlock_irqrestore(&sport->port.lock, flags); } static void @@ -1726,6 +1736,13 @@ lpuart32_console_write(struct console *co, const char *s, unsigned int count) { struct lpuart_port *sport = lpuart_ports[co->index]; unsigned long old_cr, cr; + unsigned long flags; + int locked = 1; + + if (sport->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&sport->port.lock, flags); + else + spin_lock_irqsave(&sport->port.lock, flags); /* first save CR2 and then disable interrupts */ cr = old_cr = lpuart32_read(sport->port.membase + UARTCTRL); @@ -1740,6 +1757,9 @@ lpuart32_console_write(struct console *co, const char *s, unsigned int count) barrier(); lpuart32_write(old_cr, sport->port.membase + UARTCTRL); + + if (locked) + spin_unlock_irqrestore(&sport->port.lock, flags); } /* diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index e3e152cbc75e71d7bc4e966f8a4ec3a1884c2b95..33509b4beaec237715de1e8febd63e861fbb9c15 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -719,6 +719,27 @@ static irqreturn_t imx_rxint(int irq, void *dev_id) return IRQ_HANDLED; } +static void imx_disable_rx_int(struct imx_port *sport) +{ + unsigned long temp; + + sport->dma_is_rxing = 1; + + /* disable the receiver ready and aging timer interrupts */ + temp = readl(sport->port.membase + UCR1); + temp &= ~(UCR1_RRDYEN); + writel(temp, sport->port.membase + UCR1); + + temp = readl(sport->port.membase + UCR2); + temp &= ~(UCR2_ATEN); + writel(temp, sport->port.membase + UCR2); + + /* disable the rx errors interrupts */ + temp = readl(sport->port.membase + UCR4); + temp &= ~UCR4_OREN; + writel(temp, sport->port.membase + UCR4); +} + static void clear_rx_errors(struct imx_port *sport); static int start_rx_dma(struct imx_port *sport); /* @@ -734,21 +755,8 @@ static void imx_dma_rxint(struct imx_port *sport) temp = readl(sport->port.membase + USR2); if ((temp & USR2_RDR) && !sport->dma_is_rxing) { - sport->dma_is_rxing = 1; - /* disable the receiver ready and aging timer interrupts */ - temp = readl(sport->port.membase + UCR1); - temp &= ~(UCR1_RRDYEN); - writel(temp, sport->port.membase + UCR1); - - temp = readl(sport->port.membase + UCR2); - temp &= ~(UCR2_ATEN); - writel(temp, sport->port.membase + UCR2); - - /* disable the rx errors interrupts */ - temp = readl(sport->port.membase + UCR4); - temp &= ~UCR4_OREN; - writel(temp, sport->port.membase + UCR4); + imx_disable_rx_int(sport); /* tell the DMA to receive the data. */ start_rx_dma(sport); @@ -1317,19 +1325,10 @@ static int imx_startup(struct uart_port *port) if (!is_imx1_uart(sport)) { temp = readl(sport->port.membase + UCR3); - /* - * The effect of RI and DCD differs depending on the UFCR_DCEDTE - * bit. In DCE mode they control the outputs, in DTE mode they - * enable the respective irqs. At least the DCD irq cannot be - * cleared on i.MX25 at least, so it's not usable and must be - * disabled. I don't have test hardware to check if RI has the - * same problem but I consider this likely so it's disabled for - * now, too. - */ - temp |= IMX21_UCR3_RXDMUXSEL | UCR3_ADNIMP | - UCR3_DTRDEN | UCR3_RI | UCR3_DCD; + temp |= UCR3_DTRDEN | UCR3_RI | UCR3_DCD; if (sport->dte_mode) + /* disable broken interrupts */ temp &= ~(UCR3_RI | UCR3_DCD); writel(temp, sport->port.membase + UCR3); @@ -1339,6 +1338,33 @@ static int imx_startup(struct uart_port *port) * Enable modem status interrupts */ imx_enable_ms(&sport->port); + + /* + * If the serial port is opened for reading start RX DMA immediately + * instead of waiting for RX FIFO interrupts. In our iMX53 the average + * delay for the first reception dropped from approximately 35000 + * microseconds to 1000 microseconds. + */ + if (sport->dma_is_enabled) { + struct tty_struct *tty = sport->port.state->port.tty; + struct tty_file_private *file_priv; + int readcnt = 0; + + spin_lock(&tty->files_lock); + + if (!list_empty(&tty->tty_files)) + list_for_each_entry(file_priv, &tty->tty_files, list) + if (!(file_priv->file->f_flags & O_WRONLY)) + readcnt++; + + spin_unlock(&tty->files_lock); + + if (readcnt > 0) { + imx_disable_rx_int(sport); + start_rx_dma(sport); + } + } + spin_unlock_irqrestore(&sport->port.lock, flags); return 0; @@ -1584,8 +1610,6 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios, ufcr = readl(sport->port.membase + UFCR); ufcr = (ufcr & (~UFCR_RFDIV)) | UFCR_RFDIV_REG(div); - if (sport->dte_mode) - ufcr |= UFCR_DCEDTE; writel(ufcr, sport->port.membase + UFCR); writel(num, sport->port.membase + UBIR); @@ -2153,6 +2177,27 @@ static int serial_imx_probe(struct platform_device *pdev) UCR1_TXMPTYEN | UCR1_RTSDEN); writel_relaxed(reg, sport->port.membase + UCR1); + if (!is_imx1_uart(sport) && sport->dte_mode) { + /* + * The DCEDTE bit changes the direction of DSR, DCD, DTR and RI + * and influences if UCR3_RI and UCR3_DCD changes the level of RI + * and DCD (when they are outputs) or enables the respective + * irqs. So set this bit early, i.e. before requesting irqs. + */ + writel(UFCR_DCEDTE, sport->port.membase + UFCR); + + /* + * Disable UCR3_RI and UCR3_DCD irqs. They are also not + * enabled later because they cannot be cleared + * (confirmed on i.MX25) which makes them unusable. + */ + writel(IMX21_UCR3_RXDMUXSEL | UCR3_ADNIMP | UCR3_DSR, + sport->port.membase + UCR3); + + } else { + writel(0, sport->port.membase + UFCR); + } + clk_disable_unprepare(sport->clk_ipg); /* diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 6c6f82ad8d5cb63982fc35c9047e64a8a18135a0..1ea05ac57aa7197b4f690776acc609d376faade4 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -1597,6 +1597,9 @@ static struct omap_uart_port_info *of_get_uart_port_info(struct device *dev) of_property_read_u32(dev->of_node, "clock-frequency", &omap_up_info->uartclk); + + omap_up_info->flags = UPF_BOOT_AUTOCONF; + return omap_up_info; } @@ -1767,7 +1770,8 @@ static int serial_omap_probe(struct platform_device *pdev) return 0; err_add_port: - pm_runtime_put(&pdev->dev); + pm_runtime_dont_use_autosuspend(&pdev->dev); + pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); pm_qos_remove_request(&up->pm_qos_request); device_init_wakeup(up->dev, false); @@ -1780,9 +1784,13 @@ static int serial_omap_remove(struct platform_device *dev) { struct uart_omap_port *up = platform_get_drvdata(dev); + pm_runtime_get_sync(up->dev); + + uart_remove_one_port(&serial_omap_reg, &up->port); + + pm_runtime_dont_use_autosuspend(up->dev); pm_runtime_put_sync(up->dev); pm_runtime_disable(up->dev); - uart_remove_one_port(&serial_omap_reg, &up->port); pm_qos_remove_request(&up->pm_qos_request); device_init_wakeup(&dev->dev, false); diff --git a/drivers/tty/serial/samsung.c b/drivers/tty/serial/samsung.c index 7a17aedbf902e05034129a832941f27fe5dc38c8..8aca18c4cdea4076e8666baff7d7f4953d6a6e1f 100644 --- a/drivers/tty/serial/samsung.c +++ b/drivers/tty/serial/samsung.c @@ -859,7 +859,7 @@ static void s3c24xx_serial_break_ctl(struct uart_port *port, int break_state) static int s3c24xx_serial_request_dma(struct s3c24xx_uart_port *p) { struct s3c24xx_uart_dma *dma = p->dma; - unsigned long flags; + int ret; /* Default slave configuration parameters */ dma->rx_conf.direction = DMA_DEV_TO_MEM; @@ -884,8 +884,8 @@ static int s3c24xx_serial_request_dma(struct s3c24xx_uart_port *p) dma->tx_chan = dma_request_chan(p->port.dev, "tx"); if (IS_ERR(dma->tx_chan)) { - dma_release_channel(dma->rx_chan); - return PTR_ERR(dma->tx_chan); + ret = PTR_ERR(dma->tx_chan); + goto err_release_rx; } dmaengine_slave_config(dma->tx_chan, &dma->tx_conf); @@ -894,26 +894,38 @@ static int s3c24xx_serial_request_dma(struct s3c24xx_uart_port *p) dma->rx_size = PAGE_SIZE; dma->rx_buf = kmalloc(dma->rx_size, GFP_KERNEL); - if (!dma->rx_buf) { - dma_release_channel(dma->rx_chan); - dma_release_channel(dma->tx_chan); - return -ENOMEM; + ret = -ENOMEM; + goto err_release_tx; } - dma->rx_addr = dma_map_single(dma->rx_chan->device->dev, dma->rx_buf, + dma->rx_addr = dma_map_single(p->port.dev, dma->rx_buf, dma->rx_size, DMA_FROM_DEVICE); - - spin_lock_irqsave(&p->port.lock, flags); + if (dma_mapping_error(p->port.dev, dma->rx_addr)) { + ret = -EIO; + goto err_free_rx; + } /* TX buffer */ - dma->tx_addr = dma_map_single(dma->tx_chan->device->dev, - p->port.state->xmit.buf, + dma->tx_addr = dma_map_single(p->port.dev, p->port.state->xmit.buf, UART_XMIT_SIZE, DMA_TO_DEVICE); - - spin_unlock_irqrestore(&p->port.lock, flags); + if (dma_mapping_error(p->port.dev, dma->tx_addr)) { + ret = -EIO; + goto err_unmap_rx; + } return 0; + +err_unmap_rx: + dma_unmap_single(p->port.dev, dma->rx_addr, dma->rx_size, + DMA_FROM_DEVICE); +err_free_rx: + kfree(dma->rx_buf); +err_release_tx: + dma_release_channel(dma->tx_chan); +err_release_rx: + dma_release_channel(dma->rx_chan); + return ret; } static void s3c24xx_serial_release_dma(struct s3c24xx_uart_port *p) @@ -922,7 +934,7 @@ static void s3c24xx_serial_release_dma(struct s3c24xx_uart_port *p) if (dma->rx_chan) { dmaengine_terminate_all(dma->rx_chan); - dma_unmap_single(dma->rx_chan->device->dev, dma->rx_addr, + dma_unmap_single(p->port.dev, dma->rx_addr, dma->rx_size, DMA_FROM_DEVICE); kfree(dma->rx_buf); dma_release_channel(dma->rx_chan); @@ -931,7 +943,7 @@ static void s3c24xx_serial_release_dma(struct s3c24xx_uart_port *p) if (dma->tx_chan) { dmaengine_terminate_all(dma->tx_chan); - dma_unmap_single(dma->tx_chan->device->dev, dma->tx_addr, + dma_unmap_single(p->port.dev, dma->tx_addr, UART_XMIT_SIZE, DMA_TO_DEVICE); dma_release_channel(dma->tx_chan); dma->tx_chan = NULL; diff --git a/drivers/tty/serial/sb1250-duart.c b/drivers/tty/serial/sb1250-duart.c index 771f361c47ea17850cf4b946d135795e100c0865..041625cc24bbe736cdc021fa0385fbe332f2d792 100644 --- a/drivers/tty/serial/sb1250-duart.c +++ b/drivers/tty/serial/sb1250-duart.c @@ -41,7 +41,7 @@ #include #include -#include +#include #include #include @@ -103,7 +103,7 @@ struct sbd_port { struct sbd_duart { struct sbd_port sport[2]; unsigned long mapctrl; - atomic_t map_guard; + refcount_t map_guard; }; #define to_sport(uport) container_of(uport, struct sbd_port, port) @@ -654,15 +654,13 @@ static void sbd_release_port(struct uart_port *uport) { struct sbd_port *sport = to_sport(uport); struct sbd_duart *duart = sport->duart; - int map_guard; iounmap(sport->memctrl); sport->memctrl = NULL; iounmap(uport->membase); uport->membase = NULL; - map_guard = atomic_add_return(-1, &duart->map_guard); - if (!map_guard) + if(refcount_dec_and_test(&duart->map_guard)) release_mem_region(duart->mapctrl, DUART_CHANREG_SPACING); release_mem_region(uport->mapbase, DUART_CHANREG_SPACING); } @@ -698,7 +696,6 @@ static int sbd_request_port(struct uart_port *uport) { const char *err = KERN_ERR "sbd: Unable to reserve MMIO resource\n"; struct sbd_duart *duart = to_sport(uport)->duart; - int map_guard; int ret = 0; if (!request_mem_region(uport->mapbase, DUART_CHANREG_SPACING, @@ -706,11 +703,11 @@ static int sbd_request_port(struct uart_port *uport) printk(err); return -EBUSY; } - map_guard = atomic_add_return(1, &duart->map_guard); - if (map_guard == 1) { + refcount_inc(&duart->map_guard); + if (refcount_read(&duart->map_guard) == 1) { if (!request_mem_region(duart->mapctrl, DUART_CHANREG_SPACING, "sb1250-duart")) { - atomic_add(-1, &duart->map_guard); + refcount_dec(&duart->map_guard); printk(err); ret = -EBUSY; } @@ -718,8 +715,7 @@ static int sbd_request_port(struct uart_port *uport) if (!ret) { ret = sbd_map_port(uport); if (ret) { - map_guard = atomic_add_return(-1, &duart->map_guard); - if (!map_guard) + if (refcount_dec_and_test(&duart->map_guard)) release_mem_region(duart->mapctrl, DUART_CHANREG_SPACING); } diff --git a/drivers/tty/serial/serial_core.c b/drivers/tty/serial/serial_core.c index 3fe56894974a7c4c3ae04a4d54946043be1d243c..0f45b7884a2c58a299668591e87bead1f6628f98 100644 --- a/drivers/tty/serial/serial_core.c +++ b/drivers/tty/serial/serial_core.c @@ -2117,9 +2117,8 @@ int uart_suspend_port(struct uart_driver *drv, struct uart_port *uport) for (tries = 3; !ops->tx_empty(uport) && tries; tries--) msleep(10); if (!tries) - dev_err(uport->dev, "%s%d: Unable to drain transmitter\n", - drv->dev_name, - drv->tty_driver->name_base + uport->line); + dev_err(uport->dev, "%s: Unable to drain transmitter\n", + uport->name); ops->shutdown(uport); } @@ -2248,11 +2247,10 @@ uart_report_port(struct uart_driver *drv, struct uart_port *port) break; } - printk(KERN_INFO "%s%s%s%d at %s (irq = %d, base_baud = %d) is a %s\n", + pr_info("%s%s%s at %s (irq = %d, base_baud = %d) is a %s\n", port->dev ? dev_name(port->dev) : "", port->dev ? ": " : "", - drv->dev_name, - drv->tty_driver->name_base + port->line, + port->name, address, port->irq, port->uartclk / 16, uart_type(port)); } @@ -2331,9 +2329,6 @@ static int uart_poll_init(struct tty_driver *driver, int line, char *options) int flow = 'n'; int ret = 0; - if (!state) - return -1; - tport = &state->port; mutex_lock(&tport->mutex); @@ -2368,13 +2363,12 @@ static int uart_poll_get_char(struct tty_driver *driver, int line) struct uart_port *port; int ret = -1; - if (state) { - port = uart_port_ref(state); - if (port) { - ret = port->ops->poll_get_char(port); - uart_port_deref(port); - } + port = uart_port_ref(state); + if (port) { + ret = port->ops->poll_get_char(port); + uart_port_deref(port); } + return ret; } @@ -2384,9 +2378,6 @@ static void uart_poll_put_char(struct tty_driver *driver, int line, char ch) struct uart_state *state = drv->state + line; struct uart_port *port; - if (!state) - return; - port = uart_port_ref(state); if (!port) return; @@ -2751,6 +2742,12 @@ int uart_add_one_port(struct uart_driver *drv, struct uart_port *uport) state->pm_state = UART_PM_STATE_UNDEFINED; uport->cons = drv->cons; uport->minor = drv->tty_driver->minor_start + uport->line; + uport->name = kasprintf(GFP_KERNEL, "%s%d", drv->dev_name, + drv->tty_driver->name_base + uport->line); + if (!uport->name) { + ret = -ENOMEM; + goto out; + } /* * If this port is a console, then the spinlock is already @@ -2868,6 +2865,7 @@ int uart_remove_one_port(struct uart_driver *drv, struct uart_port *uport) if (uport->type != PORT_UNKNOWN && uport->ops->release_port) uport->ops->release_port(uport); kfree(uport->tty_groups); + kfree(uport->name); /* * Indicate that there isn't a port here anymore. diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 9a47cc4f16a2798f7c0a084d44621d371a2f59a9..71707e8e6e3ffe768ad76e52b118b74e32ad35e0 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -683,24 +683,37 @@ static void sci_init_pins(struct uart_port *port, unsigned int cflag) } if (port->type == PORT_SCIFA || port->type == PORT_SCIFB) { + u16 data = serial_port_in(port, SCPDR); u16 ctrl = serial_port_in(port, SCPCR); /* Enable RXD and TXD pin functions */ ctrl &= ~(SCPCR_RXDC | SCPCR_TXDC); if (to_sci_port(port)->has_rtscts) { - /* RTS# is output, driven 1 */ - ctrl |= SCPCR_RTSC; - serial_port_out(port, SCPDR, - serial_port_in(port, SCPDR) | SCPDR_RTSD); + /* RTS# is output, active low, unless autorts */ + if (!(port->mctrl & TIOCM_RTS)) { + ctrl |= SCPCR_RTSC; + data |= SCPDR_RTSD; + } else if (!s->autorts) { + ctrl |= SCPCR_RTSC; + data &= ~SCPDR_RTSD; + } else { + /* Enable RTS# pin function */ + ctrl &= ~SCPCR_RTSC; + } /* Enable CTS# pin function */ ctrl &= ~SCPCR_CTSC; } + serial_port_out(port, SCPDR, data); serial_port_out(port, SCPCR, ctrl); } else if (sci_getreg(port, SCSPTR)->size) { u16 status = serial_port_in(port, SCSPTR); - /* RTS# is output, driven 1 */ - status |= SCSPTR_RTSIO | SCSPTR_RTSDT; + /* RTS# is always output; and active low, unless autorts */ + status |= SCSPTR_RTSIO; + if (!(port->mctrl & TIOCM_RTS)) + status |= SCSPTR_RTSDT; + else if (!s->autorts) + status &= ~SCSPTR_RTSDT; /* CTS# and SCK are inputs */ status &= ~(SCSPTR_CTSIO | SCSPTR_SCKIO); serial_port_out(port, SCSPTR, status); @@ -1985,11 +1998,13 @@ static int sci_startup(struct uart_port *port) dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); + sci_request_dma(port); + ret = sci_request_irq(s); - if (unlikely(ret < 0)) + if (unlikely(ret < 0)) { + sci_free_dma(port); return ret; - - sci_request_dma(port); + } return 0; } @@ -2021,8 +2036,8 @@ static void sci_shutdown(struct uart_port *port) } #endif - sci_free_dma(port); sci_free_irq(s); + sci_free_dma(port); } static int sci_sck_calc(struct sci_port *s, unsigned int bps, @@ -2157,10 +2172,6 @@ static void sci_reset(struct uart_port *port) unsigned int status; struct sci_port *s = to_sci_port(port); - do { - status = serial_port_in(port, SCxSR); - } while (!(status & SCxSR_TEND(port))); - serial_port_out(port, SCSCR, 0x00); /* TE=0, RE=0, CKE1=0 */ reg = sci_getreg(port, SCFCR); @@ -2374,6 +2385,10 @@ static void sci_set_termios(struct uart_port *port, struct ktermios *termios, serial_port_out(port, SCFCR, ctrl); } + if (port->flags & UPF_HARD_FLOW) { + /* Refresh (Auto) RTS */ + sci_set_mctrl(port, port->mctrl); + } scr_val |= SCSCR_RE | SCSCR_TE | (s->cfg->scscr & ~(SCSCR_CKE1 | SCSCR_CKE0)); diff --git a/drivers/tty/serial/sprd_serial.c b/drivers/tty/serial/sprd_serial.c index d98e3dc4838e637b34d3e3984c9570c656fa37b3..90996ad97b3718cfa85a0505a47b7e8d7be770c9 100644 --- a/drivers/tty/serial/sprd_serial.c +++ b/drivers/tty/serial/sprd_serial.c @@ -36,7 +36,7 @@ #define SPRD_FIFO_SIZE 128 #define SPRD_DEF_RATE 26000000 #define SPRD_BAUD_IO_LIMIT 3000000 -#define SPRD_TIMEOUT 256 +#define SPRD_TIMEOUT 256000 /* the offset of serial registers and BITs for them */ /* data registers */ diff --git a/drivers/tty/serial/st-asc.c b/drivers/tty/serial/st-asc.c index c334bcc59c649eedc2933ac29c4dc1ef45ae21d2..f5335be344f67ced2a601c81e3713a7d90e275c7 100644 --- a/drivers/tty/serial/st-asc.c +++ b/drivers/tty/serial/st-asc.c @@ -887,13 +887,12 @@ static void asc_console_write(struct console *co, const char *s, unsigned count) int locked = 1; u32 intenable; - local_irq_save(flags); if (port->sysrq) locked = 0; /* asc_interrupt has already claimed the lock */ else if (oops_in_progress) - locked = spin_trylock(&port->lock); + locked = spin_trylock_irqsave(&port->lock, flags); else - spin_lock(&port->lock); + spin_lock_irqsave(&port->lock, flags); /* * Disable interrupts so we don't get the IRQ line bouncing @@ -911,14 +910,13 @@ static void asc_console_write(struct console *co, const char *s, unsigned count) asc_out(port, ASC_INTEN, intenable); if (locked) - spin_unlock(&port->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&port->lock, flags); } static int asc_console_setup(struct console *co, char *options) { struct asc_port *ascport; - int baud = 9600; + int baud = 115200; int bits = 8; int parity = 'n'; int flow = 'n'; @@ -986,7 +984,7 @@ static struct platform_driver asc_serial_driver = { static int __init asc_init(void) { int ret; - static char banner[] __initdata = + static const char banner[] __initconst = KERN_INFO "STMicroelectronics ASC driver initialized\n"; printk(banner); diff --git a/drivers/tty/serial/uartlite.c b/drivers/tty/serial/uartlite.c index 817bb0d3f326ed982631c135ddd5f7a4b170b7a5..c9b8d702dadc35abcb6f3b74a07f62e836b2f9e2 100644 --- a/drivers/tty/serial/uartlite.c +++ b/drivers/tty/serial/uartlite.c @@ -28,7 +28,7 @@ #define ULITE_NAME "ttyUL" #define ULITE_MAJOR 204 #define ULITE_MINOR 187 -#define ULITE_NR_UARTS 16 +#define ULITE_NR_UARTS CONFIG_SERIAL_UARTLITE_NR_UARTS /* --------------------------------------------------------------------- * Register definitions diff --git a/drivers/tty/serial/xilinx_uartps.c b/drivers/tty/serial/xilinx_uartps.c index ad77d0ed0c467dbc2b20f41a9959ed10cf3e8390..c0539950f8d7f29725dd4433e6905d5e3d07b1c7 100644 --- a/drivers/tty/serial/xilinx_uartps.c +++ b/drivers/tty/serial/xilinx_uartps.c @@ -30,6 +30,7 @@ #include #include #include +#include #define CDNS_UART_TTY_NAME "ttyPS" #define CDNS_UART_NAME "xuartps" @@ -176,6 +177,7 @@ MODULE_PARM_DESC(rx_timeout, "Rx timeout, 1-255"); #define CDNS_UART_BDIV_MIN 4 #define CDNS_UART_BDIV_MAX 255 #define CDNS_UART_CD_MAX 65535 +#define UART_AUTOSUSPEND_TIMEOUT 3000 /** * struct cdns_uart - device data @@ -1065,16 +1067,13 @@ static void cdns_uart_poll_put_char(struct uart_port *port, unsigned char c) static void cdns_uart_pm(struct uart_port *port, unsigned int state, unsigned int oldstate) { - struct cdns_uart *cdns_uart = port->private_data; - switch (state) { case UART_PM_STATE_OFF: - clk_disable(cdns_uart->uartclk); - clk_disable(cdns_uart->pclk); + pm_runtime_mark_last_busy(port->dev); + pm_runtime_put_autosuspend(port->dev); break; default: - clk_enable(cdns_uart->pclk); - clk_enable(cdns_uart->uartclk); + pm_runtime_get_sync(port->dev); break; } } @@ -1353,12 +1352,7 @@ static int cdns_uart_suspend(struct device *device) * the suspend. */ uart_suspend_port(&cdns_uart_uart_driver, port); - if (console_suspend_enabled && !may_wake) { - struct cdns_uart *cdns_uart = port->private_data; - - clk_disable(cdns_uart->uartclk); - clk_disable(cdns_uart->pclk); - } else { + if (!(console_suspend_enabled && !may_wake)) { unsigned long flags = 0; spin_lock_irqsave(&port->lock, flags); @@ -1423,6 +1417,8 @@ static int cdns_uart_resume(struct device *device) ctrl_reg |= CDNS_UART_CR_TX_EN | CDNS_UART_CR_RX_EN; writel(ctrl_reg, port->membase + CDNS_UART_CR); + clk_disable(cdns_uart->uartclk); + clk_disable(cdns_uart->pclk); spin_unlock_irqrestore(&port->lock, flags); } else { spin_lock_irqsave(&port->lock, flags); @@ -1436,9 +1432,33 @@ static int cdns_uart_resume(struct device *device) return uart_resume_port(&cdns_uart_uart_driver, port); } #endif /* ! CONFIG_PM_SLEEP */ +static int __maybe_unused cdns_runtime_suspend(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct uart_port *port = platform_get_drvdata(pdev); + struct cdns_uart *cdns_uart = port->private_data; + + clk_disable(cdns_uart->uartclk); + clk_disable(cdns_uart->pclk); + return 0; +}; + +static int __maybe_unused cdns_runtime_resume(struct device *dev) +{ + struct platform_device *pdev = to_platform_device(dev); + struct uart_port *port = platform_get_drvdata(pdev); + struct cdns_uart *cdns_uart = port->private_data; -static SIMPLE_DEV_PM_OPS(cdns_uart_dev_pm_ops, cdns_uart_suspend, - cdns_uart_resume); + clk_enable(cdns_uart->pclk); + clk_enable(cdns_uart->uartclk); + return 0; +}; + +static const struct dev_pm_ops cdns_uart_dev_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(cdns_uart_suspend, cdns_uart_resume) + SET_RUNTIME_PM_OPS(cdns_runtime_suspend, + cdns_runtime_resume, NULL) +}; static const struct cdns_platform_data zynqmp_uart_def = { .quirks = CDNS_UART_RXBS_SUPPORT, }; @@ -1501,12 +1521,12 @@ static int cdns_uart_probe(struct platform_device *pdev) return PTR_ERR(cdns_uart_data->uartclk); } - rc = clk_prepare(cdns_uart_data->pclk); + rc = clk_prepare_enable(cdns_uart_data->pclk); if (rc) { dev_err(&pdev->dev, "Unable to enable pclk clock.\n"); return rc; } - rc = clk_prepare(cdns_uart_data->uartclk); + rc = clk_prepare_enable(cdns_uart_data->uartclk); if (rc) { dev_err(&pdev->dev, "Unable to enable device clock.\n"); goto err_out_clk_dis_pclk; @@ -1558,6 +1578,11 @@ static int cdns_uart_probe(struct platform_device *pdev) cdns_uart_data->port = port; platform_set_drvdata(pdev, port); + pm_runtime_use_autosuspend(&pdev->dev); + pm_runtime_set_autosuspend_delay(&pdev->dev, UART_AUTOSUSPEND_TIMEOUT); + pm_runtime_set_active(&pdev->dev); + pm_runtime_enable(&pdev->dev); + rc = uart_add_one_port(&cdns_uart_uart_driver, port); if (rc) { dev_err(&pdev->dev, @@ -1573,9 +1598,12 @@ static int cdns_uart_probe(struct platform_device *pdev) &cdns_uart_data->clk_rate_change_nb); #endif err_out_clk_disable: - clk_unprepare(cdns_uart_data->uartclk); + pm_runtime_disable(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); + pm_runtime_dont_use_autosuspend(&pdev->dev); + clk_disable_unprepare(cdns_uart_data->uartclk); err_out_clk_dis_pclk: - clk_unprepare(cdns_uart_data->pclk); + clk_disable_unprepare(cdns_uart_data->pclk); return rc; } @@ -1599,8 +1627,11 @@ static int cdns_uart_remove(struct platform_device *pdev) #endif rc = uart_remove_one_port(&cdns_uart_uart_driver, port); port->mapbase = 0; - clk_unprepare(cdns_uart_data->uartclk); - clk_unprepare(cdns_uart_data->pclk); + clk_disable_unprepare(cdns_uart_data->uartclk); + clk_disable_unprepare(cdns_uart_data->pclk); + pm_runtime_disable(&pdev->dev); + pm_runtime_set_suspended(&pdev->dev); + pm_runtime_dont_use_autosuspend(&pdev->dev); return rc; } diff --git a/drivers/tty/synclink.c b/drivers/tty/synclink.c index 657eed82eeb372d96e5f480bae3eb7c92dee06fa..a2c308f7d6374515bf8a0ac0a9e6ea8b59f5032b 100644 --- a/drivers/tty/synclink.c +++ b/drivers/tty/synclink.c @@ -869,9 +869,9 @@ static int txholdbufs[MAX_TOTAL_DEVICES]; module_param(break_on_load, bool, 0); module_param(ttymajor, int, 0); -module_param_array(io, int, NULL, 0); -module_param_array(irq, int, NULL, 0); -module_param_array(dma, int, NULL, 0); +module_param_hw_array(io, int, ioport, NULL, 0); +module_param_hw_array(irq, int, irq, NULL, 0); +module_param_hw_array(dma, int, dma, NULL, 0); module_param(debug_level, int, 0); module_param_array(maxframe, int, NULL, 0); module_param_array(txdmabufs, int, NULL, 0); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index c6fc7141d7b2814cc122d430c38d61bbde675358..3ffc1ce29023b5f6476d0006aa94b8b1b858c480 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -372,7 +372,7 @@ static void moom_callback(struct work_struct *ignored) mutex_lock(&oom_lock); if (!out_of_memory(&oc)) - pr_info("OOM request ignored because killer is disabled\n"); + pr_info("OOM request ignored. No task eligible\n"); mutex_unlock(&oom_lock); } @@ -446,7 +446,7 @@ static struct sysrq_key_op *sysrq_key_table[36] = { */ NULL, /* a */ &sysrq_reboot_op, /* b */ - &sysrq_crash_op, /* c & ibm_emac driver debug */ + &sysrq_crash_op, /* c */ &sysrq_showlocks_op, /* d */ &sysrq_term_op, /* e */ &sysrq_moom_op, /* f */ diff --git a/drivers/tty/tty_baudrate.c b/drivers/tty/tty_baudrate.c new file mode 100644 index 0000000000000000000000000000000000000000..5c33fd25676d59bc6453d7f4a5f4a46b2aadabf1 --- /dev/null +++ b/drivers/tty/tty_baudrate.c @@ -0,0 +1,232 @@ +/* + * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds + */ + +#include +#include +#include +#include +#include + + +/* + * Routine which returns the baud rate of the tty + * + * Note that the baud_table needs to be kept in sync with the + * include/asm/termbits.h file. + */ +static const speed_t baud_table[] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, + 9600, 19200, 38400, 57600, 115200, 230400, 460800, +#ifdef __sparc__ + 76800, 153600, 307200, 614400, 921600 +#else + 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, + 2500000, 3000000, 3500000, 4000000 +#endif +}; + +#ifndef __sparc__ +static const tcflag_t baud_bits[] = { + B0, B50, B75, B110, B134, B150, B200, B300, B600, + B1200, B1800, B2400, B4800, B9600, B19200, B38400, + B57600, B115200, B230400, B460800, B500000, B576000, + B921600, B1000000, B1152000, B1500000, B2000000, B2500000, + B3000000, B3500000, B4000000 +}; +#else +static const tcflag_t baud_bits[] = { + B0, B50, B75, B110, B134, B150, B200, B300, B600, + B1200, B1800, B2400, B4800, B9600, B19200, B38400, + B57600, B115200, B230400, B460800, B76800, B153600, + B307200, B614400, B921600 +}; +#endif + +static int n_baud_table = ARRAY_SIZE(baud_table); + +/** + * tty_termios_baud_rate + * @termios: termios structure + * + * Convert termios baud rate data into a speed. This should be called + * with the termios lock held if this termios is a terminal termios + * structure. May change the termios data. Device drivers can call this + * function but should use ->c_[io]speed directly as they are updated. + * + * Locking: none + */ + +speed_t tty_termios_baud_rate(struct ktermios *termios) +{ + unsigned int cbaud; + + cbaud = termios->c_cflag & CBAUD; + +#ifdef BOTHER + /* Magic token for arbitrary speed via c_ispeed/c_ospeed */ + if (cbaud == BOTHER) + return termios->c_ospeed; +#endif + if (cbaud & CBAUDEX) { + cbaud &= ~CBAUDEX; + + if (cbaud < 1 || cbaud + 15 > n_baud_table) + termios->c_cflag &= ~CBAUDEX; + else + cbaud += 15; + } + return baud_table[cbaud]; +} +EXPORT_SYMBOL(tty_termios_baud_rate); + +/** + * tty_termios_input_baud_rate + * @termios: termios structure + * + * Convert termios baud rate data into a speed. This should be called + * with the termios lock held if this termios is a terminal termios + * structure. May change the termios data. Device drivers can call this + * function but should use ->c_[io]speed directly as they are updated. + * + * Locking: none + */ + +speed_t tty_termios_input_baud_rate(struct ktermios *termios) +{ +#ifdef IBSHIFT + unsigned int cbaud = (termios->c_cflag >> IBSHIFT) & CBAUD; + + if (cbaud == B0) + return tty_termios_baud_rate(termios); + + /* Magic token for arbitrary speed via c_ispeed*/ + if (cbaud == BOTHER) + return termios->c_ispeed; + + if (cbaud & CBAUDEX) { + cbaud &= ~CBAUDEX; + + if (cbaud < 1 || cbaud + 15 > n_baud_table) + termios->c_cflag &= ~(CBAUDEX << IBSHIFT); + else + cbaud += 15; + } + return baud_table[cbaud]; +#else + return tty_termios_baud_rate(termios); +#endif +} +EXPORT_SYMBOL(tty_termios_input_baud_rate); + +/** + * tty_termios_encode_baud_rate + * @termios: ktermios structure holding user requested state + * @ispeed: input speed + * @ospeed: output speed + * + * Encode the speeds set into the passed termios structure. This is + * used as a library helper for drivers so that they can report back + * the actual speed selected when it differs from the speed requested + * + * For maximal back compatibility with legacy SYS5/POSIX *nix behaviour + * we need to carefully set the bits when the user does not get the + * desired speed. We allow small margins and preserve as much of possible + * of the input intent to keep compatibility. + * + * Locking: Caller should hold termios lock. This is already held + * when calling this function from the driver termios handler. + * + * The ifdefs deal with platforms whose owners have yet to update them + * and will all go away once this is done. + */ + +void tty_termios_encode_baud_rate(struct ktermios *termios, + speed_t ibaud, speed_t obaud) +{ + int i = 0; + int ifound = -1, ofound = -1; + int iclose = ibaud/50, oclose = obaud/50; + int ibinput = 0; + + if (obaud == 0) /* CD dropped */ + ibaud = 0; /* Clear ibaud to be sure */ + + termios->c_ispeed = ibaud; + termios->c_ospeed = obaud; + +#ifdef BOTHER + /* If the user asked for a precise weird speed give a precise weird + answer. If they asked for a Bfoo speed they may have problems + digesting non-exact replies so fuzz a bit */ + + if ((termios->c_cflag & CBAUD) == BOTHER) + oclose = 0; + if (((termios->c_cflag >> IBSHIFT) & CBAUD) == BOTHER) + iclose = 0; + if ((termios->c_cflag >> IBSHIFT) & CBAUD) + ibinput = 1; /* An input speed was specified */ +#endif + termios->c_cflag &= ~CBAUD; + + /* + * Our goal is to find a close match to the standard baud rate + * returned. Walk the baud rate table and if we get a very close + * match then report back the speed as a POSIX Bxxxx value by + * preference + */ + + do { + if (obaud - oclose <= baud_table[i] && + obaud + oclose >= baud_table[i]) { + termios->c_cflag |= baud_bits[i]; + ofound = i; + } + if (ibaud - iclose <= baud_table[i] && + ibaud + iclose >= baud_table[i]) { + /* For the case input == output don't set IBAUD bits + if the user didn't do so */ + if (ofound == i && !ibinput) + ifound = i; +#ifdef IBSHIFT + else { + ifound = i; + termios->c_cflag |= (baud_bits[i] << IBSHIFT); + } +#endif + } + } while (++i < n_baud_table); + + /* + * If we found no match then use BOTHER if provided or warn + * the user their platform maintainer needs to wake up if not. + */ +#ifdef BOTHER + if (ofound == -1) + termios->c_cflag |= BOTHER; + /* Set exact input bits only if the input and output differ or the + user already did */ + if (ifound == -1 && (ibaud != obaud || ibinput)) + termios->c_cflag |= (BOTHER << IBSHIFT); +#else + if (ifound == -1 || ofound == -1) + pr_warn_once("tty: Unable to return correct speed data as your architecture needs updating.\n"); +#endif +} +EXPORT_SYMBOL_GPL(tty_termios_encode_baud_rate); + +/** + * tty_encode_baud_rate - set baud rate of the tty + * @ibaud: input baud rate + * @obad: output baud rate + * + * Update the current termios data for the tty with the new speed + * settings. The caller must hold the termios_rwsem for the tty in + * question. + */ + +void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud) +{ + tty_termios_encode_baud_rate(&tty->termios, ibaud, obaud); +} +EXPORT_SYMBOL_GPL(tty_encode_baud_rate); diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c index e6d1a6510886c5a807cda3b65d651046afe3a48d..0c150b5a9dd68d7bba0e91a50c45ecd5df831cac 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -377,65 +377,6 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line) EXPORT_SYMBOL_GPL(tty_find_polling_driver); #endif -static int is_ignored(int sig) -{ - return (sigismember(¤t->blocked, sig) || - current->sighand->action[sig-1].sa.sa_handler == SIG_IGN); -} - -/** - * tty_check_change - check for POSIX terminal changes - * @tty: tty to check - * - * If we try to write to, or set the state of, a terminal and we're - * not in the foreground, send a SIGTTOU. If the signal is blocked or - * ignored, go ahead and perform the operation. (POSIX 7.2) - * - * Locking: ctrl_lock - */ - -int __tty_check_change(struct tty_struct *tty, int sig) -{ - unsigned long flags; - struct pid *pgrp, *tty_pgrp; - int ret = 0; - - if (current->signal->tty != tty) - return 0; - - rcu_read_lock(); - pgrp = task_pgrp(current); - - spin_lock_irqsave(&tty->ctrl_lock, flags); - tty_pgrp = tty->pgrp; - spin_unlock_irqrestore(&tty->ctrl_lock, flags); - - if (tty_pgrp && pgrp != tty->pgrp) { - if (is_ignored(sig)) { - if (sig == SIGTTIN) - ret = -EIO; - } else if (is_current_pgrp_orphaned()) - ret = -EIO; - else { - kill_pgrp(pgrp, sig, 1); - set_thread_flag(TIF_SIGPENDING); - ret = -ERESTARTSYS; - } - } - rcu_read_unlock(); - - if (!tty_pgrp) - tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig); - - return ret; -} - -int tty_check_change(struct tty_struct *tty) -{ - return __tty_check_change(tty, SIGTTOU); -} -EXPORT_SYMBOL(tty_check_change); - static ssize_t hung_up_tty_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { @@ -509,79 +450,6 @@ static const struct file_operations hung_up_tty_fops = { static DEFINE_SPINLOCK(redirect_lock); static struct file *redirect; - -void proc_clear_tty(struct task_struct *p) -{ - unsigned long flags; - struct tty_struct *tty; - spin_lock_irqsave(&p->sighand->siglock, flags); - tty = p->signal->tty; - p->signal->tty = NULL; - spin_unlock_irqrestore(&p->sighand->siglock, flags); - tty_kref_put(tty); -} - -/** - * proc_set_tty - set the controlling terminal - * - * Only callable by the session leader and only if it does not already have - * a controlling terminal. - * - * Caller must hold: tty_lock() - * a readlock on tasklist_lock - * sighand lock - */ -static void __proc_set_tty(struct tty_struct *tty) -{ - unsigned long flags; - - spin_lock_irqsave(&tty->ctrl_lock, flags); - /* - * The session and fg pgrp references will be non-NULL if - * tiocsctty() is stealing the controlling tty - */ - put_pid(tty->session); - put_pid(tty->pgrp); - tty->pgrp = get_pid(task_pgrp(current)); - spin_unlock_irqrestore(&tty->ctrl_lock, flags); - tty->session = get_pid(task_session(current)); - if (current->signal->tty) { - tty_debug(tty, "current tty %s not NULL!!\n", - current->signal->tty->name); - tty_kref_put(current->signal->tty); - } - put_pid(current->signal->tty_old_pgrp); - current->signal->tty = tty_kref_get(tty); - current->signal->tty_old_pgrp = NULL; -} - -static void proc_set_tty(struct tty_struct *tty) -{ - spin_lock_irq(¤t->sighand->siglock); - __proc_set_tty(tty); - spin_unlock_irq(¤t->sighand->siglock); -} - -struct tty_struct *get_current_tty(void) -{ - struct tty_struct *tty; - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - tty = tty_kref_get(current->signal->tty); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - return tty; -} -EXPORT_SYMBOL_GPL(get_current_tty); - -static void session_clear_tty(struct pid *session) -{ - struct task_struct *p; - do_each_pid_task(session, PIDTYPE_SID, p) { - proc_clear_tty(p); - } while_each_pid_task(session, PIDTYPE_SID, p); -} - /** * tty_wakeup - request more data * @tty: terminal @@ -608,60 +476,6 @@ void tty_wakeup(struct tty_struct *tty) EXPORT_SYMBOL_GPL(tty_wakeup); -/** - * tty_signal_session_leader - sends SIGHUP to session leader - * @tty controlling tty - * @exit_session if non-zero, signal all foreground group processes - * - * Send SIGHUP and SIGCONT to the session leader and its process group. - * Optionally, signal all processes in the foreground process group. - * - * Returns the number of processes in the session with this tty - * as their controlling terminal. This value is used to drop - * tty references for those processes. - */ -static int tty_signal_session_leader(struct tty_struct *tty, int exit_session) -{ - struct task_struct *p; - int refs = 0; - struct pid *tty_pgrp = NULL; - - read_lock(&tasklist_lock); - if (tty->session) { - do_each_pid_task(tty->session, PIDTYPE_SID, p) { - spin_lock_irq(&p->sighand->siglock); - if (p->signal->tty == tty) { - p->signal->tty = NULL; - /* We defer the dereferences outside fo - the tasklist lock */ - refs++; - } - if (!p->signal->leader) { - spin_unlock_irq(&p->sighand->siglock); - continue; - } - __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); - __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); - put_pid(p->signal->tty_old_pgrp); /* A noop */ - spin_lock(&tty->ctrl_lock); - tty_pgrp = get_pid(tty->pgrp); - if (tty->pgrp) - p->signal->tty_old_pgrp = get_pid(tty->pgrp); - spin_unlock(&tty->ctrl_lock); - spin_unlock_irq(&p->sighand->siglock); - } while_each_pid_task(tty->session, PIDTYPE_SID, p); - } - read_unlock(&tasklist_lock); - - if (tty_pgrp) { - if (exit_session) - kill_pgrp(tty_pgrp, SIGHUP, exit_session); - put_pid(tty_pgrp); - } - - return refs; -} - /** * __tty_hangup - actual handler for hangup events * @work: tty device @@ -840,7 +654,7 @@ void tty_vhangup_self(void) * is complete. That guarantee is necessary for security reasons. */ -static void tty_vhangup_session(struct tty_struct *tty) +void tty_vhangup_session(struct tty_struct *tty) { tty_debug_hangup(tty, "session hangup\n"); __tty_hangup(tty, 1); @@ -861,106 +675,6 @@ int tty_hung_up_p(struct file *filp) EXPORT_SYMBOL(tty_hung_up_p); -/** - * disassociate_ctty - disconnect controlling tty - * @on_exit: true if exiting so need to "hang up" the session - * - * This function is typically called only by the session leader, when - * it wants to disassociate itself from its controlling tty. - * - * It performs the following functions: - * (1) Sends a SIGHUP and SIGCONT to the foreground process group - * (2) Clears the tty from being controlling the session - * (3) Clears the controlling tty for all processes in the - * session group. - * - * The argument on_exit is set to 1 if called when a process is - * exiting; it is 0 if called by the ioctl TIOCNOTTY. - * - * Locking: - * BTM is taken for hysterical raisins, and held when - * called from no_tty(). - * tty_mutex is taken to protect tty - * ->siglock is taken to protect ->signal/->sighand - * tasklist_lock is taken to walk process list for sessions - * ->siglock is taken to protect ->signal/->sighand - */ - -void disassociate_ctty(int on_exit) -{ - struct tty_struct *tty; - - if (!current->signal->leader) - return; - - tty = get_current_tty(); - if (tty) { - if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) { - tty_vhangup_session(tty); - } else { - struct pid *tty_pgrp = tty_get_pgrp(tty); - if (tty_pgrp) { - kill_pgrp(tty_pgrp, SIGHUP, on_exit); - if (!on_exit) - kill_pgrp(tty_pgrp, SIGCONT, on_exit); - put_pid(tty_pgrp); - } - } - tty_kref_put(tty); - - } else if (on_exit) { - struct pid *old_pgrp; - spin_lock_irq(¤t->sighand->siglock); - old_pgrp = current->signal->tty_old_pgrp; - current->signal->tty_old_pgrp = NULL; - spin_unlock_irq(¤t->sighand->siglock); - if (old_pgrp) { - kill_pgrp(old_pgrp, SIGHUP, on_exit); - kill_pgrp(old_pgrp, SIGCONT, on_exit); - put_pid(old_pgrp); - } - return; - } - - spin_lock_irq(¤t->sighand->siglock); - put_pid(current->signal->tty_old_pgrp); - current->signal->tty_old_pgrp = NULL; - - tty = tty_kref_get(current->signal->tty); - if (tty) { - unsigned long flags; - spin_lock_irqsave(&tty->ctrl_lock, flags); - put_pid(tty->session); - put_pid(tty->pgrp); - tty->session = NULL; - tty->pgrp = NULL; - spin_unlock_irqrestore(&tty->ctrl_lock, flags); - tty_kref_put(tty); - } else - tty_debug_hangup(tty, "no current tty\n"); - - spin_unlock_irq(¤t->sighand->siglock); - /* Now clear signal->tty under the lock */ - read_lock(&tasklist_lock); - session_clear_tty(task_session(current)); - read_unlock(&tasklist_lock); -} - -/** - * - * no_tty - Ensure the current process does not have a controlling tty - */ -void no_tty(void) -{ - /* FIXME: Review locking here. The tty_lock never covered any race - between a new association and proc_clear_tty but possible we need - to protect against this anyway */ - struct task_struct *tsk = current; - disassociate_ctty(0); - proc_clear_tty(tsk); -} - - /** * stop_tty - propagate flow control * @tty: tty to stop @@ -1520,7 +1234,7 @@ struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx) * This code guarantees that either everything succeeds and the * TTY is ready for operation, or else the table slots are vacated * and the allocated memory released. (Except that the termios - * and locked termios may be retained.) + * may be retained.) */ if (!try_module_get(driver->owner)) @@ -2163,38 +1877,13 @@ static int tty_open(struct inode *inode, struct file *filp) } clear_bit(TTY_HUPPED, &tty->flags); - - read_lock(&tasklist_lock); - spin_lock_irq(¤t->sighand->siglock); noctty = (filp->f_flags & O_NOCTTY) || - (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || - device == MKDEV(TTYAUX_MAJOR, 1) || - (tty->driver->type == TTY_DRIVER_TYPE_PTY && - tty->driver->subtype == PTY_TYPE_MASTER); - - if (!noctty && - current->signal->leader && - !current->signal->tty && - tty->session == NULL) { - /* - * Don't let a process that only has write access to the tty - * obtain the privileges associated with having a tty as - * controlling terminal (being able to reopen it with full - * access through /dev/tty, being able to perform pushback). - * Many distributions set the group of all ttys to "tty" and - * grant write-only access to all terminals for setgid tty - * binaries, which should not imply full privileges on all ttys. - * - * This could theoretically break old code that performs open() - * on a write-only file descriptor. In that case, it might be - * necessary to also permit this if - * inode_permission(inode, MAY_READ) == 0. - */ - if (filp->f_mode & FMODE_READ) - __proc_set_tty(tty); - } - spin_unlock_irq(¤t->sighand->siglock); - read_unlock(&tasklist_lock); + (IS_ENABLED(CONFIG_VT) && device == MKDEV(TTY_MAJOR, 0)) || + device == MKDEV(TTYAUX_MAJOR, 1) || + (tty->driver->type == TTY_DRIVER_TYPE_PTY && + tty->driver->subtype == PTY_TYPE_MASTER); + if (!noctty) + tty_open_proc_set_tty(filp, tty); tty_unlock(tty); return 0; } @@ -2456,211 +2145,6 @@ static int fionbio(struct file *file, int __user *p) return 0; } -/** - * tiocsctty - set controlling tty - * @tty: tty structure - * @arg: user argument - * - * This ioctl is used to manage job control. It permits a session - * leader to set this tty as the controlling tty for the session. - * - * Locking: - * Takes tty_lock() to serialize proc_set_tty() for this tty - * Takes tasklist_lock internally to walk sessions - * Takes ->siglock() when updating signal->tty - */ - -static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) -{ - int ret = 0; - - tty_lock(tty); - read_lock(&tasklist_lock); - - if (current->signal->leader && (task_session(current) == tty->session)) - goto unlock; - - /* - * The process must be a session leader and - * not have a controlling tty already. - */ - if (!current->signal->leader || current->signal->tty) { - ret = -EPERM; - goto unlock; - } - - if (tty->session) { - /* - * This tty is already the controlling - * tty for another session group! - */ - if (arg == 1 && capable(CAP_SYS_ADMIN)) { - /* - * Steal it away - */ - session_clear_tty(tty->session); - } else { - ret = -EPERM; - goto unlock; - } - } - - /* See the comment in tty_open(). */ - if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { - ret = -EPERM; - goto unlock; - } - - proc_set_tty(tty); -unlock: - read_unlock(&tasklist_lock); - tty_unlock(tty); - return ret; -} - -/** - * tty_get_pgrp - return a ref counted pgrp pid - * @tty: tty to read - * - * Returns a refcounted instance of the pid struct for the process - * group controlling the tty. - */ - -struct pid *tty_get_pgrp(struct tty_struct *tty) -{ - unsigned long flags; - struct pid *pgrp; - - spin_lock_irqsave(&tty->ctrl_lock, flags); - pgrp = get_pid(tty->pgrp); - spin_unlock_irqrestore(&tty->ctrl_lock, flags); - - return pgrp; -} -EXPORT_SYMBOL_GPL(tty_get_pgrp); - -/* - * This checks not only the pgrp, but falls back on the pid if no - * satisfactory pgrp is found. I dunno - gdb doesn't work correctly - * without this... - * - * The caller must hold rcu lock or the tasklist lock. - */ -static struct pid *session_of_pgrp(struct pid *pgrp) -{ - struct task_struct *p; - struct pid *sid = NULL; - - p = pid_task(pgrp, PIDTYPE_PGID); - if (p == NULL) - p = pid_task(pgrp, PIDTYPE_PID); - if (p != NULL) - sid = task_session(p); - - return sid; -} - -/** - * tiocgpgrp - get process group - * @tty: tty passed by user - * @real_tty: tty side of the tty passed by the user if a pty else the tty - * @p: returned pid - * - * Obtain the process group of the tty. If there is no process group - * return an error. - * - * Locking: none. Reference to current->signal->tty is safe. - */ - -static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - struct pid *pid; - int ret; - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->signal->tty != real_tty) - return -ENOTTY; - pid = tty_get_pgrp(real_tty); - ret = put_user(pid_vnr(pid), p); - put_pid(pid); - return ret; -} - -/** - * tiocspgrp - attempt to set process group - * @tty: tty passed by user - * @real_tty: tty side device matching tty passed by user - * @p: pid pointer - * - * Set the process group of the tty to the session passed. Only - * permitted where the tty session is our session. - * - * Locking: RCU, ctrl lock - */ - -static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - struct pid *pgrp; - pid_t pgrp_nr; - int retval = tty_check_change(real_tty); - - if (retval == -EIO) - return -ENOTTY; - if (retval) - return retval; - if (!current->signal->tty || - (current->signal->tty != real_tty) || - (real_tty->session != task_session(current))) - return -ENOTTY; - if (get_user(pgrp_nr, p)) - return -EFAULT; - if (pgrp_nr < 0) - return -EINVAL; - rcu_read_lock(); - pgrp = find_vpid(pgrp_nr); - retval = -ESRCH; - if (!pgrp) - goto out_unlock; - retval = -EPERM; - if (session_of_pgrp(pgrp) != task_session(current)) - goto out_unlock; - retval = 0; - spin_lock_irq(&tty->ctrl_lock); - put_pid(real_tty->pgrp); - real_tty->pgrp = get_pid(pgrp); - spin_unlock_irq(&tty->ctrl_lock); -out_unlock: - rcu_read_unlock(); - return retval; -} - -/** - * tiocgsid - get session id - * @tty: tty passed by user - * @real_tty: tty side of the tty passed by the user if a pty else the tty - * @p: pointer to returned session id - * - * Obtain the session id of the tty. If there is no session - * return an error. - * - * Locking: none. Reference to current->signal->tty is safe. - */ - -static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) -{ - /* - * (tty == real_tty) is a cheap way of - * testing if the tty is NOT a master pty. - */ - if (tty == real_tty && current->signal->tty != real_tty) - return -ENOTTY; - if (!real_tty->session) - return -ENOTTY; - return put_user(pid_vnr(real_tty->session), p); -} - /** * tiocsetd - set line discipline * @tty: tty device @@ -2843,8 +2327,8 @@ static void tty_warn_deprecated_flags(struct serial_struct __user *ss) flags &= ASYNC_DEPRECATED; if (flags && __ratelimit(&depr_flags)) - pr_warning("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", - __func__, get_task_comm(comm, current), flags); + pr_warn("%s: '%s' is using deprecated serial flags (with no effect): %.8x\n", + __func__, get_task_comm(comm, current), flags); } /* @@ -2920,19 +2404,6 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) int excl = test_bit(TTY_EXCLUSIVE, &tty->flags); return put_user(excl, (int __user *)p); } - case TIOCNOTTY: - if (current->signal->tty != tty) - return -ENOTTY; - no_tty(); - return 0; - case TIOCSCTTY: - return tiocsctty(real_tty, file, arg); - case TIOCGPGRP: - return tiocgpgrp(tty, real_tty, p); - case TIOCSPGRP: - return tiocspgrp(tty, real_tty, p); - case TIOCGSID: - return tiocgsid(tty, real_tty, p); case TIOCGETD: return tiocgetd(tty, p); case TIOCSETD: @@ -2993,6 +2464,10 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case TIOCSSERIAL: tty_warn_deprecated_flags(p); break; + default: + retval = tty_jobctrl_ioctl(tty, real_tty, file, cmd, arg); + if (retval != -ENOIOCTLCMD) + return retval; } if (tty->ops->ioctl) { retval = tty->ops->ioctl(tty, cmd, arg); @@ -3293,9 +2768,9 @@ struct device *tty_register_device_attr(struct tty_driver *driver, { char name[64]; dev_t devt = MKDEV(driver->major, driver->minor_start) + index; - struct device *dev = NULL; - int retval = -ENODEV; - bool cdev = false; + struct ktermios *tp; + struct device *dev; + int retval; if (index >= driver->num) { pr_err("%s: Attempt to register invalid tty line number (%d)\n", @@ -3308,18 +2783,9 @@ struct device *tty_register_device_attr(struct tty_driver *driver, else tty_line_name(driver, index, name); - if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { - retval = tty_cdev_add(driver, devt, index, 1); - if (retval) - goto error; - cdev = true; - } - dev = kzalloc(sizeof(*dev), GFP_KERNEL); - if (!dev) { - retval = -ENOMEM; - goto error; - } + if (!dev) + return ERR_PTR(-ENOMEM); dev->devt = devt; dev->class = tty_class; @@ -3329,18 +2795,38 @@ struct device *tty_register_device_attr(struct tty_driver *driver, dev->groups = attr_grp; dev_set_drvdata(dev, drvdata); + dev_set_uevent_suppress(dev, 1); + retval = device_register(dev); if (retval) - goto error; + goto err_put; + + if (!(driver->flags & TTY_DRIVER_DYNAMIC_ALLOC)) { + /* + * Free any saved termios data so that the termios state is + * reset when reusing a minor number. + */ + tp = driver->termios[index]; + if (tp) { + driver->termios[index] = NULL; + kfree(tp); + } + + retval = tty_cdev_add(driver, devt, index, 1); + if (retval) + goto err_del; + } + + dev_set_uevent_suppress(dev, 0); + kobject_uevent(&dev->kobj, KOBJ_ADD); return dev; -error: +err_del: + device_del(dev); +err_put: put_device(dev); - if (cdev) { - cdev_del(driver->cdevs[index]); - driver->cdevs[index] = NULL; - } + return ERR_PTR(retval); } EXPORT_SYMBOL_GPL(tty_register_device_attr); @@ -3370,7 +2856,7 @@ EXPORT_SYMBOL(tty_unregister_device); /** * __tty_alloc_driver -- allocate tty driver * @lines: count of lines this driver can handle at most - * @owner: module which is repsonsible for this driver + * @owner: module which is responsible for this driver * @flags: some of TTY_DRIVER_* flags, will be set in driver->flags * * This should not be called directly, some of the provided macros should be @@ -3441,11 +2927,6 @@ static void destruct_tty_driver(struct kref *kref) struct ktermios *tp; if (driver->flags & TTY_DRIVER_INSTALLED) { - /* - * Free the termios and termios_locked structures because - * we don't want to get memory leaks when modular tty - * drivers are removed from the kernel. - */ for (i = 0; i < driver->num; i++) { tp = driver->termios[i]; if (tp) { @@ -3578,30 +3059,6 @@ void tty_default_fops(struct file_operations *fops) *fops = tty_fops; } -/* - * Initialize the console device. This is called *early*, so - * we can't necessarily depend on lots of kernel help here. - * Just do some early initializations, and do the complex setup - * later. - */ -void __init console_init(void) -{ - initcall_t *call; - - /* Setup the default TTY line discipline. */ - n_tty_init(); - - /* - * set up the console device so that later boot sequences can - * inform about problems etc.. - */ - call = __con_initcall_start; - while (call < __con_initcall_end) { - (*call)(); - call++; - } -} - static char *tty_devnode(struct device *dev, umode_t *mode) { if (!mode) diff --git a/drivers/tty/tty_ioctl.c b/drivers/tty/tty_ioctl.c index a9a978731c5b0d66f3a96b8cb076ef84ac7b3b0d..efa96e6c4c1b44f1e37a8fa94c8a4caa52e9a688 100644 --- a/drivers/tty/tty_ioctl.c +++ b/drivers/tty/tty_ioctl.c @@ -258,228 +258,6 @@ static void unset_locked_termios(struct tty_struct *tty, struct ktermios *old) /* FIXME: What should we do for i/ospeed */ } -/* - * Routine which returns the baud rate of the tty - * - * Note that the baud_table needs to be kept in sync with the - * include/asm/termbits.h file. - */ -static const speed_t baud_table[] = { - 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, 1800, 2400, 4800, - 9600, 19200, 38400, 57600, 115200, 230400, 460800, -#ifdef __sparc__ - 76800, 153600, 307200, 614400, 921600 -#else - 500000, 576000, 921600, 1000000, 1152000, 1500000, 2000000, - 2500000, 3000000, 3500000, 4000000 -#endif -}; - -#ifndef __sparc__ -static const tcflag_t baud_bits[] = { - B0, B50, B75, B110, B134, B150, B200, B300, B600, - B1200, B1800, B2400, B4800, B9600, B19200, B38400, - B57600, B115200, B230400, B460800, B500000, B576000, - B921600, B1000000, B1152000, B1500000, B2000000, B2500000, - B3000000, B3500000, B4000000 -}; -#else -static const tcflag_t baud_bits[] = { - B0, B50, B75, B110, B134, B150, B200, B300, B600, - B1200, B1800, B2400, B4800, B9600, B19200, B38400, - B57600, B115200, B230400, B460800, B76800, B153600, - B307200, B614400, B921600 -}; -#endif - -static int n_baud_table = ARRAY_SIZE(baud_table); - -/** - * tty_termios_baud_rate - * @termios: termios structure - * - * Convert termios baud rate data into a speed. This should be called - * with the termios lock held if this termios is a terminal termios - * structure. May change the termios data. Device drivers can call this - * function but should use ->c_[io]speed directly as they are updated. - * - * Locking: none - */ - -speed_t tty_termios_baud_rate(struct ktermios *termios) -{ - unsigned int cbaud; - - cbaud = termios->c_cflag & CBAUD; - -#ifdef BOTHER - /* Magic token for arbitrary speed via c_ispeed/c_ospeed */ - if (cbaud == BOTHER) - return termios->c_ospeed; -#endif - if (cbaud & CBAUDEX) { - cbaud &= ~CBAUDEX; - - if (cbaud < 1 || cbaud + 15 > n_baud_table) - termios->c_cflag &= ~CBAUDEX; - else - cbaud += 15; - } - return baud_table[cbaud]; -} -EXPORT_SYMBOL(tty_termios_baud_rate); - -/** - * tty_termios_input_baud_rate - * @termios: termios structure - * - * Convert termios baud rate data into a speed. This should be called - * with the termios lock held if this termios is a terminal termios - * structure. May change the termios data. Device drivers can call this - * function but should use ->c_[io]speed directly as they are updated. - * - * Locking: none - */ - -speed_t tty_termios_input_baud_rate(struct ktermios *termios) -{ -#ifdef IBSHIFT - unsigned int cbaud = (termios->c_cflag >> IBSHIFT) & CBAUD; - - if (cbaud == B0) - return tty_termios_baud_rate(termios); - - /* Magic token for arbitrary speed via c_ispeed*/ - if (cbaud == BOTHER) - return termios->c_ispeed; - - if (cbaud & CBAUDEX) { - cbaud &= ~CBAUDEX; - - if (cbaud < 1 || cbaud + 15 > n_baud_table) - termios->c_cflag &= ~(CBAUDEX << IBSHIFT); - else - cbaud += 15; - } - return baud_table[cbaud]; -#else - return tty_termios_baud_rate(termios); -#endif -} -EXPORT_SYMBOL(tty_termios_input_baud_rate); - -/** - * tty_termios_encode_baud_rate - * @termios: ktermios structure holding user requested state - * @ispeed: input speed - * @ospeed: output speed - * - * Encode the speeds set into the passed termios structure. This is - * used as a library helper for drivers so that they can report back - * the actual speed selected when it differs from the speed requested - * - * For maximal back compatibility with legacy SYS5/POSIX *nix behaviour - * we need to carefully set the bits when the user does not get the - * desired speed. We allow small margins and preserve as much of possible - * of the input intent to keep compatibility. - * - * Locking: Caller should hold termios lock. This is already held - * when calling this function from the driver termios handler. - * - * The ifdefs deal with platforms whose owners have yet to update them - * and will all go away once this is done. - */ - -void tty_termios_encode_baud_rate(struct ktermios *termios, - speed_t ibaud, speed_t obaud) -{ - int i = 0; - int ifound = -1, ofound = -1; - int iclose = ibaud/50, oclose = obaud/50; - int ibinput = 0; - - if (obaud == 0) /* CD dropped */ - ibaud = 0; /* Clear ibaud to be sure */ - - termios->c_ispeed = ibaud; - termios->c_ospeed = obaud; - -#ifdef BOTHER - /* If the user asked for a precise weird speed give a precise weird - answer. If they asked for a Bfoo speed they may have problems - digesting non-exact replies so fuzz a bit */ - - if ((termios->c_cflag & CBAUD) == BOTHER) - oclose = 0; - if (((termios->c_cflag >> IBSHIFT) & CBAUD) == BOTHER) - iclose = 0; - if ((termios->c_cflag >> IBSHIFT) & CBAUD) - ibinput = 1; /* An input speed was specified */ -#endif - termios->c_cflag &= ~CBAUD; - - /* - * Our goal is to find a close match to the standard baud rate - * returned. Walk the baud rate table and if we get a very close - * match then report back the speed as a POSIX Bxxxx value by - * preference - */ - - do { - if (obaud - oclose <= baud_table[i] && - obaud + oclose >= baud_table[i]) { - termios->c_cflag |= baud_bits[i]; - ofound = i; - } - if (ibaud - iclose <= baud_table[i] && - ibaud + iclose >= baud_table[i]) { - /* For the case input == output don't set IBAUD bits - if the user didn't do so */ - if (ofound == i && !ibinput) - ifound = i; -#ifdef IBSHIFT - else { - ifound = i; - termios->c_cflag |= (baud_bits[i] << IBSHIFT); - } -#endif - } - } while (++i < n_baud_table); - - /* - * If we found no match then use BOTHER if provided or warn - * the user their platform maintainer needs to wake up if not. - */ -#ifdef BOTHER - if (ofound == -1) - termios->c_cflag |= BOTHER; - /* Set exact input bits only if the input and output differ or the - user already did */ - if (ifound == -1 && (ibaud != obaud || ibinput)) - termios->c_cflag |= (BOTHER << IBSHIFT); -#else - if (ifound == -1 || ofound == -1) - pr_warn_once("tty: Unable to return correct speed data as your architecture needs updating.\n"); -#endif -} -EXPORT_SYMBOL_GPL(tty_termios_encode_baud_rate); - -/** - * tty_encode_baud_rate - set baud rate of the tty - * @ibaud: input baud rate - * @obad: output baud rate - * - * Update the current termios data for the tty with the new speed - * settings. The caller must hold the termios_rwsem for the tty in - * question. - */ - -void tty_encode_baud_rate(struct tty_struct *tty, speed_t ibaud, speed_t obaud) -{ - tty_termios_encode_baud_rate(&tty->termios, ibaud, obaud); -} -EXPORT_SYMBOL_GPL(tty_encode_baud_rate); - /** * tty_termios_copy_hw - copy hardware settings * @new: New termios diff --git a/drivers/tty/tty_jobctrl.c b/drivers/tty/tty_jobctrl.c new file mode 100644 index 0000000000000000000000000000000000000000..e7032309ee8714e4a173de8fe17b993660d01d3f --- /dev/null +++ b/drivers/tty/tty_jobctrl.c @@ -0,0 +1,554 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +static int is_ignored(int sig) +{ + return (sigismember(¤t->blocked, sig) || + current->sighand->action[sig-1].sa.sa_handler == SIG_IGN); +} + +/** + * tty_check_change - check for POSIX terminal changes + * @tty: tty to check + * + * If we try to write to, or set the state of, a terminal and we're + * not in the foreground, send a SIGTTOU. If the signal is blocked or + * ignored, go ahead and perform the operation. (POSIX 7.2) + * + * Locking: ctrl_lock + */ +int __tty_check_change(struct tty_struct *tty, int sig) +{ + unsigned long flags; + struct pid *pgrp, *tty_pgrp; + int ret = 0; + + if (current->signal->tty != tty) + return 0; + + rcu_read_lock(); + pgrp = task_pgrp(current); + + spin_lock_irqsave(&tty->ctrl_lock, flags); + tty_pgrp = tty->pgrp; + spin_unlock_irqrestore(&tty->ctrl_lock, flags); + + if (tty_pgrp && pgrp != tty->pgrp) { + if (is_ignored(sig)) { + if (sig == SIGTTIN) + ret = -EIO; + } else if (is_current_pgrp_orphaned()) + ret = -EIO; + else { + kill_pgrp(pgrp, sig, 1); + set_thread_flag(TIF_SIGPENDING); + ret = -ERESTARTSYS; + } + } + rcu_read_unlock(); + + if (!tty_pgrp) + tty_warn(tty, "sig=%d, tty->pgrp == NULL!\n", sig); + + return ret; +} + +int tty_check_change(struct tty_struct *tty) +{ + return __tty_check_change(tty, SIGTTOU); +} +EXPORT_SYMBOL(tty_check_change); + +void proc_clear_tty(struct task_struct *p) +{ + unsigned long flags; + struct tty_struct *tty; + spin_lock_irqsave(&p->sighand->siglock, flags); + tty = p->signal->tty; + p->signal->tty = NULL; + spin_unlock_irqrestore(&p->sighand->siglock, flags); + tty_kref_put(tty); +} + +/** + * proc_set_tty - set the controlling terminal + * + * Only callable by the session leader and only if it does not already have + * a controlling terminal. + * + * Caller must hold: tty_lock() + * a readlock on tasklist_lock + * sighand lock + */ +static void __proc_set_tty(struct tty_struct *tty) +{ + unsigned long flags; + + spin_lock_irqsave(&tty->ctrl_lock, flags); + /* + * The session and fg pgrp references will be non-NULL if + * tiocsctty() is stealing the controlling tty + */ + put_pid(tty->session); + put_pid(tty->pgrp); + tty->pgrp = get_pid(task_pgrp(current)); + spin_unlock_irqrestore(&tty->ctrl_lock, flags); + tty->session = get_pid(task_session(current)); + if (current->signal->tty) { + tty_debug(tty, "current tty %s not NULL!!\n", + current->signal->tty->name); + tty_kref_put(current->signal->tty); + } + put_pid(current->signal->tty_old_pgrp); + current->signal->tty = tty_kref_get(tty); + current->signal->tty_old_pgrp = NULL; +} + +static void proc_set_tty(struct tty_struct *tty) +{ + spin_lock_irq(¤t->sighand->siglock); + __proc_set_tty(tty); + spin_unlock_irq(¤t->sighand->siglock); +} + +/* + * Called by tty_open() to set the controlling tty if applicable. + */ +void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty) +{ + read_lock(&tasklist_lock); + spin_lock_irq(¤t->sighand->siglock); + if (current->signal->leader && + !current->signal->tty && + tty->session == NULL) { + /* + * Don't let a process that only has write access to the tty + * obtain the privileges associated with having a tty as + * controlling terminal (being able to reopen it with full + * access through /dev/tty, being able to perform pushback). + * Many distributions set the group of all ttys to "tty" and + * grant write-only access to all terminals for setgid tty + * binaries, which should not imply full privileges on all ttys. + * + * This could theoretically break old code that performs open() + * on a write-only file descriptor. In that case, it might be + * necessary to also permit this if + * inode_permission(inode, MAY_READ) == 0. + */ + if (filp->f_mode & FMODE_READ) + __proc_set_tty(tty); + } + spin_unlock_irq(¤t->sighand->siglock); + read_unlock(&tasklist_lock); +} + +struct tty_struct *get_current_tty(void) +{ + struct tty_struct *tty; + unsigned long flags; + + spin_lock_irqsave(¤t->sighand->siglock, flags); + tty = tty_kref_get(current->signal->tty); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + return tty; +} +EXPORT_SYMBOL_GPL(get_current_tty); + +/* + * Called from tty_release(). + */ +void session_clear_tty(struct pid *session) +{ + struct task_struct *p; + do_each_pid_task(session, PIDTYPE_SID, p) { + proc_clear_tty(p); + } while_each_pid_task(session, PIDTYPE_SID, p); +} + +/** + * tty_signal_session_leader - sends SIGHUP to session leader + * @tty controlling tty + * @exit_session if non-zero, signal all foreground group processes + * + * Send SIGHUP and SIGCONT to the session leader and its process group. + * Optionally, signal all processes in the foreground process group. + * + * Returns the number of processes in the session with this tty + * as their controlling terminal. This value is used to drop + * tty references for those processes. + */ +int tty_signal_session_leader(struct tty_struct *tty, int exit_session) +{ + struct task_struct *p; + int refs = 0; + struct pid *tty_pgrp = NULL; + + read_lock(&tasklist_lock); + if (tty->session) { + do_each_pid_task(tty->session, PIDTYPE_SID, p) { + spin_lock_irq(&p->sighand->siglock); + if (p->signal->tty == tty) { + p->signal->tty = NULL; + /* We defer the dereferences outside fo + the tasklist lock */ + refs++; + } + if (!p->signal->leader) { + spin_unlock_irq(&p->sighand->siglock); + continue; + } + __group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p); + __group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p); + put_pid(p->signal->tty_old_pgrp); /* A noop */ + spin_lock(&tty->ctrl_lock); + tty_pgrp = get_pid(tty->pgrp); + if (tty->pgrp) + p->signal->tty_old_pgrp = get_pid(tty->pgrp); + spin_unlock(&tty->ctrl_lock); + spin_unlock_irq(&p->sighand->siglock); + } while_each_pid_task(tty->session, PIDTYPE_SID, p); + } + read_unlock(&tasklist_lock); + + if (tty_pgrp) { + if (exit_session) + kill_pgrp(tty_pgrp, SIGHUP, exit_session); + put_pid(tty_pgrp); + } + + return refs; +} + +/** + * disassociate_ctty - disconnect controlling tty + * @on_exit: true if exiting so need to "hang up" the session + * + * This function is typically called only by the session leader, when + * it wants to disassociate itself from its controlling tty. + * + * It performs the following functions: + * (1) Sends a SIGHUP and SIGCONT to the foreground process group + * (2) Clears the tty from being controlling the session + * (3) Clears the controlling tty for all processes in the + * session group. + * + * The argument on_exit is set to 1 if called when a process is + * exiting; it is 0 if called by the ioctl TIOCNOTTY. + * + * Locking: + * BTM is taken for hysterical raisons, and held when + * called from no_tty(). + * tty_mutex is taken to protect tty + * ->siglock is taken to protect ->signal/->sighand + * tasklist_lock is taken to walk process list for sessions + * ->siglock is taken to protect ->signal/->sighand + */ +void disassociate_ctty(int on_exit) +{ + struct tty_struct *tty; + + if (!current->signal->leader) + return; + + tty = get_current_tty(); + if (tty) { + if (on_exit && tty->driver->type != TTY_DRIVER_TYPE_PTY) { + tty_vhangup_session(tty); + } else { + struct pid *tty_pgrp = tty_get_pgrp(tty); + if (tty_pgrp) { + kill_pgrp(tty_pgrp, SIGHUP, on_exit); + if (!on_exit) + kill_pgrp(tty_pgrp, SIGCONT, on_exit); + put_pid(tty_pgrp); + } + } + tty_kref_put(tty); + + } else if (on_exit) { + struct pid *old_pgrp; + spin_lock_irq(¤t->sighand->siglock); + old_pgrp = current->signal->tty_old_pgrp; + current->signal->tty_old_pgrp = NULL; + spin_unlock_irq(¤t->sighand->siglock); + if (old_pgrp) { + kill_pgrp(old_pgrp, SIGHUP, on_exit); + kill_pgrp(old_pgrp, SIGCONT, on_exit); + put_pid(old_pgrp); + } + return; + } + + spin_lock_irq(¤t->sighand->siglock); + put_pid(current->signal->tty_old_pgrp); + current->signal->tty_old_pgrp = NULL; + + tty = tty_kref_get(current->signal->tty); + if (tty) { + unsigned long flags; + spin_lock_irqsave(&tty->ctrl_lock, flags); + put_pid(tty->session); + put_pid(tty->pgrp); + tty->session = NULL; + tty->pgrp = NULL; + spin_unlock_irqrestore(&tty->ctrl_lock, flags); + tty_kref_put(tty); + } + + spin_unlock_irq(¤t->sighand->siglock); + /* Now clear signal->tty under the lock */ + read_lock(&tasklist_lock); + session_clear_tty(task_session(current)); + read_unlock(&tasklist_lock); +} + +/** + * + * no_tty - Ensure the current process does not have a controlling tty + */ +void no_tty(void) +{ + /* FIXME: Review locking here. The tty_lock never covered any race + between a new association and proc_clear_tty but possible we need + to protect against this anyway */ + struct task_struct *tsk = current; + disassociate_ctty(0); + proc_clear_tty(tsk); +} + +/** + * tiocsctty - set controlling tty + * @tty: tty structure + * @arg: user argument + * + * This ioctl is used to manage job control. It permits a session + * leader to set this tty as the controlling tty for the session. + * + * Locking: + * Takes tty_lock() to serialize proc_set_tty() for this tty + * Takes tasklist_lock internally to walk sessions + * Takes ->siglock() when updating signal->tty + */ +static int tiocsctty(struct tty_struct *tty, struct file *file, int arg) +{ + int ret = 0; + + tty_lock(tty); + read_lock(&tasklist_lock); + + if (current->signal->leader && (task_session(current) == tty->session)) + goto unlock; + + /* + * The process must be a session leader and + * not have a controlling tty already. + */ + if (!current->signal->leader || current->signal->tty) { + ret = -EPERM; + goto unlock; + } + + if (tty->session) { + /* + * This tty is already the controlling + * tty for another session group! + */ + if (arg == 1 && capable(CAP_SYS_ADMIN)) { + /* + * Steal it away + */ + session_clear_tty(tty->session); + } else { + ret = -EPERM; + goto unlock; + } + } + + /* See the comment in tty_open_proc_set_tty(). */ + if ((file->f_mode & FMODE_READ) == 0 && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto unlock; + } + + proc_set_tty(tty); +unlock: + read_unlock(&tasklist_lock); + tty_unlock(tty); + return ret; +} + +/** + * tty_get_pgrp - return a ref counted pgrp pid + * @tty: tty to read + * + * Returns a refcounted instance of the pid struct for the process + * group controlling the tty. + */ +struct pid *tty_get_pgrp(struct tty_struct *tty) +{ + unsigned long flags; + struct pid *pgrp; + + spin_lock_irqsave(&tty->ctrl_lock, flags); + pgrp = get_pid(tty->pgrp); + spin_unlock_irqrestore(&tty->ctrl_lock, flags); + + return pgrp; +} +EXPORT_SYMBOL_GPL(tty_get_pgrp); + +/* + * This checks not only the pgrp, but falls back on the pid if no + * satisfactory pgrp is found. I dunno - gdb doesn't work correctly + * without this... + * + * The caller must hold rcu lock or the tasklist lock. + */ +static struct pid *session_of_pgrp(struct pid *pgrp) +{ + struct task_struct *p; + struct pid *sid = NULL; + + p = pid_task(pgrp, PIDTYPE_PGID); + if (p == NULL) + p = pid_task(pgrp, PIDTYPE_PID); + if (p != NULL) + sid = task_session(p); + + return sid; +} + +/** + * tiocgpgrp - get process group + * @tty: tty passed by user + * @real_tty: tty side of the tty passed by the user if a pty else the tty + * @p: returned pid + * + * Obtain the process group of the tty. If there is no process group + * return an error. + * + * Locking: none. Reference to current->signal->tty is safe. + */ +static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +{ + struct pid *pid; + int ret; + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->signal->tty != real_tty) + return -ENOTTY; + pid = tty_get_pgrp(real_tty); + ret = put_user(pid_vnr(pid), p); + put_pid(pid); + return ret; +} + +/** + * tiocspgrp - attempt to set process group + * @tty: tty passed by user + * @real_tty: tty side device matching tty passed by user + * @p: pid pointer + * + * Set the process group of the tty to the session passed. Only + * permitted where the tty session is our session. + * + * Locking: RCU, ctrl lock + */ +static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +{ + struct pid *pgrp; + pid_t pgrp_nr; + int retval = tty_check_change(real_tty); + + if (retval == -EIO) + return -ENOTTY; + if (retval) + return retval; + if (!current->signal->tty || + (current->signal->tty != real_tty) || + (real_tty->session != task_session(current))) + return -ENOTTY; + if (get_user(pgrp_nr, p)) + return -EFAULT; + if (pgrp_nr < 0) + return -EINVAL; + rcu_read_lock(); + pgrp = find_vpid(pgrp_nr); + retval = -ESRCH; + if (!pgrp) + goto out_unlock; + retval = -EPERM; + if (session_of_pgrp(pgrp) != task_session(current)) + goto out_unlock; + retval = 0; + spin_lock_irq(&tty->ctrl_lock); + put_pid(real_tty->pgrp); + real_tty->pgrp = get_pid(pgrp); + spin_unlock_irq(&tty->ctrl_lock); +out_unlock: + rcu_read_unlock(); + return retval; +} + +/** + * tiocgsid - get session id + * @tty: tty passed by user + * @real_tty: tty side of the tty passed by the user if a pty else the tty + * @p: pointer to returned session id + * + * Obtain the session id of the tty. If there is no session + * return an error. + * + * Locking: none. Reference to current->signal->tty is safe. + */ +static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) +{ + /* + * (tty == real_tty) is a cheap way of + * testing if the tty is NOT a master pty. + */ + if (tty == real_tty && current->signal->tty != real_tty) + return -ENOTTY; + if (!real_tty->session) + return -ENOTTY; + return put_user(pid_vnr(real_tty->session), p); +} + +/* + * Called from tty_ioctl(). If tty is a pty then real_tty is the slave side, + * if not then tty == real_tty. + */ +long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, + struct file *file, unsigned int cmd, unsigned long arg) +{ + void __user *p = (void __user *)arg; + + switch (cmd) { + case TIOCNOTTY: + if (current->signal->tty != tty) + return -ENOTTY; + no_tty(); + return 0; + case TIOCSCTTY: + return tiocsctty(real_tty, file, arg); + case TIOCGPGRP: + return tiocgpgrp(tty, real_tty, p); + case TIOCSPGRP: + return tiocspgrp(tty, real_tty, p); + case TIOCGSID: + return tiocgsid(tty, real_tty, p); + } + return -ENOIOCTLCMD; +} diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index 36e1b8c7680f1d8fc5a64ba16e2fc52a0a379b40..accbd1257bc4a36938918965592e4a1e470098a6 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -80,21 +80,17 @@ void clear_selection(void) /* * User settable table: what characters are to be considered alphabetic? - * 256 bits. Locked by the console lock. + * 128 bits. Locked by the console lock. */ -static u32 inwordLut[8]={ +static u32 inwordLut[]={ 0x00000000, /* control chars */ - 0x03FF0000, /* digits */ + 0x03FFE000, /* digits and "-./" */ 0x87FFFFFE, /* uppercase and '_' */ 0x07FFFFFE, /* lowercase */ - 0x00000000, - 0x00000000, - 0xFF7FFFFF, /* latin-1 accented letters, not multiplication sign */ - 0xFF7FFFFF /* latin-1 accented letters, not division sign */ }; static inline int inword(const u16 c) { - return c > 0xff || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1); + return c > 0x7f || (( inwordLut[c>>5] >> (c & 0x1F) ) & 1); } /** @@ -106,10 +102,10 @@ static inline int inword(const u16 c) { */ int sel_loadlut(char __user *p) { - u32 tmplut[8]; - if (copy_from_user(tmplut, (u32 __user *)(p+4), 32)) + u32 tmplut[ARRAY_SIZE(inwordLut)]; + if (copy_from_user(tmplut, (u32 __user *)(p+4), sizeof(inwordLut))) return -EFAULT; - memcpy(inwordLut, tmplut, 32); + memcpy(inwordLut, tmplut, sizeof(inwordLut)); return 0; } diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c index 5c4933bb4b5336258f3d31f22971a1aea87c0dd9..9c9945284bcf240d319b98f986c2573b4d16e307 100644 --- a/drivers/tty/vt/vt.c +++ b/drivers/tty/vt/vt.c @@ -181,7 +181,7 @@ int console_blanked; static int vesa_blank_mode; /* 0:none 1:suspendV 2:suspendH 3:powerdown */ static int vesa_off_interval; -static int blankinterval = 10*60; +static int blankinterval; core_param(consoleblank, blankinterval, int, 0444); static DECLARE_WORK(console_work, console_callback); diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 60ce7fd54e890e445daf81e04bcdd70d0ae09599..1c196f87e9d967886c88b1ae7fe05cc5f01ed4ee 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -66,7 +66,7 @@ static ssize_t map_size_show(struct uio_mem *mem, char *buf) static ssize_t map_offset_show(struct uio_mem *mem, char *buf) { - return sprintf(buf, "0x%llx\n", (unsigned long long)mem->addr & ~PAGE_MASK); + return sprintf(buf, "0x%llx\n", (unsigned long long)mem->offs); } struct map_sysfs_entry { diff --git a/drivers/uio/uio_mf624.c b/drivers/uio/uio_mf624.c index d1f95a1567bba0a3f598538d5e11dca8308b7f82..35187c052e8c105c972067400c15f06e7749df93 100644 --- a/drivers/uio/uio_mf624.c +++ b/drivers/uio/uio_mf624.c @@ -127,6 +127,24 @@ static int mf624_irqcontrol(struct uio_info *info, s32 irq_on) return 0; } +static int mf624_setup_mem(struct pci_dev *dev, int bar, struct uio_mem *mem, const char *name) +{ + resource_size_t start = pci_resource_start(dev, bar); + resource_size_t len = pci_resource_len(dev, bar); + + mem->name = name; + mem->addr = start & PAGE_MASK; + mem->offs = start & ~PAGE_MASK; + if (!mem->addr) + return -ENODEV; + mem->size = ((start & ~PAGE_MASK) + len + PAGE_SIZE - 1) & PAGE_MASK; + mem->memtype = UIO_MEM_PHYS; + mem->internal_addr = pci_ioremap_bar(dev, bar); + if (!mem->internal_addr) + return -ENODEV; + return 0; +} + static int mf624_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) { struct uio_info *info; @@ -147,37 +165,15 @@ static int mf624_pci_probe(struct pci_dev *dev, const struct pci_device_id *id) /* Note: Datasheet says device uses BAR0, BAR1, BAR2 -- do not trust it */ /* BAR0 */ - info->mem[0].name = "PCI chipset, interrupts, status " - "bits, special functions"; - info->mem[0].addr = pci_resource_start(dev, 0); - if (!info->mem[0].addr) + if (mf624_setup_mem(dev, 0, &info->mem[0], "PCI chipset, interrupts, status " + "bits, special functions")) goto out_release; - info->mem[0].size = pci_resource_len(dev, 0); - info->mem[0].memtype = UIO_MEM_PHYS; - info->mem[0].internal_addr = pci_ioremap_bar(dev, 0); - if (!info->mem[0].internal_addr) - goto out_release; - /* BAR2 */ - info->mem[1].name = "ADC, DAC, DIO"; - info->mem[1].addr = pci_resource_start(dev, 2); - if (!info->mem[1].addr) - goto out_unmap0; - info->mem[1].size = pci_resource_len(dev, 2); - info->mem[1].memtype = UIO_MEM_PHYS; - info->mem[1].internal_addr = pci_ioremap_bar(dev, 2); - if (!info->mem[1].internal_addr) + if (mf624_setup_mem(dev, 2, &info->mem[1], "ADC, DAC, DIO")) goto out_unmap0; /* BAR4 */ - info->mem[2].name = "Counter/timer chip"; - info->mem[2].addr = pci_resource_start(dev, 4); - if (!info->mem[2].addr) - goto out_unmap1; - info->mem[2].size = pci_resource_len(dev, 4); - info->mem[2].memtype = UIO_MEM_PHYS; - info->mem[2].internal_addr = pci_ioremap_bar(dev, 4); - if (!info->mem[2].internal_addr) + if (mf624_setup_mem(dev, 4, &info->mem[2], "Counter/timer chip")) goto out_unmap1; info->irq = dev->irq; diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig index fbe493d44e816970fa367cfc71a986fbd1a27909..939a63bca82fb43c39dc9f46722262aa3b0037d1 100644 --- a/drivers/usb/Kconfig +++ b/drivers/usb/Kconfig @@ -35,7 +35,6 @@ config USB_COMMON config USB_ARCH_HAS_HCD def_bool y -# ARM SA1111 chips have a non-PCI based "OHCI-compatible" USB host interface. config USB tristate "Support for Host-side USB" depends on USB_ARCH_HAS_HCD @@ -73,6 +72,17 @@ config USB To compile this driver as a module, choose M here: the module will be called usbcore. +config USB_PCI + bool "PCI based USB host interface" + depends on PCI + default y + ---help--- + A lot of embeded system SOC (e.g. freescale T2080) have both + PCI and USB modules. But USB module is controlled by registers + directly, it have no relationship with PCI module. + + When say N here it will not build PCI related code in USB driver. + if USB source "drivers/usb/core/Kconfig" @@ -152,6 +162,8 @@ source "drivers/usb/phy/Kconfig" source "drivers/usb/gadget/Kconfig" +source "drivers/usb/typec/Kconfig" + config USB_LED_TRIG bool "USB LED Triggers" depends on LEDS_CLASS && LEDS_TRIGGERS diff --git a/drivers/usb/Makefile b/drivers/usb/Makefile index 7791af6c102c7f4ea2c8ecdeda5cbc08fb03a55c..9650b351c26c7f2064facc118435cb1cea1b12ee 100644 --- a/drivers/usb/Makefile +++ b/drivers/usb/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_USB_ISP1760) += isp1760/ obj-$(CONFIG_USB_MON) += mon/ obj-$(CONFIG_USB_MTU3) += mtu3/ -obj-$(CONFIG_PCI) += host/ +obj-$(CONFIG_USB_PCI) += host/ obj-$(CONFIG_USB_EHCI_HCD) += host/ obj-$(CONFIG_USB_ISP116X_HCD) += host/ obj-$(CONFIG_USB_OHCI_HCD) += host/ @@ -49,7 +49,7 @@ obj-$(CONFIG_USB_MICROTEK) += image/ obj-$(CONFIG_USB_SERIAL) += serial/ obj-$(CONFIG_USB) += misc/ -obj-$(CONFIG_EARLY_PRINTK_DBGP) += early/ +obj-$(CONFIG_EARLY_PRINTK_USB) += early/ obj-$(CONFIG_USB_ATM) += atm/ obj-$(CONFIG_USB_SPEEDTOUCH) += atm/ @@ -62,3 +62,5 @@ obj-$(CONFIG_USB_GADGET) += gadget/ obj-$(CONFIG_USB_COMMON) += common/ obj-$(CONFIG_USBIP_CORE) += usbip/ + +obj-$(CONFIG_TYPEC) += typec/ diff --git a/drivers/usb/atm/cxacru.c b/drivers/usb/atm/cxacru.c index f9fe86b6f7b5b6ac5e35d944500cd74781f6dd26..d65a64c29b852af9ec47f48fe4d3e0024aa66743 100644 --- a/drivers/usb/atm/cxacru.c +++ b/drivers/usb/atm/cxacru.c @@ -474,7 +474,7 @@ static ssize_t cxacru_sysfs_store_adsl_config(struct device *dev, ret = sscanf(buf + pos, "%x=%x%n", &index, &value, &tmp); if (ret < 2) return -EINVAL; - if (index < 0 || index > 0x7f) + if (index > 0x7f) return -EINVAL; if (tmp < 0 || tmp > len - pos) return -EINVAL; diff --git a/drivers/usb/chipidea/Kconfig b/drivers/usb/chipidea/Kconfig index fc96f5cdcb5cc7744913f560f1c23f70fe6b8250..51f4157bbecfd9972d5c334c688eceeaa9f3ff74 100644 --- a/drivers/usb/chipidea/Kconfig +++ b/drivers/usb/chipidea/Kconfig @@ -20,7 +20,7 @@ config USB_CHIPIDEA_OF config USB_CHIPIDEA_PCI tristate - depends on PCI + depends on USB_PCI depends on NOP_USB_XCEIV default USB_CHIPIDEA diff --git a/drivers/usb/chipidea/ci.h b/drivers/usb/chipidea/ci.h index 59e22389c10b01c7a37c719c84c32b9b19b9a3e4..6743f85b1b7ad816397825b70ac20c59089f26c2 100644 --- a/drivers/usb/chipidea/ci.h +++ b/drivers/usb/chipidea/ci.h @@ -177,6 +177,7 @@ struct hw_bank { * @td_pool: allocation pool for transfer descriptors * @gadget: device side representation for peripheral controller * @driver: gadget driver + * @resume_state: save the state of gadget suspend from * @hw_ep_max: total number of endpoints supported by hardware * @ci_hw_ep: array of endpoints * @ep0_dir: ep0 direction @@ -227,6 +228,7 @@ struct ci_hdrc { struct usb_gadget gadget; struct usb_gadget_driver *driver; + enum usb_device_state resume_state; unsigned hw_ep_max; struct ci_hw_ep ci_hw_ep[ENDPT_MAX]; u32 ep0_dir; diff --git a/drivers/usb/chipidea/core.c b/drivers/usb/chipidea/core.c index 79ad8e91632e6d29f3a67a328b559d2b2be7f793..9e217b1361ea76af9b9a5bb999b81559643e8e25 100644 --- a/drivers/usb/chipidea/core.c +++ b/drivers/usb/chipidea/core.c @@ -783,9 +783,6 @@ struct platform_device *ci_hdrc_add_device(struct device *dev, } pdev->dev.parent = dev; - pdev->dev.dma_mask = dev->dma_mask; - pdev->dev.dma_parms = dev->dma_parms; - dma_set_coherent_mask(&pdev->dev, dev->coherent_dma_mask); ret = platform_device_add_resources(pdev, res, nres); if (ret) @@ -841,6 +838,56 @@ static void ci_get_otg_capable(struct ci_hdrc *ci) } } +static ssize_t ci_role_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct ci_hdrc *ci = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", ci_role(ci)->name); +} + +static ssize_t ci_role_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t n) +{ + struct ci_hdrc *ci = dev_get_drvdata(dev); + enum ci_role role; + int ret; + + if (!(ci->roles[CI_ROLE_HOST] && ci->roles[CI_ROLE_GADGET])) { + dev_warn(dev, "Current configuration is not dual-role, quit\n"); + return -EPERM; + } + + for (role = CI_ROLE_HOST; role < CI_ROLE_END; role++) + if (!strncmp(buf, ci->roles[role]->name, + strlen(ci->roles[role]->name))) + break; + + if (role == CI_ROLE_END || role == ci->role) + return -EINVAL; + + pm_runtime_get_sync(dev); + disable_irq(ci->irq); + ci_role_stop(ci); + ret = ci_role_start(ci, role); + if (!ret && ci->role == CI_ROLE_GADGET) + ci_handle_vbus_change(ci); + enable_irq(ci->irq); + pm_runtime_put_sync(dev); + + return (ret == 0) ? n : ret; +} +static DEVICE_ATTR(role, 0644, ci_role_show, ci_role_store); + +static struct attribute *ci_attrs[] = { + &dev_attr_role.attr, + NULL, +}; + +static struct attribute_group ci_attr_group = { + .attrs = ci_attrs, +}; + static int ci_hdrc_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; @@ -1007,11 +1054,18 @@ static int ci_hdrc_probe(struct platform_device *pdev) ci_hdrc_otg_fsm_start(ci); device_set_wakeup_capable(&pdev->dev, true); - ret = dbg_create_files(ci); - if (!ret) - return 0; + if (ret) + goto stop; + + ret = sysfs_create_group(&dev->kobj, &ci_attr_group); + if (ret) + goto remove_debug; + + return 0; +remove_debug: + dbg_remove_files(ci); stop: ci_role_destroy(ci); deinit_phy: @@ -1033,6 +1087,7 @@ static int ci_hdrc_remove(struct platform_device *pdev) } dbg_remove_files(ci); + sysfs_remove_group(&ci->dev->kobj, &ci_attr_group); ci_role_destroy(ci); ci_hdrc_enter_lpm(ci, true); ci_usb_phy_exit(ci); diff --git a/drivers/usb/chipidea/host.c b/drivers/usb/chipidea/host.c index 915f3e91586e68adddce0367c3179753a90fd078..18cb8e46262d3ed3492d1045b7e15b9b4496f479 100644 --- a/drivers/usb/chipidea/host.c +++ b/drivers/usb/chipidea/host.c @@ -123,7 +123,8 @@ static int host_start(struct ci_hdrc *ci) if (usb_disabled()) return -ENODEV; - hcd = usb_create_hcd(&ci_ehci_hc_driver, ci->dev, dev_name(ci->dev)); + hcd = __usb_create_hcd(&ci_ehci_hc_driver, ci->dev->parent, + ci->dev, dev_name(ci->dev), NULL); if (!hcd) return -ENOMEM; diff --git a/drivers/usb/chipidea/udc.c b/drivers/usb/chipidea/udc.c index f88e9157fad07ea8443a8c72f036133418721cdf..56d2d32130765dfc655ecbb6e105f8d411070d1d 100644 --- a/drivers/usb/chipidea/udc.c +++ b/drivers/usb/chipidea/udc.c @@ -423,7 +423,8 @@ static int _hardware_enqueue(struct ci_hw_ep *hwep, struct ci_hw_req *hwreq) hwreq->req.status = -EALREADY; - ret = usb_gadget_map_request(&ci->gadget, &hwreq->req, hwep->dir); + ret = usb_gadget_map_request_by_dev(ci->dev->parent, + &hwreq->req, hwep->dir); if (ret) return ret; @@ -603,7 +604,8 @@ static int _hardware_dequeue(struct ci_hw_ep *hwep, struct ci_hw_req *hwreq) list_del_init(&node->td); } - usb_gadget_unmap_request(&hwep->ci->gadget, &hwreq->req, hwep->dir); + usb_gadget_unmap_request_by_dev(hwep->ci->dev->parent, + &hwreq->req, hwep->dir); hwreq->req.actual += actual; @@ -1845,27 +1847,32 @@ static irqreturn_t udc_irq(struct ci_hdrc *ci) if (USBi_PCI & intr) { ci->gadget.speed = hw_port_is_high_speed(ci) ? USB_SPEED_HIGH : USB_SPEED_FULL; - if (ci->suspended && ci->driver->resume) { - spin_unlock(&ci->lock); - ci->driver->resume(&ci->gadget); - spin_lock(&ci->lock); + if (ci->suspended) { + if (ci->driver->resume) { + spin_unlock(&ci->lock); + ci->driver->resume(&ci->gadget); + spin_lock(&ci->lock); + } ci->suspended = 0; + usb_gadget_set_state(&ci->gadget, + ci->resume_state); } } if (USBi_UI & intr) isr_tr_complete_handler(ci); - if (USBi_SLI & intr) { + if ((USBi_SLI & intr) && !(ci->suspended)) { + ci->suspended = 1; + ci->resume_state = ci->gadget.state; if (ci->gadget.speed != USB_SPEED_UNKNOWN && ci->driver->suspend) { - ci->suspended = 1; spin_unlock(&ci->lock); ci->driver->suspend(&ci->gadget); - usb_gadget_set_state(&ci->gadget, - USB_STATE_SUSPENDED); spin_lock(&ci->lock); } + usb_gadget_set_state(&ci->gadget, + USB_STATE_SUSPENDED); } retval = IRQ_HANDLED; } else { @@ -1899,13 +1906,13 @@ static int udc_start(struct ci_hdrc *ci) INIT_LIST_HEAD(&ci->gadget.ep_list); /* alloc resources */ - ci->qh_pool = dma_pool_create("ci_hw_qh", dev, + ci->qh_pool = dma_pool_create("ci_hw_qh", dev->parent, sizeof(struct ci_hw_qh), 64, CI_HDRC_PAGE_SIZE); if (ci->qh_pool == NULL) return -ENOMEM; - ci->td_pool = dma_pool_create("ci_hw_td", dev, + ci->td_pool = dma_pool_create("ci_hw_td", dev->parent, sizeof(struct ci_hw_td), 64, CI_HDRC_PAGE_SIZE); if (ci->td_pool == NULL) { @@ -1973,6 +1980,8 @@ static void udc_id_switch_for_host(struct ci_hdrc *ci) */ if (ci->is_otg) hw_write_otgsc(ci, OTGSC_BSVIE | OTGSC_BSVIS, OTGSC_BSVIS); + + ci->vbus_active = 0; } /** diff --git a/drivers/usb/class/Kconfig b/drivers/usb/class/Kconfig index bb8b73682a70ccf28dc7669368c892a187594007..971385fe9abce65689baf82988c5183edb054b82 100644 --- a/drivers/usb/class/Kconfig +++ b/drivers/usb/class/Kconfig @@ -12,7 +12,7 @@ config USB_ACM Please read for details. If your modem only reports "Cls=ff(vend.)" in the descriptors in - /proc/bus/usb/devices, then your modem will not work with this + /sys/kernel/debug/usb/devices, then your modem will not work with this driver. To compile this driver as a module, choose M here: the diff --git a/drivers/usb/class/cdc-acm.c b/drivers/usb/class/cdc-acm.c index d5388938bc7aaf56accce547046429eaf4fe419e..5357d83bbda2b87d8823b24995dbfcd626c2262f 100644 --- a/drivers/usb/class/cdc-acm.c +++ b/drivers/usb/class/cdc-acm.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -283,39 +284,13 @@ static DEVICE_ATTR(iCountryCodeRelDate, S_IRUGO, show_country_rel_date, NULL); * Interrupt handlers for various ACM device responses */ -/* control interface reports status changes with "interrupt" transfers */ -static void acm_ctrl_irq(struct urb *urb) +static void acm_process_notification(struct acm *acm, unsigned char *buf) { - struct acm *acm = urb->context; - struct usb_cdc_notification *dr = urb->transfer_buffer; - unsigned char *data; int newctrl; int difference; - int retval; - int status = urb->status; + struct usb_cdc_notification *dr = (struct usb_cdc_notification *)buf; + unsigned char *data = buf + sizeof(struct usb_cdc_notification); - switch (status) { - case 0: - /* success */ - break; - case -ECONNRESET: - case -ENOENT: - case -ESHUTDOWN: - /* this urb is terminated, clean up */ - dev_dbg(&acm->control->dev, - "%s - urb shutting down with status: %d\n", - __func__, status); - return; - default: - dev_dbg(&acm->control->dev, - "%s - nonzero urb status received: %d\n", - __func__, status); - goto exit; - } - - usb_mark_last_busy(acm->dev); - - data = (unsigned char *)(dr + 1); switch (dr->bNotificationType) { case USB_CDC_NOTIFY_NETWORK_CONNECTION: dev_dbg(&acm->control->dev, @@ -323,7 +298,15 @@ static void acm_ctrl_irq(struct urb *urb) break; case USB_CDC_NOTIFY_SERIAL_STATE: + if (le16_to_cpu(dr->wLength) != 2) { + dev_dbg(&acm->control->dev, + "%s - malformed serial state\n", __func__); + break; + } + newctrl = get_unaligned_le16(data); + dev_dbg(&acm->control->dev, + "%s - serial state: 0x%x\n", __func__, newctrl); if (!acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) { dev_dbg(&acm->control->dev, @@ -359,13 +342,86 @@ static void acm_ctrl_irq(struct urb *urb) default: dev_dbg(&acm->control->dev, - "%s - unknown notification %d received: index %d " - "len %d data0 %d data1 %d\n", + "%s - unknown notification %d received: index %d len %d\n", __func__, - dr->bNotificationType, dr->wIndex, - dr->wLength, data[0], data[1]); + dr->bNotificationType, dr->wIndex, dr->wLength); + } +} + +/* control interface reports status changes with "interrupt" transfers */ +static void acm_ctrl_irq(struct urb *urb) +{ + struct acm *acm = urb->context; + struct usb_cdc_notification *dr = urb->transfer_buffer; + unsigned int current_size = urb->actual_length; + unsigned int expected_size, copy_size, alloc_size; + int retval; + int status = urb->status; + + switch (status) { + case 0: + /* success */ break; + case -ECONNRESET: + case -ENOENT: + case -ESHUTDOWN: + /* this urb is terminated, clean up */ + acm->nb_index = 0; + dev_dbg(&acm->control->dev, + "%s - urb shutting down with status: %d\n", + __func__, status); + return; + default: + dev_dbg(&acm->control->dev, + "%s - nonzero urb status received: %d\n", + __func__, status); + goto exit; + } + + usb_mark_last_busy(acm->dev); + + if (acm->nb_index) + dr = (struct usb_cdc_notification *)acm->notification_buffer; + + /* size = notification-header + (optional) data */ + expected_size = sizeof(struct usb_cdc_notification) + + le16_to_cpu(dr->wLength); + + if (current_size < expected_size) { + /* notification is transmitted fragmented, reassemble */ + if (acm->nb_size < expected_size) { + if (acm->nb_size) { + kfree(acm->notification_buffer); + acm->nb_size = 0; + } + alloc_size = roundup_pow_of_two(expected_size); + /* + * kmalloc ensures a valid notification_buffer after a + * use of kfree in case the previous allocation was too + * small. Final freeing is done on disconnect. + */ + acm->notification_buffer = + kmalloc(alloc_size, GFP_ATOMIC); + if (!acm->notification_buffer) + goto exit; + acm->nb_size = alloc_size; + } + + copy_size = min(current_size, + expected_size - acm->nb_index); + + memcpy(&acm->notification_buffer[acm->nb_index], + urb->transfer_buffer, copy_size); + acm->nb_index += copy_size; + current_size = acm->nb_index; + } + + if (current_size >= expected_size) { + /* notification complete */ + acm_process_notification(acm, (unsigned char *)dr); + acm->nb_index = 0; } + exit: retval = usb_submit_urb(urb, GFP_ATOMIC); if (retval && retval != -EPERM) @@ -1174,6 +1230,7 @@ static int acm_probe(struct usb_interface *intf, int combined_interfaces = 0; struct device *tty_dev; int rv = -ENOMEM; + int res; /* normal quirks */ quirks = (unsigned long)id->driver_info; @@ -1274,23 +1331,12 @@ static int acm_probe(struct usb_interface *intf, return -EINVAL; } look_for_collapsed_interface: - for (i = 0; i < 3; i++) { - struct usb_endpoint_descriptor *ep; - ep = &data_interface->cur_altsetting->endpoint[i].desc; - - if (usb_endpoint_is_int_in(ep)) - epctrl = ep; - else if (usb_endpoint_is_bulk_out(ep)) - epwrite = ep; - else if (usb_endpoint_is_bulk_in(ep)) - epread = ep; - else - return -EINVAL; - } - if (!epctrl || !epread || !epwrite) - return -ENODEV; - else - goto made_compressed_probe; + res = usb_find_common_endpoints(data_interface->cur_altsetting, + &epread, &epwrite, &epctrl, NULL); + if (res) + return res; + + goto made_compressed_probe; } skip_normal_probe: @@ -1488,6 +1534,9 @@ static int acm_probe(struct usb_interface *intf, epctrl->bInterval ? epctrl->bInterval : 16); acm->ctrlurb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP; acm->ctrlurb->transfer_dma = acm->ctrl_dma; + acm->notification_buffer = NULL; + acm->nb_index = 0; + acm->nb_size = 0; dev_info(&intf->dev, "ttyACM%d: USB ACM device\n", minor); @@ -1580,6 +1629,8 @@ static void acm_disconnect(struct usb_interface *intf) usb_free_coherent(acm->dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma); acm_read_buffers_free(acm); + kfree(acm->notification_buffer); + if (!acm->combined_interfaces) usb_driver_release_interface(&acm_driver, intf == acm->control ? acm->data : acm->control); diff --git a/drivers/usb/class/cdc-acm.h b/drivers/usb/class/cdc-acm.h index c980f11cdf560a773e6d6bc4b8fccd232c0df200..7a2b3deafc90abc3d524463d32b49172a7778695 100644 --- a/drivers/usb/class/cdc-acm.h +++ b/drivers/usb/class/cdc-acm.h @@ -98,7 +98,9 @@ struct acm { struct acm_wb *putbuffer; /* for acm_tty_put_char() */ int rx_buflimit; spinlock_t read_lock; - int write_used; /* number of non-empty write buffers */ + u8 *notification_buffer; /* to reassemble fragmented notifications */ + unsigned int nb_index; + unsigned int nb_size; int transmitting; spinlock_t write_lock; struct mutex mutex; diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 8fda45a45bd3713398922f3158f48674153514a6..08669fee6d7f646c4a52816a9d240c441edd21bf 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -58,7 +58,6 @@ MODULE_DEVICE_TABLE (usb, wdm_ids); #define WDM_SUSPENDING 8 #define WDM_RESETTING 9 #define WDM_OVERFLOW 10 -#define WDM_DRAIN_ON_OPEN 11 #define WDM_MAX 16 @@ -182,7 +181,7 @@ static void wdm_in_callback(struct urb *urb) "nonzero urb status received: -ESHUTDOWN\n"); goto skip_error; case -EPIPE: - dev_dbg(&desc->intf->dev, + dev_err(&desc->intf->dev, "nonzero urb status received: -EPIPE\n"); break; default: @@ -210,25 +209,6 @@ static void wdm_in_callback(struct urb *urb) desc->reslength = length; } } - - /* - * Handling devices with the WDM_DRAIN_ON_OPEN flag set: - * If desc->resp_count is unset, then the urb was submitted - * without a prior notification. If the device returned any - * data, then this implies that it had messages queued without - * notifying us. Continue reading until that queue is flushed. - */ - if (!desc->resp_count) { - if (!length) { - /* do not propagate the expected -EPIPE */ - desc->rerr = 0; - goto unlock; - } - dev_dbg(&desc->intf->dev, "got %d bytes without notification\n", length); - set_bit(WDM_RESPONDING, &desc->flags); - usb_submit_urb(desc->response, GFP_ATOMIC); - } - skip_error: set_bit(WDM_READ, &desc->flags); wake_up(&desc->wait); @@ -243,7 +223,6 @@ static void wdm_in_callback(struct urb *urb) service_outstanding_interrupt(desc); } -unlock: spin_unlock(&desc->iuspin); } @@ -686,17 +665,6 @@ static int wdm_open(struct inode *inode, struct file *file) dev_err(&desc->intf->dev, "Error submitting int urb - %d\n", rv); rv = usb_translate_errors(rv); - } else if (test_bit(WDM_DRAIN_ON_OPEN, &desc->flags)) { - /* - * Some devices keep pending messages queued - * without resending notifications. We must - * flush the message queue before we can - * assume a one-to-one relationship between - * notifications and messages in the queue - */ - dev_dbg(&desc->intf->dev, "draining queued data\n"); - set_bit(WDM_RESPONDING, &desc->flags); - rv = usb_submit_urb(desc->response, GFP_KERNEL); } } else { rv = 0; @@ -803,8 +771,7 @@ static void wdm_rxwork(struct work_struct *work) /* --- hotplug --- */ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor *ep, - u16 bufsize, int (*manage_power)(struct usb_interface *, int), - bool drain_on_open) + u16 bufsize, int (*manage_power)(struct usb_interface *, int)) { int rv = -ENOMEM; struct wdm_device *desc; @@ -891,68 +858,6 @@ static int wdm_create(struct usb_interface *intf, struct usb_endpoint_descriptor desc->manage_power = manage_power; - /* - * "drain_on_open" enables a hack to work around a firmware - * issue observed on network functions, in particular MBIM - * functions. - * - * Quoting section 7 of the CDC-WMC r1.1 specification: - * - * "The firmware shall interpret GetEncapsulatedResponse as a - * request to read response bytes. The firmware shall send - * the next wLength bytes from the response. The firmware - * shall allow the host to retrieve data using any number of - * GetEncapsulatedResponse requests. The firmware shall - * return a zero- length reply if there are no data bytes - * available. - * - * The firmware shall send ResponseAvailable notifications - * periodically, using any appropriate algorithm, to inform - * the host that there is data available in the reply - * buffer. The firmware is allowed to send ResponseAvailable - * notifications even if there is no data available, but - * this will obviously reduce overall performance." - * - * These requirements, although they make equally sense, are - * often not implemented by network functions. Some firmwares - * will queue data indefinitely, without ever resending a - * notification. The result is that the driver and firmware - * loses "syncronization" if the driver ever fails to respond - * to a single notification, something which easily can happen - * on release(). When this happens, the driver will appear to - * never receive notifications for the most current data. Each - * notification will only cause a single read, which returns - * the oldest data in the firmware's queue. - * - * The "drain_on_open" hack resolves the situation by draining - * data from the firmware until none is returned, without a - * prior notification. - * - * This will inevitably race with the firmware, risking that - * we read data from the device before handling the associated - * notification. To make things worse, some of the devices - * needing the hack do not implement the "return zero if no - * data is available" requirement either. Instead they return - * an error on the subsequent read in this case. This means - * that "winning" the race can cause an unexpected EIO to - * userspace. - * - * "winning" the race is more likely on resume() than on - * open(), and the unexpected error is more harmful in the - * middle of an open session. The hack is therefore only - * applied on open(), and not on resume() where it logically - * would be equally necessary. So we define open() as the only - * driver <-> device "syncronization point". Should we happen - * to lose a notification after open(), then syncronization - * will be lost until release() - * - * The hack should not be enabled for CDC WDM devices - * conforming to the CDC-WMC r1.1 specification. This is - * ensured by setting drain_on_open to false in wdm_probe(). - */ - if (drain_on_open) - set_bit(WDM_DRAIN_ON_OPEN, &desc->flags); - spin_lock(&wdm_device_list_lock); list_add(&desc->device_list, &wdm_device_list); spin_unlock(&wdm_device_list_lock); @@ -1006,7 +911,7 @@ static int wdm_probe(struct usb_interface *intf, const struct usb_device_id *id) goto err; ep = &iface->endpoint[0].desc; - rv = wdm_create(intf, ep, maxcom, &wdm_manage_power, false); + rv = wdm_create(intf, ep, maxcom, &wdm_manage_power); err: return rv; @@ -1038,7 +943,7 @@ struct usb_driver *usb_cdc_wdm_register(struct usb_interface *intf, { int rv = -EINVAL; - rv = wdm_create(intf, ep, bufsize, manage_power, true); + rv = wdm_create(intf, ep, bufsize, manage_power); if (rv < 0) goto err; diff --git a/drivers/usb/class/usblp.c b/drivers/usb/class/usblp.c index cc61055fb9befcbc00a6a32b85996518d1aebbe4..fb87c17ed6faf8302c1eebd210a10be4a23f69ce 100644 --- a/drivers/usb/class/usblp.c +++ b/drivers/usb/class/usblp.c @@ -294,7 +294,7 @@ static int usblp_ctrl_msg(struct usblp *usblp, int request, int type, int dir, i /* * See the description for usblp_select_alts() below for the usage - * explanation. Look into your /proc/bus/usb/devices and dmesg in + * explanation. Look into your /sys/kernel/debug/usb/devices and dmesg in * case of any trouble. */ static int proto_bias = -1; @@ -1239,8 +1239,9 @@ static int usblp_select_alts(struct usblp *usblp) { struct usb_interface *if_alt; struct usb_host_interface *ifd; - struct usb_endpoint_descriptor *epd, *epwrite, *epread; - int p, i, e; + struct usb_endpoint_descriptor *epwrite, *epread; + int p, i; + int res; if_alt = usblp->intf; @@ -1260,31 +1261,21 @@ static int usblp_select_alts(struct usblp *usblp) ifd->desc.bInterfaceProtocol > USBLP_LAST_PROTOCOL) continue; - /* Look for bulk OUT and IN endpoints. */ - epwrite = epread = NULL; - for (e = 0; e < ifd->desc.bNumEndpoints; e++) { - epd = &ifd->endpoint[e].desc; - - if (usb_endpoint_is_bulk_out(epd)) - if (!epwrite) - epwrite = epd; - - if (usb_endpoint_is_bulk_in(epd)) - if (!epread) - epread = epd; + /* Look for the expected bulk endpoints. */ + if (ifd->desc.bInterfaceProtocol > 1) { + res = usb_find_common_endpoints(ifd, + &epread, &epwrite, NULL, NULL); + } else { + epread = NULL; + res = usb_find_bulk_out_endpoint(ifd, &epwrite); } /* Ignore buggy hardware without the right endpoints. */ - if (!epwrite || (ifd->desc.bInterfaceProtocol > 1 && !epread)) + if (res) continue; - /* - * Turn off reads for USB_CLASS_PRINTER/1/1 (unidirectional) - * interfaces and buggy bidirectional printers. - */ - if (ifd->desc.bInterfaceProtocol == 1) { - epread = NULL; - } else if (usblp->quirks & USBLP_QUIRK_BIDIR) { + /* Turn off reads for buggy bidirectional printers. */ + if (usblp->quirks & USBLP_QUIRK_BIDIR) { printk(KERN_INFO "usblp%d: Disabling reads from " "problematic bidirectional printer\n", usblp->minor); diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index 8fb309a0ff6b5dae0c6f867cd323585aeff17252..578f424decc22b9f340732f6f381fc0d23135e72 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -1375,7 +1375,7 @@ static int usbtmc_probe(struct usb_interface *intf, { struct usbtmc_device_data *data; struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; + struct usb_endpoint_descriptor *bulk_in, *bulk_out, *int_in; int n; int retcode; @@ -1421,49 +1421,29 @@ static int usbtmc_probe(struct usb_interface *intf, iface_desc = data->intf->cur_altsetting; data->ifnum = iface_desc->desc.bInterfaceNumber; - /* Find bulk in endpoint */ - for (n = 0; n < iface_desc->desc.bNumEndpoints; n++) { - endpoint = &iface_desc->endpoint[n].desc; - - if (usb_endpoint_is_bulk_in(endpoint)) { - data->bulk_in = endpoint->bEndpointAddress; - dev_dbg(&intf->dev, "Found bulk in endpoint at %u\n", - data->bulk_in); - break; - } - } - - /* Find bulk out endpoint */ - for (n = 0; n < iface_desc->desc.bNumEndpoints; n++) { - endpoint = &iface_desc->endpoint[n].desc; - - if (usb_endpoint_is_bulk_out(endpoint)) { - data->bulk_out = endpoint->bEndpointAddress; - dev_dbg(&intf->dev, "Found Bulk out endpoint at %u\n", - data->bulk_out); - break; - } - } - - if (!data->bulk_out || !data->bulk_in) { + /* Find bulk endpoints */ + retcode = usb_find_common_endpoints(iface_desc, + &bulk_in, &bulk_out, NULL, NULL); + if (retcode) { dev_err(&intf->dev, "bulk endpoints not found\n"); - retcode = -ENODEV; goto err_put; } + data->bulk_in = bulk_in->bEndpointAddress; + dev_dbg(&intf->dev, "Found bulk in endpoint at %u\n", data->bulk_in); + + data->bulk_out = bulk_out->bEndpointAddress; + dev_dbg(&intf->dev, "Found Bulk out endpoint at %u\n", data->bulk_out); + /* Find int endpoint */ - for (n = 0; n < iface_desc->desc.bNumEndpoints; n++) { - endpoint = &iface_desc->endpoint[n].desc; - - if (usb_endpoint_is_int_in(endpoint)) { - data->iin_ep_present = 1; - data->iin_ep = endpoint->bEndpointAddress; - data->iin_wMaxPacketSize = usb_endpoint_maxp(endpoint); - data->iin_interval = endpoint->bInterval; - dev_dbg(&intf->dev, "Found Int in endpoint at %u\n", + retcode = usb_find_int_in_endpoint(iface_desc, &int_in); + if (!retcode) { + data->iin_ep_present = 1; + data->iin_ep = int_in->bEndpointAddress; + data->iin_wMaxPacketSize = usb_endpoint_maxp(int_in); + data->iin_interval = int_in->bInterval; + dev_dbg(&intf->dev, "Found Int in endpoint at %u\n", data->iin_ep); - break; - } } retcode = get_capabilities(data); diff --git a/drivers/usb/common/ulpi.c b/drivers/usb/common/ulpi.c index c9480d77810c9ff93db21c18ac262a3a046c62b2..930e8f35f8df39255be60e2e8267f3acff72b2f9 100644 --- a/drivers/usb/common/ulpi.c +++ b/drivers/usb/common/ulpi.c @@ -107,7 +107,7 @@ static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, int len; struct ulpi *ulpi = to_ulpi_dev(dev); - len = of_device_get_modalias(dev, buf, PAGE_SIZE - 1); + len = of_device_modalias(dev, buf, PAGE_SIZE); if (len != -ENODEV) return len; diff --git a/drivers/usb/common/usb-otg-fsm.c b/drivers/usb/common/usb-otg-fsm.c index 2f537bbdda093d90285789cd92f77da608dbbe14..b8fe31e409a5d6d13f04995a0707fafcf1e9daa0 100644 --- a/drivers/usb/common/usb-otg-fsm.c +++ b/drivers/usb/common/usb-otg-fsm.c @@ -31,6 +31,13 @@ #include #include +#ifdef VERBOSE +#define VDBG(fmt, args...) pr_debug("[%s] " fmt, \ + __func__, ## args) +#else +#define VDBG(stuff...) do {} while (0) +#endif + /* Change USB protocol when there is a protocol change */ static int otg_set_protocol(struct otg_fsm *fsm, int protocol) { diff --git a/drivers/usb/core/Kconfig b/drivers/usb/core/Kconfig index 0e5a889742b34b5ee6f55597741aeeb70f984527..4d75d9a80001eda9039d853871969938b4b82a41 100644 --- a/drivers/usb/core/Kconfig +++ b/drivers/usb/core/Kconfig @@ -26,7 +26,7 @@ config USB_DEFAULT_PERSIST unplugged, causing any mounted filesystems to be lost. The persist feature can still be enabled for individual devices through the power/persist sysfs node. See - Documentation/usb/persist.txt for more info. + Documentation/driver-api/usb/persist.rst for more info. If you have any questions about this, say Y here, only say N if you know exactly what you are doing. diff --git a/drivers/usb/core/Makefile b/drivers/usb/core/Makefile index b99b871c4b9d9b05c2f4a2f843485fc4ca965506..250ec1d662d9ecbb53ec59449f5cd229a5071e77 100644 --- a/drivers/usb/core/Makefile +++ b/drivers/usb/core/Makefile @@ -8,7 +8,7 @@ usbcore-y += devio.o notify.o generic.o quirks.o devices.o usbcore-y += port.o usbcore-$(CONFIG_OF) += of.o -usbcore-$(CONFIG_PCI) += hcd-pci.o +usbcore-$(CONFIG_USB_PCI) += hcd-pci.o usbcore-$(CONFIG_ACPI) += usb-acpi.o obj-$(CONFIG_USB) += usbcore.o diff --git a/drivers/usb/core/buffer.c b/drivers/usb/core/buffer.c index b9bf6e2eb6feabc49daebff549b361d06622d8e9..b64568cf572c9eade1bb5121c4118760bd845ef5 100644 --- a/drivers/usb/core/buffer.c +++ b/drivers/usb/core/buffer.c @@ -66,7 +66,7 @@ int hcd_buffer_create(struct usb_hcd *hcd) int i, size; if (!IS_ENABLED(CONFIG_HAS_DMA) || - (!hcd->self.controller->dma_mask && + (!is_device_dma_capable(hcd->self.sysdev) && !(hcd->driver->flags & HCD_LOCAL_MEM))) return 0; @@ -75,7 +75,7 @@ int hcd_buffer_create(struct usb_hcd *hcd) if (!size) continue; snprintf(name, sizeof(name), "buffer-%d", size); - hcd->pool[i] = dma_pool_create(name, hcd->self.controller, + hcd->pool[i] = dma_pool_create(name, hcd->self.sysdev, size, size, 0); if (!hcd->pool[i]) { hcd_buffer_destroy(hcd); @@ -130,7 +130,7 @@ void *hcd_buffer_alloc( /* some USB hosts just use PIO */ if (!IS_ENABLED(CONFIG_HAS_DMA) || - (!bus->controller->dma_mask && + (!is_device_dma_capable(bus->sysdev) && !(hcd->driver->flags & HCD_LOCAL_MEM))) { *dma = ~(dma_addr_t) 0; return kmalloc(size, mem_flags); @@ -140,7 +140,7 @@ void *hcd_buffer_alloc( if (size <= pool_max[i]) return dma_pool_alloc(hcd->pool[i], mem_flags, dma); } - return dma_alloc_coherent(hcd->self.controller, size, dma, mem_flags); + return dma_alloc_coherent(hcd->self.sysdev, size, dma, mem_flags); } void hcd_buffer_free( @@ -157,7 +157,7 @@ void hcd_buffer_free( return; if (!IS_ENABLED(CONFIG_HAS_DMA) || - (!bus->controller->dma_mask && + (!is_device_dma_capable(bus->sysdev) && !(hcd->driver->flags & HCD_LOCAL_MEM))) { kfree(addr); return; @@ -169,5 +169,5 @@ void hcd_buffer_free( return; } } - dma_free_coherent(hcd->self.controller, size, addr, dma); + dma_free_coherent(hcd->self.sysdev, size, addr, dma); } diff --git a/drivers/usb/core/devices.c b/drivers/usb/core/devices.c index f2987ddb1cde3492189f21caa6b23e807218abe2..55dea2e7828fe027c632444c7888c29c204e47c1 100644 --- a/drivers/usb/core/devices.c +++ b/drivers/usb/core/devices.c @@ -24,7 +24,7 @@ * /devices contains USB topology, device, config, class, * interface, & endpoint data. * - * I considered using /proc/bus/usb/devices/device# for each device + * I considered using /dev/bus/usb/device# for each device * as it is attached or detached, but I didn't like this for some * reason -- maybe it's just too deep of a directory structure. * I also don't like looking in multiple places to gather and view @@ -40,7 +40,7 @@ * Converted the whole proc stuff to real * read methods. Now not the whole device list needs to fit * into one page, only the device list for one bus. - * Added a poll method to /proc/bus/usb/devices, to wake + * Added a poll method to /sys/kernel/debug/usb/devices, to wake * up an eventual usbd * 2000-01-04: Thomas Sailer * Turned into its own filesystem diff --git a/drivers/usb/core/driver.c b/drivers/usb/core/driver.c index cdee5130638b109e5be899c41b8a5f432620c4e3..eb87a259d55c3abcc4d54e929cba861a0b429fae 100644 --- a/drivers/usb/core/driver.c +++ b/drivers/usb/core/driver.c @@ -1331,6 +1331,24 @@ static int usb_suspend_both(struct usb_device *udev, pm_message_t msg) */ if (udev->parent && !PMSG_IS_AUTO(msg)) status = 0; + + /* + * If the device is inaccessible, don't try to resume + * suspended interfaces and just return the error. + */ + if (status && status != -EBUSY) { + int err; + u16 devstat; + + err = usb_get_status(udev, USB_RECIP_DEVICE, 0, + &devstat); + if (err) { + dev_err(&udev->dev, + "Failed to suspend device, error %d\n", + status); + goto done; + } + } } /* If the suspend failed, resume interfaces that did get suspended */ @@ -1763,6 +1781,9 @@ static int autosuspend_check(struct usb_device *udev) int w, i; struct usb_interface *intf; + if (udev->state == USB_STATE_NOTATTACHED) + return -ENODEV; + /* Fail if autosuspend is disabled, or any interfaces are in use, or * any interface drivers require remote wakeup but it isn't available. */ diff --git a/drivers/usb/core/file.c b/drivers/usb/core/file.c index e26bd5e773ada0685329ec8ed65084ff72d7242d..87ad6b6bfee8f724cf0400633a352b6d7ae21797 100644 --- a/drivers/usb/core/file.c +++ b/drivers/usb/core/file.c @@ -29,6 +29,7 @@ #define MAX_USB_MINORS 256 static const struct file_operations *usb_minors[MAX_USB_MINORS]; static DECLARE_RWSEM(minor_rwsem); +static DEFINE_MUTEX(init_usb_class_mutex); static int usb_open(struct inode *inode, struct file *file) { @@ -111,8 +112,9 @@ static void release_usb_class(struct kref *kref) static void destroy_usb_class(void) { - if (usb_class) - kref_put(&usb_class->kref, release_usb_class); + mutex_lock(&init_usb_class_mutex); + kref_put(&usb_class->kref, release_usb_class); + mutex_unlock(&init_usb_class_mutex); } int usb_major_init(void) @@ -173,7 +175,10 @@ int usb_register_dev(struct usb_interface *intf, if (intf->minor >= 0) return -EADDRINUSE; + mutex_lock(&init_usb_class_mutex); retval = init_usb_class(); + mutex_unlock(&init_usb_class_mutex); + if (retval) return retval; diff --git a/drivers/usb/core/hcd.c b/drivers/usb/core/hcd.c index 79bdca5cb9c7ae8f01b1f4a25c7ceacd9c716e2c..49550790a3cba255df329dd73ca6e08e17a13af7 100644 --- a/drivers/usb/core/hcd.c +++ b/drivers/usb/core/hcd.c @@ -1076,6 +1076,7 @@ static void usb_deregister_bus (struct usb_bus *bus) static int register_root_hub(struct usb_hcd *hcd) { struct device *parent_dev = hcd->self.controller; + struct device *sysdev = hcd->self.sysdev; struct usb_device *usb_dev = hcd->self.root_hub; const int devnum = 1; int retval; @@ -1122,7 +1123,7 @@ static int register_root_hub(struct usb_hcd *hcd) /* Did the HC die before the root hub was registered? */ if (HCD_DEAD(hcd)) usb_hc_died (hcd); /* This time clean up */ - usb_dev->dev.of_node = parent_dev->of_node; + usb_dev->dev.of_node = sysdev->of_node; } mutex_unlock(&usb_bus_idr_lock); @@ -1435,7 +1436,7 @@ void usb_hcd_unmap_urb_setup_for_dma(struct usb_hcd *hcd, struct urb *urb) { if (IS_ENABLED(CONFIG_HAS_DMA) && (urb->transfer_flags & URB_SETUP_MAP_SINGLE)) - dma_unmap_single(hcd->self.controller, + dma_unmap_single(hcd->self.sysdev, urb->setup_dma, sizeof(struct usb_ctrlrequest), DMA_TO_DEVICE); @@ -1468,19 +1469,19 @@ void usb_hcd_unmap_urb_for_dma(struct usb_hcd *hcd, struct urb *urb) dir = usb_urb_dir_in(urb) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; if (IS_ENABLED(CONFIG_HAS_DMA) && (urb->transfer_flags & URB_DMA_MAP_SG)) - dma_unmap_sg(hcd->self.controller, + dma_unmap_sg(hcd->self.sysdev, urb->sg, urb->num_sgs, dir); else if (IS_ENABLED(CONFIG_HAS_DMA) && (urb->transfer_flags & URB_DMA_MAP_PAGE)) - dma_unmap_page(hcd->self.controller, + dma_unmap_page(hcd->self.sysdev, urb->transfer_dma, urb->transfer_buffer_length, dir); else if (IS_ENABLED(CONFIG_HAS_DMA) && (urb->transfer_flags & URB_DMA_MAP_SINGLE)) - dma_unmap_single(hcd->self.controller, + dma_unmap_single(hcd->self.sysdev, urb->transfer_dma, urb->transfer_buffer_length, dir); @@ -1523,11 +1524,11 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, return ret; if (IS_ENABLED(CONFIG_HAS_DMA) && hcd->self.uses_dma) { urb->setup_dma = dma_map_single( - hcd->self.controller, + hcd->self.sysdev, urb->setup_packet, sizeof(struct usb_ctrlrequest), DMA_TO_DEVICE); - if (dma_mapping_error(hcd->self.controller, + if (dma_mapping_error(hcd->self.sysdev, urb->setup_dma)) return -EAGAIN; urb->transfer_flags |= URB_SETUP_MAP_SINGLE; @@ -1558,7 +1559,7 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, } n = dma_map_sg( - hcd->self.controller, + hcd->self.sysdev, urb->sg, urb->num_sgs, dir); @@ -1573,12 +1574,12 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, } else if (urb->sg) { struct scatterlist *sg = urb->sg; urb->transfer_dma = dma_map_page( - hcd->self.controller, + hcd->self.sysdev, sg_page(sg), sg->offset, urb->transfer_buffer_length, dir); - if (dma_mapping_error(hcd->self.controller, + if (dma_mapping_error(hcd->self.sysdev, urb->transfer_dma)) ret = -EAGAIN; else @@ -1588,11 +1589,11 @@ int usb_hcd_map_urb_for_dma(struct usb_hcd *hcd, struct urb *urb, ret = -EAGAIN; } else { urb->transfer_dma = dma_map_single( - hcd->self.controller, + hcd->self.sysdev, urb->transfer_buffer, urb->transfer_buffer_length, dir); - if (dma_mapping_error(hcd->self.controller, + if (dma_mapping_error(hcd->self.sysdev, urb->transfer_dma)) ret = -EAGAIN; else @@ -2498,24 +2499,8 @@ static void init_giveback_urb_bh(struct giveback_urb_bh *bh) tasklet_init(&bh->bh, usb_giveback_urb_bh, (unsigned long)bh); } -/** - * usb_create_shared_hcd - create and initialize an HCD structure - * @driver: HC driver that will use this hcd - * @dev: device for this HC, stored in hcd->self.controller - * @bus_name: value to store in hcd->self.bus_name - * @primary_hcd: a pointer to the usb_hcd structure that is sharing the - * PCI device. Only allocate certain resources for the primary HCD - * Context: !in_interrupt() - * - * Allocate a struct usb_hcd, with extra space at the end for the - * HC driver's private data. Initialize the generic members of the - * hcd structure. - * - * Return: On success, a pointer to the created and initialized HCD structure. - * On failure (e.g. if memory is unavailable), %NULL. - */ -struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver, - struct device *dev, const char *bus_name, +struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver, + struct device *sysdev, struct device *dev, const char *bus_name, struct usb_hcd *primary_hcd) { struct usb_hcd *hcd; @@ -2556,8 +2541,9 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver, usb_bus_init(&hcd->self); hcd->self.controller = dev; + hcd->self.sysdev = sysdev; hcd->self.bus_name = bus_name; - hcd->self.uses_dma = (dev->dma_mask != NULL); + hcd->self.uses_dma = (sysdev->dma_mask != NULL); init_timer(&hcd->rh_timer); hcd->rh_timer.function = rh_timer_func; @@ -2572,6 +2558,30 @@ struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver, "USB Host Controller"; return hcd; } +EXPORT_SYMBOL_GPL(__usb_create_hcd); + +/** + * usb_create_shared_hcd - create and initialize an HCD structure + * @driver: HC driver that will use this hcd + * @dev: device for this HC, stored in hcd->self.controller + * @bus_name: value to store in hcd->self.bus_name + * @primary_hcd: a pointer to the usb_hcd structure that is sharing the + * PCI device. Only allocate certain resources for the primary HCD + * Context: !in_interrupt() + * + * Allocate a struct usb_hcd, with extra space at the end for the + * HC driver's private data. Initialize the generic members of the + * hcd structure. + * + * Return: On success, a pointer to the created and initialized HCD structure. + * On failure (e.g. if memory is unavailable), %NULL. + */ +struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver, + struct device *dev, const char *bus_name, + struct usb_hcd *primary_hcd) +{ + return __usb_create_hcd(driver, dev, dev, bus_name, primary_hcd); +} EXPORT_SYMBOL_GPL(usb_create_shared_hcd); /** @@ -2591,7 +2601,7 @@ EXPORT_SYMBOL_GPL(usb_create_shared_hcd); struct usb_hcd *usb_create_hcd(const struct hc_driver *driver, struct device *dev, const char *bus_name) { - return usb_create_shared_hcd(driver, dev, bus_name, NULL); + return __usb_create_hcd(driver, dev, dev, bus_name, NULL); } EXPORT_SYMBOL_GPL(usb_create_hcd); @@ -2718,7 +2728,7 @@ int usb_add_hcd(struct usb_hcd *hcd, struct usb_device *rhdev; if (IS_ENABLED(CONFIG_USB_PHY) && !hcd->usb_phy) { - struct usb_phy *phy = usb_get_phy_dev(hcd->self.controller, 0); + struct usb_phy *phy = usb_get_phy_dev(hcd->self.sysdev, 0); if (IS_ERR(phy)) { retval = PTR_ERR(phy); @@ -2736,7 +2746,7 @@ int usb_add_hcd(struct usb_hcd *hcd, } if (IS_ENABLED(CONFIG_GENERIC_PHY) && !hcd->phy) { - struct phy *phy = phy_get(hcd->self.controller, "usb"); + struct phy *phy = phy_get(hcd->self.sysdev, "usb"); if (IS_ERR(phy)) { retval = PTR_ERR(phy); @@ -2784,7 +2794,7 @@ int usb_add_hcd(struct usb_hcd *hcd, */ retval = hcd_buffer_create(hcd); if (retval != 0) { - dev_dbg(hcd->self.controller, "pool alloc failed\n"); + dev_dbg(hcd->self.sysdev, "pool alloc failed\n"); goto err_create_buf; } @@ -2794,7 +2804,7 @@ int usb_add_hcd(struct usb_hcd *hcd, rhdev = usb_alloc_dev(NULL, &hcd->self, 0); if (rhdev == NULL) { - dev_err(hcd->self.controller, "unable to allocate root hub\n"); + dev_err(hcd->self.sysdev, "unable to allocate root hub\n"); retval = -ENOMEM; goto err_allocate_root_hub; } diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index 5286bf67869a83e1d7e1d3f1ca0ebc87db5cf7a4..9dca59ef18b3454437da9affb5e6d93619e88ba3 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -1066,6 +1066,9 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) portstatus = portchange = 0; status = hub_port_status(hub, port1, &portstatus, &portchange); + if (status) + goto abort; + if (udev || (portstatus & USB_PORT_STAT_CONNECTION)) dev_dbg(&port_dev->dev, "status %04x change %04x\n", portstatus, portchange); @@ -1198,7 +1201,7 @@ static void hub_activate(struct usb_hub *hub, enum hub_activation_type type) /* Scan all ports that need attention */ kick_hub_wq(hub); - + abort: if (type == HUB_INIT2 || type == HUB_INIT3) { /* Allow autosuspend if it was suppressed */ disconnected: @@ -2084,6 +2087,12 @@ void usb_disconnect(struct usb_device **pdev) dev_info(&udev->dev, "USB disconnect, device number %d\n", udev->devnum); + /* + * Ensure that the pm runtime code knows that the USB device + * is in the process of being disconnected. + */ + pm_runtime_barrier(&udev->dev); + usb_lock_device(udev); hub_disconnect_children(udev); diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index 2184ef40a82ae494c11e82b4090545bd0dad30c8..4c38ea41ae969e6fac8ced209a655957322ba36f 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -474,6 +474,7 @@ EXPORT_SYMBOL_GPL(usb_sg_init); * significantly improve USB throughput. * * There are three kinds of completion for this function. + * * (1) success, where io->status is zero. The number of io->bytes * transferred is as requested. * (2) error, where io->status is a negative errno value. The number diff --git a/drivers/usb/core/of.c b/drivers/usb/core/of.c index 3de4f8873984d2d04749f2f87b8b5c0fb2b9f59b..d787f195a9a682c99e6289166d364d9aed110b2b 100644 --- a/drivers/usb/core/of.c +++ b/drivers/usb/core/of.c @@ -18,6 +18,7 @@ */ #include +#include #include /** @@ -46,3 +47,25 @@ struct device_node *usb_of_get_child_node(struct device_node *parent, } EXPORT_SYMBOL_GPL(usb_of_get_child_node); +/** + * usb_of_get_companion_dev - Find the companion device + * @dev: the device pointer to find a companion + * + * Find the companion device from platform bus. + * + * Return: On success, a pointer to the companion device, %NULL on failure. + */ +struct device *usb_of_get_companion_dev(struct device *dev) +{ + struct device_node *node; + struct platform_device *pdev = NULL; + + node = of_parse_phandle(dev->of_node, "companion", 0); + if (node) + pdev = of_find_device_by_node(node); + + of_node_put(node); + + return pdev ? &pdev->dev : NULL; +} +EXPORT_SYMBOL_GPL(usb_of_get_companion_dev); diff --git a/drivers/usb/core/usb.c b/drivers/usb/core/usb.c index a2ccc69fb45c092a5220474c64b03418133f2178..28b053cacc90556acf7df562fdc502c51b25a02e 100644 --- a/drivers/usb/core/usb.c +++ b/drivers/usb/core/usb.c @@ -74,6 +74,140 @@ MODULE_PARM_DESC(autosuspend, "default autosuspend delay"); #define usb_autosuspend_delay 0 #endif +static bool match_endpoint(struct usb_endpoint_descriptor *epd, + struct usb_endpoint_descriptor **bulk_in, + struct usb_endpoint_descriptor **bulk_out, + struct usb_endpoint_descriptor **int_in, + struct usb_endpoint_descriptor **int_out) +{ + switch (usb_endpoint_type(epd)) { + case USB_ENDPOINT_XFER_BULK: + if (usb_endpoint_dir_in(epd)) { + if (bulk_in && !*bulk_in) { + *bulk_in = epd; + break; + } + } else { + if (bulk_out && !*bulk_out) { + *bulk_out = epd; + break; + } + } + + return false; + case USB_ENDPOINT_XFER_INT: + if (usb_endpoint_dir_in(epd)) { + if (int_in && !*int_in) { + *int_in = epd; + break; + } + } else { + if (int_out && !*int_out) { + *int_out = epd; + break; + } + } + + return false; + default: + return false; + } + + return (!bulk_in || *bulk_in) && (!bulk_out || *bulk_out) && + (!int_in || *int_in) && (!int_out || *int_out); +} + +/** + * usb_find_common_endpoints() -- look up common endpoint descriptors + * @alt: alternate setting to search + * @bulk_in: pointer to descriptor pointer, or NULL + * @bulk_out: pointer to descriptor pointer, or NULL + * @int_in: pointer to descriptor pointer, or NULL + * @int_out: pointer to descriptor pointer, or NULL + * + * Search the alternate setting's endpoint descriptors for the first bulk-in, + * bulk-out, interrupt-in and interrupt-out endpoints and return them in the + * provided pointers (unless they are NULL). + * + * If a requested endpoint is not found, the corresponding pointer is set to + * NULL. + * + * Return: Zero if all requested descriptors were found, or -ENXIO otherwise. + */ +int usb_find_common_endpoints(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in, + struct usb_endpoint_descriptor **bulk_out, + struct usb_endpoint_descriptor **int_in, + struct usb_endpoint_descriptor **int_out) +{ + struct usb_endpoint_descriptor *epd; + int i; + + if (bulk_in) + *bulk_in = NULL; + if (bulk_out) + *bulk_out = NULL; + if (int_in) + *int_in = NULL; + if (int_out) + *int_out = NULL; + + for (i = 0; i < alt->desc.bNumEndpoints; ++i) { + epd = &alt->endpoint[i].desc; + + if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out)) + return 0; + } + + return -ENXIO; +} +EXPORT_SYMBOL_GPL(usb_find_common_endpoints); + +/** + * usb_find_common_endpoints_reverse() -- look up common endpoint descriptors + * @alt: alternate setting to search + * @bulk_in: pointer to descriptor pointer, or NULL + * @bulk_out: pointer to descriptor pointer, or NULL + * @int_in: pointer to descriptor pointer, or NULL + * @int_out: pointer to descriptor pointer, or NULL + * + * Search the alternate setting's endpoint descriptors for the last bulk-in, + * bulk-out, interrupt-in and interrupt-out endpoints and return them in the + * provided pointers (unless they are NULL). + * + * If a requested endpoint is not found, the corresponding pointer is set to + * NULL. + * + * Return: Zero if all requested descriptors were found, or -ENXIO otherwise. + */ +int usb_find_common_endpoints_reverse(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in, + struct usb_endpoint_descriptor **bulk_out, + struct usb_endpoint_descriptor **int_in, + struct usb_endpoint_descriptor **int_out) +{ + struct usb_endpoint_descriptor *epd; + int i; + + if (bulk_in) + *bulk_in = NULL; + if (bulk_out) + *bulk_out = NULL; + if (int_in) + *int_in = NULL; + if (int_out) + *int_out = NULL; + + for (i = alt->desc.bNumEndpoints - 1; i >= 0; --i) { + epd = &alt->endpoint[i].desc; + + if (match_endpoint(epd, bulk_in, bulk_out, int_in, int_out)) + return 0; + } + + return -ENXIO; +} +EXPORT_SYMBOL_GPL(usb_find_common_endpoints_reverse); /** * usb_find_alt_setting() - Given a configuration, find the alternate setting @@ -453,9 +587,9 @@ struct usb_device *usb_alloc_dev(struct usb_device *parent, * Note: calling dma_set_mask() on a USB device would set the * mask for the entire HCD, so don't do that. */ - dev->dev.dma_mask = bus->controller->dma_mask; - dev->dev.dma_pfn_offset = bus->controller->dma_pfn_offset; - set_dev_node(&dev->dev, dev_to_node(bus->controller)); + dev->dev.dma_mask = bus->sysdev->dma_mask; + dev->dev.dma_pfn_offset = bus->sysdev->dma_pfn_offset; + set_dev_node(&dev->dev, dev_to_node(bus->sysdev)); dev->state = USB_STATE_ATTACHED; dev->lpm_disable_count = 1; atomic_set(&dev->urbnum, 0); @@ -803,7 +937,7 @@ struct urb *usb_buffer_map(struct urb *urb) if (!urb || !urb->dev || !(bus = urb->dev->bus) - || !(controller = bus->controller)) + || !(controller = bus->sysdev)) return NULL; if (controller->dma_mask) { @@ -841,7 +975,7 @@ void usb_buffer_dmasync(struct urb *urb) || !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) || !urb->dev || !(bus = urb->dev->bus) - || !(controller = bus->controller)) + || !(controller = bus->sysdev)) return; if (controller->dma_mask) { @@ -875,7 +1009,7 @@ void usb_buffer_unmap(struct urb *urb) || !(urb->transfer_flags & URB_NO_TRANSFER_DMA_MAP) || !urb->dev || !(bus = urb->dev->bus) - || !(controller = bus->controller)) + || !(controller = bus->sysdev)) return; if (controller->dma_mask) { @@ -925,7 +1059,7 @@ int usb_buffer_map_sg(const struct usb_device *dev, int is_in, if (!dev || !(bus = dev->bus) - || !(controller = bus->controller) + || !(controller = bus->sysdev) || !controller->dma_mask) return -EINVAL; @@ -961,7 +1095,7 @@ void usb_buffer_dmasync_sg(const struct usb_device *dev, int is_in, if (!dev || !(bus = dev->bus) - || !(controller = bus->controller) + || !(controller = bus->sysdev) || !controller->dma_mask) return; @@ -989,7 +1123,7 @@ void usb_buffer_unmap_sg(const struct usb_device *dev, int is_in, if (!dev || !(bus = dev->bus) - || !(controller = bus->controller) + || !(controller = bus->sysdev) || !controller->dma_mask) return; diff --git a/drivers/usb/dwc2/Kconfig b/drivers/usb/dwc2/Kconfig index e838701d6dd54e624c356171833b929889027c38..b6a495e98fd848a4965f586ca216beab346f53fc 100644 --- a/drivers/usb/dwc2/Kconfig +++ b/drivers/usb/dwc2/Kconfig @@ -54,7 +54,7 @@ endchoice config USB_DWC2_PCI tristate "DWC2 PCI" - depends on PCI + depends on USB_PCI depends on USB_GADGET || !USB_GADGET default n select NOP_USB_XCEIV diff --git a/drivers/usb/dwc2/core.h b/drivers/usb/dwc2/core.h index 1a7e8300508251f8b0c2a7274acadc63c9c42111..8367d4f985c1220773a7c6efc38df94d837c7f28 100644 --- a/drivers/usb/dwc2/core.h +++ b/drivers/usb/dwc2/core.h @@ -423,6 +423,10 @@ enum dwc2_ep0_state { * needed. * 0 - No (default) * 1 - Yes + * @activate_stm_fs_transceiver: Activate internal transceiver using GGPIO + * register. + * 0 - Deactivate the transceiver (default) + * 1 - Activate the transceiver * @g_dma: Enables gadget dma usage (default: autodetect). * @g_dma_desc: Enables gadget descriptor DMA (default: autodetect). * @g_rx_fifo_size: The periodic rx fifo size for the device, in @@ -477,6 +481,7 @@ struct dwc2_core_params { bool uframe_sched; bool external_id_pin_ctl; bool hibernation; + bool activate_stm_fs_transceiver; u16 max_packet_count; u32 max_transfer_size; u32 ahbcfg; diff --git a/drivers/usb/dwc2/hcd.c b/drivers/usb/dwc2/hcd.c index a73722e27d07835f6a02f53fbc702c7e27fcf11a..740c7e86d31b912bf0f3c861a687b369acdc5b3d 100644 --- a/drivers/usb/dwc2/hcd.c +++ b/drivers/usb/dwc2/hcd.c @@ -121,7 +121,7 @@ static void dwc2_init_fs_ls_pclk_sel(struct dwc2_hsotg *hsotg) static int dwc2_fs_phy_init(struct dwc2_hsotg *hsotg, bool select_phy) { - u32 usbcfg, i2cctl; + u32 usbcfg, ggpio, i2cctl; int retval = 0; /* @@ -145,6 +145,19 @@ static int dwc2_fs_phy_init(struct dwc2_hsotg *hsotg, bool select_phy) return retval; } } + + if (hsotg->params.activate_stm_fs_transceiver) { + ggpio = dwc2_readl(hsotg->regs + GGPIO); + if (!(ggpio & GGPIO_STM32_OTG_GCCFG_PWRDWN)) { + dev_dbg(hsotg->dev, "Activating transceiver\n"); + /* + * STM32F4x9 uses the GGPIO register as general + * core configuration register. + */ + ggpio |= GGPIO_STM32_OTG_GCCFG_PWRDWN; + dwc2_writel(ggpio, hsotg->regs + GGPIO); + } + } } /* @@ -3264,6 +3277,7 @@ static void dwc2_conn_id_status_change(struct work_struct *work) dwc2_core_init(hsotg, false); dwc2_enable_global_interrupts(hsotg); spin_lock_irqsave(&hsotg->lock, flags); + dwc2_hsotg_disconnect(hsotg); dwc2_hsotg_core_init_disconnected(hsotg, false); spin_unlock_irqrestore(&hsotg->lock, flags); dwc2_hsotg_core_connect(hsotg); diff --git a/drivers/usb/dwc2/hw.h b/drivers/usb/dwc2/hw.h index bde72489ae6625b7a666f9a6e8c79e895d4d1441..4592012c47436c422e4840761717b48929e239bc 100644 --- a/drivers/usb/dwc2/hw.h +++ b/drivers/usb/dwc2/hw.h @@ -225,6 +225,8 @@ #define GPVNDCTL HSOTG_REG(0x0034) #define GGPIO HSOTG_REG(0x0038) +#define GGPIO_STM32_OTG_GCCFG_PWRDWN BIT(16) + #define GUID HSOTG_REG(0x003c) #define GSNPSID HSOTG_REG(0x0040) #define GHWCFG1 HSOTG_REG(0x0044) diff --git a/drivers/usb/dwc2/params.c b/drivers/usb/dwc2/params.c index 2990c347289fcd877b831d12cc7372ee5f2b1166..9cd8722f24f65994d83cdb0378387c021e773e5f 100644 --- a/drivers/usb/dwc2/params.c +++ b/drivers/usb/dwc2/params.c @@ -120,6 +120,22 @@ static void dwc2_set_amcc_params(struct dwc2_hsotg *hsotg) p->ahbcfg = GAHBCFG_HBSTLEN_INCR16 << GAHBCFG_HBSTLEN_SHIFT; } +static void dwc2_set_stm32f4x9_fsotg_params(struct dwc2_hsotg *hsotg) +{ + struct dwc2_core_params *p = &hsotg->params; + + p->otg_cap = DWC2_CAP_PARAM_NO_HNP_SRP_CAPABLE; + p->speed = DWC2_SPEED_PARAM_FULL; + p->host_rx_fifo_size = 128; + p->host_nperio_tx_fifo_size = 96; + p->host_perio_tx_fifo_size = 96; + p->max_packet_count = 256; + p->phy_type = DWC2_PHY_TYPE_PARAM_FS; + p->i2c_enable = false; + p->uframe_sched = false; + p->activate_stm_fs_transceiver = true; +} + const struct of_device_id dwc2_of_match_table[] = { { .compatible = "brcm,bcm2835-usb", .data = dwc2_set_bcm_params }, { .compatible = "hisilicon,hi6220-usb", .data = dwc2_set_his_params }, @@ -133,6 +149,9 @@ const struct of_device_id dwc2_of_match_table[] = { { .compatible = "amlogic,meson-gxbb-usb", .data = dwc2_set_amlogic_params }, { .compatible = "amcc,dwc-otg", .data = dwc2_set_amcc_params }, + { .compatible = "st,stm32f4x9-fsotg", + .data = dwc2_set_stm32f4x9_fsotg_params }, + { .compatible = "st,stm32f4x9-hsotg" }, {}, }; MODULE_DEVICE_TABLE(of, dwc2_of_match_table); diff --git a/drivers/usb/dwc2/platform.c b/drivers/usb/dwc2/platform.c index 9564bc76c56f333fbb6535bdf2487f65c6241039..daf0d37acb37f2ccd2714e6be6bc14832b1cb937 100644 --- a/drivers/usb/dwc2/platform.c +++ b/drivers/usb/dwc2/platform.c @@ -214,20 +214,11 @@ static int dwc2_lowlevel_hw_init(struct dwc2_hsotg *hsotg) hsotg->reset = devm_reset_control_get_optional(hsotg->dev, "dwc2"); if (IS_ERR(hsotg->reset)) { ret = PTR_ERR(hsotg->reset); - switch (ret) { - case -ENOENT: - case -ENOTSUPP: - hsotg->reset = NULL; - break; - default: - dev_err(hsotg->dev, "error getting reset control %d\n", - ret); - return ret; - } + dev_err(hsotg->dev, "error getting reset control %d\n", ret); + return ret; } - if (hsotg->reset) - reset_control_deassert(hsotg->reset); + reset_control_deassert(hsotg->reset); /* Set default UTMI width */ hsotg->phyif = GUSBCFG_PHYIF16; @@ -326,8 +317,7 @@ static int dwc2_driver_remove(struct platform_device *dev) if (hsotg->ll_hw_enabled) dwc2_lowlevel_hw_disable(hsotg); - if (hsotg->reset) - reset_control_assert(hsotg->reset); + reset_control_assert(hsotg->reset); return 0; } diff --git a/drivers/usb/dwc3/Kconfig b/drivers/usb/dwc3/Kconfig index c5aa235863e8c7e88b30544f8e1aba98c9a18d03..ab8c0e0d3b60d54110c6a9a0daba4c10ef086eb1 100644 --- a/drivers/usb/dwc3/Kconfig +++ b/drivers/usb/dwc3/Kconfig @@ -41,6 +41,7 @@ config USB_DWC3_GADGET config USB_DWC3_DUAL_ROLE bool "Dual Role mode" depends on ((USB=y || USB=USB_DWC3) && (USB_GADGET=y || USB_GADGET=USB_DWC3)) + depends on (EXTCON=y || EXTCON=USB_DWC3) help This is the default mode of working of DWC3 controller where both host and gadget features are enabled. @@ -70,7 +71,7 @@ config USB_DWC3_EXYNOS config USB_DWC3_PCI tristate "PCIe-based Platforms" - depends on PCI && ACPI + depends on USB_PCI && ACPI default USB_DWC3 help If you're using the DesignWare Core IP with a PCIe, please say diff --git a/drivers/usb/dwc3/Makefile b/drivers/usb/dwc3/Makefile index ffca34029b21ff6c3bdb3e44ae5531c74d78856a..f15fabbd1e59e9d23e07936be6cfa05a3489975f 100644 --- a/drivers/usb/dwc3/Makefile +++ b/drivers/usb/dwc3/Makefile @@ -17,6 +17,10 @@ ifneq ($(filter y,$(CONFIG_USB_DWC3_GADGET) $(CONFIG_USB_DWC3_DUAL_ROLE)),) dwc3-y += gadget.o ep0.o endif +ifneq ($(CONFIG_USB_DWC3_DUAL_ROLE),) + dwc3-y += drd.o +endif + ifneq ($(CONFIG_USB_DWC3_ULPI),) dwc3-y += ulpi.o endif diff --git a/drivers/usb/dwc3/core.c b/drivers/usb/dwc3/core.c index 369bab16a824599f2295dbdf91e825e1a69e4446..455d89a1cd6dbbb3e3f707bc42bada4a9569e5d7 100644 --- a/drivers/usb/dwc3/core.c +++ b/drivers/usb/dwc3/core.c @@ -100,7 +100,10 @@ static int dwc3_get_dr_mode(struct dwc3 *dwc) return 0; } -void dwc3_set_mode(struct dwc3 *dwc, u32 mode) +static void dwc3_event_buffers_cleanup(struct dwc3 *dwc); +static int dwc3_event_buffers_setup(struct dwc3 *dwc); + +static void dwc3_set_prtcap(struct dwc3 *dwc, u32 mode) { u32 reg; @@ -110,6 +113,69 @@ void dwc3_set_mode(struct dwc3 *dwc, u32 mode) dwc3_writel(dwc->regs, DWC3_GCTL, reg); } +static void __dwc3_set_mode(struct work_struct *work) +{ + struct dwc3 *dwc = work_to_dwc(work); + unsigned long flags; + int ret; + + if (!dwc->desired_dr_role) + return; + + if (dwc->desired_dr_role == dwc->current_dr_role) + return; + + if (dwc->dr_mode != USB_DR_MODE_OTG) + return; + + switch (dwc->current_dr_role) { + case DWC3_GCTL_PRTCAP_HOST: + dwc3_host_exit(dwc); + break; + case DWC3_GCTL_PRTCAP_DEVICE: + dwc3_gadget_exit(dwc); + dwc3_event_buffers_cleanup(dwc); + break; + default: + break; + } + + spin_lock_irqsave(&dwc->lock, flags); + + dwc3_set_prtcap(dwc, dwc->desired_dr_role); + + dwc->current_dr_role = dwc->desired_dr_role; + + spin_unlock_irqrestore(&dwc->lock, flags); + + switch (dwc->desired_dr_role) { + case DWC3_GCTL_PRTCAP_HOST: + ret = dwc3_host_init(dwc); + if (ret) + dev_err(dwc->dev, "failed to initialize host\n"); + break; + case DWC3_GCTL_PRTCAP_DEVICE: + dwc3_event_buffers_setup(dwc); + ret = dwc3_gadget_init(dwc); + if (ret) + dev_err(dwc->dev, "failed to initialize peripheral\n"); + break; + default: + break; + } +} + +void dwc3_set_mode(struct dwc3 *dwc, u32 mode) +{ + unsigned long flags; + + spin_lock_irqsave(&dwc->lock, flags); + dwc->desired_dr_role = mode; + spin_unlock_irqrestore(&dwc->lock, flags); + + queue_work(system_power_efficient_wq, &dwc->drd_work); +} + u32 dwc3_core_fifo_space(struct dwc3_ep *dep, u8 type) { struct dwc3 *dwc = dep->dwc; @@ -397,8 +463,7 @@ static void dwc3_core_num_eps(struct dwc3 *dwc) { struct dwc3_hwparams *parms = &dwc->hwparams; - dwc->num_in_eps = DWC3_NUM_IN_EPS(parms); - dwc->num_out_eps = DWC3_NUM_EPS(parms) - dwc->num_in_eps; + dwc->num_eps = DWC3_NUM_EPS(parms); } static void dwc3_cache_hwparams(struct dwc3 *dwc) @@ -431,6 +496,12 @@ static int dwc3_phy_setup(struct dwc3 *dwc) reg = dwc3_readl(dwc->regs, DWC3_GUSB3PIPECTL(0)); + /* + * Make sure UX_EXIT_PX is cleared as that causes issues with some + * PHYs. Also, this bit is not supposed to be used in normal operation. + */ + reg &= ~DWC3_GUSB3PIPECTL_UX_EXIT_PX; + /* * Above 1.94a, it is recommended to set DWC3_GUSB3PIPECTL_SUSPHY * to '0' during coreConsultant configuration. So default value @@ -714,21 +785,6 @@ static int dwc3_core_init(struct dwc3 *dwc) goto err4; } - switch (dwc->dr_mode) { - case USB_DR_MODE_PERIPHERAL: - dwc3_set_mode(dwc, DWC3_GCTL_PRTCAP_DEVICE); - break; - case USB_DR_MODE_HOST: - dwc3_set_mode(dwc, DWC3_GCTL_PRTCAP_HOST); - break; - case USB_DR_MODE_OTG: - dwc3_set_mode(dwc, DWC3_GCTL_PRTCAP_OTG); - break; - default: - dev_warn(dwc->dev, "Unsupported mode %d\n", dwc->dr_mode); - break; - } - /* * ENDXFER polling is available on version 3.10a and later of * the DWC_usb3 controller. It is NOT available in the @@ -846,6 +902,7 @@ static int dwc3_core_init_mode(struct dwc3 *dwc) switch (dwc->dr_mode) { case USB_DR_MODE_PERIPHERAL: + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_DEVICE); ret = dwc3_gadget_init(dwc); if (ret) { if (ret != -EPROBE_DEFER) @@ -854,6 +911,7 @@ static int dwc3_core_init_mode(struct dwc3 *dwc) } break; case USB_DR_MODE_HOST: + dwc3_set_prtcap(dwc, DWC3_GCTL_PRTCAP_HOST); ret = dwc3_host_init(dwc); if (ret) { if (ret != -EPROBE_DEFER) @@ -862,17 +920,11 @@ static int dwc3_core_init_mode(struct dwc3 *dwc) } break; case USB_DR_MODE_OTG: - ret = dwc3_host_init(dwc); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(dev, "failed to initialize host\n"); - return ret; - } - - ret = dwc3_gadget_init(dwc); + INIT_WORK(&dwc->drd_work, __dwc3_set_mode); + ret = dwc3_drd_init(dwc); if (ret) { if (ret != -EPROBE_DEFER) - dev_err(dev, "failed to initialize gadget\n"); + dev_err(dev, "failed to initialize dual-role\n"); return ret; } break; @@ -894,8 +946,7 @@ static void dwc3_core_exit_mode(struct dwc3 *dwc) dwc3_host_exit(dwc); break; case USB_DR_MODE_OTG: - dwc3_host_exit(dwc); - dwc3_gadget_exit(dwc); + dwc3_drd_exit(dwc); break; default: /* do nothing */ diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 2b9e4ca3c932e5691376dc9504d870804cccc695..981c77f5628e105731b41756b798c988b6c6692d 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -23,10 +23,12 @@ #include #include #include +#include #include #include #include #include +#include #include #include @@ -39,9 +41,8 @@ /* Global constants */ #define DWC3_PULL_UP_TIMEOUT 500 /* ms */ -#define DWC3_ZLP_BUF_SIZE 1024 /* size of a superspeed bulk */ #define DWC3_BOUNCE_SIZE 1024 /* size of a superspeed bulk */ -#define DWC3_EP0_BOUNCE_SIZE 512 +#define DWC3_EP0_SETUP_SIZE 512 #define DWC3_ENDPOINTS_NUM 32 #define DWC3_XHCI_RESOURCES_NUM 2 @@ -66,7 +67,7 @@ #define DWC3_DEVICE_EVENT_OVERFLOW 11 #define DWC3_GEVNTCOUNT_MASK 0xfffc -#define DWC3_GEVNTCOUNT_EHB (1 << 31) +#define DWC3_GEVNTCOUNT_EHB BIT(31) #define DWC3_GSNPSID_MASK 0xffff0000 #define DWC3_GSNPSREV_MASK 0xffff @@ -116,20 +117,20 @@ #define DWC3_VER_NUMBER 0xc1a0 #define DWC3_VER_TYPE 0xc1a4 -#define DWC3_GUSB2PHYCFG(n) (0xc200 + (n * 0x04)) -#define DWC3_GUSB2I2CCTL(n) (0xc240 + (n * 0x04)) +#define DWC3_GUSB2PHYCFG(n) (0xc200 + ((n) * 0x04)) +#define DWC3_GUSB2I2CCTL(n) (0xc240 + ((n) * 0x04)) -#define DWC3_GUSB2PHYACC(n) (0xc280 + (n * 0x04)) +#define DWC3_GUSB2PHYACC(n) (0xc280 + ((n) * 0x04)) -#define DWC3_GUSB3PIPECTL(n) (0xc2c0 + (n * 0x04)) +#define DWC3_GUSB3PIPECTL(n) (0xc2c0 + ((n) * 0x04)) -#define DWC3_GTXFIFOSIZ(n) (0xc300 + (n * 0x04)) -#define DWC3_GRXFIFOSIZ(n) (0xc380 + (n * 0x04)) +#define DWC3_GTXFIFOSIZ(n) (0xc300 + ((n) * 0x04)) +#define DWC3_GRXFIFOSIZ(n) (0xc380 + ((n) * 0x04)) -#define DWC3_GEVNTADRLO(n) (0xc400 + (n * 0x10)) -#define DWC3_GEVNTADRHI(n) (0xc404 + (n * 0x10)) -#define DWC3_GEVNTSIZ(n) (0xc408 + (n * 0x10)) -#define DWC3_GEVNTCOUNT(n) (0xc40c + (n * 0x10)) +#define DWC3_GEVNTADRLO(n) (0xc400 + ((n) * 0x10)) +#define DWC3_GEVNTADRHI(n) (0xc404 + ((n) * 0x10)) +#define DWC3_GEVNTSIZ(n) (0xc408 + ((n) * 0x10)) +#define DWC3_GEVNTCOUNT(n) (0xc40c + ((n) * 0x10)) #define DWC3_GHWPARAMS8 0xc600 #define DWC3_GFLADJ 0xc630 @@ -143,13 +144,13 @@ #define DWC3_DGCMD 0xc714 #define DWC3_DALEPENA 0xc720 -#define DWC3_DEP_BASE(n) (0xc800 + (n * 0x10)) +#define DWC3_DEP_BASE(n) (0xc800 + ((n) * 0x10)) #define DWC3_DEPCMDPAR2 0x00 #define DWC3_DEPCMDPAR1 0x04 #define DWC3_DEPCMDPAR0 0x08 #define DWC3_DEPCMD 0x0c -#define DWC3_DEV_IMOD(n) (0xca00 + (n * 0x4)) +#define DWC3_DEV_IMOD(n) (0xca00 + ((n) * 0x4)) /* OTG Registers */ #define DWC3_OCFG 0xcc00 @@ -176,11 +177,11 @@ /* Global RX Threshold Configuration Register */ #define DWC3_GRXTHRCFG_MAXRXBURSTSIZE(n) (((n) & 0x1f) << 19) #define DWC3_GRXTHRCFG_RXPKTCNT(n) (((n) & 0xf) << 24) -#define DWC3_GRXTHRCFG_PKTCNTSEL (1 << 29) +#define DWC3_GRXTHRCFG_PKTCNTSEL BIT(29) /* Global Configuration Register */ #define DWC3_GCTL_PWRDNSCALE(n) ((n) << 19) -#define DWC3_GCTL_U2RSTECN (1 << 16) +#define DWC3_GCTL_U2RSTECN BIT(16) #define DWC3_GCTL_RAMCLKSEL(x) (((x) & DWC3_GCTL_CLK_MASK) << 6) #define DWC3_GCTL_CLK_BUS (0) #define DWC3_GCTL_CLK_PIPE (1) @@ -193,24 +194,24 @@ #define DWC3_GCTL_PRTCAP_DEVICE 2 #define DWC3_GCTL_PRTCAP_OTG 3 -#define DWC3_GCTL_CORESOFTRESET (1 << 11) -#define DWC3_GCTL_SOFITPSYNC (1 << 10) +#define DWC3_GCTL_CORESOFTRESET BIT(11) +#define DWC3_GCTL_SOFITPSYNC BIT(10) #define DWC3_GCTL_SCALEDOWN(n) ((n) << 4) #define DWC3_GCTL_SCALEDOWN_MASK DWC3_GCTL_SCALEDOWN(3) -#define DWC3_GCTL_DISSCRAMBLE (1 << 3) -#define DWC3_GCTL_U2EXIT_LFPS (1 << 2) -#define DWC3_GCTL_GBLHIBERNATIONEN (1 << 1) -#define DWC3_GCTL_DSBLCLKGTNG (1 << 0) +#define DWC3_GCTL_DISSCRAMBLE BIT(3) +#define DWC3_GCTL_U2EXIT_LFPS BIT(2) +#define DWC3_GCTL_GBLHIBERNATIONEN BIT(1) +#define DWC3_GCTL_DSBLCLKGTNG BIT(0) /* Global User Control 1 Register */ -#define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW (1 << 24) +#define DWC3_GUCTL1_DEV_L1_EXIT_BY_HW BIT(24) /* Global USB2 PHY Configuration Register */ -#define DWC3_GUSB2PHYCFG_PHYSOFTRST (1 << 31) -#define DWC3_GUSB2PHYCFG_U2_FREECLK_EXISTS (1 << 30) -#define DWC3_GUSB2PHYCFG_SUSPHY (1 << 6) -#define DWC3_GUSB2PHYCFG_ULPI_UTMI (1 << 4) -#define DWC3_GUSB2PHYCFG_ENBLSLPM (1 << 8) +#define DWC3_GUSB2PHYCFG_PHYSOFTRST BIT(31) +#define DWC3_GUSB2PHYCFG_U2_FREECLK_EXISTS BIT(30) +#define DWC3_GUSB2PHYCFG_SUSPHY BIT(6) +#define DWC3_GUSB2PHYCFG_ULPI_UTMI BIT(4) +#define DWC3_GUSB2PHYCFG_ENBLSLPM BIT(8) #define DWC3_GUSB2PHYCFG_PHYIF(n) (n << 3) #define DWC3_GUSB2PHYCFG_PHYIF_MASK DWC3_GUSB2PHYCFG_PHYIF(1) #define DWC3_GUSB2PHYCFG_USBTRDTIM(n) (n << 10) @@ -221,25 +222,26 @@ #define UTMI_PHYIF_8_BIT 0 /* Global USB2 PHY Vendor Control Register */ -#define DWC3_GUSB2PHYACC_NEWREGREQ (1 << 25) -#define DWC3_GUSB2PHYACC_BUSY (1 << 23) -#define DWC3_GUSB2PHYACC_WRITE (1 << 22) +#define DWC3_GUSB2PHYACC_NEWREGREQ BIT(25) +#define DWC3_GUSB2PHYACC_BUSY BIT(23) +#define DWC3_GUSB2PHYACC_WRITE BIT(22) #define DWC3_GUSB2PHYACC_ADDR(n) (n << 16) #define DWC3_GUSB2PHYACC_EXTEND_ADDR(n) (n << 8) #define DWC3_GUSB2PHYACC_DATA(n) (n & 0xff) /* Global USB3 PIPE Control Register */ -#define DWC3_GUSB3PIPECTL_PHYSOFTRST (1 << 31) -#define DWC3_GUSB3PIPECTL_U2SSINP3OK (1 << 29) -#define DWC3_GUSB3PIPECTL_DISRXDETINP3 (1 << 28) -#define DWC3_GUSB3PIPECTL_REQP1P2P3 (1 << 24) +#define DWC3_GUSB3PIPECTL_PHYSOFTRST BIT(31) +#define DWC3_GUSB3PIPECTL_U2SSINP3OK BIT(29) +#define DWC3_GUSB3PIPECTL_DISRXDETINP3 BIT(28) +#define DWC3_GUSB3PIPECTL_UX_EXIT_PX BIT(27) +#define DWC3_GUSB3PIPECTL_REQP1P2P3 BIT(24) #define DWC3_GUSB3PIPECTL_DEP1P2P3(n) ((n) << 19) #define DWC3_GUSB3PIPECTL_DEP1P2P3_MASK DWC3_GUSB3PIPECTL_DEP1P2P3(7) #define DWC3_GUSB3PIPECTL_DEP1P2P3_EN DWC3_GUSB3PIPECTL_DEP1P2P3(1) -#define DWC3_GUSB3PIPECTL_DEPOCHANGE (1 << 18) -#define DWC3_GUSB3PIPECTL_SUSPHY (1 << 17) -#define DWC3_GUSB3PIPECTL_LFPSFILT (1 << 9) -#define DWC3_GUSB3PIPECTL_RX_DETOPOLL (1 << 8) +#define DWC3_GUSB3PIPECTL_DEPOCHANGE BIT(18) +#define DWC3_GUSB3PIPECTL_SUSPHY BIT(17) +#define DWC3_GUSB3PIPECTL_LFPSFILT BIT(9) +#define DWC3_GUSB3PIPECTL_RX_DETOPOLL BIT(8) #define DWC3_GUSB3PIPECTL_TX_DEEPH_MASK DWC3_GUSB3PIPECTL_TX_DEEPH(3) #define DWC3_GUSB3PIPECTL_TX_DEEPH(n) ((n) << 1) @@ -248,7 +250,7 @@ #define DWC3_GTXFIFOSIZ_TXFSTADDR(n) ((n) & 0xffff0000) /* Global Event Size Registers */ -#define DWC3_GEVNTSIZ_INTMASK (1 << 31) +#define DWC3_GEVNTSIZ_INTMASK BIT(31) #define DWC3_GEVNTSIZ_SIZE(n) ((n) & 0xffff) /* Global HWPARAMS0 Register */ @@ -289,18 +291,18 @@ #define DWC3_MAX_HIBER_SCRATCHBUFS 15 /* Global HWPARAMS6 Register */ -#define DWC3_GHWPARAMS6_EN_FPGA (1 << 7) +#define DWC3_GHWPARAMS6_EN_FPGA BIT(7) /* Global HWPARAMS7 Register */ #define DWC3_GHWPARAMS7_RAM1_DEPTH(n) ((n) & 0xffff) #define DWC3_GHWPARAMS7_RAM2_DEPTH(n) (((n) >> 16) & 0xffff) /* Global Frame Length Adjustment Register */ -#define DWC3_GFLADJ_30MHZ_SDBND_SEL (1 << 7) +#define DWC3_GFLADJ_30MHZ_SDBND_SEL BIT(7) #define DWC3_GFLADJ_30MHZ_MASK 0x3f /* Global User Control Register 2 */ -#define DWC3_GUCTL2_RST_ACTBITLATER (1 << 14) +#define DWC3_GUCTL2_RST_ACTBITLATER BIT(14) /* Device Configuration Register */ #define DWC3_DCFG_DEVADDR(addr) ((addr) << 3) @@ -310,23 +312,23 @@ #define DWC3_DCFG_SUPERSPEED_PLUS (5 << 0) /* DWC_usb31 only */ #define DWC3_DCFG_SUPERSPEED (4 << 0) #define DWC3_DCFG_HIGHSPEED (0 << 0) -#define DWC3_DCFG_FULLSPEED (1 << 0) +#define DWC3_DCFG_FULLSPEED BIT(0) #define DWC3_DCFG_LOWSPEED (2 << 0) #define DWC3_DCFG_NUMP_SHIFT 17 #define DWC3_DCFG_NUMP(n) (((n) >> DWC3_DCFG_NUMP_SHIFT) & 0x1f) #define DWC3_DCFG_NUMP_MASK (0x1f << DWC3_DCFG_NUMP_SHIFT) -#define DWC3_DCFG_LPM_CAP (1 << 22) +#define DWC3_DCFG_LPM_CAP BIT(22) /* Device Control Register */ -#define DWC3_DCTL_RUN_STOP (1 << 31) -#define DWC3_DCTL_CSFTRST (1 << 30) -#define DWC3_DCTL_LSFTRST (1 << 29) +#define DWC3_DCTL_RUN_STOP BIT(31) +#define DWC3_DCTL_CSFTRST BIT(30) +#define DWC3_DCTL_LSFTRST BIT(29) #define DWC3_DCTL_HIRD_THRES_MASK (0x1f << 24) #define DWC3_DCTL_HIRD_THRES(n) ((n) << 24) -#define DWC3_DCTL_APPL1RES (1 << 23) +#define DWC3_DCTL_APPL1RES BIT(23) /* These apply for core versions 1.87a and earlier */ #define DWC3_DCTL_TRGTULST_MASK (0x0f << 17) @@ -341,15 +343,15 @@ #define DWC3_DCTL_LPM_ERRATA_MASK DWC3_DCTL_LPM_ERRATA(0xf) #define DWC3_DCTL_LPM_ERRATA(n) ((n) << 20) -#define DWC3_DCTL_KEEP_CONNECT (1 << 19) -#define DWC3_DCTL_L1_HIBER_EN (1 << 18) -#define DWC3_DCTL_CRS (1 << 17) -#define DWC3_DCTL_CSS (1 << 16) +#define DWC3_DCTL_KEEP_CONNECT BIT(19) +#define DWC3_DCTL_L1_HIBER_EN BIT(18) +#define DWC3_DCTL_CRS BIT(17) +#define DWC3_DCTL_CSS BIT(16) -#define DWC3_DCTL_INITU2ENA (1 << 12) -#define DWC3_DCTL_ACCEPTU2ENA (1 << 11) -#define DWC3_DCTL_INITU1ENA (1 << 10) -#define DWC3_DCTL_ACCEPTU1ENA (1 << 9) +#define DWC3_DCTL_INITU2ENA BIT(12) +#define DWC3_DCTL_ACCEPTU2ENA BIT(11) +#define DWC3_DCTL_INITU1ENA BIT(10) +#define DWC3_DCTL_ACCEPTU1ENA BIT(9) #define DWC3_DCTL_TSTCTRL_MASK (0xf << 1) #define DWC3_DCTL_ULSTCHNGREQ_MASK (0x0f << 5) @@ -364,36 +366,36 @@ #define DWC3_DCTL_ULSTCHNG_LOOPBACK (DWC3_DCTL_ULSTCHNGREQ(11)) /* Device Event Enable Register */ -#define DWC3_DEVTEN_VNDRDEVTSTRCVEDEN (1 << 12) -#define DWC3_DEVTEN_EVNTOVERFLOWEN (1 << 11) -#define DWC3_DEVTEN_CMDCMPLTEN (1 << 10) -#define DWC3_DEVTEN_ERRTICERREN (1 << 9) -#define DWC3_DEVTEN_SOFEN (1 << 7) -#define DWC3_DEVTEN_EOPFEN (1 << 6) -#define DWC3_DEVTEN_HIBERNATIONREQEVTEN (1 << 5) -#define DWC3_DEVTEN_WKUPEVTEN (1 << 4) -#define DWC3_DEVTEN_ULSTCNGEN (1 << 3) -#define DWC3_DEVTEN_CONNECTDONEEN (1 << 2) -#define DWC3_DEVTEN_USBRSTEN (1 << 1) -#define DWC3_DEVTEN_DISCONNEVTEN (1 << 0) +#define DWC3_DEVTEN_VNDRDEVTSTRCVEDEN BIT(12) +#define DWC3_DEVTEN_EVNTOVERFLOWEN BIT(11) +#define DWC3_DEVTEN_CMDCMPLTEN BIT(10) +#define DWC3_DEVTEN_ERRTICERREN BIT(9) +#define DWC3_DEVTEN_SOFEN BIT(7) +#define DWC3_DEVTEN_EOPFEN BIT(6) +#define DWC3_DEVTEN_HIBERNATIONREQEVTEN BIT(5) +#define DWC3_DEVTEN_WKUPEVTEN BIT(4) +#define DWC3_DEVTEN_ULSTCNGEN BIT(3) +#define DWC3_DEVTEN_CONNECTDONEEN BIT(2) +#define DWC3_DEVTEN_USBRSTEN BIT(1) +#define DWC3_DEVTEN_DISCONNEVTEN BIT(0) /* Device Status Register */ -#define DWC3_DSTS_DCNRD (1 << 29) +#define DWC3_DSTS_DCNRD BIT(29) /* This applies for core versions 1.87a and earlier */ -#define DWC3_DSTS_PWRUPREQ (1 << 24) +#define DWC3_DSTS_PWRUPREQ BIT(24) /* These apply for core versions 1.94a and later */ -#define DWC3_DSTS_RSS (1 << 25) -#define DWC3_DSTS_SSS (1 << 24) +#define DWC3_DSTS_RSS BIT(25) +#define DWC3_DSTS_SSS BIT(24) -#define DWC3_DSTS_COREIDLE (1 << 23) -#define DWC3_DSTS_DEVCTRLHLT (1 << 22) +#define DWC3_DSTS_COREIDLE BIT(23) +#define DWC3_DSTS_DEVCTRLHLT BIT(22) #define DWC3_DSTS_USBLNKST_MASK (0x0f << 18) #define DWC3_DSTS_USBLNKST(n) (((n) & DWC3_DSTS_USBLNKST_MASK) >> 18) -#define DWC3_DSTS_RXFIFOEMPTY (1 << 17) +#define DWC3_DSTS_RXFIFOEMPTY BIT(17) #define DWC3_DSTS_SOFFN_MASK (0x3fff << 3) #define DWC3_DSTS_SOFFN(n) (((n) & DWC3_DSTS_SOFFN_MASK) >> 3) @@ -403,7 +405,7 @@ #define DWC3_DSTS_SUPERSPEED_PLUS (5 << 0) /* DWC_usb31 only */ #define DWC3_DSTS_SUPERSPEED (4 << 0) #define DWC3_DSTS_HIGHSPEED (0 << 0) -#define DWC3_DSTS_FULLSPEED (1 << 0) +#define DWC3_DSTS_FULLSPEED BIT(0) #define DWC3_DSTS_LOWSPEED (2 << 0) /* Device Generic Command Register */ @@ -421,26 +423,26 @@ #define DWC3_DGCMD_RUN_SOC_BUS_LOOPBACK 0x10 #define DWC3_DGCMD_STATUS(n) (((n) >> 12) & 0x0F) -#define DWC3_DGCMD_CMDACT (1 << 10) -#define DWC3_DGCMD_CMDIOC (1 << 8) +#define DWC3_DGCMD_CMDACT BIT(10) +#define DWC3_DGCMD_CMDIOC BIT(8) /* Device Generic Command Parameter Register */ -#define DWC3_DGCMDPAR_FORCE_LINKPM_ACCEPT (1 << 0) +#define DWC3_DGCMDPAR_FORCE_LINKPM_ACCEPT BIT(0) #define DWC3_DGCMDPAR_FIFO_NUM(n) ((n) << 0) #define DWC3_DGCMDPAR_RX_FIFO (0 << 5) -#define DWC3_DGCMDPAR_TX_FIFO (1 << 5) +#define DWC3_DGCMDPAR_TX_FIFO BIT(5) #define DWC3_DGCMDPAR_LOOPBACK_DIS (0 << 0) -#define DWC3_DGCMDPAR_LOOPBACK_ENA (1 << 0) +#define DWC3_DGCMDPAR_LOOPBACK_ENA BIT(0) /* Device Endpoint Command Register */ #define DWC3_DEPCMD_PARAM_SHIFT 16 #define DWC3_DEPCMD_PARAM(x) ((x) << DWC3_DEPCMD_PARAM_SHIFT) #define DWC3_DEPCMD_GET_RSC_IDX(x) (((x) >> DWC3_DEPCMD_PARAM_SHIFT) & 0x7f) #define DWC3_DEPCMD_STATUS(x) (((x) >> 12) & 0x0F) -#define DWC3_DEPCMD_HIPRI_FORCERM (1 << 11) -#define DWC3_DEPCMD_CLEARPENDIN (1 << 11) -#define DWC3_DEPCMD_CMDACT (1 << 10) -#define DWC3_DEPCMD_CMDIOC (1 << 8) +#define DWC3_DEPCMD_HIPRI_FORCERM BIT(11) +#define DWC3_DEPCMD_CLEARPENDIN BIT(11) +#define DWC3_DEPCMD_CMDACT BIT(10) +#define DWC3_DEPCMD_CMDIOC BIT(8) #define DWC3_DEPCMD_DEPSTARTCFG (0x09 << 0) #define DWC3_DEPCMD_ENDTRANSFER (0x08 << 0) @@ -458,7 +460,7 @@ #define DWC3_DEPCMD_CMD(x) ((x) & 0xf) /* The EP number goes 0..31 so ep0 is always out and ep1 is always in */ -#define DWC3_DALEPENA_EP(n) (1 << n) +#define DWC3_DALEPENA_EP(n) BIT(n) #define DWC3_DEPCMD_TYPE_CONTROL 0 #define DWC3_DEPCMD_TYPE_ISOC 1 @@ -500,8 +502,8 @@ struct dwc3_event_buffer { struct dwc3 *dwc; }; -#define DWC3_EP_FLAG_STALLED (1 << 0) -#define DWC3_EP_FLAG_WEDGED (1 << 1) +#define DWC3_EP_FLAG_STALLED BIT(0) +#define DWC3_EP_FLAG_WEDGED BIT(1) #define DWC3_EP_DIRECTION_TX true #define DWC3_EP_DIRECTION_RX false @@ -550,17 +552,17 @@ struct dwc3_ep { u32 saved_state; unsigned flags; -#define DWC3_EP_ENABLED (1 << 0) -#define DWC3_EP_STALL (1 << 1) -#define DWC3_EP_WEDGE (1 << 2) -#define DWC3_EP_BUSY (1 << 4) -#define DWC3_EP_PENDING_REQUEST (1 << 5) -#define DWC3_EP_MISSED_ISOC (1 << 6) -#define DWC3_EP_END_TRANSFER_PENDING (1 << 7) -#define DWC3_EP_TRANSFER_STARTED (1 << 8) +#define DWC3_EP_ENABLED BIT(0) +#define DWC3_EP_STALL BIT(1) +#define DWC3_EP_WEDGE BIT(2) +#define DWC3_EP_BUSY BIT(4) +#define DWC3_EP_PENDING_REQUEST BIT(5) +#define DWC3_EP_MISSED_ISOC BIT(6) +#define DWC3_EP_END_TRANSFER_PENDING BIT(7) +#define DWC3_EP_TRANSFER_STARTED BIT(8) /* This last one is specific to EP0 */ -#define DWC3_EP0_DIR_IN (1 << 31) +#define DWC3_EP0_DIR_IN BIT(31) /* * IMPORTANT: we *know* we have 256 TRBs in our @trb_pool, so we will @@ -638,13 +640,13 @@ enum dwc3_link_state { #define DWC3_TRB_STS_XFER_IN_PROG 4 /* TRB Control */ -#define DWC3_TRB_CTRL_HWO (1 << 0) -#define DWC3_TRB_CTRL_LST (1 << 1) -#define DWC3_TRB_CTRL_CHN (1 << 2) -#define DWC3_TRB_CTRL_CSP (1 << 3) +#define DWC3_TRB_CTRL_HWO BIT(0) +#define DWC3_TRB_CTRL_LST BIT(1) +#define DWC3_TRB_CTRL_CHN BIT(2) +#define DWC3_TRB_CTRL_CSP BIT(3) #define DWC3_TRB_CTRL_TRBCTL(n) (((n) & 0x3f) << 4) -#define DWC3_TRB_CTRL_ISP_IMI (1 << 10) -#define DWC3_TRB_CTRL_IOC (1 << 11) +#define DWC3_TRB_CTRL_ISP_IMI BIT(10) +#define DWC3_TRB_CTRL_IOC BIT(11) #define DWC3_TRB_CTRL_SID_SOFN(n) (((n) & 0xffff) << 14) #define DWC3_TRBCTL_TYPE(n) ((n) & (0x3f << 4)) @@ -746,6 +748,7 @@ struct dwc3_request { unsigned direction:1; unsigned mapped:1; unsigned started:1; + unsigned zero:1; }; /* @@ -758,15 +761,11 @@ struct dwc3_scratchpad_array { /** * struct dwc3 - representation of our controller - * @ctrl_req: usb control request which is used for ep0 + * @drd_work - workqueue used for role swapping * @ep0_trb: trb which is used for the ctrl_req - * @ep0_bounce: bounce buffer for ep0 - * @zlp_buf: used when request->zero is set * @setup_buf: used while precessing STD USB requests - * @ctrl_req_addr: dma address of ctrl_req * @ep0_trb: dma address of ep0_trb * @ep0_usb_req: dummy req used while handling STD USB requests - * @ep0_bounce_addr: dma address of ep0_bounce * @scratch_addr: dma address of scratchbuf * @ep0_in_setup: one control transfer is completed and enter setup phase * @lock: for synchronizing @@ -784,6 +783,10 @@ struct dwc3_scratchpad_array { * @maximum_speed: maximum speed requested (mainly for testing purposes) * @revision: revision register contents * @dr_mode: requested mode of operation + * @current_dr_role: current role of operation when in dual-role mode + * @desired_dr_role: desired role of operation when in dual-role mode + * @edev: extcon handle + * @edev_nb: extcon notifier * @hsphy_mode: UTMI phy mode, one of following: * - USBPHY_INTERFACE_MODE_UTMI * - USBPHY_INTERFACE_MODE_UTMIW @@ -799,8 +802,7 @@ struct dwc3_scratchpad_array { * @u2pel: parameter from Set SEL request. * @u1sel: parameter from Set SEL request. * @u1pel: parameter from Set SEL request. - * @num_out_eps: number of out endpoints - * @num_in_eps: number of in endpoints + * @num_eps: number of endpoints * @ep0_next_event: hold the next expected event * @ep0state: state of endpoint zero * @link_state: link state @@ -858,17 +860,13 @@ struct dwc3_scratchpad_array { * increments or 0 to disable. */ struct dwc3 { - struct usb_ctrlrequest *ctrl_req; + struct work_struct drd_work; struct dwc3_trb *ep0_trb; void *bounce; - void *ep0_bounce; - void *zlp_buf; void *scratchbuf; u8 *setup_buf; - dma_addr_t ctrl_req_addr; dma_addr_t ep0_trb_addr; dma_addr_t bounce_addr; - dma_addr_t ep0_bounce_addr; dma_addr_t scratch_addr; struct dwc3_request ep0_usb_req; struct completion ep0_in_setup; @@ -900,6 +898,10 @@ struct dwc3 { size_t regs_size; enum usb_dr_mode dr_mode; + u32 current_dr_role; + u32 desired_dr_role; + struct extcon_dev *edev; + struct notifier_block edev_nb; enum usb_phy_interface hsphy_mode; u32 fladj; @@ -960,8 +962,7 @@ struct dwc3 { u8 speed; - u8 num_out_eps; - u8 num_in_eps; + u8 num_eps; struct dwc3_hwparams hwparams; struct dentry *root; @@ -1010,7 +1011,7 @@ struct dwc3 { u16 imod_interval; }; -/* -------------------------------------------------------------------------- */ +#define work_to_dwc(w) (container_of((w), struct dwc3, drd_work)) /* -------------------------------------------------------------------------- */ @@ -1054,13 +1055,13 @@ struct dwc3_event_depevt { u32 status:4; /* Within XferNotReady */ -#define DEPEVT_STATUS_TRANSFER_ACTIVE (1 << 3) +#define DEPEVT_STATUS_TRANSFER_ACTIVE BIT(3) /* Within XferComplete */ -#define DEPEVT_STATUS_BUSERR (1 << 0) -#define DEPEVT_STATUS_SHORT (1 << 1) -#define DEPEVT_STATUS_IOC (1 << 2) -#define DEPEVT_STATUS_LST (1 << 3) +#define DEPEVT_STATUS_BUSERR BIT(0) +#define DEPEVT_STATUS_SHORT BIT(1) +#define DEPEVT_STATUS_IOC BIT(2) +#define DEPEVT_STATUS_LST BIT(3) /* Stream event only */ #define DEPEVT_STREAMEVT_FOUND 1 @@ -1221,6 +1222,16 @@ static inline int dwc3_send_gadget_generic_command(struct dwc3 *dwc, { return 0; } #endif +#if IS_ENABLED(CONFIG_USB_DWC3_DUAL_ROLE) +int dwc3_drd_init(struct dwc3 *dwc); +void dwc3_drd_exit(struct dwc3 *dwc); +#else +static inline int dwc3_drd_init(struct dwc3 *dwc) +{ return 0; } +static inline void dwc3_drd_exit(struct dwc3 *dwc) +{ } +#endif + /* power management interface */ #if !IS_ENABLED(CONFIG_USB_DWC3_HOST) int dwc3_gadget_suspend(struct dwc3 *dwc); diff --git a/drivers/usb/dwc3/debug.h b/drivers/usb/dwc3/debug.h index eeed4ffd81312c9922c4fe2e4b83627a8c41059c..cb2d8d3f7f3d8d052bb3a94a924ce37100683bb4 100644 --- a/drivers/usb/dwc3/debug.h +++ b/drivers/usb/dwc3/debug.h @@ -124,6 +124,34 @@ dwc3_gadget_link_string(enum dwc3_link_state link_state) } } +/** + * dwc3_trb_type_string - returns TRB type as a string + * @type: the type of the TRB + */ +static inline const char *dwc3_trb_type_string(unsigned int type) +{ + switch (type) { + case DWC3_TRBCTL_NORMAL: + return "normal"; + case DWC3_TRBCTL_CONTROL_SETUP: + return "setup"; + case DWC3_TRBCTL_CONTROL_STATUS2: + return "status2"; + case DWC3_TRBCTL_CONTROL_STATUS3: + return "status3"; + case DWC3_TRBCTL_CONTROL_DATA: + return "data"; + case DWC3_TRBCTL_ISOCHRONOUS_FIRST: + return "isoc-first"; + case DWC3_TRBCTL_ISOCHRONOUS: + return "isoc"; + case DWC3_TRBCTL_LINK_TRB: + return "link"; + default: + return "UNKNOWN"; + } +} + static inline const char *dwc3_ep0_state_string(enum dwc3_ep0_state state) { switch (state) { diff --git a/drivers/usb/dwc3/debugfs.c b/drivers/usb/dwc3/debugfs.c index 31926dda43c9638f96a3fa6a51a2f5ef6a6d2359..7be963dd8e3baecd4538a56c3179787cd9b85d29 100644 --- a/drivers/usb/dwc3/debugfs.c +++ b/drivers/usb/dwc3/debugfs.c @@ -300,7 +300,7 @@ static int dwc3_mode_show(struct seq_file *s, void *unused) seq_printf(s, "device\n"); break; case DWC3_GCTL_PRTCAP_OTG: - seq_printf(s, "OTG\n"); + seq_printf(s, "otg\n"); break; default: seq_printf(s, "UNKNOWN %08x\n", DWC3_GCTL_PRTCAP(reg)); @@ -319,7 +319,6 @@ static ssize_t dwc3_mode_write(struct file *file, { struct seq_file *s = file->private_data; struct dwc3 *dwc = s->private; - unsigned long flags; u32 mode = 0; char buf[32]; @@ -327,19 +326,16 @@ static ssize_t dwc3_mode_write(struct file *file, return -EFAULT; if (!strncmp(buf, "host", 4)) - mode |= DWC3_GCTL_PRTCAP_HOST; + mode = DWC3_GCTL_PRTCAP_HOST; if (!strncmp(buf, "device", 6)) - mode |= DWC3_GCTL_PRTCAP_DEVICE; + mode = DWC3_GCTL_PRTCAP_DEVICE; if (!strncmp(buf, "otg", 3)) - mode |= DWC3_GCTL_PRTCAP_OTG; + mode = DWC3_GCTL_PRTCAP_OTG; + + dwc3_set_mode(dwc, mode); - if (mode) { - spin_lock_irqsave(&dwc->lock, flags); - dwc3_set_mode(dwc, mode); - spin_unlock_irqrestore(&dwc->lock, flags); - } return count; } @@ -446,52 +442,7 @@ static int dwc3_link_state_show(struct seq_file *s, void *unused) state = DWC3_DSTS_USBLNKST(reg); spin_unlock_irqrestore(&dwc->lock, flags); - switch (state) { - case DWC3_LINK_STATE_U0: - seq_printf(s, "U0\n"); - break; - case DWC3_LINK_STATE_U1: - seq_printf(s, "U1\n"); - break; - case DWC3_LINK_STATE_U2: - seq_printf(s, "U2\n"); - break; - case DWC3_LINK_STATE_U3: - seq_printf(s, "U3\n"); - break; - case DWC3_LINK_STATE_SS_DIS: - seq_printf(s, "SS.Disabled\n"); - break; - case DWC3_LINK_STATE_RX_DET: - seq_printf(s, "Rx.Detect\n"); - break; - case DWC3_LINK_STATE_SS_INACT: - seq_printf(s, "SS.Inactive\n"); - break; - case DWC3_LINK_STATE_POLL: - seq_printf(s, "Poll\n"); - break; - case DWC3_LINK_STATE_RECOV: - seq_printf(s, "Recovery\n"); - break; - case DWC3_LINK_STATE_HRESET: - seq_printf(s, "HRESET\n"); - break; - case DWC3_LINK_STATE_CMPLY: - seq_printf(s, "Compliance\n"); - break; - case DWC3_LINK_STATE_LPBK: - seq_printf(s, "Loopback\n"); - break; - case DWC3_LINK_STATE_RESET: - seq_printf(s, "Reset\n"); - break; - case DWC3_LINK_STATE_RESUME: - seq_printf(s, "Resume\n"); - break; - default: - seq_printf(s, "UNKNOWN %d\n", state); - } + seq_printf(s, "%s\n", dwc3_gadget_link_string(state)); return 0; } @@ -689,30 +640,6 @@ static int dwc3_ep_transfer_type_show(struct seq_file *s, void *unused) return 0; } -static inline const char *dwc3_trb_type_string(struct dwc3_trb *trb) -{ - switch (DWC3_TRBCTL_TYPE(trb->ctrl)) { - case DWC3_TRBCTL_NORMAL: - return "normal"; - case DWC3_TRBCTL_CONTROL_SETUP: - return "control-setup"; - case DWC3_TRBCTL_CONTROL_STATUS2: - return "control-status2"; - case DWC3_TRBCTL_CONTROL_STATUS3: - return "control-status3"; - case DWC3_TRBCTL_CONTROL_DATA: - return "control-data"; - case DWC3_TRBCTL_ISOCHRONOUS_FIRST: - return "isoc-first"; - case DWC3_TRBCTL_ISOCHRONOUS: - return "isoc"; - case DWC3_TRBCTL_LINK_TRB: - return "link"; - default: - return "UNKNOWN"; - } -} - static int dwc3_ep_trb_ring_show(struct seq_file *s, void *unused) { struct dwc3_ep *dep = s->private; @@ -733,10 +660,11 @@ static int dwc3_ep_trb_ring_show(struct seq_file *s, void *unused) for (i = 0; i < DWC3_TRB_NUM; i++) { struct dwc3_trb *trb = &dep->trb_pool[i]; + unsigned int type = DWC3_TRBCTL_TYPE(trb->ctrl); seq_printf(s, "%08x%08x,%d,%s,%d,%d,%d,%d,%d,%d\n", trb->bph, trb->bpl, trb->size, - dwc3_trb_type_string(trb), + dwc3_trb_type_string(type), !!(trb->ctrl & DWC3_TRB_CTRL_IOC), !!(trb->ctrl & DWC3_TRB_CTRL_ISP_IMI), !!(trb->ctrl & DWC3_TRB_CTRL_CSP), @@ -822,19 +750,8 @@ static void dwc3_debugfs_create_endpoint_dirs(struct dwc3 *dwc, { int i; - for (i = 0; i < dwc->num_in_eps; i++) { - u8 epnum = (i << 1) | 1; - struct dwc3_ep *dep = dwc->eps[epnum]; - - if (!dep) - continue; - - dwc3_debugfs_create_endpoint_dir(dep, parent); - } - - for (i = 0; i < dwc->num_out_eps; i++) { - u8 epnum = (i << 1); - struct dwc3_ep *dep = dwc->eps[epnum]; + for (i = 0; i < dwc->num_eps; i++) { + struct dwc3_ep *dep = dwc->eps[i]; if (!dep) continue; diff --git a/drivers/usb/dwc3/drd.c b/drivers/usb/dwc3/drd.c new file mode 100644 index 0000000000000000000000000000000000000000..2765c51c7ef5fe0c0242ef0f09750fae2d727598 --- /dev/null +++ b/drivers/usb/dwc3/drd.c @@ -0,0 +1,85 @@ +/** + * drd.c - DesignWare USB3 DRD Controller Dual-role support + * + * Copyright (C) 2017 Texas Instruments Incorporated - http://www.ti.com + * + * Authors: Roger Quadros + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +#include "debug.h" +#include "core.h" +#include "gadget.h" + +static void dwc3_drd_update(struct dwc3 *dwc) +{ + int id; + + id = extcon_get_state(dwc->edev, EXTCON_USB_HOST); + if (id < 0) + id = 0; + + dwc3_set_mode(dwc, id ? + DWC3_GCTL_PRTCAP_HOST : + DWC3_GCTL_PRTCAP_DEVICE); +} + +static int dwc3_drd_notifier(struct notifier_block *nb, + unsigned long event, void *ptr) +{ + struct dwc3 *dwc = container_of(nb, struct dwc3, edev_nb); + + dwc3_set_mode(dwc, event ? + DWC3_GCTL_PRTCAP_HOST : + DWC3_GCTL_PRTCAP_DEVICE); + + return NOTIFY_DONE; +} + +int dwc3_drd_init(struct dwc3 *dwc) +{ + int ret; + + if (dwc->dev->of_node) { + if (of_property_read_bool(dwc->dev->of_node, "extcon")) + dwc->edev = extcon_get_edev_by_phandle(dwc->dev, 0); + + if (IS_ERR(dwc->edev)) + return PTR_ERR(dwc->edev); + + dwc->edev_nb.notifier_call = dwc3_drd_notifier; + ret = extcon_register_notifier(dwc->edev, EXTCON_USB_HOST, + &dwc->edev_nb); + if (ret < 0) { + dev_err(dwc->dev, "couldn't register cable notifier\n"); + return ret; + } + } + + dwc3_drd_update(dwc); + + return 0; +} + +void dwc3_drd_exit(struct dwc3 *dwc) +{ + extcon_unregister_notifier(dwc->edev, EXTCON_USB_HOST, + &dwc->edev_nb); + + dwc3_set_mode(dwc, DWC3_GCTL_PRTCAP_DEVICE); + flush_work(&dwc->drd_work); + dwc3_gadget_exit(dwc); +} diff --git a/drivers/usb/dwc3/dwc3-exynos.c b/drivers/usb/dwc3/dwc3-exynos.c index 1515d45ebcec0dd412b52550930053ab9a9a2fe9..98f74ff66120936bad7bb166c8cd2c5fb040b460 100644 --- a/drivers/usb/dwc3/dwc3-exynos.c +++ b/drivers/usb/dwc3/dwc3-exynos.c @@ -147,53 +147,53 @@ static int dwc3_exynos_probe(struct platform_device *pdev) exynos->vdd33 = devm_regulator_get(dev, "vdd33"); if (IS_ERR(exynos->vdd33)) { ret = PTR_ERR(exynos->vdd33); - goto err2; + goto vdd33_err; } ret = regulator_enable(exynos->vdd33); if (ret) { dev_err(dev, "Failed to enable VDD33 supply\n"); - goto err2; + goto vdd33_err; } exynos->vdd10 = devm_regulator_get(dev, "vdd10"); if (IS_ERR(exynos->vdd10)) { ret = PTR_ERR(exynos->vdd10); - goto err3; + goto vdd10_err; } ret = regulator_enable(exynos->vdd10); if (ret) { dev_err(dev, "Failed to enable VDD10 supply\n"); - goto err3; + goto vdd10_err; } ret = dwc3_exynos_register_phys(exynos); if (ret) { dev_err(dev, "couldn't register PHYs\n"); - goto err4; + goto phys_err; } if (node) { ret = of_platform_populate(node, NULL, NULL, dev); if (ret) { dev_err(dev, "failed to add dwc3 core\n"); - goto err5; + goto populate_err; } } else { dev_err(dev, "no device node, failed to add dwc3 core\n"); ret = -ENODEV; - goto err5; + goto populate_err; } return 0; -err5: +populate_err: platform_device_unregister(exynos->usb2_phy); platform_device_unregister(exynos->usb3_phy); -err4: +phys_err: regulator_disable(exynos->vdd10); -err3: +vdd10_err: regulator_disable(exynos->vdd33); -err2: +vdd33_err: clk_disable_unprepare(exynos->axius_clk); axius_clk_err: clk_disable_unprepare(exynos->susp_clk); diff --git a/drivers/usb/dwc3/dwc3-omap.c b/drivers/usb/dwc3/dwc3-omap.c index f8d0747810e78d7cc7930fed1b71cfc9c5aeb048..98926504b55b5b425eb3c9dce66e889a524d2152 100644 --- a/drivers/usb/dwc3/dwc3-omap.c +++ b/drivers/usb/dwc3/dwc3-omap.c @@ -79,40 +79,40 @@ #define USBOTGSS_DEBUG_OFFSET 0x0600 /* SYSCONFIG REGISTER */ -#define USBOTGSS_SYSCONFIG_DMADISABLE (1 << 16) +#define USBOTGSS_SYSCONFIG_DMADISABLE BIT(16) /* IRQ_EOI REGISTER */ -#define USBOTGSS_IRQ_EOI_LINE_NUMBER (1 << 0) +#define USBOTGSS_IRQ_EOI_LINE_NUMBER BIT(0) /* IRQS0 BITS */ -#define USBOTGSS_IRQO_COREIRQ_ST (1 << 0) +#define USBOTGSS_IRQO_COREIRQ_ST BIT(0) /* IRQMISC BITS */ -#define USBOTGSS_IRQMISC_DMADISABLECLR (1 << 17) -#define USBOTGSS_IRQMISC_OEVT (1 << 16) -#define USBOTGSS_IRQMISC_DRVVBUS_RISE (1 << 13) -#define USBOTGSS_IRQMISC_CHRGVBUS_RISE (1 << 12) -#define USBOTGSS_IRQMISC_DISCHRGVBUS_RISE (1 << 11) -#define USBOTGSS_IRQMISC_IDPULLUP_RISE (1 << 8) -#define USBOTGSS_IRQMISC_DRVVBUS_FALL (1 << 5) -#define USBOTGSS_IRQMISC_CHRGVBUS_FALL (1 << 4) -#define USBOTGSS_IRQMISC_DISCHRGVBUS_FALL (1 << 3) -#define USBOTGSS_IRQMISC_IDPULLUP_FALL (1 << 0) +#define USBOTGSS_IRQMISC_DMADISABLECLR BIT(17) +#define USBOTGSS_IRQMISC_OEVT BIT(16) +#define USBOTGSS_IRQMISC_DRVVBUS_RISE BIT(13) +#define USBOTGSS_IRQMISC_CHRGVBUS_RISE BIT(12) +#define USBOTGSS_IRQMISC_DISCHRGVBUS_RISE BIT(11) +#define USBOTGSS_IRQMISC_IDPULLUP_RISE BIT(8) +#define USBOTGSS_IRQMISC_DRVVBUS_FALL BIT(5) +#define USBOTGSS_IRQMISC_CHRGVBUS_FALL BIT(4) +#define USBOTGSS_IRQMISC_DISCHRGVBUS_FALL BIT(3) +#define USBOTGSS_IRQMISC_IDPULLUP_FALL BIT(0) /* UTMI_OTG_STATUS REGISTER */ -#define USBOTGSS_UTMI_OTG_STATUS_DRVVBUS (1 << 5) -#define USBOTGSS_UTMI_OTG_STATUS_CHRGVBUS (1 << 4) -#define USBOTGSS_UTMI_OTG_STATUS_DISCHRGVBUS (1 << 3) -#define USBOTGSS_UTMI_OTG_STATUS_IDPULLUP (1 << 0) +#define USBOTGSS_UTMI_OTG_STATUS_DRVVBUS BIT(5) +#define USBOTGSS_UTMI_OTG_STATUS_CHRGVBUS BIT(4) +#define USBOTGSS_UTMI_OTG_STATUS_DISCHRGVBUS BIT(3) +#define USBOTGSS_UTMI_OTG_STATUS_IDPULLUP BIT(0) /* UTMI_OTG_CTRL REGISTER */ -#define USBOTGSS_UTMI_OTG_CTRL_SW_MODE (1 << 31) -#define USBOTGSS_UTMI_OTG_CTRL_POWERPRESENT (1 << 9) -#define USBOTGSS_UTMI_OTG_CTRL_TXBITSTUFFENABLE (1 << 8) -#define USBOTGSS_UTMI_OTG_CTRL_IDDIG (1 << 4) -#define USBOTGSS_UTMI_OTG_CTRL_SESSEND (1 << 3) -#define USBOTGSS_UTMI_OTG_CTRL_SESSVALID (1 << 2) -#define USBOTGSS_UTMI_OTG_CTRL_VBUSVALID (1 << 1) +#define USBOTGSS_UTMI_OTG_CTRL_SW_MODE BIT(31) +#define USBOTGSS_UTMI_OTG_CTRL_POWERPRESENT BIT(9) +#define USBOTGSS_UTMI_OTG_CTRL_TXBITSTUFFENABLE BIT(8) +#define USBOTGSS_UTMI_OTG_CTRL_IDDIG BIT(4) +#define USBOTGSS_UTMI_OTG_CTRL_SESSEND BIT(3) +#define USBOTGSS_UTMI_OTG_CTRL_SESSVALID BIT(2) +#define USBOTGSS_UTMI_OTG_CTRL_VBUSVALID BIT(1) struct dwc3_omap { struct device *dev; diff --git a/drivers/usb/dwc3/dwc3-pci.c b/drivers/usb/dwc3/dwc3-pci.c index cce0a220b6b0437f23102da3fa56f4f7373f9082..a15ec71d04233a3ac9e93fe5d9063edb0a4d89af 100644 --- a/drivers/usb/dwc3/dwc3-pci.c +++ b/drivers/usb/dwc3/dwc3-pci.c @@ -125,8 +125,10 @@ static int dwc3_pci_quirks(struct dwc3_pci *dwc) if (pdev->device == PCI_DEVICE_ID_INTEL_BYT) { struct gpio_desc *gpio; - acpi_dev_add_driver_gpios(ACPI_COMPANION(&pdev->dev), + ret = devm_acpi_dev_add_driver_gpios(&pdev->dev, acpi_dwc3_byt_gpios); + if (ret) + dev_dbg(&pdev->dev, "failed to add mapping table\n"); /* * These GPIOs will turn on the USB2 PHY. Note that we have to @@ -242,7 +244,6 @@ static void dwc3_pci_remove(struct pci_dev *pci) device_init_wakeup(&pci->dev, false); pm_runtime_get(&pci->dev); - acpi_dev_remove_driver_gpios(ACPI_COMPANION(&pci->dev)); platform_device_unregister(dwc->dwc3); } diff --git a/drivers/usb/dwc3/ep0.c b/drivers/usb/dwc3/ep0.c index e689cede9b0e526ed4bc98e02d5e7b6294ce6d1d..a78c78e7a8c3c18ac3d50f1b4dc18ec9517ae248 100644 --- a/drivers/usb/dwc3/ep0.c +++ b/drivers/usb/dwc3/ep0.c @@ -39,14 +39,13 @@ static void __dwc3_ep0_do_control_status(struct dwc3 *dwc, struct dwc3_ep *dep); static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, struct dwc3_ep *dep, struct dwc3_request *req); -static void dwc3_ep0_prepare_one_trb(struct dwc3 *dwc, u8 epnum, +static void dwc3_ep0_prepare_one_trb(struct dwc3_ep *dep, dma_addr_t buf_dma, u32 len, u32 type, bool chain) { struct dwc3_trb *trb; - struct dwc3_ep *dep; - - dep = dwc->eps[epnum]; + struct dwc3 *dwc; + dwc = dep->dwc; trb = &dwc->ep0_trb[dep->trb_enqueue]; if (chain) @@ -69,16 +68,17 @@ static void dwc3_ep0_prepare_one_trb(struct dwc3 *dwc, u8 epnum, trace_dwc3_prepare_trb(dep, trb); } -static int dwc3_ep0_start_trans(struct dwc3 *dwc, u8 epnum) +static int dwc3_ep0_start_trans(struct dwc3_ep *dep) { struct dwc3_gadget_ep_cmd_params params; - struct dwc3_ep *dep; + struct dwc3 *dwc; int ret; - dep = dwc->eps[epnum]; if (dep->flags & DWC3_EP_BUSY) return 0; + dwc = dep->dwc; + memset(¶ms, 0, sizeof(params)); params.param0 = upper_32_bits(dwc->ep0_trb_addr); params.param1 = lower_32_bits(dwc->ep0_trb_addr); @@ -279,13 +279,15 @@ int dwc3_gadget_ep0_set_halt(struct usb_ep *ep, int value) void dwc3_ep0_out_start(struct dwc3 *dwc) { + struct dwc3_ep *dep; int ret; complete(&dwc->ep0_in_setup); - dwc3_ep0_prepare_one_trb(dwc, 0, dwc->ctrl_req_addr, 8, + dep = dwc->eps[0]; + dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 8, DWC3_TRBCTL_CONTROL_SETUP, false); - ret = dwc3_ep0_start_trans(dwc, 0); + ret = dwc3_ep0_start_trans(dep); WARN_ON(ret < 0); } @@ -794,7 +796,7 @@ static int dwc3_ep0_std_request(struct dwc3 *dwc, struct usb_ctrlrequest *ctrl) static void dwc3_ep0_inspect_setup(struct dwc3 *dwc, const struct dwc3_event_depevt *event) { - struct usb_ctrlrequest *ctrl = dwc->ctrl_req; + struct usb_ctrlrequest *ctrl = (void *) dwc->ep0_trb; int ret = -EINVAL; u32 len; @@ -834,7 +836,6 @@ static void dwc3_ep0_complete_data(struct dwc3 *dwc, struct usb_request *ur; struct dwc3_trb *trb; struct dwc3_ep *ep0; - unsigned transfer_size = 0; unsigned maxp; unsigned remaining_ur_length; void *buf; @@ -847,9 +848,7 @@ static void dwc3_ep0_complete_data(struct dwc3 *dwc, ep0 = dwc->eps[0]; dwc->ep0_next_event = DWC3_EP0_NRDY_STATUS; - trb = dwc->ep0_trb; - trace_dwc3_complete_trb(ep0, trb); r = next_request(&ep0->pending_list); @@ -870,58 +869,23 @@ static void dwc3_ep0_complete_data(struct dwc3 *dwc, remaining_ur_length = ur->length; length = trb->size & DWC3_TRB_SIZE_MASK; - maxp = ep0->endpoint.maxpacket; - - if (dwc->ep0_bounced) { - /* - * Handle the first TRB before handling the bounce buffer if - * the request length is greater than the bounce buffer size - */ - if (ur->length > DWC3_EP0_BOUNCE_SIZE) { - transfer_size = ALIGN(ur->length - maxp, maxp); - transferred = transfer_size - length; - buf = (u8 *)buf + transferred; - ur->actual += transferred; - remaining_ur_length -= transferred; - - trb++; - length = trb->size & DWC3_TRB_SIZE_MASK; - - ep0->trb_enqueue = 0; - } - - transfer_size = roundup((ur->length - transfer_size), - maxp); - - transferred = min_t(u32, remaining_ur_length, - transfer_size - length); - memcpy(buf, dwc->ep0_bounce, transferred); - } else { - transferred = ur->length - length; - } - + transferred = ur->length - length; ur->actual += transferred; - if ((epnum & 1) && ur->actual < ur->length) { - /* for some reason we did not get everything out */ + if ((IS_ALIGNED(ur->length, ep0->endpoint.maxpacket) && + ur->length && ur->zero) || dwc->ep0_bounced) { + trb++; + trb->ctrl &= ~DWC3_TRB_CTRL_HWO; + trace_dwc3_complete_trb(ep0, trb); + ep0->trb_enqueue = 0; + dwc->ep0_bounced = false; + } + if ((epnum & 1) && ur->actual < ur->length) dwc3_ep0_stall_and_restart(dwc); - } else { + else dwc3_gadget_giveback(ep0, r, 0); - - if (IS_ALIGNED(ur->length, ep0->endpoint.maxpacket) && - ur->length && ur->zero) { - int ret; - - dwc->ep0_next_event = DWC3_EP0_COMPLETE; - - dwc3_ep0_prepare_one_trb(dwc, epnum, dwc->ctrl_req_addr, - 0, DWC3_TRBCTL_CONTROL_DATA, false); - ret = dwc3_ep0_start_trans(dwc, epnum); - WARN_ON(ret < 0); - } - } } static void dwc3_ep0_complete_status(struct dwc3 *dwc, @@ -997,14 +961,13 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, req->direction = !!dep->number; if (req->request.length == 0) { - dwc3_ep0_prepare_one_trb(dwc, dep->number, - dwc->ctrl_req_addr, 0, + dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 0, DWC3_TRBCTL_CONTROL_DATA, false); - ret = dwc3_ep0_start_trans(dwc, dep->number); + ret = dwc3_ep0_start_trans(dep); } else if (!IS_ALIGNED(req->request.length, dep->endpoint.maxpacket) && (dep->number == 0)) { - u32 transfer_size = 0; u32 maxpacket; + u32 rem; ret = usb_gadget_map_request_by_dev(dwc->sysdev, &req->request, dep->number); @@ -1012,36 +975,55 @@ static void __dwc3_ep0_do_control_data(struct dwc3 *dwc, return; maxpacket = dep->endpoint.maxpacket; + rem = req->request.length % maxpacket; + dwc->ep0_bounced = true; - if (req->request.length > DWC3_EP0_BOUNCE_SIZE) { - transfer_size = ALIGN(req->request.length - maxpacket, - maxpacket); - dwc3_ep0_prepare_one_trb(dwc, dep->number, - req->request.dma, - transfer_size, - DWC3_TRBCTL_CONTROL_DATA, - true); - } - - transfer_size = roundup((req->request.length - transfer_size), - maxpacket); + /* prepare normal TRB */ + dwc3_ep0_prepare_one_trb(dep, req->request.dma, + req->request.length, + DWC3_TRBCTL_CONTROL_DATA, + true); + + /* Now prepare one extra TRB to align transfer size */ + dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, + maxpacket - rem, + DWC3_TRBCTL_CONTROL_DATA, + false); + ret = dwc3_ep0_start_trans(dep); + } else if (IS_ALIGNED(req->request.length, dep->endpoint.maxpacket) && + req->request.length && req->request.zero) { + u32 maxpacket; + u32 rem; - dwc->ep0_bounced = true; + ret = usb_gadget_map_request_by_dev(dwc->sysdev, + &req->request, dep->number); + if (ret) + return; - dwc3_ep0_prepare_one_trb(dwc, dep->number, - dwc->ep0_bounce_addr, transfer_size, - DWC3_TRBCTL_CONTROL_DATA, false); - ret = dwc3_ep0_start_trans(dwc, dep->number); + maxpacket = dep->endpoint.maxpacket; + rem = req->request.length % maxpacket; + + /* prepare normal TRB */ + dwc3_ep0_prepare_one_trb(dep, req->request.dma, + req->request.length, + DWC3_TRBCTL_CONTROL_DATA, + true); + + /* Now prepare one extra TRB to align transfer size */ + dwc3_ep0_prepare_one_trb(dep, dwc->bounce_addr, + 0, DWC3_TRBCTL_CONTROL_DATA, + false); + ret = dwc3_ep0_start_trans(dep); } else { ret = usb_gadget_map_request_by_dev(dwc->sysdev, &req->request, dep->number); if (ret) return; - dwc3_ep0_prepare_one_trb(dwc, dep->number, req->request.dma, + dwc3_ep0_prepare_one_trb(dep, req->request.dma, req->request.length, DWC3_TRBCTL_CONTROL_DATA, false); - ret = dwc3_ep0_start_trans(dwc, dep->number); + ret = dwc3_ep0_start_trans(dep); } WARN_ON(ret < 0); @@ -1055,9 +1037,8 @@ static int dwc3_ep0_start_control_status(struct dwc3_ep *dep) type = dwc->three_stage_setup ? DWC3_TRBCTL_CONTROL_STATUS3 : DWC3_TRBCTL_CONTROL_STATUS2; - dwc3_ep0_prepare_one_trb(dwc, dep->number, - dwc->ctrl_req_addr, 0, type, false); - return dwc3_ep0_start_trans(dwc, dep->number); + dwc3_ep0_prepare_one_trb(dep, dwc->ep0_trb_addr, 0, type, false); + return dwc3_ep0_start_trans(dep); } static void __dwc3_ep0_do_control_status(struct dwc3 *dwc, struct dwc3_ep *dep) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 79e7a3480d51b07abf3988a51f1790f6caec7ca3..6f6f0b3be3ad7a3491d244ff5648a3ef852479c4 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -171,7 +171,6 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req, int status) { struct dwc3 *dwc = dep->dwc; - unsigned int unmap_after_complete = false; req->started = false; list_del(&req->list); @@ -181,19 +180,8 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req, if (req->request.status == -EINPROGRESS) req->request.status = status; - /* - * NOTICE we don't want to unmap before calling ->complete() if we're - * dealing with a bounced ep0 request. If we unmap it here, we would end - * up overwritting the contents of req->buf and this could confuse the - * gadget driver. - */ - if (dwc->ep0_bounced && dep->number <= 1) { - dwc->ep0_bounced = false; - unmap_after_complete = true; - } else { - usb_gadget_unmap_request_by_dev(dwc->sysdev, - &req->request, req->direction); - } + usb_gadget_unmap_request_by_dev(dwc->sysdev, + &req->request, req->direction); trace_dwc3_gadget_giveback(req); @@ -201,10 +189,6 @@ void dwc3_gadget_giveback(struct dwc3_ep *dep, struct dwc3_request *req, usb_gadget_giveback_request(&dep->endpoint, &req->request); spin_lock(&dwc->lock); - if (unmap_after_complete) - usb_gadget_unmap_request_by_dev(dwc->sysdev, - &req->request, req->direction); - if (dep->number > 1) pm_runtime_put(dwc->dev); } @@ -1060,6 +1044,22 @@ static void dwc3_prepare_one_trb_linear(struct dwc3_ep *dep, false, 0, req->request.stream_id, req->request.short_not_ok, req->request.no_interrupt); + } else if (req->request.zero && req->request.length && + (IS_ALIGNED(req->request.length,dep->endpoint.maxpacket))) { + struct dwc3 *dwc = dep->dwc; + struct dwc3_trb *trb; + + req->zero = true; + + /* prepare normal TRB */ + dwc3_prepare_one_trb(dep, req, true, 0); + + /* Now prepare one extra TRB to handle ZLP */ + trb = &dep->trb_pool[dep->trb_enqueue]; + __dwc3_prepare_one_trb(dep, trb, dwc->bounce_addr, 0, + false, 0, req->request.stream_id, + req->request.short_not_ok, + req->request.no_interrupt); } else { dwc3_prepare_one_trb(dep, req, false, 0); } @@ -1184,8 +1184,11 @@ static void __dwc3_gadget_start_isoc(struct dwc3 *dwc, return; } - /* 4 micro frames in the future */ - uf = cur_uf + dep->interval * 4; + /* + * Schedule the first trb for one interval in the future or at + * least 4 microframes. + */ + uf = cur_uf + max_t(u32, 4, dep->interval); __dwc3_gadget_kick_transfer(dep, uf); } @@ -1272,31 +1275,6 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) return ret; } -static void __dwc3_gadget_ep_zlp_complete(struct usb_ep *ep, - struct usb_request *request) -{ - dwc3_gadget_ep_free_request(ep, request); -} - -static int __dwc3_gadget_ep_queue_zlp(struct dwc3 *dwc, struct dwc3_ep *dep) -{ - struct dwc3_request *req; - struct usb_request *request; - struct usb_ep *ep = &dep->endpoint; - - request = dwc3_gadget_ep_alloc_request(ep, GFP_ATOMIC); - if (!request) - return -ENOMEM; - - request->length = 0; - request->buf = dwc->zlp_buf; - request->complete = __dwc3_gadget_ep_zlp_complete; - - req = to_dwc3_request(request); - - return __dwc3_gadget_ep_queue(dep, req); -} - static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, gfp_t gfp_flags) { @@ -1310,17 +1288,6 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, spin_lock_irqsave(&dwc->lock, flags); ret = __dwc3_gadget_ep_queue(dep, req); - - /* - * Okay, here's the thing, if gadget driver has requested for a ZLP by - * setting request->zero, instead of doing magic, we will just queue an - * extra usb_request ourselves so that it gets handled the same way as - * any other request. - */ - if (ret == 0 && request->zero && request->length && - (request->length % ep->maxpacket == 0)) - ret = __dwc3_gadget_ep_queue_zlp(dwc, dep); - spin_unlock_irqrestore(&dwc->lock, flags); return ret; @@ -1400,7 +1367,7 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, dwc3_ep_inc_deq(dep); } - if (r->unaligned) { + if (r->unaligned || r->zero) { trb = r->trb + r->num_pending_sgs + 1; trb->ctrl &= ~DWC3_TRB_CTRL_HWO; dwc3_ep_inc_deq(dep); @@ -1411,7 +1378,7 @@ static int dwc3_gadget_ep_dequeue(struct usb_ep *ep, trb->ctrl &= ~DWC3_TRB_CTRL_HWO; dwc3_ep_inc_deq(dep); - if (r->unaligned) { + if (r->unaligned || r->zero) { trb = r->trb + 1; trb->ctrl &= ~DWC3_TRB_CTRL_HWO; dwc3_ep_inc_deq(dep); @@ -2006,14 +1973,15 @@ static const struct usb_gadget_ops dwc3_gadget_ops = { /* -------------------------------------------------------------------------- */ -static int dwc3_gadget_init_hw_endpoints(struct dwc3 *dwc, - u8 num, u32 direction) +static int dwc3_gadget_init_endpoints(struct dwc3 *dwc, u8 num) { struct dwc3_ep *dep; - u8 i; + u8 epnum; + + INIT_LIST_HEAD(&dwc->gadget.ep_list); - for (i = 0; i < num; i++) { - u8 epnum = (i << 1) | (direction ? 1 : 0); + for (epnum = 0; epnum < num; epnum++) { + bool direction = epnum & 1; dep = kzalloc(sizeof(*dep), GFP_KERNEL); if (!dep) @@ -2021,12 +1989,12 @@ static int dwc3_gadget_init_hw_endpoints(struct dwc3 *dwc, dep->dwc = dwc; dep->number = epnum; - dep->direction = !!direction; + dep->direction = direction; dep->regs = dwc->regs + DWC3_DEP_BASE(epnum); dwc->eps[epnum] = dep; snprintf(dep->name, sizeof(dep->name), "ep%d%s", epnum >> 1, - (epnum & 1) ? "in" : "out"); + direction ? "in" : "out"); dep->endpoint.name = dep->name; @@ -2053,7 +2021,7 @@ static int dwc3_gadget_init_hw_endpoints(struct dwc3 *dwc, /* MDWIDTH is represented in bits, we need it in bytes */ mdwidth /= 8; - size = dwc3_readl(dwc->regs, DWC3_GTXFIFOSIZ(i)); + size = dwc3_readl(dwc->regs, DWC3_GTXFIFOSIZ(epnum >> 1)); size = DWC3_GTXFIFOSIZ_TXFDEF(size); /* FIFO Depth is in MDWDITH bytes. Multiply */ @@ -2103,7 +2071,7 @@ static int dwc3_gadget_init_hw_endpoints(struct dwc3 *dwc, dep->endpoint.caps.type_int = true; } - dep->endpoint.caps.dir_in = !!direction; + dep->endpoint.caps.dir_in = direction; dep->endpoint.caps.dir_out = !direction; INIT_LIST_HEAD(&dep->pending_list); @@ -2113,27 +2081,6 @@ static int dwc3_gadget_init_hw_endpoints(struct dwc3 *dwc, return 0; } -static int dwc3_gadget_init_endpoints(struct dwc3 *dwc) -{ - int ret; - - INIT_LIST_HEAD(&dwc->gadget.ep_list); - - ret = dwc3_gadget_init_hw_endpoints(dwc, dwc->num_out_eps, 0); - if (ret < 0) { - dev_err(dwc->dev, "failed to initialize OUT endpoints\n"); - return ret; - } - - ret = dwc3_gadget_init_hw_endpoints(dwc, dwc->num_in_eps, 1); - if (ret < 0) { - dev_err(dwc->dev, "failed to initialize IN endpoints\n"); - return ret; - } - - return 0; -} - static void dwc3_gadget_free_endpoints(struct dwc3 *dwc) { struct dwc3_ep *dep; @@ -2197,7 +2144,7 @@ static int __dwc3_cleanup_done_trbs(struct dwc3 *dwc, struct dwc3_ep *dep, * with one TRB pending in the ring. We need to manually clear HWO bit * from that TRB. */ - if (req->unaligned && (trb->ctrl & DWC3_TRB_CTRL_HWO)) { + if ((req->zero || req->unaligned) && (trb->ctrl & DWC3_TRB_CTRL_HWO)) { trb->ctrl &= ~DWC3_TRB_CTRL_HWO; return 1; } @@ -2291,11 +2238,12 @@ static int dwc3_cleanup_done_reqs(struct dwc3 *dwc, struct dwc3_ep *dep, event, status, chain); } - if (req->unaligned) { + if (req->unaligned || req->zero) { trb = &dep->trb_pool[dep->trb_dequeue]; ret = __dwc3_cleanup_done_trbs(dwc, dep, req, trb, event, status, false); req->unaligned = false; + req->zero = false; } req->request.actual = length - req->remaining; @@ -3161,49 +3109,26 @@ int dwc3_gadget_init(struct dwc3 *dwc) dwc->irq_gadget = irq; - dwc->ctrl_req = dma_alloc_coherent(dwc->sysdev, sizeof(*dwc->ctrl_req), - &dwc->ctrl_req_addr, GFP_KERNEL); - if (!dwc->ctrl_req) { - dev_err(dwc->dev, "failed to allocate ctrl request\n"); - ret = -ENOMEM; - goto err0; - } - dwc->ep0_trb = dma_alloc_coherent(dwc->sysdev, sizeof(*dwc->ep0_trb) * 2, &dwc->ep0_trb_addr, GFP_KERNEL); if (!dwc->ep0_trb) { dev_err(dwc->dev, "failed to allocate ep0 trb\n"); ret = -ENOMEM; - goto err1; + goto err0; } - dwc->setup_buf = kzalloc(DWC3_EP0_BOUNCE_SIZE, GFP_KERNEL); + dwc->setup_buf = kzalloc(DWC3_EP0_SETUP_SIZE, GFP_KERNEL); if (!dwc->setup_buf) { ret = -ENOMEM; - goto err2; - } - - dwc->ep0_bounce = dma_alloc_coherent(dwc->sysdev, - DWC3_EP0_BOUNCE_SIZE, &dwc->ep0_bounce_addr, - GFP_KERNEL); - if (!dwc->ep0_bounce) { - dev_err(dwc->dev, "failed to allocate ep0 bounce buffer\n"); - ret = -ENOMEM; - goto err3; - } - - dwc->zlp_buf = kzalloc(DWC3_ZLP_BUF_SIZE, GFP_KERNEL); - if (!dwc->zlp_buf) { - ret = -ENOMEM; - goto err4; + goto err1; } dwc->bounce = dma_alloc_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, &dwc->bounce_addr, GFP_KERNEL); if (!dwc->bounce) { ret = -ENOMEM; - goto err5; + goto err2; } init_completion(&dwc->ep0_in_setup); @@ -3241,39 +3166,31 @@ int dwc3_gadget_init(struct dwc3 *dwc) * sure we're starting from a well known location. */ - ret = dwc3_gadget_init_endpoints(dwc); + ret = dwc3_gadget_init_endpoints(dwc, dwc->num_eps); if (ret) - goto err6; + goto err3; ret = usb_add_gadget_udc(dwc->dev, &dwc->gadget); if (ret) { dev_err(dwc->dev, "failed to register udc\n"); - goto err6; + goto err4; } return 0; -err6: - dma_free_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, dwc->bounce, - dwc->bounce_addr); - -err5: - kfree(dwc->zlp_buf); err4: dwc3_gadget_free_endpoints(dwc); - dma_free_coherent(dwc->sysdev, DWC3_EP0_BOUNCE_SIZE, - dwc->ep0_bounce, dwc->ep0_bounce_addr); err3: - kfree(dwc->setup_buf); + dma_free_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, dwc->bounce, + dwc->bounce_addr); err2: - dma_free_coherent(dwc->sysdev, sizeof(*dwc->ep0_trb) * 2, - dwc->ep0_trb, dwc->ep0_trb_addr); + kfree(dwc->setup_buf); err1: - dma_free_coherent(dwc->sysdev, sizeof(*dwc->ctrl_req), - dwc->ctrl_req, dwc->ctrl_req_addr); + dma_free_coherent(dwc->sysdev, sizeof(*dwc->ep0_trb) * 2, + dwc->ep0_trb, dwc->ep0_trb_addr); err0: return ret; @@ -3284,22 +3201,12 @@ int dwc3_gadget_init(struct dwc3 *dwc) void dwc3_gadget_exit(struct dwc3 *dwc) { usb_del_gadget_udc(&dwc->gadget); - dwc3_gadget_free_endpoints(dwc); - dma_free_coherent(dwc->sysdev, DWC3_BOUNCE_SIZE, dwc->bounce, - dwc->bounce_addr); - dma_free_coherent(dwc->sysdev, DWC3_EP0_BOUNCE_SIZE, - dwc->ep0_bounce, dwc->ep0_bounce_addr); - + dwc->bounce_addr); kfree(dwc->setup_buf); - kfree(dwc->zlp_buf); - dma_free_coherent(dwc->sysdev, sizeof(*dwc->ep0_trb) * 2, - dwc->ep0_trb, dwc->ep0_trb_addr); - - dma_free_coherent(dwc->sysdev, sizeof(*dwc->ctrl_req), - dwc->ctrl_req, dwc->ctrl_req_addr); + dwc->ep0_trb, dwc->ep0_trb_addr); } int dwc3_gadget_suspend(struct dwc3 *dwc) diff --git a/drivers/usb/dwc3/gadget.h b/drivers/usb/dwc3/gadget.h index 265e223ab64554f6be78d37d15e36a86bb571c74..e4602d0e515bbd6473c534b227844a07bc792b30 100644 --- a/drivers/usb/dwc3/gadget.h +++ b/drivers/usb/dwc3/gadget.h @@ -29,16 +29,16 @@ struct dwc3; /* DEPCFG parameter 1 */ #define DWC3_DEPCFG_INT_NUM(n) (((n) & 0x1f) << 0) -#define DWC3_DEPCFG_XFER_COMPLETE_EN (1 << 8) -#define DWC3_DEPCFG_XFER_IN_PROGRESS_EN (1 << 9) -#define DWC3_DEPCFG_XFER_NOT_READY_EN (1 << 10) -#define DWC3_DEPCFG_FIFO_ERROR_EN (1 << 11) -#define DWC3_DEPCFG_STREAM_EVENT_EN (1 << 13) +#define DWC3_DEPCFG_XFER_COMPLETE_EN BIT(8) +#define DWC3_DEPCFG_XFER_IN_PROGRESS_EN BIT(9) +#define DWC3_DEPCFG_XFER_NOT_READY_EN BIT(10) +#define DWC3_DEPCFG_FIFO_ERROR_EN BIT(11) +#define DWC3_DEPCFG_STREAM_EVENT_EN BIT(12) #define DWC3_DEPCFG_BINTERVAL_M1(n) (((n) & 0xff) << 16) -#define DWC3_DEPCFG_STREAM_CAPABLE (1 << 24) +#define DWC3_DEPCFG_STREAM_CAPABLE BIT(24) #define DWC3_DEPCFG_EP_NUMBER(n) (((n) & 0x1f) << 25) -#define DWC3_DEPCFG_BULK_BASED (1 << 30) -#define DWC3_DEPCFG_FIFO_BASED (1 << 31) +#define DWC3_DEPCFG_BULK_BASED BIT(30) +#define DWC3_DEPCFG_FIFO_BASED BIT(31) /* DEPCFG parameter 0 */ #define DWC3_DEPCFG_EP_TYPE(n) (((n) & 0x3) << 1) @@ -47,10 +47,10 @@ struct dwc3; #define DWC3_DEPCFG_BURST_SIZE(n) (((n) & 0xf) << 22) #define DWC3_DEPCFG_DATA_SEQ_NUM(n) ((n) << 26) /* This applies for core versions earlier than 1.94a */ -#define DWC3_DEPCFG_IGN_SEQ_NUM (1 << 31) +#define DWC3_DEPCFG_IGN_SEQ_NUM BIT(31) /* These apply for core versions 1.94a and later */ #define DWC3_DEPCFG_ACTION_INIT (0 << 30) -#define DWC3_DEPCFG_ACTION_RESTORE (1 << 30) +#define DWC3_DEPCFG_ACTION_RESTORE BIT(30) #define DWC3_DEPCFG_ACTION_MODIFY (2 << 30) /* DEPXFERCFG parameter 0 */ diff --git a/drivers/usb/dwc3/trace.h b/drivers/usb/dwc3/trace.h index 2b124f94d858487ec9694683f618c0b1448e64d4..f1bd444d22a36b5a331bee6f2187d8a50094085d 100644 --- a/drivers/usb/dwc3/trace.h +++ b/drivers/usb/dwc3/trace.h @@ -27,31 +27,6 @@ #include "core.h" #include "debug.h" -DECLARE_EVENT_CLASS(dwc3_log_msg, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf), - TP_STRUCT__entry(__dynamic_array(char, msg, DWC3_MSG_MAX)), - TP_fast_assign( - vsnprintf(__get_str(msg), DWC3_MSG_MAX, vaf->fmt, *vaf->va); - ), - TP_printk("%s", __get_str(msg)) -); - -DEFINE_EVENT(dwc3_log_msg, dwc3_gadget, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) -); - -DEFINE_EVENT(dwc3_log_msg, dwc3_core, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) -); - -DEFINE_EVENT(dwc3_log_msg, dwc3_ep0, - TP_PROTO(struct va_format *vaf), - TP_ARGS(vaf) -); - DECLARE_EVENT_CLASS(dwc3_log_io, TP_PROTO(void *base, u32 offset, u32 value), TP_ARGS(base, offset, value), @@ -198,7 +173,7 @@ DECLARE_EVENT_CLASS(dwc3_log_generic_cmd, __entry->param = param; __entry->status = status; ), - TP_printk("cmd '%s' [%d] param %08x --> status: %s", + TP_printk("cmd '%s' [%x] param %08x --> status: %s", dwc3_gadget_generic_cmd_string(__entry->cmd), __entry->cmd, __entry->param, dwc3_gadget_generic_cmd_status_string(__entry->status) @@ -298,36 +273,7 @@ DECLARE_EVENT_CLASS(dwc3_log_trb, __entry->ctrl & DWC3_TRB_CTRL_CSP ? 'S' : 's', __entry->ctrl & DWC3_TRB_CTRL_ISP_IMI ? 'S' : 's', __entry->ctrl & DWC3_TRB_CTRL_IOC ? 'C' : 'c', - ({char *s; - switch (__entry->ctrl & 0x3f0) { - case DWC3_TRBCTL_NORMAL: - s = "normal"; - break; - case DWC3_TRBCTL_CONTROL_SETUP: - s = "setup"; - break; - case DWC3_TRBCTL_CONTROL_STATUS2: - s = "status2"; - break; - case DWC3_TRBCTL_CONTROL_STATUS3: - s = "status3"; - break; - case DWC3_TRBCTL_CONTROL_DATA: - s = "data"; - break; - case DWC3_TRBCTL_ISOCHRONOUS_FIRST: - s = "isoc-first"; - break; - case DWC3_TRBCTL_ISOCHRONOUS: - s = "isoc"; - break; - case DWC3_TRBCTL_LINK_TRB: - s = "link"; - break; - default: - s = "UNKNOWN"; - break; - } s; }) + dwc3_trb_type_string(DWC3_TRBCTL_TYPE(__entry->ctrl)) ) ); diff --git a/drivers/usb/early/Makefile b/drivers/usb/early/Makefile index 24bbe519c737647b202dfd00c020a32e1721830e..fcde2286da1cc18fecff8ccb6289dd733bd1e4de 100644 --- a/drivers/usb/early/Makefile +++ b/drivers/usb/early/Makefile @@ -3,3 +3,4 @@ # obj-$(CONFIG_EARLY_PRINTK_DBGP) += ehci-dbgp.o +obj-$(CONFIG_EARLY_PRINTK_USB_XDBC) += xhci-dbc.o diff --git a/drivers/usb/early/xhci-dbc.c b/drivers/usb/early/xhci-dbc.c new file mode 100644 index 0000000000000000000000000000000000000000..1268818e2263a28645642e9789360eade8ee6b2b --- /dev/null +++ b/drivers/usb/early/xhci-dbc.c @@ -0,0 +1,1014 @@ +/** + * xhci-dbc.c - xHCI debug capability early driver + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Lu Baolu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s: " fmt, __func__ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../host/xhci.h" +#include "xhci-dbc.h" + +static struct xdbc_state xdbc; +static bool early_console_keep; + +#define XDBC_TRACE +#ifdef XDBC_TRACE +#define xdbc_trace trace_printk +#else +static inline void xdbc_trace(const char *fmt, ...) { } +#endif /* XDBC_TRACE */ + +static void __iomem * __init xdbc_map_pci_mmio(u32 bus, u32 dev, u32 func) +{ + u64 val64, sz64, mask64; + void __iomem *base; + u32 val, sz; + u8 byte; + + val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0); + write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, ~0); + sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0); + write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0, val); + + if (val == 0xffffffff || sz == 0xffffffff) { + pr_notice("invalid mmio bar\n"); + return NULL; + } + + val64 = val & PCI_BASE_ADDRESS_MEM_MASK; + sz64 = sz & PCI_BASE_ADDRESS_MEM_MASK; + mask64 = PCI_BASE_ADDRESS_MEM_MASK; + + if ((val & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == PCI_BASE_ADDRESS_MEM_TYPE_64) { + val = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4); + write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, ~0); + sz = read_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4); + write_pci_config(bus, dev, func, PCI_BASE_ADDRESS_0 + 4, val); + + val64 |= (u64)val << 32; + sz64 |= (u64)sz << 32; + mask64 |= ~0ULL << 32; + } + + sz64 &= mask64; + + if (!sz64) { + pr_notice("invalid mmio address\n"); + return NULL; + } + + sz64 = 1ULL << __ffs64(sz64); + + /* Check if the mem space is enabled: */ + byte = read_pci_config_byte(bus, dev, func, PCI_COMMAND); + if (!(byte & PCI_COMMAND_MEMORY)) { + byte |= PCI_COMMAND_MEMORY; + write_pci_config_byte(bus, dev, func, PCI_COMMAND, byte); + } + + xdbc.xhci_start = val64; + xdbc.xhci_length = sz64; + base = early_ioremap(val64, sz64); + + return base; +} + +static void * __init xdbc_get_page(dma_addr_t *dma_addr) +{ + void *virt; + + virt = alloc_bootmem_pages_nopanic(PAGE_SIZE); + if (!virt) + return NULL; + + if (dma_addr) + *dma_addr = (dma_addr_t)__pa(virt); + + return virt; +} + +static u32 __init xdbc_find_dbgp(int xdbc_num, u32 *b, u32 *d, u32 *f) +{ + u32 bus, dev, func, class; + + for (bus = 0; bus < XDBC_PCI_MAX_BUSES; bus++) { + for (dev = 0; dev < XDBC_PCI_MAX_DEVICES; dev++) { + for (func = 0; func < XDBC_PCI_MAX_FUNCTION; func++) { + + class = read_pci_config(bus, dev, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_XHCI) + continue; + + if (xdbc_num-- != 0) + continue; + + *b = bus; + *d = dev; + *f = func; + + return 0; + } + } + } + + return -1; +} + +static int handshake(void __iomem *ptr, u32 mask, u32 done, int wait, int delay) +{ + u32 result; + + do { + result = readl(ptr); + result &= mask; + if (result == done) + return 0; + udelay(delay); + wait -= delay; + } while (wait > 0); + + return -ETIMEDOUT; +} + +static void __init xdbc_bios_handoff(void) +{ + int offset, timeout; + u32 val; + + offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_LEGACY); + val = readl(xdbc.xhci_base + offset); + + if (val & XHCI_HC_BIOS_OWNED) { + writel(val | XHCI_HC_OS_OWNED, xdbc.xhci_base + offset); + timeout = handshake(xdbc.xhci_base + offset, XHCI_HC_BIOS_OWNED, 0, 5000, 10); + + if (timeout) { + pr_notice("failed to hand over xHCI control from BIOS\n"); + writel(val & ~XHCI_HC_BIOS_OWNED, xdbc.xhci_base + offset); + } + } + + /* Disable BIOS SMIs and clear all SMI events: */ + val = readl(xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET); + val &= XHCI_LEGACY_DISABLE_SMI; + val |= XHCI_LEGACY_SMI_EVENTS; + writel(val, xdbc.xhci_base + offset + XHCI_LEGACY_CONTROL_OFFSET); +} + +static int __init +xdbc_alloc_ring(struct xdbc_segment *seg, struct xdbc_ring *ring) +{ + seg->trbs = xdbc_get_page(&seg->dma); + if (!seg->trbs) + return -ENOMEM; + + ring->segment = seg; + + return 0; +} + +static void __init xdbc_free_ring(struct xdbc_ring *ring) +{ + struct xdbc_segment *seg = ring->segment; + + if (!seg) + return; + + free_bootmem(seg->dma, PAGE_SIZE); + ring->segment = NULL; +} + +static void xdbc_reset_ring(struct xdbc_ring *ring) +{ + struct xdbc_segment *seg = ring->segment; + struct xdbc_trb *link_trb; + + memset(seg->trbs, 0, PAGE_SIZE); + + ring->enqueue = seg->trbs; + ring->dequeue = seg->trbs; + ring->cycle_state = 1; + + if (ring != &xdbc.evt_ring) { + link_trb = &seg->trbs[XDBC_TRBS_PER_SEGMENT - 1]; + link_trb->field[0] = cpu_to_le32(lower_32_bits(seg->dma)); + link_trb->field[1] = cpu_to_le32(upper_32_bits(seg->dma)); + link_trb->field[3] = cpu_to_le32(TRB_TYPE(TRB_LINK)) | cpu_to_le32(LINK_TOGGLE); + } +} + +static inline void xdbc_put_utf16(u16 *s, const char *c, size_t size) +{ + int i; + + for (i = 0; i < size; i++) + s[i] = cpu_to_le16(c[i]); +} + +static void xdbc_mem_init(void) +{ + struct xdbc_ep_context *ep_in, *ep_out; + struct usb_string_descriptor *s_desc; + struct xdbc_erst_entry *entry; + struct xdbc_strings *strings; + struct xdbc_context *ctx; + unsigned int max_burst; + u32 string_length; + int index = 0; + u32 dev_info; + + xdbc_reset_ring(&xdbc.evt_ring); + xdbc_reset_ring(&xdbc.in_ring); + xdbc_reset_ring(&xdbc.out_ring); + memset(xdbc.table_base, 0, PAGE_SIZE); + memset(xdbc.out_buf, 0, PAGE_SIZE); + + /* Initialize event ring segment table: */ + xdbc.erst_size = 16; + xdbc.erst_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE; + xdbc.erst_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE; + + index += XDBC_ERST_ENTRY_NUM; + entry = (struct xdbc_erst_entry *)xdbc.erst_base; + + entry->seg_addr = cpu_to_le64(xdbc.evt_seg.dma); + entry->seg_size = cpu_to_le32(XDBC_TRBS_PER_SEGMENT); + entry->__reserved_0 = 0; + + /* Initialize ERST registers: */ + writel(1, &xdbc.xdbc_reg->ersts); + xdbc_write64(xdbc.erst_dma, &xdbc.xdbc_reg->erstba); + xdbc_write64(xdbc.evt_seg.dma, &xdbc.xdbc_reg->erdp); + + /* Debug capability contexts: */ + xdbc.dbcc_size = 64 * 3; + xdbc.dbcc_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE; + xdbc.dbcc_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE; + + index += XDBC_DBCC_ENTRY_NUM; + + /* Popluate the strings: */ + xdbc.string_size = sizeof(struct xdbc_strings); + xdbc.string_base = xdbc.table_base + index * XDBC_TABLE_ENTRY_SIZE; + xdbc.string_dma = xdbc.table_dma + index * XDBC_TABLE_ENTRY_SIZE; + strings = (struct xdbc_strings *)xdbc.string_base; + + index += XDBC_STRING_ENTRY_NUM; + + /* Serial string: */ + s_desc = (struct usb_string_descriptor *)strings->serial; + s_desc->bLength = (strlen(XDBC_STRING_SERIAL) + 1) * 2; + s_desc->bDescriptorType = USB_DT_STRING; + + xdbc_put_utf16(s_desc->wData, XDBC_STRING_SERIAL, strlen(XDBC_STRING_SERIAL)); + string_length = s_desc->bLength; + string_length <<= 8; + + /* Product string: */ + s_desc = (struct usb_string_descriptor *)strings->product; + s_desc->bLength = (strlen(XDBC_STRING_PRODUCT) + 1) * 2; + s_desc->bDescriptorType = USB_DT_STRING; + + xdbc_put_utf16(s_desc->wData, XDBC_STRING_PRODUCT, strlen(XDBC_STRING_PRODUCT)); + string_length += s_desc->bLength; + string_length <<= 8; + + /* Manufacture string: */ + s_desc = (struct usb_string_descriptor *)strings->manufacturer; + s_desc->bLength = (strlen(XDBC_STRING_MANUFACTURER) + 1) * 2; + s_desc->bDescriptorType = USB_DT_STRING; + + xdbc_put_utf16(s_desc->wData, XDBC_STRING_MANUFACTURER, strlen(XDBC_STRING_MANUFACTURER)); + string_length += s_desc->bLength; + string_length <<= 8; + + /* String0: */ + strings->string0[0] = 4; + strings->string0[1] = USB_DT_STRING; + strings->string0[2] = 0x09; + strings->string0[3] = 0x04; + + string_length += 4; + + /* Populate info Context: */ + ctx = (struct xdbc_context *)xdbc.dbcc_base; + + ctx->info.string0 = cpu_to_le64(xdbc.string_dma); + ctx->info.manufacturer = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH); + ctx->info.product = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 2); + ctx->info.serial = cpu_to_le64(xdbc.string_dma + XDBC_MAX_STRING_LENGTH * 3); + ctx->info.length = cpu_to_le32(string_length); + + /* Populate bulk out endpoint context: */ + max_burst = DEBUG_MAX_BURST(readl(&xdbc.xdbc_reg->control)); + ep_out = (struct xdbc_ep_context *)&ctx->out; + + ep_out->ep_info1 = 0; + ep_out->ep_info2 = cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst)); + ep_out->deq = cpu_to_le64(xdbc.out_seg.dma | xdbc.out_ring.cycle_state); + + /* Populate bulk in endpoint context: */ + ep_in = (struct xdbc_ep_context *)&ctx->in; + + ep_in->ep_info1 = 0; + ep_in->ep_info2 = cpu_to_le32(EP_TYPE(BULK_OUT_EP) | MAX_PACKET(1024) | MAX_BURST(max_burst)); + ep_in->deq = cpu_to_le64(xdbc.in_seg.dma | xdbc.in_ring.cycle_state); + + /* Set DbC context and info registers: */ + xdbc_write64(xdbc.dbcc_dma, &xdbc.xdbc_reg->dccp); + + dev_info = cpu_to_le32((XDBC_VENDOR_ID << 16) | XDBC_PROTOCOL); + writel(dev_info, &xdbc.xdbc_reg->devinfo1); + + dev_info = cpu_to_le32((XDBC_DEVICE_REV << 16) | XDBC_PRODUCT_ID); + writel(dev_info, &xdbc.xdbc_reg->devinfo2); + + xdbc.in_buf = xdbc.out_buf + XDBC_MAX_PACKET; + xdbc.in_dma = xdbc.out_dma + XDBC_MAX_PACKET; +} + +static void xdbc_do_reset_debug_port(u32 id, u32 count) +{ + void __iomem *ops_reg; + void __iomem *portsc; + u32 val, cap_length; + int i; + + cap_length = readl(xdbc.xhci_base) & 0xff; + ops_reg = xdbc.xhci_base + cap_length; + + id--; + for (i = id; i < (id + count); i++) { + portsc = ops_reg + 0x400 + i * 0x10; + val = readl(portsc); + if (!(val & PORT_CONNECT)) + writel(val | PORT_RESET, portsc); + } +} + +static void xdbc_reset_debug_port(void) +{ + u32 val, port_offset, port_count; + int offset = 0; + + do { + offset = xhci_find_next_ext_cap(xdbc.xhci_base, offset, XHCI_EXT_CAPS_PROTOCOL); + if (!offset) + break; + + val = readl(xdbc.xhci_base + offset); + if (XHCI_EXT_PORT_MAJOR(val) != 0x3) + continue; + + val = readl(xdbc.xhci_base + offset + 8); + port_offset = XHCI_EXT_PORT_OFF(val); + port_count = XHCI_EXT_PORT_COUNT(val); + + xdbc_do_reset_debug_port(port_offset, port_count); + } while (1); +} + +static void +xdbc_queue_trb(struct xdbc_ring *ring, u32 field1, u32 field2, u32 field3, u32 field4) +{ + struct xdbc_trb *trb, *link_trb; + + trb = ring->enqueue; + trb->field[0] = cpu_to_le32(field1); + trb->field[1] = cpu_to_le32(field2); + trb->field[2] = cpu_to_le32(field3); + trb->field[3] = cpu_to_le32(field4); + + ++(ring->enqueue); + if (ring->enqueue >= &ring->segment->trbs[TRBS_PER_SEGMENT - 1]) { + link_trb = ring->enqueue; + if (ring->cycle_state) + link_trb->field[3] |= cpu_to_le32(TRB_CYCLE); + else + link_trb->field[3] &= cpu_to_le32(~TRB_CYCLE); + + ring->enqueue = ring->segment->trbs; + ring->cycle_state ^= 1; + } +} + +static void xdbc_ring_doorbell(int target) +{ + writel(DOOR_BELL_TARGET(target), &xdbc.xdbc_reg->doorbell); +} + +static int xdbc_start(void) +{ + u32 ctrl, status; + int ret; + + ctrl = readl(&xdbc.xdbc_reg->control); + writel(ctrl | CTRL_DBC_ENABLE | CTRL_PORT_ENABLE, &xdbc.xdbc_reg->control); + ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, CTRL_DBC_ENABLE, 100000, 100); + if (ret) { + xdbc_trace("failed to initialize hardware\n"); + return ret; + } + + /* Reset port to avoid bus hang: */ + if (xdbc.vendor == PCI_VENDOR_ID_INTEL) + xdbc_reset_debug_port(); + + /* Wait for port connection: */ + ret = handshake(&xdbc.xdbc_reg->portsc, PORTSC_CONN_STATUS, PORTSC_CONN_STATUS, 5000000, 100); + if (ret) { + xdbc_trace("waiting for connection timed out\n"); + return ret; + } + + /* Wait for debug device to be configured: */ + ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_RUN, CTRL_DBC_RUN, 5000000, 100); + if (ret) { + xdbc_trace("waiting for device configuration timed out\n"); + return ret; + } + + /* Check port number: */ + status = readl(&xdbc.xdbc_reg->status); + if (!DCST_DEBUG_PORT(status)) { + xdbc_trace("invalid root hub port number\n"); + return -ENODEV; + } + + xdbc.port_number = DCST_DEBUG_PORT(status); + + xdbc_trace("DbC is running now, control 0x%08x port ID %d\n", + readl(&xdbc.xdbc_reg->control), xdbc.port_number); + + return 0; +} + +static int xdbc_bulk_transfer(void *data, int size, bool read) +{ + struct xdbc_ring *ring; + struct xdbc_trb *trb; + u32 length, control; + u32 cycle; + u64 addr; + + if (size > XDBC_MAX_PACKET) { + xdbc_trace("bad parameter, size %d\n", size); + return -EINVAL; + } + + if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED) || + !(xdbc.flags & XDBC_FLAGS_CONFIGURED) || + (!read && (xdbc.flags & XDBC_FLAGS_OUT_STALL)) || + (read && (xdbc.flags & XDBC_FLAGS_IN_STALL))) { + + xdbc_trace("connection not ready, flags %08x\n", xdbc.flags); + return -EIO; + } + + ring = (read ? &xdbc.in_ring : &xdbc.out_ring); + trb = ring->enqueue; + cycle = ring->cycle_state; + length = TRB_LEN(size); + control = TRB_TYPE(TRB_NORMAL) | TRB_IOC; + + if (cycle) + control &= cpu_to_le32(~TRB_CYCLE); + else + control |= cpu_to_le32(TRB_CYCLE); + + if (read) { + memset(xdbc.in_buf, 0, XDBC_MAX_PACKET); + addr = xdbc.in_dma; + xdbc.flags |= XDBC_FLAGS_IN_PROCESS; + } else { + memset(xdbc.out_buf, 0, XDBC_MAX_PACKET); + memcpy(xdbc.out_buf, data, size); + addr = xdbc.out_dma; + xdbc.flags |= XDBC_FLAGS_OUT_PROCESS; + } + + xdbc_queue_trb(ring, lower_32_bits(addr), upper_32_bits(addr), length, control); + + /* + * Add a barrier between writes of trb fields and flipping + * the cycle bit: + */ + wmb(); + if (cycle) + trb->field[3] |= cpu_to_le32(cycle); + else + trb->field[3] &= cpu_to_le32(~TRB_CYCLE); + + xdbc_ring_doorbell(read ? IN_EP_DOORBELL : OUT_EP_DOORBELL); + + return size; +} + +static int xdbc_handle_external_reset(void) +{ + int ret = 0; + + xdbc.flags = 0; + writel(0, &xdbc.xdbc_reg->control); + ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 10); + if (ret) + goto reset_out; + + xdbc_mem_init(); + + mmiowb(); + + ret = xdbc_start(); + if (ret < 0) + goto reset_out; + + xdbc_trace("dbc recovered\n"); + + xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED; + + xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true); + + return 0; + +reset_out: + xdbc_trace("failed to recover from external reset\n"); + return ret; +} + +static int __init xdbc_early_setup(void) +{ + int ret; + + writel(0, &xdbc.xdbc_reg->control); + ret = handshake(&xdbc.xdbc_reg->control, CTRL_DBC_ENABLE, 0, 100000, 100); + if (ret) + return ret; + + /* Allocate the table page: */ + xdbc.table_base = xdbc_get_page(&xdbc.table_dma); + if (!xdbc.table_base) + return -ENOMEM; + + /* Get and store the transfer buffer: */ + xdbc.out_buf = xdbc_get_page(&xdbc.out_dma); + if (!xdbc.out_buf) + return -ENOMEM; + + /* Allocate the event ring: */ + ret = xdbc_alloc_ring(&xdbc.evt_seg, &xdbc.evt_ring); + if (ret < 0) + return ret; + + /* Allocate IN/OUT endpoint transfer rings: */ + ret = xdbc_alloc_ring(&xdbc.in_seg, &xdbc.in_ring); + if (ret < 0) + return ret; + + ret = xdbc_alloc_ring(&xdbc.out_seg, &xdbc.out_ring); + if (ret < 0) + return ret; + + xdbc_mem_init(); + + mmiowb(); + + ret = xdbc_start(); + if (ret < 0) { + writel(0, &xdbc.xdbc_reg->control); + return ret; + } + + xdbc.flags |= XDBC_FLAGS_INITIALIZED | XDBC_FLAGS_CONFIGURED; + + xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true); + + return 0; +} + +int __init early_xdbc_parse_parameter(char *s) +{ + unsigned long dbgp_num = 0; + u32 bus, dev, func, offset; + int ret; + + if (!early_pci_allowed()) + return -EPERM; + + if (strstr(s, "keep")) + early_console_keep = true; + + if (xdbc.xdbc_reg) + return 0; + + if (*s && kstrtoul(s, 0, &dbgp_num)) + dbgp_num = 0; + + pr_notice("dbgp_num: %lu\n", dbgp_num); + + /* Locate the host controller: */ + ret = xdbc_find_dbgp(dbgp_num, &bus, &dev, &func); + if (ret) { + pr_notice("failed to locate xhci host\n"); + return -ENODEV; + } + + xdbc.vendor = read_pci_config_16(bus, dev, func, PCI_VENDOR_ID); + xdbc.device = read_pci_config_16(bus, dev, func, PCI_DEVICE_ID); + xdbc.bus = bus; + xdbc.dev = dev; + xdbc.func = func; + + /* Map the IO memory: */ + xdbc.xhci_base = xdbc_map_pci_mmio(bus, dev, func); + if (!xdbc.xhci_base) + return -EINVAL; + + /* Locate DbC registers: */ + offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG); + if (!offset) { + pr_notice("xhci host doesn't support debug capability\n"); + early_iounmap(xdbc.xhci_base, xdbc.xhci_length); + xdbc.xhci_base = NULL; + xdbc.xhci_length = 0; + + return -ENODEV; + } + xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset); + + return 0; +} + +int __init early_xdbc_setup_hardware(void) +{ + int ret; + + if (!xdbc.xdbc_reg) + return -ENODEV; + + xdbc_bios_handoff(); + + raw_spin_lock_init(&xdbc.lock); + + ret = xdbc_early_setup(); + if (ret) { + pr_notice("failed to setup the connection to host\n"); + + xdbc_free_ring(&xdbc.evt_ring); + xdbc_free_ring(&xdbc.out_ring); + xdbc_free_ring(&xdbc.in_ring); + + if (xdbc.table_dma) + free_bootmem(xdbc.table_dma, PAGE_SIZE); + + if (xdbc.out_dma) + free_bootmem(xdbc.out_dma, PAGE_SIZE); + + xdbc.table_base = NULL; + xdbc.out_buf = NULL; + } + + return ret; +} + +static void xdbc_handle_port_status(struct xdbc_trb *evt_trb) +{ + u32 port_reg; + + port_reg = readl(&xdbc.xdbc_reg->portsc); + if (port_reg & PORTSC_CONN_CHANGE) { + xdbc_trace("connect status change event\n"); + + /* Check whether cable unplugged: */ + if (!(port_reg & PORTSC_CONN_STATUS)) { + xdbc.flags = 0; + xdbc_trace("cable unplugged\n"); + } + } + + if (port_reg & PORTSC_RESET_CHANGE) + xdbc_trace("port reset change event\n"); + + if (port_reg & PORTSC_LINK_CHANGE) + xdbc_trace("port link status change event\n"); + + if (port_reg & PORTSC_CONFIG_CHANGE) + xdbc_trace("config error change\n"); + + /* Write back the value to clear RW1C bits: */ + writel(port_reg, &xdbc.xdbc_reg->portsc); +} + +static void xdbc_handle_tx_event(struct xdbc_trb *evt_trb) +{ + size_t remain_length; + u32 comp_code; + int ep_id; + + comp_code = GET_COMP_CODE(le32_to_cpu(evt_trb->field[2])); + remain_length = EVENT_TRB_LEN(le32_to_cpu(evt_trb->field[2])); + ep_id = TRB_TO_EP_ID(le32_to_cpu(evt_trb->field[3])); + + switch (comp_code) { + case COMP_SUCCESS: + remain_length = 0; + case COMP_SHORT_PACKET: + break; + case COMP_TRB_ERROR: + case COMP_BABBLE_DETECTED_ERROR: + case COMP_USB_TRANSACTION_ERROR: + case COMP_STALL_ERROR: + default: + if (ep_id == XDBC_EPID_OUT) + xdbc.flags |= XDBC_FLAGS_OUT_STALL; + if (ep_id == XDBC_EPID_IN) + xdbc.flags |= XDBC_FLAGS_IN_STALL; + + xdbc_trace("endpoint %d stalled\n", ep_id); + break; + } + + if (ep_id == XDBC_EPID_IN) { + xdbc.flags &= ~XDBC_FLAGS_IN_PROCESS; + xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true); + } else if (ep_id == XDBC_EPID_OUT) { + xdbc.flags &= ~XDBC_FLAGS_OUT_PROCESS; + } else { + xdbc_trace("invalid endpoint id %d\n", ep_id); + } +} + +static void xdbc_handle_events(void) +{ + struct xdbc_trb *evt_trb; + bool update_erdp = false; + u32 reg; + u8 cmd; + + cmd = read_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND); + if (!(cmd & PCI_COMMAND_MASTER)) { + cmd |= PCI_COMMAND_MASTER | PCI_COMMAND_MEMORY; + write_pci_config_byte(xdbc.bus, xdbc.dev, xdbc.func, PCI_COMMAND, cmd); + } + + if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) + return; + + /* Handle external reset events: */ + reg = readl(&xdbc.xdbc_reg->control); + if (!(reg & CTRL_DBC_ENABLE)) { + if (xdbc_handle_external_reset()) { + xdbc_trace("failed to recover connection\n"); + return; + } + } + + /* Handle configure-exit event: */ + reg = readl(&xdbc.xdbc_reg->control); + if (reg & CTRL_DBC_RUN_CHANGE) { + writel(reg, &xdbc.xdbc_reg->control); + if (reg & CTRL_DBC_RUN) + xdbc.flags |= XDBC_FLAGS_CONFIGURED; + else + xdbc.flags &= ~XDBC_FLAGS_CONFIGURED; + } + + /* Handle endpoint stall event: */ + reg = readl(&xdbc.xdbc_reg->control); + if (reg & CTRL_HALT_IN_TR) { + xdbc.flags |= XDBC_FLAGS_IN_STALL; + } else { + xdbc.flags &= ~XDBC_FLAGS_IN_STALL; + if (!(xdbc.flags & XDBC_FLAGS_IN_PROCESS)) + xdbc_bulk_transfer(NULL, XDBC_MAX_PACKET, true); + } + + if (reg & CTRL_HALT_OUT_TR) + xdbc.flags |= XDBC_FLAGS_OUT_STALL; + else + xdbc.flags &= ~XDBC_FLAGS_OUT_STALL; + + /* Handle the events in the event ring: */ + evt_trb = xdbc.evt_ring.dequeue; + while ((le32_to_cpu(evt_trb->field[3]) & TRB_CYCLE) == xdbc.evt_ring.cycle_state) { + /* + * Add a barrier between reading the cycle flag and any + * reads of the event's flags/data below: + */ + rmb(); + + switch ((le32_to_cpu(evt_trb->field[3]) & TRB_TYPE_BITMASK)) { + case TRB_TYPE(TRB_PORT_STATUS): + xdbc_handle_port_status(evt_trb); + break; + case TRB_TYPE(TRB_TRANSFER): + xdbc_handle_tx_event(evt_trb); + break; + default: + break; + } + + ++(xdbc.evt_ring.dequeue); + if (xdbc.evt_ring.dequeue == &xdbc.evt_seg.trbs[TRBS_PER_SEGMENT]) { + xdbc.evt_ring.dequeue = xdbc.evt_seg.trbs; + xdbc.evt_ring.cycle_state ^= 1; + } + + evt_trb = xdbc.evt_ring.dequeue; + update_erdp = true; + } + + /* Update event ring dequeue pointer: */ + if (update_erdp) + xdbc_write64(__pa(xdbc.evt_ring.dequeue), &xdbc.xdbc_reg->erdp); +} + +static int xdbc_bulk_write(const char *bytes, int size) +{ + int ret, timeout = 0; + unsigned long flags; + +retry: + if (in_nmi()) { + if (!raw_spin_trylock_irqsave(&xdbc.lock, flags)) + return -EAGAIN; + } else { + raw_spin_lock_irqsave(&xdbc.lock, flags); + } + + xdbc_handle_events(); + + /* Check completion of the previous request: */ + if ((xdbc.flags & XDBC_FLAGS_OUT_PROCESS) && (timeout < 2000000)) { + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + udelay(100); + timeout += 100; + goto retry; + } + + if (xdbc.flags & XDBC_FLAGS_OUT_PROCESS) { + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + xdbc_trace("previous transfer not completed yet\n"); + + return -ETIMEDOUT; + } + + ret = xdbc_bulk_transfer((void *)bytes, size, false); + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + + return ret; +} + +static void early_xdbc_write(struct console *con, const char *str, u32 n) +{ + static char buf[XDBC_MAX_PACKET]; + int chunk, ret; + int use_cr = 0; + + if (!xdbc.xdbc_reg) + return; + memset(buf, 0, XDBC_MAX_PACKET); + while (n > 0) { + for (chunk = 0; chunk < XDBC_MAX_PACKET && n > 0; str++, chunk++, n--) { + + if (!use_cr && *str == '\n') { + use_cr = 1; + buf[chunk] = '\r'; + str--; + n++; + continue; + } + + if (use_cr) + use_cr = 0; + buf[chunk] = *str; + } + + if (chunk > 0) { + ret = xdbc_bulk_write(buf, chunk); + if (ret < 0) + xdbc_trace("missed message {%s}\n", buf); + } + } +} + +static struct console early_xdbc_console = { + .name = "earlyxdbc", + .write = early_xdbc_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; + +void __init early_xdbc_register_console(void) +{ + if (early_console) + return; + + early_console = &early_xdbc_console; + if (early_console_keep) + early_console->flags &= ~CON_BOOT; + else + early_console->flags |= CON_BOOT; + register_console(early_console); +} + +static void xdbc_unregister_console(void) +{ + if (early_xdbc_console.flags & CON_ENABLED) + unregister_console(&early_xdbc_console); +} + +static int xdbc_scrub_function(void *ptr) +{ + unsigned long flags; + + while (true) { + raw_spin_lock_irqsave(&xdbc.lock, flags); + xdbc_handle_events(); + + if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) { + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + break; + } + + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + schedule_timeout_interruptible(1); + } + + xdbc_unregister_console(); + writel(0, &xdbc.xdbc_reg->control); + xdbc_trace("dbc scrub function exits\n"); + + return 0; +} + +static int __init xdbc_init(void) +{ + unsigned long flags; + void __iomem *base; + int ret = 0; + u32 offset; + + if (!(xdbc.flags & XDBC_FLAGS_INITIALIZED)) + return 0; + + /* + * It's time to shut down the DbC, so that the debug + * port can be reused by the host controller: + */ + if (early_xdbc_console.index == -1 || + (early_xdbc_console.flags & CON_BOOT)) { + xdbc_trace("hardware not used anymore\n"); + goto free_and_quit; + } + + base = ioremap_nocache(xdbc.xhci_start, xdbc.xhci_length); + if (!base) { + xdbc_trace("failed to remap the io address\n"); + ret = -ENOMEM; + goto free_and_quit; + } + + raw_spin_lock_irqsave(&xdbc.lock, flags); + early_iounmap(xdbc.xhci_base, xdbc.xhci_length); + xdbc.xhci_base = base; + offset = xhci_find_next_ext_cap(xdbc.xhci_base, 0, XHCI_EXT_CAPS_DEBUG); + xdbc.xdbc_reg = (struct xdbc_regs __iomem *)(xdbc.xhci_base + offset); + raw_spin_unlock_irqrestore(&xdbc.lock, flags); + + kthread_run(xdbc_scrub_function, NULL, "%s", "xdbc"); + + return 0; + +free_and_quit: + xdbc_free_ring(&xdbc.evt_ring); + xdbc_free_ring(&xdbc.out_ring); + xdbc_free_ring(&xdbc.in_ring); + free_bootmem(xdbc.table_dma, PAGE_SIZE); + free_bootmem(xdbc.out_dma, PAGE_SIZE); + writel(0, &xdbc.xdbc_reg->control); + early_iounmap(xdbc.xhci_base, xdbc.xhci_length); + + return ret; +} +subsys_initcall(xdbc_init); diff --git a/drivers/usb/early/xhci-dbc.h b/drivers/usb/early/xhci-dbc.h new file mode 100644 index 0000000000000000000000000000000000000000..2df0f6e613fede532503e2bfdf94a0b0eca29937 --- /dev/null +++ b/drivers/usb/early/xhci-dbc.h @@ -0,0 +1,211 @@ +/* + * xhci-dbc.h - xHCI debug capability early driver + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Lu Baolu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_XHCI_DBC_H +#define __LINUX_XHCI_DBC_H + +#include +#include + +/* + * xHCI Debug Capability Register interfaces: + */ +struct xdbc_regs { + __le32 capability; + __le32 doorbell; + __le32 ersts; /* Event Ring Segment Table Size*/ + __le32 __reserved_0; /* 0c~0f reserved bits */ + __le64 erstba; /* Event Ring Segment Table Base Address */ + __le64 erdp; /* Event Ring Dequeue Pointer */ + __le32 control; + __le32 status; + __le32 portsc; /* Port status and control */ + __le32 __reserved_1; /* 2b~28 reserved bits */ + __le64 dccp; /* Debug Capability Context Pointer */ + __le32 devinfo1; /* Device Descriptor Info Register 1 */ + __le32 devinfo2; /* Device Descriptor Info Register 2 */ +}; + +#define DEBUG_MAX_BURST(p) (((p) >> 16) & 0xff) + +#define CTRL_DBC_RUN BIT(0) +#define CTRL_PORT_ENABLE BIT(1) +#define CTRL_HALT_OUT_TR BIT(2) +#define CTRL_HALT_IN_TR BIT(3) +#define CTRL_DBC_RUN_CHANGE BIT(4) +#define CTRL_DBC_ENABLE BIT(31) + +#define DCST_DEBUG_PORT(p) (((p) >> 24) & 0xff) + +#define PORTSC_CONN_STATUS BIT(0) +#define PORTSC_CONN_CHANGE BIT(17) +#define PORTSC_RESET_CHANGE BIT(21) +#define PORTSC_LINK_CHANGE BIT(22) +#define PORTSC_CONFIG_CHANGE BIT(23) + +/* + * xHCI Debug Capability data structures: + */ +struct xdbc_trb { + __le32 field[4]; +}; + +struct xdbc_erst_entry { + __le64 seg_addr; + __le32 seg_size; + __le32 __reserved_0; +}; + +struct xdbc_info_context { + __le64 string0; + __le64 manufacturer; + __le64 product; + __le64 serial; + __le32 length; + __le32 __reserved_0[7]; +}; + +struct xdbc_ep_context { + __le32 ep_info1; + __le32 ep_info2; + __le64 deq; + __le32 tx_info; + __le32 __reserved_0[11]; +}; + +struct xdbc_context { + struct xdbc_info_context info; + struct xdbc_ep_context out; + struct xdbc_ep_context in; +}; + +#define XDBC_INFO_CONTEXT_SIZE 48 +#define XDBC_MAX_STRING_LENGTH 64 +#define XDBC_STRING_MANUFACTURER "Linux" +#define XDBC_STRING_PRODUCT "Remote GDB" +#define XDBC_STRING_SERIAL "0001" + +struct xdbc_strings { + char string0[XDBC_MAX_STRING_LENGTH]; + char manufacturer[XDBC_MAX_STRING_LENGTH]; + char product[XDBC_MAX_STRING_LENGTH]; + char serial[XDBC_MAX_STRING_LENGTH]; +}; + +#define XDBC_PROTOCOL 1 /* GNU Remote Debug Command Set */ +#define XDBC_VENDOR_ID 0x1d6b /* Linux Foundation 0x1d6b */ +#define XDBC_PRODUCT_ID 0x0004 /* __le16 idProduct; device 0004 */ +#define XDBC_DEVICE_REV 0x0010 /* 0.10 */ + +/* + * xHCI Debug Capability software state structures: + */ +struct xdbc_segment { + struct xdbc_trb *trbs; + dma_addr_t dma; +}; + +#define XDBC_TRBS_PER_SEGMENT 256 + +struct xdbc_ring { + struct xdbc_segment *segment; + struct xdbc_trb *enqueue; + struct xdbc_trb *dequeue; + u32 cycle_state; +}; + +#define XDBC_EPID_OUT 2 +#define XDBC_EPID_IN 3 + +struct xdbc_state { + u16 vendor; + u16 device; + u32 bus; + u32 dev; + u32 func; + void __iomem *xhci_base; + u64 xhci_start; + size_t xhci_length; + int port_number; + + /* DbC register base */ + struct xdbc_regs __iomem *xdbc_reg; + + /* DbC table page */ + dma_addr_t table_dma; + void *table_base; + + /* event ring segment table */ + dma_addr_t erst_dma; + size_t erst_size; + void *erst_base; + + /* event ring segments */ + struct xdbc_ring evt_ring; + struct xdbc_segment evt_seg; + + /* debug capability contexts */ + dma_addr_t dbcc_dma; + size_t dbcc_size; + void *dbcc_base; + + /* descriptor strings */ + dma_addr_t string_dma; + size_t string_size; + void *string_base; + + /* bulk OUT endpoint */ + struct xdbc_ring out_ring; + struct xdbc_segment out_seg; + void *out_buf; + dma_addr_t out_dma; + + /* bulk IN endpoint */ + struct xdbc_ring in_ring; + struct xdbc_segment in_seg; + void *in_buf; + dma_addr_t in_dma; + + u32 flags; + + /* spinlock for early_xdbc_write() reentrancy */ + raw_spinlock_t lock; +}; + +#define XDBC_PCI_MAX_BUSES 256 +#define XDBC_PCI_MAX_DEVICES 32 +#define XDBC_PCI_MAX_FUNCTION 8 + +#define XDBC_TABLE_ENTRY_SIZE 64 +#define XDBC_ERST_ENTRY_NUM 1 +#define XDBC_DBCC_ENTRY_NUM 3 +#define XDBC_STRING_ENTRY_NUM 4 + +/* Bits definitions for xdbc_state.flags: */ +#define XDBC_FLAGS_INITIALIZED BIT(0) +#define XDBC_FLAGS_IN_STALL BIT(1) +#define XDBC_FLAGS_OUT_STALL BIT(2) +#define XDBC_FLAGS_IN_PROCESS BIT(3) +#define XDBC_FLAGS_OUT_PROCESS BIT(4) +#define XDBC_FLAGS_CONFIGURED BIT(5) + +#define XDBC_MAX_PACKET 1024 + +/* Door bell target: */ +#define OUT_EP_DOORBELL 0 +#define IN_EP_DOORBELL 1 +#define DOOR_BELL_TARGET(p) (((p) & 0xff) << 8) + +#define xdbc_read64(regs) xhci_read_64(NULL, (regs)) +#define xdbc_write64(val, regs) xhci_write_64(NULL, (val), (regs)) + +#endif /* __LINUX_XHCI_DBC_H */ diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig index 8ad203296079f6a9cf5e7457db1017784725a710..c164d6b788c31f9922836f407c21e50bfce73f5a 100644 --- a/drivers/usb/gadget/Kconfig +++ b/drivers/usb/gadget/Kconfig @@ -212,7 +212,7 @@ config USB_F_TCM # this first set of drivers all depend on bulk-capable hardware. config USB_CONFIGFS - tristate "USB functions configurable through configfs" + tristate "USB Gadget functions configurable through configfs" select USB_LIBCOMPOSITE help A Linux USB "gadget" can be set up through configfs. @@ -458,8 +458,9 @@ config USB_CONFIGFS_F_TCM UAS utilizes the USB 3.0 feature called streams support. choice - tristate "USB Gadget Drivers" + tristate "USB Gadget precomposed configurations" default USB_ETH + optional help A Linux "Gadget Driver" talks to the USB Peripheral Controller driver through the abstract "gadget" API. Some other operating @@ -476,6 +477,12 @@ choice not be able work with that controller, or might need to implement a less common variant of a device class protocol. + The available choices each represent a single precomposed USB + gadget configuration. In the device model, each option contains + both the device instantiation as a child for a USB gadget + controller, and the relevant drivers for each function declared + by the device. + source "drivers/usb/gadget/legacy/Kconfig" endchoice diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index a0085571824d9b4352c7245625a30c3248d789f5..71dd27c0d7f27daf9d2db08b6a0b72901844ba3f 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -246,7 +246,6 @@ EXPORT_SYMBOL_GPL(ffs_lock); static struct ffs_dev *_ffs_find_dev(const char *name); static struct ffs_dev *_ffs_alloc_dev(void); -static int _ffs_name_dev(struct ffs_dev *dev, const char *name); static void _ffs_free_dev(struct ffs_dev *dev); static void *ffs_acquire_dev(const char *dev_name); static void ffs_release_dev(struct ffs_data *ffs_data); @@ -1571,14 +1570,14 @@ static void ffs_data_get(struct ffs_data *ffs) { ENTER(); - atomic_inc(&ffs->ref); + refcount_inc(&ffs->ref); } static void ffs_data_opened(struct ffs_data *ffs) { ENTER(); - atomic_inc(&ffs->ref); + refcount_inc(&ffs->ref); if (atomic_add_return(1, &ffs->opened) == 1 && ffs->state == FFS_DEACTIVATED) { ffs->state = FFS_CLOSING; @@ -1590,7 +1589,7 @@ static void ffs_data_put(struct ffs_data *ffs) { ENTER(); - if (unlikely(atomic_dec_and_test(&ffs->ref))) { + if (unlikely(refcount_dec_and_test(&ffs->ref))) { pr_info("%s(): freeing\n", __func__); ffs_data_clear(ffs); BUG_ON(waitqueue_active(&ffs->ev.waitq) || @@ -1635,7 +1634,7 @@ static struct ffs_data *ffs_data_new(void) ENTER(); - atomic_set(&ffs->ref, 1); + refcount_set(&ffs->ref, 1); atomic_set(&ffs->opened, 0); ffs->state = FFS_READ_DESCRIPTORS; mutex_init(&ffs->mutex); @@ -3302,9 +3301,10 @@ static struct ffs_dev *_ffs_do_find_dev(const char *name) { struct ffs_dev *dev; + if (!name) + return NULL; + list_for_each_entry(dev, &ffs_devices, entry) { - if (!dev->name || !name) - continue; if (strcmp(dev->name, name) == 0) return dev; } @@ -3380,42 +3380,11 @@ static void ffs_free_inst(struct usb_function_instance *f) kfree(opts); } -#define MAX_INST_NAME_LEN 40 - static int ffs_set_inst_name(struct usb_function_instance *fi, const char *name) { - struct f_fs_opts *opts; - char *ptr; - const char *tmp; - int name_len, ret; - - name_len = strlen(name) + 1; - if (name_len > MAX_INST_NAME_LEN) + if (strlen(name) >= FIELD_SIZEOF(struct ffs_dev, name)) return -ENAMETOOLONG; - - ptr = kstrndup(name, name_len, GFP_KERNEL); - if (!ptr) - return -ENOMEM; - - opts = to_f_fs_opts(fi); - tmp = NULL; - - ffs_dev_lock(); - - tmp = opts->dev->name_allocated ? opts->dev->name : NULL; - ret = _ffs_name_dev(opts->dev, ptr); - if (ret) { - kfree(ptr); - ffs_dev_unlock(); - return ret; - } - opts->dev->name_allocated = true; - - ffs_dev_unlock(); - - kfree(tmp); - - return 0; + return ffs_name_dev(to_f_fs_opts(fi)->dev, name); } static struct usb_function_instance *ffs_alloc_inst(void) @@ -3545,32 +3514,19 @@ static struct ffs_dev *_ffs_alloc_dev(void) return dev; } -/* - * ffs_lock must be taken by the caller of this function - * The caller is responsible for "name" being available whenever f_fs needs it - */ -static int _ffs_name_dev(struct ffs_dev *dev, const char *name) +int ffs_name_dev(struct ffs_dev *dev, const char *name) { struct ffs_dev *existing; + int ret = 0; - existing = _ffs_do_find_dev(name); - if (existing) - return -EBUSY; - - dev->name = name; - - return 0; -} + ffs_dev_lock(); -/* - * The caller is responsible for "name" being available whenever f_fs needs it - */ -int ffs_name_dev(struct ffs_dev *dev, const char *name) -{ - int ret; + existing = _ffs_do_find_dev(name); + if (!existing) + strlcpy(dev->name, name, ARRAY_SIZE(dev->name)); + else if (existing != dev) + ret = -EBUSY; - ffs_dev_lock(); - ret = _ffs_name_dev(dev, name); ffs_dev_unlock(); return ret; @@ -3600,8 +3556,6 @@ EXPORT_SYMBOL_GPL(ffs_single_dev); static void _ffs_free_dev(struct ffs_dev *dev) { list_del(&dev->entry); - if (dev->name_allocated) - kfree(dev->name); /* Clear the private_data pointer to stop incorrect dev access */ if (dev->ffs_data) diff --git a/drivers/usb/gadget/function/f_ncm.c b/drivers/usb/gadget/function/f_ncm.c index 224717e63a5300970867a663cd030d8cd62068f6..864819ff9a7d362962eb837755886423b57906fb 100644 --- a/drivers/usb/gadget/function/f_ncm.c +++ b/drivers/usb/gadget/function/f_ncm.c @@ -16,6 +16,7 @@ */ #include +#include #include #include #include diff --git a/drivers/usb/gadget/function/u_ether.c b/drivers/usb/gadget/function/u_ether.c index c3cab77181d4c2677ea5214d6c9abc0b120d83e8..a8b40d07e927bb78e835b40efeea9bb83fb19bf8 100644 --- a/drivers/usb/gadget/function/u_ether.c +++ b/drivers/usb/gadget/function/u_ether.c @@ -178,6 +178,7 @@ static void rx_complete(struct usb_ep *ep, struct usb_request *req); static int rx_submit(struct eth_dev *dev, struct usb_request *req, gfp_t gfp_flags) { + struct usb_gadget *g = dev->gadget; struct sk_buff *skb; int retval = -ENOMEM; size_t size = 0; @@ -209,8 +210,11 @@ rx_submit(struct eth_dev *dev, struct usb_request *req, gfp_t gfp_flags) */ size += sizeof(struct ethhdr) + dev->net->mtu + RX_EXTRA; size += dev->port_usb->header_len; - size += out->maxpacket - 1; - size -= size % out->maxpacket; + + if (g->quirk_ep_out_aligned_size) { + size += out->maxpacket - 1; + size -= size % out->maxpacket; + } if (dev->port_usb->is_fixed) size = max_t(size_t, size, dev->port_usb->fixed_out_len); @@ -401,13 +405,12 @@ static int alloc_requests(struct eth_dev *dev, struct gether *link, unsigned n) static void rx_fill(struct eth_dev *dev, gfp_t gfp_flags) { struct usb_request *req; + struct usb_request *tmp; unsigned long flags; /* fill unused rxq slots with some skb */ spin_lock_irqsave(&dev->req_lock, flags); - while (!list_empty(&dev->rx_reqs)) { - req = container_of(dev->rx_reqs.next, - struct usb_request, list); + list_for_each_entry_safe(req, tmp, &dev->rx_reqs, list) { list_del_init(&req->list); spin_unlock_irqrestore(&dev->req_lock, flags); @@ -527,7 +530,7 @@ static netdev_tx_t eth_start_xmit(struct sk_buff *skb, return NETDEV_TX_BUSY; } - req = container_of(dev->tx_reqs.next, struct usb_request, list); + req = list_first_entry(&dev->tx_reqs, struct usb_request, list); list_del(&req->list); /* temporarily stop TX queue when the freelist empties */ @@ -1122,6 +1125,7 @@ void gether_disconnect(struct gether *link) { struct eth_dev *dev = link->ioport; struct usb_request *req; + struct usb_request *tmp; WARN_ON(!dev); if (!dev) @@ -1138,9 +1142,7 @@ void gether_disconnect(struct gether *link) */ usb_ep_disable(link->in_ep); spin_lock(&dev->req_lock); - while (!list_empty(&dev->tx_reqs)) { - req = container_of(dev->tx_reqs.next, - struct usb_request, list); + list_for_each_entry_safe(req, tmp, &dev->tx_reqs, list) { list_del(&req->list); spin_unlock(&dev->req_lock); @@ -1152,9 +1154,7 @@ void gether_disconnect(struct gether *link) usb_ep_disable(link->out_ep); spin_lock(&dev->req_lock); - while (!list_empty(&dev->rx_reqs)) { - req = container_of(dev->rx_reqs.next, - struct usb_request, list); + list_for_each_entry_safe(req, tmp, &dev->rx_reqs, list) { list_del(&req->list); spin_unlock(&dev->req_lock); diff --git a/drivers/usb/gadget/function/u_fs.h b/drivers/usb/gadget/function/u_fs.h index 4b6969451cdc28dcfabb4db8806a403df1199e0d..4378cc2fcac36fd69c71cdea53fef6e7ce6ce6e9 100644 --- a/drivers/usb/gadget/function/u_fs.h +++ b/drivers/usb/gadget/function/u_fs.h @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef VERBOSE_DEBUG #ifndef pr_vdebug @@ -39,15 +40,16 @@ struct f_fs_opts; struct ffs_dev { - const char *name; - bool name_allocated; - bool mounted; - bool desc_ready; - bool single; struct ffs_data *ffs_data; struct f_fs_opts *opts; struct list_head entry; + char name[41]; + + bool mounted; + bool desc_ready; + bool single; + int (*ffs_ready_callback)(struct ffs_data *ffs); void (*ffs_closed_callback)(struct ffs_data *ffs); void *(*ffs_acquire_dev_callback)(struct ffs_dev *dev); @@ -177,7 +179,7 @@ struct ffs_data { struct completion ep0req_completion; /* P: mutex */ /* reference counter */ - atomic_t ref; + refcount_t ref; /* how many files are opened (EP0 and others) */ atomic_t opened; diff --git a/drivers/usb/gadget/function/uvc_configfs.c b/drivers/usb/gadget/function/uvc_configfs.c index 4e037d2a7a60c3eefdd9f6585def29b23c02984b..844cb738bafd00537b567038301beb7f5850f034 100644 --- a/drivers/usb/gadget/function/uvc_configfs.c +++ b/drivers/usb/gadget/function/uvc_configfs.c @@ -2125,7 +2125,7 @@ static struct configfs_item_operations uvc_item_ops = { .release = uvc_attr_release, }; -#define UVCG_OPTS_ATTR(cname, conv, str2u, uxx, vnoc, limit) \ +#define UVCG_OPTS_ATTR(cname, aname, conv, str2u, uxx, vnoc, limit) \ static ssize_t f_uvc_opts_##cname##_show( \ struct config_item *item, char *page) \ { \ @@ -2168,16 +2168,16 @@ end: \ return ret; \ } \ \ -UVC_ATTR(f_uvc_opts_, cname, aname) +UVC_ATTR(f_uvc_opts_, cname, cname) #define identity_conv(x) (x) -UVCG_OPTS_ATTR(streaming_interval, identity_conv, kstrtou8, u8, identity_conv, - 16); -UVCG_OPTS_ATTR(streaming_maxpacket, le16_to_cpu, kstrtou16, u16, le16_to_cpu, - 3072); -UVCG_OPTS_ATTR(streaming_maxburst, identity_conv, kstrtou8, u8, identity_conv, - 15); +UVCG_OPTS_ATTR(streaming_interval, streaming_interval, identity_conv, + kstrtou8, u8, identity_conv, 16); +UVCG_OPTS_ATTR(streaming_maxpacket, streaming_maxpacket, le16_to_cpu, + kstrtou16, u16, le16_to_cpu, 3072); +UVCG_OPTS_ATTR(streaming_maxburst, streaming_maxburst, identity_conv, + kstrtou8, u8, identity_conv, 15); #undef identity_conv diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index a2c916869293720e378ced6b265532846eca52a3..b9ca0a26cbd93e540a2df8f78544adb81e298805 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -114,7 +115,7 @@ enum ep0_state { struct dev_data { spinlock_t lock; - atomic_t count; + refcount_t count; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; @@ -150,12 +151,12 @@ struct dev_data { static inline void get_dev (struct dev_data *data) { - atomic_inc (&data->count); + refcount_inc (&data->count); } static void put_dev (struct dev_data *data) { - if (likely (!atomic_dec_and_test (&data->count))) + if (likely (!refcount_dec_and_test (&data->count))) return; /* needs no more cleanup */ BUG_ON (waitqueue_active (&data->wait)); @@ -170,7 +171,7 @@ static struct dev_data *dev_new (void) if (!dev) return NULL; dev->state = STATE_DEV_DISABLED; - atomic_set (&dev->count, 1); + refcount_set (&dev->count, 1); spin_lock_init (&dev->lock); INIT_LIST_HEAD (&dev->epfiles); init_waitqueue_head (&dev->wait); @@ -190,7 +191,7 @@ enum ep_state { struct ep_data { struct mutex lock; enum ep_state state; - atomic_t count; + refcount_t count; struct dev_data *dev; /* must hold dev->lock before accessing ep or req */ struct usb_ep *ep; @@ -205,12 +206,12 @@ struct ep_data { static inline void get_ep (struct ep_data *data) { - atomic_inc (&data->count); + refcount_inc (&data->count); } static void put_ep (struct ep_data *data) { - if (likely (!atomic_dec_and_test (&data->count))) + if (likely (!refcount_dec_and_test (&data->count))) return; put_dev (data->dev); /* needs no more cleanup */ @@ -1561,7 +1562,7 @@ static int activate_ep_files (struct dev_data *dev) init_waitqueue_head (&data->wait); strncpy (data->name, ep->name, sizeof (data->name) - 1); - atomic_set (&data->count, 1); + refcount_set (&data->count, 1); data->dev = dev; get_dev (dev); diff --git a/drivers/usb/gadget/udc/Kconfig b/drivers/usb/gadget/udc/Kconfig index 4b69f28a9af972dc5fc5584b9f8bbd6a38774f5a..1c14c283cc47de111d8f6e4990b62f5906dc89a1 100644 --- a/drivers/usb/gadget/udc/Kconfig +++ b/drivers/usb/gadget/udc/Kconfig @@ -62,8 +62,9 @@ config USB_ATMEL_USBA The fifo_mode parameter is used to select endpoint allocation mode. fifo_mode = 0 is used to let the driver autoconfigure the endpoints. - In this case 2 banks are allocated for isochronous endpoints and - only one bank is allocated for the rest of the endpoints. + In this case, for ep1 2 banks are allocated if it works in isochronous + mode and only 1 bank otherwise. For the rest of the endpoints + only 1 bank is allocated. fifo_mode = 1 is a generic maximum fifo size (1024 bytes) configuration allowing the usage of ep1 - ep6 @@ -191,6 +192,7 @@ config USB_RENESAS_USBHS_UDC config USB_RENESAS_USB3 tristate 'Renesas USB3.0 Peripheral controller' depends on ARCH_RENESAS || COMPILE_TEST + depends on EXTCON help Renesas USB3.0 Peripheral controller is a USB peripheral controller that supports super, high, and full speed USB 3.0 data transfers. @@ -253,6 +255,20 @@ config USB_MV_U3D MARVELL PXA2128 Processor series include a super speed USB3.0 device controller, which support super speed USB peripheral. +config USB_SNP_CORE + depends on USB_AMD5536UDC + tristate + help + This enables core driver support for Synopsys USB 2.0 Device + controller. + + This will be enabled when PCI or Platform driver for this UDC is + selected. Currently, this will be enabled by USB_SNP_UDC_PLAT or + USB_AMD5536UDC options. + + This IP is different to the High Speed OTG IP that can be enabled + by selecting USB_DWC2 or USB_DWC3 options. + # # Controllers available in both integrated and discrete versions # @@ -277,7 +293,8 @@ source "drivers/usb/gadget/udc/bdc/Kconfig" config USB_AMD5536UDC tristate "AMD5536 UDC" - depends on PCI + depends on USB_PCI + select USB_SNP_CORE help The AMD5536 UDC is part of the AMD Geode CS5536, an x86 southbridge. It is a USB Highspeed DMA capable USB device controller. Beside ep0 @@ -285,6 +302,9 @@ config USB_AMD5536UDC The UDC port supports OTG operation, and may be used as a host port if it's not being used to implement peripheral or OTG roles. + This UDC is based on Synopsys USB device controller IP and selects + CONFIG_USB_SNP_CORE option to build the core driver. + Say "y" to link the driver statically, or "m" to build a dynamically linked module called "amd5536udc" and force all gadget drivers to also be dynamically linked. @@ -327,7 +347,7 @@ config USB_NET2272_DMA config USB_NET2280 tristate "NetChip NET228x / PLX USB3x8x" - depends on PCI + depends on USB_PCI help NetChip 2280 / 2282 is a PCI based USB peripheral controller which supports both full and high speed USB 2.0 data transfers. @@ -352,7 +372,7 @@ config USB_NET2280 config USB_GOKU tristate "Toshiba TC86C001 'Goku-S'" - depends on PCI + depends on USB_PCI help The Toshiba TC86C001 is a PCI device which includes controllers for full speed USB devices, IDE, I2C, SIO, plus a USB host (OHCI). @@ -366,7 +386,7 @@ config USB_GOKU config USB_EG20T tristate "Intel QUARK X1000/EG20T PCH/LAPIS Semiconductor IOH(ML7213/ML7831) UDC" - depends on PCI + depends on USB_PCI help This is a USB device driver for EG20T PCH. EG20T PCH is the platform controller hub that is used in Intel's diff --git a/drivers/usb/gadget/udc/Makefile b/drivers/usb/gadget/udc/Makefile index 98e74ed9f555547d8d4b778bb7d2fc4d19e553f1..626e1f1c62da9ad0430b68bfd721a14f6f1965ab 100644 --- a/drivers/usb/gadget/udc/Makefile +++ b/drivers/usb/gadget/udc/Makefile @@ -10,7 +10,8 @@ obj-$(CONFIG_USB_GADGET) += udc-core.o obj-$(CONFIG_USB_DUMMY_HCD) += dummy_hcd.o obj-$(CONFIG_USB_NET2272) += net2272.o obj-$(CONFIG_USB_NET2280) += net2280.o -obj-$(CONFIG_USB_AMD5536UDC) += amd5536udc.o +obj-$(CONFIG_USB_SNP_CORE) += amd5536udc.o +obj-$(CONFIG_USB_AMD5536UDC) += amd5536udc_pci.o obj-$(CONFIG_USB_PXA25X) += pxa25x_udc.o obj-$(CONFIG_USB_PXA27X) += pxa27x_udc.o obj-$(CONFIG_USB_GOKU) += goku_udc.o diff --git a/drivers/usb/gadget/udc/amd5536udc.c b/drivers/usb/gadget/udc/amd5536udc.c index ea03ca7ae29a22cd171584846891883e426da570..4ecd2f20ea480082a5a3abf2f9a7f8e756e6c495 100644 --- a/drivers/usb/gadget/udc/amd5536udc.c +++ b/drivers/usb/gadget/udc/amd5536udc.c @@ -11,27 +11,15 @@ */ /* - * The AMD5536 UDC is part of the x86 southbridge AMD Geode CS5536. - * It is a USB Highspeed DMA capable USB device controller. Beside ep0 it - * provides 4 IN and 4 OUT endpoints (bulk or interrupt type). - * - * Make sure that UDC is assigned to port 4 by BIOS settings (port can also - * be used as host port) and UOC bits PAD_EN and APU are set (should be done - * by BIOS init). - * - * UDC DMA requires 32-bit aligned buffers so DMA with gadget ether does not - * work without updating NET_IP_ALIGN. Or PIO mode (module param "use_dma=0") - * can be used with gadget ether. + * This file does the core driver implementation for the UDC that is based + * on Synopsys device controller IP (different than HS OTG IP) that is either + * connected through PCI bus or integrated to SoC platforms. */ -/* debug control */ -/* #define UDC_VERBOSE */ - /* Driver strings */ -#define UDC_MOD_DESCRIPTION "AMD 5536 UDC - USB Device Controller" +#define UDC_MOD_DESCRIPTION "Synopsys USB Device Controller" #define UDC_DRIVER_VERSION_STRING "01.00.0206" -/* system */ #include #include #include @@ -46,23 +34,12 @@ #include #include #include -#include -#include -#include -#include #include - +#include #include #include - -/* gadget stack */ -#include -#include - -/* udc specific */ #include "amd5536udc.h" - static void udc_tasklet_disconnect(unsigned long); static void empty_req_queue(struct udc_ep *); static void udc_setup_endpoints(struct udc *dev); @@ -72,7 +49,7 @@ static void udc_free_request(struct usb_ep *usbep, struct usb_request *usbreq); /* description */ static const char mod_desc[] = UDC_MOD_DESCRIPTION; -static const char name[] = "amd5536udc"; +static const char name[] = "udc"; /* structure to hold endpoint function pointers */ static const struct usb_ep_ops udc_ep_ops; @@ -208,30 +185,11 @@ static const struct { #undef EP_INFO }; -/* DMA usage flag */ -static bool use_dma = 1; -/* packet per buffer dma */ -static bool use_dma_ppb = 1; -/* with per descr. update */ -static bool use_dma_ppb_du; /* buffer fill mode */ static int use_dma_bufferfill_mode; -/* full speed only mode */ -static bool use_fullspeed; /* tx buffer size for high speed */ static unsigned long hs_tx_buf = UDC_EPIN_BUFF_SIZE; -/* module parameters */ -module_param(use_dma, bool, S_IRUGO); -MODULE_PARM_DESC(use_dma, "true for DMA"); -module_param(use_dma_ppb, bool, S_IRUGO); -MODULE_PARM_DESC(use_dma_ppb, "true for DMA in packet per buffer mode"); -module_param(use_dma_ppb_du, bool, S_IRUGO); -MODULE_PARM_DESC(use_dma_ppb_du, - "true for DMA in packet per buffer mode with descriptor update"); -module_param(use_fullspeed, bool, S_IRUGO); -MODULE_PARM_DESC(use_fullspeed, "true for fullspeed only"); - /*---------------------------------------------------------------------------*/ /* Prints UDC device registers and endpoint irq registers */ static void print_regs(struct udc *dev) @@ -267,7 +225,7 @@ static void print_regs(struct udc *dev) } /* Masks unused interrupts */ -static int udc_mask_unused_interrupts(struct udc *dev) +int udc_mask_unused_interrupts(struct udc *dev) { u32 tmp; @@ -287,6 +245,7 @@ static int udc_mask_unused_interrupts(struct udc *dev) return 0; } +EXPORT_SYMBOL_GPL(udc_mask_unused_interrupts); /* Enables endpoint 0 interrupts */ static int udc_enable_ep0_interrupts(struct udc *dev) @@ -306,7 +265,7 @@ static int udc_enable_ep0_interrupts(struct udc *dev) } /* Enables device interrupts for SET_INTF and SET_CONFIG */ -static int udc_enable_dev_setup_interrupts(struct udc *dev) +int udc_enable_dev_setup_interrupts(struct udc *dev) { u32 tmp; @@ -325,6 +284,7 @@ static int udc_enable_dev_setup_interrupts(struct udc *dev) return 0; } +EXPORT_SYMBOL_GPL(udc_enable_dev_setup_interrupts); /* Calculates fifo start of endpoint based on preceding endpoints */ static int udc_set_txfifo_addr(struct udc_ep *ep) @@ -583,7 +543,7 @@ udc_alloc_request(struct usb_ep *usbep, gfp_t gfp) if (ep->dma) { /* ep0 in requests are allocated from data pool here */ - dma_desc = pci_pool_alloc(ep->dev->data_requests, gfp, + dma_desc = dma_pool_alloc(ep->dev->data_requests, gfp, &req->td_phys); if (!dma_desc) { kfree(req); @@ -608,27 +568,23 @@ udc_alloc_request(struct usb_ep *usbep, gfp_t gfp) } /* frees pci pool descriptors of a DMA chain */ -static int udc_free_dma_chain(struct udc *dev, struct udc_request *req) +static void udc_free_dma_chain(struct udc *dev, struct udc_request *req) { - int ret_val = 0; - struct udc_data_dma *td; - struct udc_data_dma *td_last = NULL; + struct udc_data_dma *td = req->td_data; unsigned int i; + dma_addr_t addr_next = 0x00; + dma_addr_t addr = (dma_addr_t)td->next; + DBG(dev, "free chain req = %p\n", req); /* do not free first desc., will be done by free for request */ - td_last = req->td_data; - td = phys_to_virt(td_last->next); - for (i = 1; i < req->chain_len; i++) { - pci_pool_free(dev->data_requests, td, - (dma_addr_t)td_last->next); - td_last = td; - td = phys_to_virt(td_last->next); + td = phys_to_virt(addr); + addr_next = (dma_addr_t)td->next; + dma_pool_free(dev->data_requests, td, addr); + addr = addr_next; } - - return ret_val; } /* Frees request packet, called by gadget driver */ @@ -652,7 +608,7 @@ udc_free_request(struct usb_ep *usbep, struct usb_request *usbreq) if (req->chain_len > 1) udc_free_dma_chain(ep->dev, req); - pci_pool_free(ep->dev->data_requests, req->td_data, + dma_pool_free(ep->dev->data_requests, req->td_data, req->td_phys); } kfree(req); @@ -847,7 +803,7 @@ static int udc_create_dma_chain( for (i = buf_len; i < bytes; i += buf_len) { /* create or determine next desc. */ if (create_new_chain) { - td = pci_pool_alloc(ep->dev->data_requests, + td = dma_pool_alloc(ep->dev->data_requests, gfp_flags, &dma_addr); if (!td) return -ENOMEM; @@ -1507,7 +1463,7 @@ static void make_ep_lists(struct udc *dev) } /* Inits UDC context */ -static void udc_basic_init(struct udc *dev) +void udc_basic_init(struct udc *dev) { u32 tmp; @@ -1543,6 +1499,7 @@ static void udc_basic_init(struct udc *dev) dev->data_ep_enabled = 0; dev->data_ep_queued = 0; } +EXPORT_SYMBOL_GPL(udc_basic_init); /* init registers at driver load time */ static int startup_registers(struct udc *dev) @@ -3031,7 +2988,7 @@ __acquires(dev->lock) } /* Interrupt Service Routine, see Linux Kernel Doc for parameters */ -static irqreturn_t udc_irq(int irq, void *pdev) +irqreturn_t udc_irq(int irq, void *pdev) { struct udc *dev = pdev; u32 reg; @@ -3083,16 +3040,18 @@ static irqreturn_t udc_irq(int irq, void *pdev) spin_unlock(&dev->lock); return ret_val; } +EXPORT_SYMBOL_GPL(udc_irq); /* Tears down device */ -static void gadget_release(struct device *pdev) +void gadget_release(struct device *pdev) { struct amd5536udc *dev = dev_get_drvdata(pdev); kfree(dev); } +EXPORT_SYMBOL_GPL(gadget_release); /* Cleanup on device remove */ -static void udc_remove(struct udc *dev) +void udc_remove(struct udc *dev) { /* remove timer */ stop_timer++; @@ -3108,9 +3067,10 @@ static void udc_remove(struct udc *dev) del_timer_sync(&udc_pollstall_timer); udc = NULL; } +EXPORT_SYMBOL_GPL(udc_remove); /* free all the dma pools */ -static void free_dma_pools(struct udc *dev) +void free_dma_pools(struct udc *dev) { dma_pool_free(dev->stp_requests, dev->ep[UDC_EP0OUT_IX].td, dev->ep[UDC_EP0OUT_IX].td_phys); @@ -3119,35 +3079,10 @@ static void free_dma_pools(struct udc *dev) dma_pool_destroy(dev->stp_requests); dma_pool_destroy(dev->data_requests); } - -/* Reset all pci context */ -static void udc_pci_remove(struct pci_dev *pdev) -{ - struct udc *dev; - - dev = pci_get_drvdata(pdev); - - usb_del_gadget_udc(&udc->gadget); - /* gadget driver must not be registered */ - if (WARN_ON(dev->driver)) - return; - - /* dma pool cleanup */ - free_dma_pools(dev); - - /* reset controller */ - writel(AMD_BIT(UDC_DEVCFG_SOFTRESET), &dev->regs->cfg); - free_irq(pdev->irq, dev); - iounmap(dev->virt_addr); - release_mem_region(pci_resource_start(pdev, 0), - pci_resource_len(pdev, 0)); - pci_disable_device(pdev); - - udc_remove(dev); -} +EXPORT_SYMBOL_GPL(free_dma_pools); /* create dma pools on init */ -static int init_dma_pools(struct udc *dev) +int init_dma_pools(struct udc *dev) { struct udc_stp_dma *td_stp; struct udc_data_dma *td_data; @@ -3210,9 +3145,10 @@ static int init_dma_pools(struct udc *dev) dev->data_requests = NULL; return retval; } +EXPORT_SYMBOL_GPL(init_dma_pools); /* general probe */ -static int udc_probe(struct udc *dev) +int udc_probe(struct udc *dev) { char tmp[128]; u32 reg; @@ -3276,137 +3212,7 @@ static int udc_probe(struct udc *dev) finished: return retval; } - -/* Called by pci bus driver to init pci context */ -static int udc_pci_probe( - struct pci_dev *pdev, - const struct pci_device_id *id -) -{ - struct udc *dev; - unsigned long resource; - unsigned long len; - int retval = 0; - - /* one udc only */ - if (udc) { - dev_dbg(&pdev->dev, "already probed\n"); - return -EBUSY; - } - - /* init */ - dev = kzalloc(sizeof(struct udc), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - /* pci setup */ - if (pci_enable_device(pdev) < 0) { - retval = -ENODEV; - goto err_pcidev; - } - - /* PCI resource allocation */ - resource = pci_resource_start(pdev, 0); - len = pci_resource_len(pdev, 0); - - if (!request_mem_region(resource, len, name)) { - dev_dbg(&pdev->dev, "pci device used already\n"); - retval = -EBUSY; - goto err_memreg; - } - - dev->virt_addr = ioremap_nocache(resource, len); - if (!dev->virt_addr) { - dev_dbg(&pdev->dev, "start address cannot be mapped\n"); - retval = -EFAULT; - goto err_ioremap; - } - - if (!pdev->irq) { - dev_err(&pdev->dev, "irq not set\n"); - retval = -ENODEV; - goto err_irq; - } - - spin_lock_init(&dev->lock); - /* udc csr registers base */ - dev->csr = dev->virt_addr + UDC_CSR_ADDR; - /* dev registers base */ - dev->regs = dev->virt_addr + UDC_DEVCFG_ADDR; - /* ep registers base */ - dev->ep_regs = dev->virt_addr + UDC_EPREGS_ADDR; - /* fifo's base */ - dev->rxfifo = (u32 __iomem *)(dev->virt_addr + UDC_RXFIFO_ADDR); - dev->txfifo = (u32 __iomem *)(dev->virt_addr + UDC_TXFIFO_ADDR); - - if (request_irq(pdev->irq, udc_irq, IRQF_SHARED, name, dev) != 0) { - dev_dbg(&pdev->dev, "request_irq(%d) fail\n", pdev->irq); - retval = -EBUSY; - goto err_irq; - } - - pci_set_drvdata(pdev, dev); - - /* chip revision for Hs AMD5536 */ - dev->chiprev = pdev->revision; - - pci_set_master(pdev); - pci_try_set_mwi(pdev); - - /* init dma pools */ - if (use_dma) { - retval = init_dma_pools(dev); - if (retval != 0) - goto err_dma; - } - - dev->phys_addr = resource; - dev->irq = pdev->irq; - dev->pdev = pdev; - - /* general probing */ - if (udc_probe(dev)) { - retval = -ENODEV; - goto err_probe; - } - return 0; - -err_probe: - if (use_dma) - free_dma_pools(dev); -err_dma: - free_irq(pdev->irq, dev); -err_irq: - iounmap(dev->virt_addr); -err_ioremap: - release_mem_region(resource, len); -err_memreg: - pci_disable_device(pdev); -err_pcidev: - kfree(dev); - return retval; -} - -/* PCI device parameters */ -static const struct pci_device_id pci_id[] = { - { - PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x2096), - .class = PCI_CLASS_SERIAL_USB_DEVICE, - .class_mask = 0xffffffff, - }, - {}, -}; -MODULE_DEVICE_TABLE(pci, pci_id); - -/* PCI functions */ -static struct pci_driver udc_pci_driver = { - .name = (char *) name, - .id_table = pci_id, - .probe = udc_pci_probe, - .remove = udc_pci_remove, -}; - -module_pci_driver(udc_pci_driver); +EXPORT_SYMBOL_GPL(udc_probe); MODULE_DESCRIPTION(UDC_MOD_DESCRIPTION); MODULE_AUTHOR("Thomas Dahlmann"); diff --git a/drivers/usb/gadget/udc/amd5536udc.h b/drivers/usb/gadget/udc/amd5536udc.h index 4638d707f1698940eea06131fe0baf0cfbb2032f..fae49bf3833e603a8bb7e0a305d0655bea97a73e 100644 --- a/drivers/usb/gadget/udc/amd5536udc.h +++ b/drivers/usb/gadget/udc/amd5536udc.h @@ -13,6 +13,12 @@ #ifndef AMD5536UDC_H #define AMD5536UDC_H +/* debug control */ +/* #define UDC_VERBOSE */ + +#include +#include + /* various constants */ #define UDC_RDE_TIMER_SECONDS 1 #define UDC_RDE_TIMER_DIV 10 @@ -545,8 +551,8 @@ struct udc { u32 __iomem *txfifo; /* DMA desc pools */ - struct pci_pool *data_requests; - struct pci_pool *stp_requests; + struct dma_pool *data_requests; + struct dma_pool *stp_requests; /* device data */ unsigned long phys_addr; @@ -567,6 +573,36 @@ union udc_setup_data { struct usb_ctrlrequest request; }; +/* Function declarations */ +int udc_enable_dev_setup_interrupts(struct udc *dev); +int udc_mask_unused_interrupts(struct udc *dev); +irqreturn_t udc_irq(int irq, void *pdev); +void gadget_release(struct device *pdev); +void udc_basic_init(struct udc *dev); +void free_dma_pools(struct udc *dev); +int init_dma_pools(struct udc *dev); +void udc_remove(struct udc *dev); +int udc_probe(struct udc *dev); + +/* DMA usage flag */ +static bool use_dma = 1; +/* packet per buffer dma */ +static bool use_dma_ppb = 1; +/* with per descr. update */ +static bool use_dma_ppb_du; +/* full speed only mode */ +static bool use_fullspeed; + +/* module parameters */ +module_param(use_dma, bool, S_IRUGO); +MODULE_PARM_DESC(use_dma, "true for DMA"); +module_param(use_dma_ppb, bool, S_IRUGO); +MODULE_PARM_DESC(use_dma_ppb, "true for DMA in packet per buffer mode"); +module_param(use_dma_ppb_du, bool, S_IRUGO); +MODULE_PARM_DESC(use_dma_ppb_du, + "true for DMA in packet per buffer mode with descriptor update"); +module_param(use_fullspeed, bool, S_IRUGO); +MODULE_PARM_DESC(use_fullspeed, "true for fullspeed only"); /* *--------------------------------------------------------------------------- * SET and GET bitfields in u32 values diff --git a/drivers/usb/gadget/udc/amd5536udc_pci.c b/drivers/usb/gadget/udc/amd5536udc_pci.c new file mode 100644 index 0000000000000000000000000000000000000000..2a2d0a96fe246d4be570da05c228ea97528e86d3 --- /dev/null +++ b/drivers/usb/gadget/udc/amd5536udc_pci.c @@ -0,0 +1,217 @@ +/* + * amd5536udc_pci.c -- AMD 5536 UDC high/full speed USB device controller + * + * Copyright (C) 2005-2007 AMD (http://www.amd.com) + * Author: Thomas Dahlmann + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +/* + * The AMD5536 UDC is part of the x86 southbridge AMD Geode CS5536. + * It is a USB Highspeed DMA capable USB device controller. Beside ep0 it + * provides 4 IN and 4 OUT endpoints (bulk or interrupt type). + * + * Make sure that UDC is assigned to port 4 by BIOS settings (port can also + * be used as host port) and UOC bits PAD_EN and APU are set (should be done + * by BIOS init). + * + * UDC DMA requires 32-bit aligned buffers so DMA with gadget ether does not + * work without updating NET_IP_ALIGN. Or PIO mode (module param "use_dma=0") + * can be used with gadget ether. + * + * This file does pci device registration, and the core driver implementation + * is done in amd5536udc.c + * + * The driver is split so as to use the core UDC driver which is based on + * Synopsys device controller IP (different than HS OTG IP) in UDCs + * integrated to SoC platforms. + * + */ + +/* Driver strings */ +#define UDC_MOD_DESCRIPTION "AMD 5536 UDC - USB Device Controller" + +/* system */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* udc specific */ +#include "amd5536udc.h" + +/* pointer to device object */ +static struct udc *udc; + +/* description */ +static const char mod_desc[] = UDC_MOD_DESCRIPTION; +static const char name[] = "amd5536udc-pci"; + +/* Reset all pci context */ +static void udc_pci_remove(struct pci_dev *pdev) +{ + struct udc *dev; + + dev = pci_get_drvdata(pdev); + + usb_del_gadget_udc(&udc->gadget); + /* gadget driver must not be registered */ + if (WARN_ON(dev->driver)) + return; + + /* dma pool cleanup */ + free_dma_pools(dev); + + /* reset controller */ + writel(AMD_BIT(UDC_DEVCFG_SOFTRESET), &dev->regs->cfg); + free_irq(pdev->irq, dev); + iounmap(dev->virt_addr); + release_mem_region(pci_resource_start(pdev, 0), + pci_resource_len(pdev, 0)); + pci_disable_device(pdev); + + udc_remove(dev); +} + +/* Called by pci bus driver to init pci context */ +static int udc_pci_probe( + struct pci_dev *pdev, + const struct pci_device_id *id +) +{ + struct udc *dev; + unsigned long resource; + unsigned long len; + int retval = 0; + + /* one udc only */ + if (udc) { + dev_dbg(&pdev->dev, "already probed\n"); + return -EBUSY; + } + + /* init */ + dev = kzalloc(sizeof(struct udc), GFP_KERNEL); + if (!dev) + return -ENOMEM; + + /* pci setup */ + if (pci_enable_device(pdev) < 0) { + retval = -ENODEV; + goto err_pcidev; + } + + /* PCI resource allocation */ + resource = pci_resource_start(pdev, 0); + len = pci_resource_len(pdev, 0); + + if (!request_mem_region(resource, len, name)) { + dev_dbg(&pdev->dev, "pci device used already\n"); + retval = -EBUSY; + goto err_memreg; + } + + dev->virt_addr = ioremap_nocache(resource, len); + if (!dev->virt_addr) { + dev_dbg(&pdev->dev, "start address cannot be mapped\n"); + retval = -EFAULT; + goto err_ioremap; + } + + if (!pdev->irq) { + dev_err(&pdev->dev, "irq not set\n"); + retval = -ENODEV; + goto err_irq; + } + + spin_lock_init(&dev->lock); + /* udc csr registers base */ + dev->csr = dev->virt_addr + UDC_CSR_ADDR; + /* dev registers base */ + dev->regs = dev->virt_addr + UDC_DEVCFG_ADDR; + /* ep registers base */ + dev->ep_regs = dev->virt_addr + UDC_EPREGS_ADDR; + /* fifo's base */ + dev->rxfifo = (u32 __iomem *)(dev->virt_addr + UDC_RXFIFO_ADDR); + dev->txfifo = (u32 __iomem *)(dev->virt_addr + UDC_TXFIFO_ADDR); + + if (request_irq(pdev->irq, udc_irq, IRQF_SHARED, name, dev) != 0) { + dev_dbg(&pdev->dev, "request_irq(%d) fail\n", pdev->irq); + retval = -EBUSY; + goto err_irq; + } + + pci_set_drvdata(pdev, dev); + + /* chip revision for Hs AMD5536 */ + dev->chiprev = pdev->revision; + + pci_set_master(pdev); + pci_try_set_mwi(pdev); + + /* init dma pools */ + if (use_dma) { + retval = init_dma_pools(dev); + if (retval != 0) + goto err_dma; + } + + dev->phys_addr = resource; + dev->irq = pdev->irq; + dev->pdev = pdev; + + /* general probing */ + if (udc_probe(dev)) { + retval = -ENODEV; + goto err_probe; + } + return 0; + +err_probe: + if (use_dma) + free_dma_pools(dev); +err_dma: + free_irq(pdev->irq, dev); +err_irq: + iounmap(dev->virt_addr); +err_ioremap: + release_mem_region(resource, len); +err_memreg: + pci_disable_device(pdev); +err_pcidev: + kfree(dev); + return retval; +} + +/* PCI device parameters */ +static const struct pci_device_id pci_id[] = { + { + PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x2096), + .class = PCI_CLASS_SERIAL_USB_DEVICE, + .class_mask = 0xffffffff, + }, + {}, +}; +MODULE_DEVICE_TABLE(pci, pci_id); + +/* PCI functions */ +static struct pci_driver udc_pci_driver = { + .name = (char *) name, + .id_table = pci_id, + .probe = udc_pci_probe, + .remove = udc_pci_remove, +}; +module_pci_driver(udc_pci_driver); + +MODULE_DESCRIPTION(UDC_MOD_DESCRIPTION); +MODULE_AUTHOR("Thomas Dahlmann"); +MODULE_LICENSE("GPL"); diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index 2035906b8ced173c2e869a3272334d4265acf79c..3ccc34176a5aabb43cc4ee0bba90efa6fb32f551 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -321,7 +321,6 @@ static inline void usba_cleanup_debugfs(struct usba_udc *udc) static ushort fifo_mode; -/* "modprobe ... fifo_mode=1" etc */ module_param(fifo_mode, ushort, 0x0); MODULE_PARM_DESC(fifo_mode, "Endpoint configuration mode"); @@ -371,7 +370,7 @@ static struct usba_fifo_cfg mode_4_cfg[] = { }; /* Add additional configurations here */ -int usba_config_fifo_table(struct usba_udc *udc) +static int usba_config_fifo_table(struct usba_udc *udc) { int n; @@ -1076,11 +1075,9 @@ static int atmel_usba_start(struct usb_gadget *gadget, struct usb_gadget_driver *driver); static int atmel_usba_stop(struct usb_gadget *gadget); -static struct usb_ep *atmel_usba_match_ep( - struct usb_gadget *gadget, - struct usb_endpoint_descriptor *desc, - struct usb_ss_ep_comp_descriptor *ep_comp -) +static struct usb_ep *atmel_usba_match_ep(struct usb_gadget *gadget, + struct usb_endpoint_descriptor *desc, + struct usb_ss_ep_comp_descriptor *ep_comp) { struct usb_ep *_ep; struct usba_ep *ep; @@ -1100,7 +1097,6 @@ static struct usb_ep *atmel_usba_match_ep( ep = to_usba_ep(_ep); switch (usb_endpoint_type(desc)) { - case USB_ENDPOINT_XFER_CONTROL: break; @@ -1141,7 +1137,7 @@ static struct usb_ep *atmel_usba_match_ep( ep->udc->configured_ep++; } -return _ep; + return _ep; } static const struct usb_gadget_ops usba_udc_ops = { @@ -1855,8 +1851,8 @@ static irqreturn_t usba_udc_irq(int irq, void *devid) * but it's clearly harmless... */ if (!(usba_ep_readl(ep0, CFG) & USBA_EPT_MAPPED)) - dev_dbg(&udc->pdev->dev, - "ODD: EP0 configuration is invalid!\n"); + dev_err(&udc->pdev->dev, + "ODD: EP0 configuration is invalid!\n"); /* Preallocate other endpoints */ n = fifo_mode ? udc->num_ep : udc->configured_ep; @@ -1864,8 +1860,8 @@ static irqreturn_t usba_udc_irq(int irq, void *devid) ep = &udc->usba_ep[i]; usba_ep_writel(ep, CFG, ep->ept_cfg); if (!(usba_ep_readl(ep, CFG) & USBA_EPT_MAPPED)) - dev_dbg(&udc->pdev->dev, - "ODD: EP%d configuration is invalid!\n", i); + dev_err(&udc->pdev->dev, + "ODD: EP%d configuration is invalid!\n", i); } } @@ -2089,8 +2085,9 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev, while ((pp = of_get_next_child(np, pp))) udc->num_ep++; udc->configured_ep = 1; - } else + } else { udc->num_ep = usba_config_fifo_table(udc); + } eps = devm_kzalloc(&pdev->dev, sizeof(struct usba_ep) * udc->num_ep, GFP_KERNEL); @@ -2118,14 +2115,34 @@ static struct usba_ep * atmel_udc_of_init(struct platform_device *pdev, dev_err(&pdev->dev, "of_probe: fifo-size error(%d)\n", ret); goto err; } - ep->fifo_size = fifo_mode ? udc->fifo_cfg[i].fifo_size : val; + if (fifo_mode) { + if (val < udc->fifo_cfg[i].fifo_size) { + dev_warn(&pdev->dev, + "Using max fifo-size value from DT\n"); + ep->fifo_size = val; + } else { + ep->fifo_size = udc->fifo_cfg[i].fifo_size; + } + } else { + ep->fifo_size = val; + } ret = of_property_read_u32(pp, "atmel,nb-banks", &val); if (ret) { dev_err(&pdev->dev, "of_probe: nb-banks error(%d)\n", ret); goto err; } - ep->nr_banks = fifo_mode ? udc->fifo_cfg[i].nr_banks : val; + if (fifo_mode) { + if (val < udc->fifo_cfg[i].nr_banks) { + dev_warn(&pdev->dev, + "Using max nb-banks value from DT\n"); + ep->nr_banks = val; + } else { + ep->nr_banks = udc->fifo_cfg[i].nr_banks; + } + } else { + ep->nr_banks = val; + } ep->can_dma = of_property_read_bool(pp, "atmel,can-dma"); ep->can_isoc = of_property_read_bool(pp, "atmel,can-isoc"); diff --git a/drivers/usb/gadget/udc/bdc/Kconfig b/drivers/usb/gadget/udc/bdc/Kconfig index 0d7b8c9f72fda17e4a5cfb904592806af817b2ae..eb8b553923609fbd09a0631ae3eaed3b1c5fff87 100644 --- a/drivers/usb/gadget/udc/bdc/Kconfig +++ b/drivers/usb/gadget/udc/bdc/Kconfig @@ -14,7 +14,7 @@ if USB_BDC_UDC comment "Platform Support" config USB_BDC_PCI tristate "BDC support for PCIe based platforms" - depends on PCI + depends on USB_PCI default USB_BDC_UDC help Enable support for platforms which have BDC connected through PCIe, such as Lego3 FPGA platform. diff --git a/drivers/usb/gadget/udc/core.c b/drivers/usb/gadget/udc/core.c index d685d82dcf487c9f53bb3cd821424159d2e445d1..efce68e9a8e038557bdc36c8263bad03cef7e108 100644 --- a/drivers/usb/gadget/udc/core.c +++ b/drivers/usb/gadget/udc/core.c @@ -1273,6 +1273,7 @@ void usb_del_gadget_udc(struct usb_gadget *gadget) flush_work(&gadget->work); device_unregister(&udc->dev); device_unregister(&gadget->dev); + memset(&gadget->dev, 0x00, sizeof(gadget->dev)); } EXPORT_SYMBOL_GPL(usb_del_gadget_udc); diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 8cabc5944d5f1d834db7dd2186777cd79536016b..c79081952ea068c770887298ed1c5483f0e61dac 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -2062,16 +2062,13 @@ static int dummy_hub_control( } break; case USB_PORT_FEAT_POWER: - if (hcd->speed == HCD_USB3) { - if (dum_hcd->port_status & USB_PORT_STAT_POWER) - dev_dbg(dummy_dev(dum_hcd), - "power-off\n"); - } else - if (dum_hcd->port_status & - USB_SS_PORT_STAT_POWER) - dev_dbg(dummy_dev(dum_hcd), - "power-off\n"); - /* FALLS THROUGH */ + dev_dbg(dummy_dev(dum_hcd), "power-off\n"); + if (hcd->speed == HCD_USB3) + dum_hcd->port_status &= ~USB_SS_PORT_STAT_POWER; + else + dum_hcd->port_status &= ~USB_PORT_STAT_POWER; + set_link_state(dum_hcd); + break; default: dum_hcd->port_status &= ~(1 << wValue); set_link_state(dum_hcd); @@ -2242,14 +2239,13 @@ static int dummy_hub_control( if ((dum_hcd->port_status & USB_SS_PORT_STAT_POWER) != 0) { dum_hcd->port_status |= (1 << wValue); - set_link_state(dum_hcd); } } else if ((dum_hcd->port_status & USB_PORT_STAT_POWER) != 0) { dum_hcd->port_status |= (1 << wValue); - set_link_state(dum_hcd); } + set_link_state(dum_hcd); } break; case GetPortErrorCount: diff --git a/drivers/usb/gadget/udc/fsl_udc_core.c b/drivers/usb/gadget/udc/fsl_udc_core.c index b76fcdb763a0d27c5606e97807b18bf14e05ba9f..6f2f71c054be27f66cf56d56012aaf7332d2dad4 100644 --- a/drivers/usb/gadget/udc/fsl_udc_core.c +++ b/drivers/usb/gadget/udc/fsl_udc_core.c @@ -2675,6 +2675,8 @@ static const struct platform_device_id fsl_udc_devtype[] = { .name = "imx-udc-mx27", }, { .name = "imx-udc-mx51", + }, { + .name = "fsl-usb2-udc", }, { /* sentinel */ } diff --git a/drivers/usb/gadget/udc/mv_u3d_core.c b/drivers/usb/gadget/udc/mv_u3d_core.c index d365449a295a8ab823f9358146a0a910bcabba2d..772049afe1666404dcdf0ee1310feb344d574f33 100644 --- a/drivers/usb/gadget/udc/mv_u3d_core.c +++ b/drivers/usb/gadget/udc/mv_u3d_core.c @@ -1835,13 +1835,18 @@ static int mv_u3d_probe(struct platform_device *dev) } /* we will access controller register, so enable the u3d controller */ - clk_enable(u3d->clk); + retval = clk_enable(u3d->clk); + if (retval) { + dev_err(&dev->dev, "clk_enable error %d\n", retval); + goto err_u3d_enable; + } if (pdata->phy_init) { retval = pdata->phy_init(u3d->phy_regs); if (retval) { dev_err(&dev->dev, "init phy error %d\n", retval); - goto err_u3d_enable; + clk_disable(u3d->clk); + goto err_phy_init; } } @@ -1974,15 +1979,13 @@ static int mv_u3d_probe(struct platform_device *dev) dma_free_coherent(&dev->dev, u3d->ep_context_size, u3d->ep_context, u3d->ep_context_dma); err_alloc_ep_context: - if (pdata->phy_deinit) - pdata->phy_deinit(u3d->phy_regs); - clk_disable(u3d->clk); +err_phy_init: err_u3d_enable: iounmap(u3d->cap_regs); err_map_cap_regs: err_get_cap_regs: -err_get_clk: clk_put(u3d->clk); +err_get_clk: kfree(u3d); err_alloc_private: err_pdata: diff --git a/drivers/usb/gadget/udc/mv_udc_core.c b/drivers/usb/gadget/udc/mv_udc_core.c index 27ebb0d5449d0dda58863ffb28bcbd8f3514f93f..76f56c5762f9659e27aeb1304a4ff145986685ab 100644 --- a/drivers/usb/gadget/udc/mv_udc_core.c +++ b/drivers/usb/gadget/udc/mv_udc_core.c @@ -445,7 +445,8 @@ static int mv_ep_enable(struct usb_ep *_ep, struct mv_dqh *dqh; u16 max = 0; u32 bit_pos, epctrlx, direction; - unsigned char zlt = 0, ios = 0, mult = 0; + const unsigned char zlt = 1; + unsigned char ios, mult; unsigned long flags; ep = container_of(_ep, struct mv_ep, ep); @@ -465,8 +466,6 @@ static int mv_ep_enable(struct usb_ep *_ep, * disable HW zero length termination select * driver handles zero length packet through req->req.zero */ - zlt = 1; - bit_pos = 1 << ((direction == EP_DIR_OUT ? 0 : 16) + ep->ep_num); /* Check if the Endpoint is Primed */ @@ -481,16 +480,16 @@ static int mv_ep_enable(struct usb_ep *_ep, (unsigned)bit_pos); goto en_done; } + /* Set the max packet length, interrupt on Setup and Mult fields */ + ios = 0; + mult = 0; switch (desc->bmAttributes & USB_ENDPOINT_XFERTYPE_MASK) { case USB_ENDPOINT_XFER_BULK: - zlt = 1; - mult = 0; + case USB_ENDPOINT_XFER_INT: break; case USB_ENDPOINT_XFER_CONTROL: ios = 1; - case USB_ENDPOINT_XFER_INT: - mult = 0; break; case USB_ENDPOINT_XFER_ISOC: /* Calculate transactions needed for high bandwidth iso */ diff --git a/drivers/usb/gadget/udc/net2272.c b/drivers/usb/gadget/udc/net2272.c index 7dc0102abdfe240ade3729e847b57b43fde768f1..8f85a51bd2b3ca5237f078ed00ee4d7ad201e10a 100644 --- a/drivers/usb/gadget/udc/net2272.c +++ b/drivers/usb/gadget/udc/net2272.c @@ -653,7 +653,7 @@ net2272_request_dma(struct net2272 *dev, unsigned ep, u32 buf, dev->dma_busy = 1; /* initialize platform's dma */ -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI /* NET2272 addr, buffer addr, length, etc. */ switch (dev->dev_id) { case PCI_DEVICE_ID_RDK1: @@ -701,7 +701,7 @@ static void net2272_start_dma(struct net2272 *dev) { /* start platform's dma controller */ -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI switch (dev->dev_id) { case PCI_DEVICE_ID_RDK1: writeb((1 << CHANNEL_ENABLE) | (1 << CHANNEL_START), @@ -797,7 +797,7 @@ net2272_kick_dma(struct net2272_ep *ep, struct net2272_request *req) static void net2272_cancel_dma(struct net2272 *dev) { -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI switch (dev->dev_id) { case PCI_DEVICE_ID_RDK1: writeb(0, dev->rdk1.plx9054_base_addr + DMACSR0); @@ -2306,7 +2306,7 @@ net2272_probe_fin(struct net2272 *dev, unsigned int irqflags) return ret; } -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI /* * wrap this driver around the specified device, but diff --git a/drivers/usb/gadget/udc/net2272.h b/drivers/usb/gadget/udc/net2272.h index 127ab03fcde3ba40bf539d231ac92baaeff7f265..69bc9c3c6ce40b871cdf508fb403da74cb7c8ac5 100644 --- a/drivers/usb/gadget/udc/net2272.h +++ b/drivers/usb/gadget/udc/net2272.h @@ -472,7 +472,7 @@ struct net2272 { unsigned int base_shift; u16 __iomem *base_addr; union { -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI struct { void __iomem *plx9054_base_addr; void __iomem *epld_base_addr; diff --git a/drivers/usb/gadget/udc/net2280.c b/drivers/usb/gadget/udc/net2280.c index 3828c2ec8623b155c9948ae90dcebdc4a0106ce6..6cf07857eacaa19b873b18c299079fd07507a5cf 100644 --- a/drivers/usb/gadget/udc/net2280.c +++ b/drivers/usb/gadget/udc/net2280.c @@ -569,7 +569,7 @@ static struct usb_request if (ep->dma) { struct net2280_dma *td; - td = pci_pool_alloc(ep->dev->requests, gfp_flags, + td = dma_pool_alloc(ep->dev->requests, gfp_flags, &req->td_dma); if (!td) { kfree(req); @@ -597,7 +597,7 @@ static void net2280_free_request(struct usb_ep *_ep, struct usb_request *_req) req = container_of(_req, struct net2280_request, req); WARN_ON(!list_empty(&req->queue)); if (req->td) - pci_pool_free(ep->dev->requests, req->td, req->td_dma); + dma_pool_free(ep->dev->requests, req->td, req->td_dma); kfree(req); } @@ -3579,10 +3579,10 @@ static void net2280_remove(struct pci_dev *pdev) for (i = 1; i < 5; i++) { if (!dev->ep[i].dummy) continue; - pci_pool_free(dev->requests, dev->ep[i].dummy, + dma_pool_free(dev->requests, dev->ep[i].dummy, dev->ep[i].td_dma); } - pci_pool_destroy(dev->requests); + dma_pool_destroy(dev->requests); } if (dev->got_irq) free_irq(pdev->irq, dev); @@ -3724,7 +3724,7 @@ static int net2280_probe(struct pci_dev *pdev, const struct pci_device_id *id) /* DMA setup */ /* NOTE: we know only the 32 LSBs of dma addresses may be nonzero */ - dev->requests = pci_pool_create("requests", pdev, + dev->requests = dma_pool_create("requests", &pdev->dev, sizeof(struct net2280_dma), 0 /* no alignment requirements */, 0 /* or page-crossing issues */); @@ -3736,7 +3736,7 @@ static int net2280_probe(struct pci_dev *pdev, const struct pci_device_id *id) for (i = 1; i < 5; i++) { struct net2280_dma *td; - td = pci_pool_alloc(dev->requests, GFP_KERNEL, + td = dma_pool_alloc(dev->requests, GFP_KERNEL, &dev->ep[i].td_dma); if (!td) { ep_dbg(dev, "can't get dummy %d\n", i); diff --git a/drivers/usb/gadget/udc/net2280.h b/drivers/usb/gadget/udc/net2280.h index 2736a95751c3834fe5110f36201fa6363a00f33f..1088c3745999a364c225a809f68e3bd237faed95 100644 --- a/drivers/usb/gadget/udc/net2280.h +++ b/drivers/usb/gadget/udc/net2280.h @@ -187,7 +187,7 @@ struct net2280 { struct usb338x_ll_chi_regs __iomem *ll_chicken_reg; struct usb338x_pl_regs __iomem *plregs; - struct pci_pool *requests; + struct dma_pool *requests; /* statistics...*/ }; diff --git a/drivers/usb/gadget/udc/pch_udc.c b/drivers/usb/gadget/udc/pch_udc.c index 8a365aad66fe2e38eaf0a869ae0f774349f694f2..84dcbcd756f04062b423fe95bd7b7938b206c4e6 100644 --- a/drivers/usb/gadget/udc/pch_udc.c +++ b/drivers/usb/gadget/udc/pch_udc.c @@ -355,8 +355,8 @@ struct pch_udc_dev { vbus_session:1, set_cfg_not_acked:1, waiting_zlp_ack:1; - struct pci_pool *data_requests; - struct pci_pool *stp_requests; + struct dma_pool *data_requests; + struct dma_pool *stp_requests; dma_addr_t dma_addr; struct usb_ctrlrequest setup_data; void __iomem *base_addr; @@ -1522,7 +1522,8 @@ static void pch_udc_free_dma_chain(struct pch_udc_dev *dev, /* do not free first desc., will be done by free for request */ td = phys_to_virt(addr); addr2 = (dma_addr_t)td->next; - pci_pool_free(dev->data_requests, td, addr); + dma_pool_free(dev->data_requests, td, addr); + td->next = 0x00; addr = addr2; } req->chain_len = 1; @@ -1538,7 +1539,7 @@ static void pch_udc_free_dma_chain(struct pch_udc_dev *dev, * * Return codes: * 0: success, - * -ENOMEM: pci_pool_alloc invocation fails + * -ENOMEM: dma_pool_alloc invocation fails */ static int pch_udc_create_dma_chain(struct pch_udc_ep *ep, struct pch_udc_request *req, @@ -1564,7 +1565,7 @@ static int pch_udc_create_dma_chain(struct pch_udc_ep *ep, if (bytes <= buf_len) break; last = td; - td = pci_pool_alloc(ep->dev->data_requests, gfp_flags, + td = dma_pool_alloc(ep->dev->data_requests, gfp_flags, &dma_addr); if (!td) goto nomem; @@ -1769,7 +1770,7 @@ static struct usb_request *pch_udc_alloc_request(struct usb_ep *usbep, if (!ep->dev->dma_addr) return &req->req; /* ep0 in requests are allocated from data pool here */ - dma_desc = pci_pool_alloc(ep->dev->data_requests, gfp, + dma_desc = dma_pool_alloc(ep->dev->data_requests, gfp, &req->td_data_phys); if (NULL == dma_desc) { kfree(req); @@ -1808,7 +1809,7 @@ static void pch_udc_free_request(struct usb_ep *usbep, if (req->td_data != NULL) { if (req->chain_len > 1) pch_udc_free_dma_chain(ep->dev, req); - pci_pool_free(ep->dev->data_requests, req->td_data, + dma_pool_free(ep->dev->data_requests, req->td_data, req->td_data_phys); } kfree(req); @@ -2913,7 +2914,7 @@ static int init_dma_pools(struct pch_udc_dev *dev) void *ep0out_buf; /* DMA setup */ - dev->data_requests = pci_pool_create("data_requests", dev->pdev, + dev->data_requests = dma_pool_create("data_requests", &dev->pdev->dev, sizeof(struct pch_udc_data_dma_desc), 0, 0); if (!dev->data_requests) { dev_err(&dev->pdev->dev, "%s: can't get request data pool\n", @@ -2922,7 +2923,7 @@ static int init_dma_pools(struct pch_udc_dev *dev) } /* dma desc for setup data */ - dev->stp_requests = pci_pool_create("setup requests", dev->pdev, + dev->stp_requests = dma_pool_create("setup requests", &dev->pdev->dev, sizeof(struct pch_udc_stp_dma_desc), 0, 0); if (!dev->stp_requests) { dev_err(&dev->pdev->dev, "%s: can't get setup request pool\n", @@ -2930,7 +2931,7 @@ static int init_dma_pools(struct pch_udc_dev *dev) return -ENOMEM; } /* setup */ - td_stp = pci_pool_alloc(dev->stp_requests, GFP_KERNEL, + td_stp = dma_pool_alloc(dev->stp_requests, GFP_KERNEL, &dev->ep[UDC_EP0OUT_IDX].td_stp_phys); if (!td_stp) { dev_err(&dev->pdev->dev, @@ -2940,7 +2941,7 @@ static int init_dma_pools(struct pch_udc_dev *dev) dev->ep[UDC_EP0OUT_IDX].td_stp = td_stp; /* data: 0 packets !? */ - td_data = pci_pool_alloc(dev->data_requests, GFP_KERNEL, + td_data = dma_pool_alloc(dev->data_requests, GFP_KERNEL, &dev->ep[UDC_EP0OUT_IDX].td_data_phys); if (!td_data) { dev_err(&dev->pdev->dev, @@ -3020,22 +3021,21 @@ static void pch_udc_remove(struct pci_dev *pdev) dev_err(&pdev->dev, "%s: gadget driver still bound!!!\n", __func__); /* dma pool cleanup */ - if (dev->data_requests) - pci_pool_destroy(dev->data_requests); + dma_pool_destroy(dev->data_requests); if (dev->stp_requests) { /* cleanup DMA desc's for ep0in */ if (dev->ep[UDC_EP0OUT_IDX].td_stp) { - pci_pool_free(dev->stp_requests, + dma_pool_free(dev->stp_requests, dev->ep[UDC_EP0OUT_IDX].td_stp, dev->ep[UDC_EP0OUT_IDX].td_stp_phys); } if (dev->ep[UDC_EP0OUT_IDX].td_data) { - pci_pool_free(dev->stp_requests, + dma_pool_free(dev->stp_requests, dev->ep[UDC_EP0OUT_IDX].td_data, dev->ep[UDC_EP0OUT_IDX].td_data_phys); } - pci_pool_destroy(dev->stp_requests); + dma_pool_destroy(dev->stp_requests); } if (dev->dma_addr) diff --git a/drivers/usb/gadget/udc/pxa27x_udc.c b/drivers/usb/gadget/udc/pxa27x_udc.c index 832c4fdbe98512a2b70b6b9e9424acc21a09b83b..d48e239660c31170696042313ebd1a66e79621db 100644 --- a/drivers/usb/gadget/udc/pxa27x_udc.c +++ b/drivers/usb/gadget/udc/pxa27x_udc.c @@ -1608,9 +1608,6 @@ static int pxa_udc_pullup(struct usb_gadget *_gadget, int is_active) return 0; } -static void udc_enable(struct pxa_udc *udc); -static void udc_disable(struct pxa_udc *udc); - /** * pxa_udc_vbus_session - Called by external transceiver to enable/disable udc * @_gadget: usb gadget diff --git a/drivers/usb/gadget/udc/renesas_usb3.c b/drivers/usb/gadget/udc/renesas_usb3.c index 2218f91e92a61cd8078afe857ba347463aa8ff63..5a2d845fb1a68708a3ea27b3822ae8de7afb85df 100644 --- a/drivers/usb/gadget/udc/renesas_usb3.c +++ b/drivers/usb/gadget/udc/renesas_usb3.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -37,6 +38,9 @@ #define USB3_USB_INT_ENA_2 0x22c #define USB3_STUP_DAT_0 0x230 #define USB3_STUP_DAT_1 0x234 +#define USB3_USB_OTG_STA 0x268 +#define USB3_USB_OTG_INT_STA 0x26c +#define USB3_USB_OTG_INT_ENA 0x270 #define USB3_P0_MOD 0x280 #define USB3_P0_CON 0x288 #define USB3_P0_STA 0x28c @@ -124,6 +128,9 @@ /* USB_INT_ENA_2 and USB_INT_STA_2 */ #define USB_INT_2_PIPE(n) BIT(n) +/* USB_OTG_STA, USB_OTG_INT_STA and USB_OTG_INT_ENA */ +#define USB_OTG_IDMON BIT(4) + /* P0_MOD */ #define P0_MOD_DIR BIT(6) @@ -257,6 +264,8 @@ struct renesas_usb3 { struct usb_gadget gadget; struct usb_gadget_driver *driver; + struct extcon_dev *extcon; + struct work_struct extcon_work; struct renesas_usb3_ep *usb3_ep; int num_usb3_eps; @@ -269,6 +278,8 @@ struct renesas_usb3 { u8 ep0_buf[USB3_EP0_BUF_SIZE]; bool softconnect; bool workaround_for_vbus; + bool extcon_host; /* check id and set EXTCON_USB_HOST */ + bool extcon_usb; /* check vbus and set EXTCON_USB */ }; #define gadget_to_renesas_usb3(_gadget) \ @@ -332,6 +343,15 @@ static int usb3_wait(struct renesas_usb3 *usb3, u32 reg, u32 mask, return -EBUSY; } +static void renesas_usb3_extcon_work(struct work_struct *work) +{ + struct renesas_usb3 *usb3 = container_of(work, struct renesas_usb3, + extcon_work); + + extcon_set_state_sync(usb3->extcon, EXTCON_USB_HOST, usb3->extcon_host); + extcon_set_state_sync(usb3->extcon, EXTCON_USB, usb3->extcon_usb); +} + static void usb3_enable_irq_1(struct renesas_usb3 *usb3, u32 bits) { usb3_set_bit(usb3, bits, USB3_USB_INT_ENA_1); @@ -352,6 +372,11 @@ static void usb3_disable_pipe_irq(struct renesas_usb3 *usb3, int num) usb3_clear_bit(usb3, USB_INT_2_PIPE(num), USB3_USB_INT_ENA_2); } +static bool usb3_is_host(struct renesas_usb3 *usb3) +{ + return !(usb3_read(usb3, USB3_DRD_CON) & DRD_CON_PERI_CON); +} + static void usb3_init_axi_bridge(struct renesas_usb3 *usb3) { /* Set AXI_INT */ @@ -362,10 +387,6 @@ static void usb3_init_axi_bridge(struct renesas_usb3 *usb3) static void usb3_init_epc_registers(struct renesas_usb3 *usb3) { - /* FIXME: How to change host / peripheral mode as well? */ - usb3_set_bit(usb3, DRD_CON_PERI_CON, USB3_DRD_CON); - usb3_clear_bit(usb3, DRD_CON_VBOUT, USB3_DRD_CON); - usb3_write(usb3, ~0, USB3_USB_INT_STA_1); usb3_enable_irq_1(usb3, USB_INT_1_VBUS_CNG); } @@ -531,18 +552,70 @@ static void usb3_check_vbus(struct renesas_usb3 *usb3) if (usb3->workaround_for_vbus) { usb3_connect(usb3); } else { - if (usb3_read(usb3, USB3_USB_STA) & USB_STA_VBUS_STA) + usb3->extcon_usb = !!(usb3_read(usb3, USB3_USB_STA) & + USB_STA_VBUS_STA); + if (usb3->extcon_usb) usb3_connect(usb3); else usb3_disconnect(usb3); + + schedule_work(&usb3->extcon_work); } } +static void usb3_set_mode(struct renesas_usb3 *usb3, bool host) +{ + if (host) + usb3_clear_bit(usb3, DRD_CON_PERI_CON, USB3_DRD_CON); + else + usb3_set_bit(usb3, DRD_CON_PERI_CON, USB3_DRD_CON); +} + +static void usb3_vbus_out(struct renesas_usb3 *usb3, bool enable) +{ + if (enable) + usb3_set_bit(usb3, DRD_CON_VBOUT, USB3_DRD_CON); + else + usb3_clear_bit(usb3, DRD_CON_VBOUT, USB3_DRD_CON); +} + +static void usb3_mode_config(struct renesas_usb3 *usb3, bool host, bool a_dev) +{ + unsigned long flags; + + spin_lock_irqsave(&usb3->lock, flags); + usb3_set_mode(usb3, host); + usb3_vbus_out(usb3, a_dev); + if (!host && a_dev) /* for A-Peripheral */ + usb3_connect(usb3); + spin_unlock_irqrestore(&usb3->lock, flags); +} + +static bool usb3_is_a_device(struct renesas_usb3 *usb3) +{ + return !(usb3_read(usb3, USB3_USB_OTG_STA) & USB_OTG_IDMON); +} + +static void usb3_check_id(struct renesas_usb3 *usb3) +{ + usb3->extcon_host = usb3_is_a_device(usb3); + + if (usb3->extcon_host) + usb3_mode_config(usb3, true, true); + else + usb3_mode_config(usb3, false, false); + + schedule_work(&usb3->extcon_work); +} + static void renesas_usb3_init_controller(struct renesas_usb3 *usb3) { usb3_init_axi_bridge(usb3); usb3_init_epc_registers(usb3); + usb3_write(usb3, USB_OTG_IDMON, USB3_USB_OTG_INT_STA); + usb3_write(usb3, USB_OTG_IDMON, USB3_USB_OTG_INT_ENA); + usb3_check_id(usb3); usb3_check_vbus(usb3); } @@ -551,6 +624,7 @@ static void renesas_usb3_stop_controller(struct renesas_usb3 *usb3) usb3_disconnect(usb3); usb3_write(usb3, 0, USB3_P0_INT_ENA); usb3_write(usb3, 0, USB3_PN_INT_ENA); + usb3_write(usb3, 0, USB3_USB_OTG_INT_ENA); usb3_write(usb3, 0, USB3_USB_INT_ENA_1); usb3_write(usb3, 0, USB3_USB_INT_ENA_2); usb3_write(usb3, 0, USB3_AXI_INT_ENA); @@ -1474,10 +1548,22 @@ static void usb3_irq_epc_int_2(struct renesas_usb3 *usb3, u32 int_sta_2) } } +static void usb3_irq_idmon_change(struct renesas_usb3 *usb3) +{ + usb3_check_id(usb3); +} + +static void usb3_irq_otg_int(struct renesas_usb3 *usb3, u32 otg_int_sta) +{ + if (otg_int_sta & USB_OTG_IDMON) + usb3_irq_idmon_change(usb3); +} + static void usb3_irq_epc(struct renesas_usb3 *usb3) { u32 int_sta_1 = usb3_read(usb3, USB3_USB_INT_STA_1); u32 int_sta_2 = usb3_read(usb3, USB3_USB_INT_STA_2); + u32 otg_int_sta = usb3_read(usb3, USB3_USB_OTG_INT_STA); int_sta_1 &= usb3_read(usb3, USB3_USB_INT_ENA_1); if (int_sta_1) { @@ -1488,6 +1574,12 @@ static void usb3_irq_epc(struct renesas_usb3 *usb3) int_sta_2 &= usb3_read(usb3, USB3_USB_INT_ENA_2); if (int_sta_2) usb3_irq_epc_int_2(usb3, int_sta_2); + + otg_int_sta &= usb3_read(usb3, USB3_USB_OTG_INT_ENA); + if (otg_int_sta) { + usb3_write(usb3, otg_int_sta, USB3_USB_OTG_INT_STA); + usb3_irq_otg_int(usb3, otg_int_sta); + } } static irqreturn_t renesas_usb3_irq(int irq, void *_usb3) @@ -1756,11 +1848,49 @@ static const struct usb_gadget_ops renesas_usb3_gadget_ops = { .set_selfpowered = renesas_usb3_set_selfpowered, }; +static ssize_t role_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct renesas_usb3 *usb3 = dev_get_drvdata(dev); + bool new_mode_is_host; + + if (!usb3->driver) + return -ENODEV; + + if (!strncmp(buf, "host", strlen("host"))) + new_mode_is_host = true; + else if (!strncmp(buf, "peripheral", strlen("peripheral"))) + new_mode_is_host = false; + else + return -EINVAL; + + if (new_mode_is_host == usb3_is_host(usb3)) + return -EINVAL; + + usb3_mode_config(usb3, new_mode_is_host, usb3_is_a_device(usb3)); + + return count; +} + +static ssize_t role_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct renesas_usb3 *usb3 = dev_get_drvdata(dev); + + if (!usb3->driver) + return -ENODEV; + + return sprintf(buf, "%s\n", usb3_is_host(usb3) ? "host" : "peripheral"); +} +static DEVICE_ATTR_RW(role); + /*------- platform_driver ------------------------------------------------*/ static int renesas_usb3_remove(struct platform_device *pdev) { struct renesas_usb3 *usb3 = platform_get_drvdata(pdev); + device_remove_file(&pdev->dev, &dev_attr_role); + pm_runtime_put(&pdev->dev); pm_runtime_disable(&pdev->dev); @@ -1894,6 +2024,12 @@ static const struct of_device_id usb3_of_match[] = { }; MODULE_DEVICE_TABLE(of, usb3_of_match); +static const unsigned int renesas_usb3_cable[] = { + EXTCON_USB, + EXTCON_USB_HOST, + EXTCON_NONE, +}; + static int renesas_usb3_probe(struct platform_device *pdev) { struct renesas_usb3 *usb3; @@ -1937,6 +2073,17 @@ static int renesas_usb3_probe(struct platform_device *pdev) if (ret < 0) return ret; + INIT_WORK(&usb3->extcon_work, renesas_usb3_extcon_work); + usb3->extcon = devm_extcon_dev_allocate(&pdev->dev, renesas_usb3_cable); + if (IS_ERR(usb3->extcon)) + return PTR_ERR(usb3->extcon); + + ret = devm_extcon_dev_register(&pdev->dev, usb3->extcon); + if (ret < 0) { + dev_err(&pdev->dev, "Failed to register extcon\n"); + return ret; + } + /* for ep0 handling */ usb3->ep0_req = __renesas_usb3_ep_alloc_request(GFP_KERNEL); if (!usb3->ep0_req) @@ -1946,6 +2093,10 @@ static int renesas_usb3_probe(struct platform_device *pdev) if (ret < 0) goto err_add_udc; + ret = device_create_file(&pdev->dev, &dev_attr_role); + if (ret < 0) + goto err_dev_create; + usb3->workaround_for_vbus = priv->workaround_for_vbus; pm_runtime_enable(&pdev->dev); @@ -1955,6 +2106,9 @@ static int renesas_usb3_probe(struct platform_device *pdev) return 0; +err_dev_create: + usb_del_gadget_udc(&usb3->gadget); + err_add_udc: __renesas_usb3_ep_free_request(usb3->ep0_req); diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 407d947b34ea5b524c467c7c992d9774ded5414a..ababb91d654ab2a7ee3885da9c35ae8465290013 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -30,7 +30,7 @@ if USB_XHCI_HCD config USB_XHCI_PCI tristate - depends on PCI + depends on USB_PCI default y config USB_XHCI_PLATFORM @@ -139,7 +139,7 @@ if USB_EHCI_HCD config USB_EHCI_PCI tristate - depends on PCI + depends on USB_PCI default y config USB_EHCI_HCD_PMC_MSP @@ -188,7 +188,7 @@ config USB_EHCI_HCD_OMAP config USB_EHCI_HCD_ORION tristate "Support for Marvell EBU on-chip EHCI USB controller" - depends on USB_EHCI_HCD && PLAT_ORION + depends on USB_EHCI_HCD && (PLAT_ORION || ARCH_MVEBU) default y ---help--- Enables support for the on-chip EHCI controller on Marvell's @@ -525,7 +525,7 @@ config USB_OHCI_HCD_PPC_OF config USB_OHCI_HCD_PCI tristate "OHCI support for PCI-bus USB controllers" - depends on PCI + depends on USB_PCI default y select USB_OHCI_LITTLE_ENDIAN ---help--- @@ -606,7 +606,7 @@ endif # USB_OHCI_HCD config USB_UHCI_HCD tristate "UHCI HCD (most Intel and VIA) support" - depends on PCI || USB_UHCI_SUPPORT_NON_PCI_HC + depends on USB_PCI || USB_UHCI_SUPPORT_NON_PCI_HC ---help--- The Universal Host Controller Interface is a standard by Intel for accessing the USB hardware in the PC (which is also called the USB @@ -739,7 +739,7 @@ config USB_RENESAS_USBHS_HCD config USB_WHCI_HCD tristate "Wireless USB Host Controller Interface (WHCI) driver" - depends on PCI && USB && UWB + depends on USB_PCI && USB && UWB select USB_WUSB select UWB_WHCI help diff --git a/drivers/usb/host/Makefile b/drivers/usb/host/Makefile index 2644537b7bcfb5cc8452243838815d3aa5669409..c77b0a38557b5bfcd6339302fc780ccdf9a14e35 100644 --- a/drivers/usb/host/Makefile +++ b/drivers/usb/host/Makefile @@ -27,9 +27,7 @@ endif obj-$(CONFIG_USB_WHCI_HCD) += whci/ -ifneq ($(CONFIG_USB), ) - obj-$(CONFIG_PCI) += pci-quirks.o -endif +obj-$(CONFIG_USB_PCI) += pci-quirks.o obj-$(CONFIG_USB_EHCI_HCD) += ehci-hcd.o obj-$(CONFIG_USB_EHCI_PCI) += ehci-pci.o diff --git a/drivers/usb/host/ehci-dbg.c b/drivers/usb/host/ehci-dbg.c index 1a2614aae42cabd8859f1ab327034d18c5f46c12..cbb9b8e12c3ce7e8f18cd8d0765dfc2dc34bd29b 100644 --- a/drivers/usb/host/ehci-dbg.c +++ b/drivers/usb/host/ehci-dbg.c @@ -803,7 +803,7 @@ static ssize_t fill_registers_buffer(struct debug_buffer *buf) size -= temp; next += temp; -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI /* EHCI 0.96 and later may have "extended capabilities" */ if (dev_is_pci(hcd->self.controller)) { struct pci_dev *pdev; diff --git a/drivers/usb/host/ehci-fsl.c b/drivers/usb/host/ehci-fsl.c index 3733aab46efece4fada516c9a590ed8050a467d1..4a08b70c81aa8f31b7baa1c1ea122416448a6c03 100644 --- a/drivers/usb/host/ehci-fsl.c +++ b/drivers/usb/host/ehci-fsl.c @@ -96,8 +96,8 @@ static int fsl_ehci_drv_probe(struct platform_device *pdev) } irq = res->start; - hcd = usb_create_hcd(&fsl_ehci_hc_driver, &pdev->dev, - dev_name(&pdev->dev)); + hcd = __usb_create_hcd(&fsl_ehci_hc_driver, pdev->dev.parent, + &pdev->dev, dev_name(&pdev->dev), NULL); if (!hcd) { retval = -ENOMEM; goto err1; diff --git a/drivers/usb/host/ehci-hcd.c b/drivers/usb/host/ehci-hcd.c index ac2c4eab478db7f91c8b0f705eeadd22cd818604..6e834b83a1040de0a59a9f4cd30a185f558e3fd2 100644 --- a/drivers/usb/host/ehci-hcd.c +++ b/drivers/usb/host/ehci-hcd.c @@ -597,7 +597,7 @@ static int ehci_run (struct usb_hcd *hcd) /* * hcc_params controls whether ehci->regs->segment must (!!!) * be used; it constrains QH/ITD/SITD and QTD locations. - * pci_pool consistent memory always uses segment zero. + * dma_pool consistent memory always uses segment zero. * streaming mappings for I/O buffers, like pci_map_single(), * can return segments above 4GB, if the device allows. * diff --git a/drivers/usb/host/ehci-mem.c b/drivers/usb/host/ehci-mem.c index 4de43011df23aa970472eda236edb1e334dad0a3..9b7e639772153b43e8f14a7d18df8094f9339662 100644 --- a/drivers/usb/host/ehci-mem.c +++ b/drivers/usb/host/ehci-mem.c @@ -138,7 +138,7 @@ static void ehci_mem_cleanup (struct ehci_hcd *ehci) ehci->sitd_pool = NULL; if (ehci->periodic) - dma_free_coherent (ehci_to_hcd(ehci)->self.controller, + dma_free_coherent(ehci_to_hcd(ehci)->self.sysdev, ehci->periodic_size * sizeof (u32), ehci->periodic, ehci->periodic_dma); ehci->periodic = NULL; @@ -155,7 +155,7 @@ static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) /* QTDs for control/bulk/intr transfers */ ehci->qtd_pool = dma_pool_create ("ehci_qtd", - ehci_to_hcd(ehci)->self.controller, + ehci_to_hcd(ehci)->self.sysdev, sizeof (struct ehci_qtd), 32 /* byte alignment (for hw parts) */, 4096 /* can't cross 4K */); @@ -165,7 +165,7 @@ static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) /* QHs for control/bulk/intr transfers */ ehci->qh_pool = dma_pool_create ("ehci_qh", - ehci_to_hcd(ehci)->self.controller, + ehci_to_hcd(ehci)->self.sysdev, sizeof(struct ehci_qh_hw), 32 /* byte alignment (for hw parts) */, 4096 /* can't cross 4K */); @@ -179,7 +179,7 @@ static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) /* ITD for high speed ISO transfers */ ehci->itd_pool = dma_pool_create ("ehci_itd", - ehci_to_hcd(ehci)->self.controller, + ehci_to_hcd(ehci)->self.sysdev, sizeof (struct ehci_itd), 32 /* byte alignment (for hw parts) */, 4096 /* can't cross 4K */); @@ -189,7 +189,7 @@ static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) /* SITD for full/low speed split ISO transfers */ ehci->sitd_pool = dma_pool_create ("ehci_sitd", - ehci_to_hcd(ehci)->self.controller, + ehci_to_hcd(ehci)->self.sysdev, sizeof (struct ehci_sitd), 32 /* byte alignment (for hw parts) */, 4096 /* can't cross 4K */); @@ -199,7 +199,7 @@ static int ehci_mem_init (struct ehci_hcd *ehci, gfp_t flags) /* Hardware periodic table */ ehci->periodic = (__le32 *) - dma_alloc_coherent (ehci_to_hcd(ehci)->self.controller, + dma_alloc_coherent(ehci_to_hcd(ehci)->self.sysdev, ehci->periodic_size * sizeof(__le32), &ehci->periodic_dma, flags); if (ehci->periodic == NULL) { diff --git a/drivers/usb/host/ehci-orion.c b/drivers/usb/host/ehci-orion.c index ee8d5faa01947cd6d2c8cf708273c0521762a797..1aec87ec68df6768090581ccac4c836dbee651dd 100644 --- a/drivers/usb/host/ehci-orion.c +++ b/drivers/usb/host/ehci-orion.c @@ -47,6 +47,18 @@ #define USB_PHY_IVREF_CTRL 0x440 #define USB_PHY_TST_GRP_CTRL 0x450 +#define USB_SBUSCFG 0x90 + +/* BAWR = BARD = 3 : Align read/write bursts packets larger than 128 bytes */ +#define USB_SBUSCFG_BAWR_ALIGN_128B (0x3 << 6) +#define USB_SBUSCFG_BARD_ALIGN_128B (0x3 << 3) +/* AHBBRST = 3 : Align AHB Burst to INCR16 (64 bytes) */ +#define USB_SBUSCFG_AHBBRST_INCR16 (0x3 << 0) + +#define USB_SBUSCFG_DEF_VAL (USB_SBUSCFG_BAWR_ALIGN_128B \ + | USB_SBUSCFG_BARD_ALIGN_128B \ + | USB_SBUSCFG_AHBBRST_INCR16) + #define DRIVER_DESC "EHCI orion driver" #define hcd_to_orion_priv(h) ((struct orion_ehci_hcd *)hcd_to_ehci(h)->priv) @@ -151,8 +163,31 @@ ehci_orion_conf_mbus_windows(struct usb_hcd *hcd, } } +static int ehci_orion_drv_reset(struct usb_hcd *hcd) +{ + struct device *dev = hcd->self.controller; + int ret; + + ret = ehci_setup(hcd); + if (ret) + return ret; + + /* + * For SoC without hlock, need to program sbuscfg value to guarantee + * AHB master's burst would not overrun or underrun FIFO. + * + * sbuscfg reg has to be set after usb controller reset, otherwise + * the value would be override to 0. + */ + if (of_device_is_compatible(dev->of_node, "marvell,armada-3700-ehci")) + wrl(USB_SBUSCFG, USB_SBUSCFG_DEF_VAL); + + return ret; +} + static const struct ehci_driver_overrides orion_overrides __initconst = { .extra_priv_size = sizeof(struct orion_ehci_hcd), + .reset = ehci_orion_drv_reset, }; static int ehci_orion_drv_probe(struct platform_device *pdev) @@ -310,6 +345,7 @@ static int ehci_orion_drv_remove(struct platform_device *pdev) static const struct of_device_id ehci_orion_dt_ids[] = { { .compatible = "marvell,orion-ehci", }, + { .compatible = "marvell,armada-3700-ehci", }, {}, }; MODULE_DEVICE_TABLE(of, ehci_orion_dt_ids); diff --git a/drivers/usb/host/ehci-platform.c b/drivers/usb/host/ehci-platform.c index a268d9e8d6cfb17b214278e23c44267cf8c06e33..bc7b9be12f540cb22dc3a5e9ab93aa49f4c722ef 100644 --- a/drivers/usb/host/ehci-platform.c +++ b/drivers/usb/host/ehci-platform.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "ehci.h" @@ -220,6 +221,9 @@ static int ehci_platform_probe(struct platform_device *dev) if (IS_ERR(priv->phys[phy_num])) { err = PTR_ERR(priv->phys[phy_num]); goto err_put_hcd; + } else if (!hcd->phy) { + /* Avoiding phy_get() in usb_add_hcd() */ + hcd->phy = priv->phys[phy_num]; } } @@ -297,6 +301,7 @@ static int ehci_platform_probe(struct platform_device *dev) goto err_power; device_wakeup_enable(hcd->self.controller); + device_enable_async_suspend(hcd->self.controller); platform_set_drvdata(dev, hcd); return err; @@ -370,6 +375,7 @@ static int ehci_platform_resume(struct device *dev) struct usb_ehci_pdata *pdata = dev_get_platdata(dev); struct platform_device *pdev = to_platform_device(dev); struct ehci_platform_priv *priv = hcd_to_ehci_priv(hcd); + struct device *companion_dev; if (pdata->power_on) { int err = pdata->power_on(pdev); @@ -377,6 +383,10 @@ static int ehci_platform_resume(struct device *dev) return err; } + companion_dev = usb_of_get_companion_dev(hcd->self.controller); + if (companion_dev) + device_pm_wait_for_dev(hcd->self.controller, companion_dev); + ehci_resume(hcd, priv->reset_on_resume); return 0; } diff --git a/drivers/usb/host/fotg210-hcd.c b/drivers/usb/host/fotg210-hcd.c index 1c5b34b74860f1c3aed374f2cfcd0668b1124bcf..ced08dc229ad02ef394816b657e7d5343b1d2e84 100644 --- a/drivers/usb/host/fotg210-hcd.c +++ b/drivers/usb/host/fotg210-hcd.c @@ -5047,7 +5047,7 @@ static int fotg210_run(struct usb_hcd *hcd) /* * hcc_params controls whether fotg210->regs->segment must (!!!) * be used; it constrains QH/ITD/SITD and QTD locations. - * pci_pool consistent memory always uses segment zero. + * dma_pool consistent memory always uses segment zero. * streaming mappings for I/O buffers, like pci_map_single(), * can return segments above 4GB, if the device allows. * diff --git a/drivers/usb/host/ohci-hcd.c b/drivers/usb/host/ohci-hcd.c index b6daf2e6998936021e590a444633dbe28e100878..44924824fa414a28dfbf10a715b658e6b493a462 100644 --- a/drivers/usb/host/ohci-hcd.c +++ b/drivers/usb/host/ohci-hcd.c @@ -231,7 +231,8 @@ static int ohci_urb_enqueue ( /* Start up the I/O watchdog timer, if it's not running */ if (!timer_pending(&ohci->io_watchdog) && - list_empty(&ohci->eds_in_use)) { + list_empty(&ohci->eds_in_use) && + !(ohci->flags & OHCI_QUIRK_QEMU)) { ohci->prev_frame_no = ohci_frame_no(ohci); mod_timer(&ohci->io_watchdog, jiffies + IO_WATCHDOG_DELAY); @@ -994,7 +995,7 @@ static void ohci_stop (struct usb_hcd *hcd) /*-------------------------------------------------------------------------*/ -#if defined(CONFIG_PM) || defined(CONFIG_PCI) +#if defined(CONFIG_PM) || defined(CONFIG_USB_PCI) /* must not be called from interrupt context */ int ohci_restart(struct ohci_hcd *ohci) diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c index bb1509675727b374586d61917920578cc7631a45..a84aebe9b0a97880ef1f83a17b483e7586f190b1 100644 --- a/drivers/usb/host/ohci-pci.c +++ b/drivers/usb/host/ohci-pci.c @@ -164,6 +164,15 @@ static int ohci_quirk_amd700(struct usb_hcd *hcd) return 0; } +static int ohci_quirk_qemu(struct usb_hcd *hcd) +{ + struct ohci_hcd *ohci = hcd_to_ohci(hcd); + + ohci->flags |= OHCI_QUIRK_QEMU; + ohci_dbg(ohci, "enabled qemu quirk\n"); + return 0; +} + /* List of quirks for OHCI */ static const struct pci_device_id ohci_pci_quirks[] = { { @@ -214,6 +223,13 @@ static const struct pci_device_id ohci_pci_quirks[] = { PCI_DEVICE(PCI_VENDOR_ID_ATI, 0x4399), .driver_data = (unsigned long)ohci_quirk_amd700, }, + { + .vendor = PCI_VENDOR_ID_APPLE, + .device = 0x003f, + .subvendor = PCI_SUBVENDOR_ID_REDHAT_QUMRANET, + .subdevice = PCI_SUBDEVICE_ID_QEMU, + .driver_data = (unsigned long)ohci_quirk_qemu, + }, /* FIXME for some of the early AMD 760 southbridges, OHCI * won't work at all. blacklist them. diff --git a/drivers/usb/host/ohci-platform.c b/drivers/usb/host/ohci-platform.c index 898b74086c1292d1c63110795557155962adc1ba..6368fce43197c9bdfe80d5fd5718eedbea50e9fb 100644 --- a/drivers/usb/host/ohci-platform.c +++ b/drivers/usb/host/ohci-platform.c @@ -183,6 +183,9 @@ static int ohci_platform_probe(struct platform_device *dev) if (IS_ERR(priv->phys[phy_num])) { err = PTR_ERR(priv->phys[phy_num]); goto err_put_hcd; + } else if (!hcd->phy) { + /* Avoiding phy_get() in usb_add_hcd() */ + hcd->phy = priv->phys[phy_num]; } } diff --git a/drivers/usb/host/ohci.h b/drivers/usb/host/ohci.h index 37f1725e7a469c45fa035a46ed9827a5b92bc6c7..12742d002d2dc421c61dc25352ff687247b1b3cf 100644 --- a/drivers/usb/host/ohci.h +++ b/drivers/usb/host/ohci.h @@ -418,6 +418,7 @@ struct ohci_hcd { #define OHCI_QUIRK_AMD_PLL 0x200 /* AMD PLL quirk*/ #define OHCI_QUIRK_AMD_PREFETCH 0x400 /* pre-fetch for ISO transfer */ #define OHCI_QUIRK_GLOBAL_SUSPEND 0x800 /* must suspend ports */ +#define OHCI_QUIRK_QEMU 0x1000 /* relax timing expectations */ // there are also chip quirks/bugs in init logic @@ -438,7 +439,7 @@ struct ohci_hcd { }; -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI static inline int quirk_nec(struct ohci_hcd *ohci) { return ohci->flags & OHCI_QUIRK_NEC; diff --git a/drivers/usb/host/oxu210hp-hcd.c b/drivers/usb/host/oxu210hp-hcd.c index bcf531c44c70cf220c9ab268cdbed1b13144c91e..ed20fb34c897f5ce0fe4f004f1dd35ebb9c35e36 100644 --- a/drivers/usb/host/oxu210hp-hcd.c +++ b/drivers/usb/host/oxu210hp-hcd.c @@ -2708,7 +2708,7 @@ static int oxu_run(struct usb_hcd *hcd) /* hcc_params controls whether oxu->regs->segment must (!!!) * be used; it constrains QH/ITD/SITD and QTD locations. - * pci_pool consistent memory always uses segment zero. + * dma_pool consistent memory always uses segment zero. * streaming mappings for I/O buffers, like pci_map_single(), * can return segments above 4GB, if the device allows. * diff --git a/drivers/usb/host/pci-quirks.h b/drivers/usb/host/pci-quirks.h index c622ddf21c94e61f6c8089cdc493088f33973599..0222195bd5b0e2a954ec8d2e841720b577c7282a 100644 --- a/drivers/usb/host/pci-quirks.h +++ b/drivers/usb/host/pci-quirks.h @@ -1,7 +1,7 @@ #ifndef __LINUX_USB_PCI_QUIRKS_H #define __LINUX_USB_PCI_QUIRKS_H -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI void uhci_reset_hc(struct pci_dev *pdev, unsigned long base); int uhci_check_and_reset_hc(struct pci_dev *pdev, unsigned long base); int usb_amd_find_chipset_info(void); @@ -21,6 +21,6 @@ static inline void usb_amd_quirk_pll_enable(void) {} static inline void usb_amd_dev_put(void) {} static inline void usb_disable_xhci_ports(struct pci_dev *xhci_pdev) {} static inline void sb800_prefetch(struct device *dev, int on) {} -#endif /* CONFIG_PCI */ +#endif /* CONFIG_USB_PCI */ #endif /* __LINUX_USB_PCI_QUIRKS_H */ diff --git a/drivers/usb/host/uhci-hcd.c b/drivers/usb/host/uhci-hcd.c index 683098afa93ea150ad9888875b6e11ad2d5d33be..94b150196d4f5d575ba5775aae1a29f112df0f55 100644 --- a/drivers/usb/host/uhci-hcd.c +++ b/drivers/usb/host/uhci-hcd.c @@ -837,7 +837,7 @@ static int uhci_count_ports(struct usb_hcd *hcd) static const char hcd_name[] = "uhci_hcd"; -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI #include "uhci-pci.c" #define PCI_DRIVER uhci_pci_driver #endif diff --git a/drivers/usb/host/uhci-hcd.h b/drivers/usb/host/uhci-hcd.h index 6f986d82472d7daf2fad8ff14bac8b201d3391fb..7fa318a3091d6556a1e9bb46fa3eda0f4ba39625 100644 --- a/drivers/usb/host/uhci-hcd.h +++ b/drivers/usb/host/uhci-hcd.h @@ -530,7 +530,7 @@ static inline void uhci_writeb(const struct uhci_hcd *uhci, u8 val, int reg) #else /* Support non-PCI host controllers */ -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI /* Support PCI and non-PCI host controllers */ #define uhci_has_pci_registers(u) ((u)->io_addr != 0) #else diff --git a/drivers/usb/host/xhci-dbg.c b/drivers/usb/host/xhci-dbg.c index 2b4a00fa735dfef5c51b4a9f583e5783e87ad957..2c83b37ae8f26168c1f09b5471285241c12f41f4 100644 --- a/drivers/usb/host/xhci-dbg.c +++ b/drivers/usb/host/xhci-dbg.c @@ -254,157 +254,6 @@ void xhci_print_registers(struct xhci_hcd *xhci) xhci_print_ports(xhci); } -void xhci_print_trb_offsets(struct xhci_hcd *xhci, union xhci_trb *trb) -{ - int i; - for (i = 0; i < 4; i++) - xhci_dbg(xhci, "Offset 0x%x = 0x%x\n", - i*4, trb->generic.field[i]); -} - -/** - * Debug a transfer request block (TRB). - */ -void xhci_debug_trb(struct xhci_hcd *xhci, union xhci_trb *trb) -{ - u64 address; - u32 type = le32_to_cpu(trb->link.control) & TRB_TYPE_BITMASK; - - switch (type) { - case TRB_TYPE(TRB_LINK): - xhci_dbg(xhci, "Link TRB:\n"); - xhci_print_trb_offsets(xhci, trb); - - address = le64_to_cpu(trb->link.segment_ptr); - xhci_dbg(xhci, "Next ring segment DMA address = 0x%llx\n", address); - - xhci_dbg(xhci, "Interrupter target = 0x%x\n", - GET_INTR_TARGET(le32_to_cpu(trb->link.intr_target))); - xhci_dbg(xhci, "Cycle bit = %u\n", - le32_to_cpu(trb->link.control) & TRB_CYCLE); - xhci_dbg(xhci, "Toggle cycle bit = %u\n", - le32_to_cpu(trb->link.control) & LINK_TOGGLE); - xhci_dbg(xhci, "No Snoop bit = %u\n", - le32_to_cpu(trb->link.control) & TRB_NO_SNOOP); - break; - case TRB_TYPE(TRB_TRANSFER): - address = le64_to_cpu(trb->trans_event.buffer); - /* - * FIXME: look at flags to figure out if it's an address or if - * the data is directly in the buffer field. - */ - xhci_dbg(xhci, "DMA address or buffer contents= %llu\n", address); - break; - case TRB_TYPE(TRB_COMPLETION): - address = le64_to_cpu(trb->event_cmd.cmd_trb); - xhci_dbg(xhci, "Command TRB pointer = %llu\n", address); - xhci_dbg(xhci, "Completion status = %u\n", - GET_COMP_CODE(le32_to_cpu(trb->event_cmd.status))); - xhci_dbg(xhci, "Flags = 0x%x\n", - le32_to_cpu(trb->event_cmd.flags)); - break; - default: - xhci_dbg(xhci, "Unknown TRB with TRB type ID %u\n", - (unsigned int) type>>10); - xhci_print_trb_offsets(xhci, trb); - break; - } -} - -/** - * Debug a segment with an xHCI ring. - * - * @return The Link TRB of the segment, or NULL if there is no Link TRB - * (which is a bug, since all segments must have a Link TRB). - * - * Prints out all TRBs in the segment, even those after the Link TRB. - * - * XXX: should we print out TRBs that the HC owns? As long as we don't - * write, that should be fine... We shouldn't expect that the memory pointed to - * by the TRB is valid at all. Do we care about ones the HC owns? Probably, - * for HC debugging. - */ -void xhci_debug_segment(struct xhci_hcd *xhci, struct xhci_segment *seg) -{ - int i; - u64 addr = seg->dma; - union xhci_trb *trb = seg->trbs; - - for (i = 0; i < TRBS_PER_SEGMENT; i++) { - trb = &seg->trbs[i]; - xhci_dbg(xhci, "@%016llx %08x %08x %08x %08x\n", addr, - lower_32_bits(le64_to_cpu(trb->link.segment_ptr)), - upper_32_bits(le64_to_cpu(trb->link.segment_ptr)), - le32_to_cpu(trb->link.intr_target), - le32_to_cpu(trb->link.control)); - addr += sizeof(*trb); - } -} - -void xhci_dbg_ring_ptrs(struct xhci_hcd *xhci, struct xhci_ring *ring) -{ - xhci_dbg(xhci, "Ring deq = %p (virt), 0x%llx (dma)\n", - ring->dequeue, - (unsigned long long)xhci_trb_virt_to_dma(ring->deq_seg, - ring->dequeue)); - xhci_dbg(xhci, "Ring deq updated %u times\n", - ring->deq_updates); - xhci_dbg(xhci, "Ring enq = %p (virt), 0x%llx (dma)\n", - ring->enqueue, - (unsigned long long)xhci_trb_virt_to_dma(ring->enq_seg, - ring->enqueue)); - xhci_dbg(xhci, "Ring enq updated %u times\n", - ring->enq_updates); -} - -/** - * Debugging for an xHCI ring, which is a queue broken into multiple segments. - * - * Print out each segment in the ring. Check that the DMA address in - * each link segment actually matches the segment's stored DMA address. - * Check that the link end bit is only set at the end of the ring. - * Check that the dequeue and enqueue pointers point to real data in this ring - * (not some other ring). - */ -void xhci_debug_ring(struct xhci_hcd *xhci, struct xhci_ring *ring) -{ - /* FIXME: Throw an error if any segment doesn't have a Link TRB */ - struct xhci_segment *seg; - struct xhci_segment *first_seg = ring->first_seg; - xhci_debug_segment(xhci, first_seg); - - if (!ring->enq_updates && !ring->deq_updates) { - xhci_dbg(xhci, " Ring has not been updated\n"); - return; - } - for (seg = first_seg->next; seg != first_seg; seg = seg->next) - xhci_debug_segment(xhci, seg); -} - -void xhci_dbg_ep_rings(struct xhci_hcd *xhci, - unsigned int slot_id, unsigned int ep_index, - struct xhci_virt_ep *ep) -{ - int i; - struct xhci_ring *ring; - - if (ep->ep_state & EP_HAS_STREAMS) { - for (i = 1; i < ep->stream_info->num_streams; i++) { - ring = ep->stream_info->stream_rings[i]; - xhci_dbg(xhci, "Dev %d endpoint %d stream ID %d:\n", - slot_id, ep_index, i); - xhci_debug_segment(xhci, ring->deq_seg); - } - } else { - ring = ep->ring; - if (!ring) - return; - xhci_dbg(xhci, "Dev %d endpoint ring %d:\n", - slot_id, ep_index); - xhci_debug_segment(xhci, ring->deq_seg); - } -} - void xhci_dbg_erst(struct xhci_hcd *xhci, struct xhci_erst *erst) { u64 addr = erst->erst_dma_addr; @@ -434,166 +283,13 @@ void xhci_dbg_cmd_ptrs(struct xhci_hcd *xhci) upper_32_bits(val)); } -/* Print the last 32 bytes for 64-byte contexts */ -static void dbg_rsvd64(struct xhci_hcd *xhci, u64 *ctx, dma_addr_t dma) -{ - int i; - for (i = 0; i < 4; i++) { - xhci_dbg(xhci, "@%p (virt) @%08llx " - "(dma) %#08llx - rsvd64[%d]\n", - &ctx[4 + i], (unsigned long long)dma, - ctx[4 + i], i); - dma += 8; - } -} - char *xhci_get_slot_state(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx) { struct xhci_slot_ctx *slot_ctx = xhci_get_slot_ctx(xhci, ctx); + int state = GET_SLOT_STATE(le32_to_cpu(slot_ctx->dev_state)); - switch (GET_SLOT_STATE(le32_to_cpu(slot_ctx->dev_state))) { - case SLOT_STATE_ENABLED: - return "enabled/disabled"; - case SLOT_STATE_DEFAULT: - return "default"; - case SLOT_STATE_ADDRESSED: - return "addressed"; - case SLOT_STATE_CONFIGURED: - return "configured"; - default: - return "reserved"; - } -} - -static void xhci_dbg_slot_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx) -{ - /* Fields are 32 bits wide, DMA addresses are in bytes */ - int field_size = 32 / 8; - int i; - - struct xhci_slot_ctx *slot_ctx = xhci_get_slot_ctx(xhci, ctx); - dma_addr_t dma = ctx->dma + - ((unsigned long)slot_ctx - (unsigned long)ctx->bytes); - int csz = HCC_64BYTE_CONTEXT(xhci->hcc_params); - - xhci_dbg(xhci, "Slot Context:\n"); - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - dev_info\n", - &slot_ctx->dev_info, - (unsigned long long)dma, slot_ctx->dev_info); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - dev_info2\n", - &slot_ctx->dev_info2, - (unsigned long long)dma, slot_ctx->dev_info2); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - tt_info\n", - &slot_ctx->tt_info, - (unsigned long long)dma, slot_ctx->tt_info); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - dev_state\n", - &slot_ctx->dev_state, - (unsigned long long)dma, slot_ctx->dev_state); - dma += field_size; - for (i = 0; i < 4; i++) { - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - rsvd[%d]\n", - &slot_ctx->reserved[i], (unsigned long long)dma, - slot_ctx->reserved[i], i); - dma += field_size; - } - - if (csz) - dbg_rsvd64(xhci, (u64 *)slot_ctx, dma); -} - -static void xhci_dbg_ep_ctx(struct xhci_hcd *xhci, - struct xhci_container_ctx *ctx, - unsigned int last_ep) -{ - int i, j; - int last_ep_ctx = 31; - /* Fields are 32 bits wide, DMA addresses are in bytes */ - int field_size = 32 / 8; - int csz = HCC_64BYTE_CONTEXT(xhci->hcc_params); - - if (last_ep < 31) - last_ep_ctx = last_ep + 1; - for (i = 0; i < last_ep_ctx; i++) { - unsigned int epaddr = xhci_get_endpoint_address(i); - struct xhci_ep_ctx *ep_ctx = xhci_get_ep_ctx(xhci, ctx, i); - dma_addr_t dma = ctx->dma + - ((unsigned long)ep_ctx - (unsigned long)ctx->bytes); - - xhci_dbg(xhci, "%s Endpoint %02d Context (ep_index %02d):\n", - usb_endpoint_out(epaddr) ? "OUT" : "IN", - epaddr & USB_ENDPOINT_NUMBER_MASK, i); - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - ep_info\n", - &ep_ctx->ep_info, - (unsigned long long)dma, ep_ctx->ep_info); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - ep_info2\n", - &ep_ctx->ep_info2, - (unsigned long long)dma, ep_ctx->ep_info2); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08llx - deq\n", - &ep_ctx->deq, - (unsigned long long)dma, ep_ctx->deq); - dma += 2*field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - tx_info\n", - &ep_ctx->tx_info, - (unsigned long long)dma, ep_ctx->tx_info); - dma += field_size; - for (j = 0; j < 3; j++) { - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - rsvd[%d]\n", - &ep_ctx->reserved[j], - (unsigned long long)dma, - ep_ctx->reserved[j], j); - dma += field_size; - } - - if (csz) - dbg_rsvd64(xhci, (u64 *)ep_ctx, dma); - } -} - -void xhci_dbg_ctx(struct xhci_hcd *xhci, - struct xhci_container_ctx *ctx, - unsigned int last_ep) -{ - int i; - /* Fields are 32 bits wide, DMA addresses are in bytes */ - int field_size = 32 / 8; - dma_addr_t dma = ctx->dma; - int csz = HCC_64BYTE_CONTEXT(xhci->hcc_params); - - if (ctx->type == XHCI_CTX_TYPE_INPUT) { - struct xhci_input_control_ctx *ctrl_ctx = - xhci_get_input_control_ctx(ctx); - if (!ctrl_ctx) { - xhci_warn(xhci, "Could not get input context, bad type.\n"); - return; - } - - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - drop flags\n", - &ctrl_ctx->drop_flags, (unsigned long long)dma, - ctrl_ctx->drop_flags); - dma += field_size; - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - add flags\n", - &ctrl_ctx->add_flags, (unsigned long long)dma, - ctrl_ctx->add_flags); - dma += field_size; - for (i = 0; i < 6; i++) { - xhci_dbg(xhci, "@%p (virt) @%08llx (dma) %#08x - rsvd2[%d]\n", - &ctrl_ctx->rsvd2[i], (unsigned long long)dma, - ctrl_ctx->rsvd2[i], i); - dma += field_size; - } - - if (csz) - dbg_rsvd64(xhci, (u64 *)ctrl_ctx, dma); - } - - xhci_dbg_slot_ctx(xhci, ctx); - xhci_dbg_ep_ctx(xhci, ctx, last_ep); + return xhci_slot_state_string(state); } void xhci_dbg_trace(struct xhci_hcd *xhci, void (*trace)(struct va_format *), diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index 3bddeaa1e2d768feebb99c5e091de947562b799c..5e3e9d4c6956bab71a815a86ef8e658cce12b48e 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -392,10 +392,8 @@ static int xhci_stop_device(struct xhci_hcd *xhci, int slot_id, int suspend) trace_xhci_stop_device(virt_dev); cmd = xhci_alloc_command(xhci, false, true, GFP_NOIO); - if (!cmd) { - xhci_dbg(xhci, "Couldn't allocate command structure.\n"); + if (!cmd) return -ENOMEM; - } spin_lock_irqsave(&xhci->lock, flags); for (i = LAST_EP_INDEX; i > 0; i--) { @@ -540,6 +538,119 @@ static int xhci_get_ports(struct usb_hcd *hcd, __le32 __iomem ***port_array) return max_ports; } +static __le32 __iomem *xhci_get_port_io_addr(struct usb_hcd *hcd, int index) +{ + __le32 __iomem **port_array; + + xhci_get_ports(hcd, &port_array); + return port_array[index]; +} + +/* + * xhci_set_port_power() must be called with xhci->lock held. + * It will release and re-aquire the lock while calling ACPI + * method. + */ +static void xhci_set_port_power(struct xhci_hcd *xhci, struct usb_hcd *hcd, + u16 index, bool on, unsigned long *flags) +{ + __le32 __iomem *addr; + u32 temp; + + addr = xhci_get_port_io_addr(hcd, index); + temp = readl(addr); + temp = xhci_port_state_to_neutral(temp); + if (on) { + /* Power on */ + writel(temp | PORT_POWER, addr); + temp = readl(addr); + xhci_dbg(xhci, "set port power, actual port %d status = 0x%x\n", + index, temp); + } else { + /* Power off */ + writel(temp & ~PORT_POWER, addr); + } + + spin_unlock_irqrestore(&xhci->lock, *flags); + temp = usb_acpi_power_manageable(hcd->self.root_hub, + index); + if (temp) + usb_acpi_set_power_state(hcd->self.root_hub, + index, on); + spin_lock_irqsave(&xhci->lock, *flags); +} + +static void xhci_port_set_test_mode(struct xhci_hcd *xhci, + u16 test_mode, u16 wIndex) +{ + u32 temp; + __le32 __iomem *addr; + + /* xhci only supports test mode for usb2 ports, i.e. xhci->main_hcd */ + addr = xhci_get_port_io_addr(xhci->main_hcd, wIndex); + temp = readl(addr + PORTPMSC); + temp |= test_mode << PORT_TEST_MODE_SHIFT; + writel(temp, addr + PORTPMSC); + xhci->test_mode = test_mode; + if (test_mode == TEST_FORCE_EN) + xhci_start(xhci); +} + +static int xhci_enter_test_mode(struct xhci_hcd *xhci, + u16 test_mode, u16 wIndex, unsigned long *flags) +{ + int i, retval; + + /* Disable all Device Slots */ + xhci_dbg(xhci, "Disable all slots\n"); + for (i = 1; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) { + retval = xhci_disable_slot(xhci, NULL, i); + if (retval) + xhci_err(xhci, "Failed to disable slot %d, %d. Enter test mode anyway\n", + i, retval); + } + /* Put all ports to the Disable state by clear PP */ + xhci_dbg(xhci, "Disable all port (PP = 0)\n"); + /* Power off USB3 ports*/ + for (i = 0; i < xhci->num_usb3_ports; i++) + xhci_set_port_power(xhci, xhci->shared_hcd, i, false, flags); + /* Power off USB2 ports*/ + for (i = 0; i < xhci->num_usb2_ports; i++) + xhci_set_port_power(xhci, xhci->main_hcd, i, false, flags); + /* Stop the controller */ + xhci_dbg(xhci, "Stop controller\n"); + retval = xhci_halt(xhci); + if (retval) + return retval; + /* Disable runtime PM for test mode */ + pm_runtime_forbid(xhci_to_hcd(xhci)->self.controller); + /* Set PORTPMSC.PTC field to enter selected test mode */ + /* Port is selected by wIndex. port_id = wIndex + 1 */ + xhci_dbg(xhci, "Enter Test Mode: %d, Port_id=%d\n", + test_mode, wIndex + 1); + xhci_port_set_test_mode(xhci, test_mode, wIndex); + return retval; +} + +static int xhci_exit_test_mode(struct xhci_hcd *xhci) +{ + int retval; + + if (!xhci->test_mode) { + xhci_err(xhci, "Not in test mode, do nothing.\n"); + return 0; + } + if (xhci->test_mode == TEST_FORCE_EN && + !(xhci->xhc_state & XHCI_STATE_HALTED)) { + retval = xhci_halt(xhci); + if (retval) + return retval; + } + pm_runtime_allow(xhci_to_hcd(xhci)->self.controller); + xhci->test_mode = 0; + return xhci_reset(xhci); +} + void xhci_set_link_state(struct xhci_hcd *xhci, __le32 __iomem **port_array, int port_id, u32 link_state) { @@ -895,6 +1006,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 link_state = 0; u16 wake_mask = 0; u16 timeout = 0; + u16 test_mode = 0; max_ports = xhci_get_ports(hcd, &port_array); bus_state = &xhci->bus_state[hcd_index(hcd)]; @@ -935,7 +1047,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, goto error; wIndex--; temp = readl(port_array[wIndex]); - if (temp == 0xffffffff) { + if (temp == ~(u32)0) { + xhci_hc_died(xhci); retval = -ENODEV; break; } @@ -968,6 +1081,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, link_state = (wIndex & 0xff00) >> 3; if (wValue == USB_PORT_FEAT_REMOTE_WAKE_MASK) wake_mask = wIndex & 0xff00; + if (wValue == USB_PORT_FEAT_TEST) + test_mode = (wIndex & 0xff00) >> 8; /* The MSB of wIndex is the U1/U2 timeout */ timeout = (wIndex & 0xff00) >> 8; wIndex &= 0xff; @@ -975,7 +1090,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, goto error; wIndex--; temp = readl(port_array[wIndex]); - if (temp == 0xffffffff) { + if (temp == ~(u32)0) { + xhci_hc_died(xhci); retval = -ENODEV; break; } @@ -1092,18 +1208,7 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, * However, hub_wq will ignore the roothub events until * the roothub is registered. */ - writel(temp | PORT_POWER, port_array[wIndex]); - - temp = readl(port_array[wIndex]); - xhci_dbg(xhci, "set port power, actual port %d status = 0x%x\n", wIndex, temp); - - spin_unlock_irqrestore(&xhci->lock, flags); - temp = usb_acpi_power_manageable(hcd->self.root_hub, - wIndex); - if (temp) - usb_acpi_set_power_state(hcd->self.root_hub, - wIndex, true); - spin_lock_irqsave(&xhci->lock, flags); + xhci_set_port_power(xhci, hcd, wIndex, true, &flags); break; case USB_PORT_FEAT_RESET: temp = (temp | PORT_RESET); @@ -1142,6 +1247,15 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, temp |= PORT_U2_TIMEOUT(timeout); writel(temp, port_array[wIndex] + PORTPMSC); break; + case USB_PORT_FEAT_TEST: + /* 4.19.6 Port Test Modes (USB2 Test Mode) */ + if (hcd->speed != HCD_USB2) + goto error; + if (test_mode > TEST_FORCE_EN || test_mode < TEST_J) + goto error; + retval = xhci_enter_test_mode(xhci, test_mode, wIndex, + &flags); + break; default: goto error; } @@ -1153,7 +1267,8 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, goto error; wIndex--; temp = readl(port_array[wIndex]); - if (temp == 0xffffffff) { + if (temp == ~(u32)0) { + xhci_hc_died(xhci); retval = -ENODEV; break; } @@ -1207,15 +1322,10 @@ int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, port_array[wIndex], temp); break; case USB_PORT_FEAT_POWER: - writel(temp & ~PORT_POWER, port_array[wIndex]); - - spin_unlock_irqrestore(&xhci->lock, flags); - temp = usb_acpi_power_manageable(hcd->self.root_hub, - wIndex); - if (temp) - usb_acpi_set_power_state(hcd->self.root_hub, - wIndex, false); - spin_lock_irqsave(&xhci->lock, flags); + xhci_set_port_power(xhci, hcd, wIndex, false, &flags); + break; + case USB_PORT_FEAT_TEST: + retval = xhci_exit_test_mode(xhci); break; default: goto error; @@ -1269,7 +1379,8 @@ int xhci_hub_status_data(struct usb_hcd *hcd, char *buf) /* For each port, did anything change? If so, set that bit in buf. */ for (i = 0; i < max_ports; i++) { temp = readl(port_array[i]); - if (temp == 0xffffffff) { + if (temp == ~(u32)0) { + xhci_hc_died(xhci); retval = -ENODEV; break; } diff --git a/drivers/usb/host/xhci-mem.c b/drivers/usb/host/xhci-mem.c index ba1853f4e407a9558af25e6573ced5a7ec6d01f6..bbe22bcc550a7f2641c8be1f379510c2ba0d9162 100644 --- a/drivers/usb/host/xhci-mem.c +++ b/drivers/usb/host/xhci-mem.c @@ -288,6 +288,8 @@ void xhci_ring_free(struct xhci_hcd *xhci, struct xhci_ring *ring) if (!ring) return; + trace_xhci_ring_free(ring); + if (ring->first_seg) { if (ring->type == TYPE_STREAM) xhci_remove_stream_mapping(ring); @@ -313,9 +315,6 @@ static void xhci_initialize_ring_info(struct xhci_ring *ring, * handling ring expansion, set the cycle state equal to the old ring. */ ring->cycle_state = cycle_state; - /* Not necessary for new rings, but needed for re-initialized rings */ - ring->enq_updates = 0; - ring->deq_updates = 0; /* * Each segment has a link TRB, and leave an extra TRB for SW @@ -400,6 +399,7 @@ static struct xhci_ring *xhci_ring_alloc(struct xhci_hcd *xhci, cpu_to_le32(LINK_TOGGLE); } xhci_initialize_ring_info(ring, cycle_state); + trace_xhci_ring_alloc(ring); return ring; fail: @@ -504,6 +504,7 @@ int xhci_ring_expansion(struct xhci_hcd *xhci, struct xhci_ring *ring, } xhci_link_rings(xhci, ring, first, last, num_segs); + trace_xhci_ring_expansion(ring); xhci_dbg_trace(xhci, trace_xhci_dbg_ring_expansion, "ring expansion succeed, now has %d segments", ring->num_segs); @@ -586,7 +587,7 @@ static void xhci_free_stream_ctx(struct xhci_hcd *xhci, unsigned int num_stream_ctxs, struct xhci_stream_ctx *stream_ctx, dma_addr_t dma) { - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; size_t size = sizeof(struct xhci_stream_ctx) * num_stream_ctxs; if (size > MEDIUM_STREAM_ARRAY_SIZE) @@ -614,7 +615,7 @@ static struct xhci_stream_ctx *xhci_alloc_stream_ctx(struct xhci_hcd *xhci, unsigned int num_stream_ctxs, dma_addr_t *dma, gfp_t mem_flags) { - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; size_t size = sizeof(struct xhci_stream_ctx) * num_stream_ctxs; if (size > MEDIUM_STREAM_ARRAY_SIZE) @@ -1502,6 +1503,17 @@ int xhci_endpoint_init(struct xhci_hcd *xhci, */ max_esit_payload = xhci_get_max_esit_payload(udev, ep); interval = xhci_get_endpoint_interval(udev, ep); + + /* Periodic endpoint bInterval limit quirk */ + if (usb_endpoint_xfer_int(&ep->desc) || + usb_endpoint_xfer_isoc(&ep->desc)) { + if ((xhci->quirks & XHCI_LIMIT_ENDPOINT_INTERVAL_7) && + udev->speed >= USB_SPEED_HIGH && + interval >= 7) { + interval = 6; + } + } + mult = xhci_get_endpoint_mult(udev, ep); max_packet = usb_endpoint_maxp(&ep->desc); max_burst = xhci_get_endpoint_max_burst(udev, ep); @@ -1686,7 +1698,7 @@ void xhci_slot_copy(struct xhci_hcd *xhci, static int scratchpad_alloc(struct xhci_hcd *xhci, gfp_t flags) { int i; - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; int num_sp = HCS_MAX_SCRATCHPAD(xhci->hcs_params2); xhci_dbg_trace(xhci, trace_xhci_dbg_init, @@ -1709,36 +1721,27 @@ static int scratchpad_alloc(struct xhci_hcd *xhci, gfp_t flags) if (!xhci->scratchpad->sp_buffers) goto fail_sp3; - xhci->scratchpad->sp_dma_buffers = - kzalloc(sizeof(dma_addr_t) * num_sp, flags); - - if (!xhci->scratchpad->sp_dma_buffers) - goto fail_sp4; - xhci->dcbaa->dev_context_ptrs[0] = cpu_to_le64(xhci->scratchpad->sp_dma); for (i = 0; i < num_sp; i++) { dma_addr_t dma; void *buf = dma_alloc_coherent(dev, xhci->page_size, &dma, flags); if (!buf) - goto fail_sp5; + goto fail_sp4; xhci->scratchpad->sp_array[i] = dma; xhci->scratchpad->sp_buffers[i] = buf; - xhci->scratchpad->sp_dma_buffers[i] = dma; } return 0; - fail_sp5: + fail_sp4: for (i = i - 1; i >= 0; i--) { dma_free_coherent(dev, xhci->page_size, xhci->scratchpad->sp_buffers[i], - xhci->scratchpad->sp_dma_buffers[i]); + xhci->scratchpad->sp_array[i]); } - kfree(xhci->scratchpad->sp_dma_buffers); - fail_sp4: kfree(xhci->scratchpad->sp_buffers); fail_sp3: @@ -1758,7 +1761,7 @@ static void scratchpad_free(struct xhci_hcd *xhci) { int num_sp; int i; - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; if (!xhci->scratchpad) return; @@ -1768,9 +1771,8 @@ static void scratchpad_free(struct xhci_hcd *xhci) for (i = 0; i < num_sp; i++) { dma_free_coherent(dev, xhci->page_size, xhci->scratchpad->sp_buffers[i], - xhci->scratchpad->sp_dma_buffers[i]); + xhci->scratchpad->sp_array[i]); } - kfree(xhci->scratchpad->sp_dma_buffers); kfree(xhci->scratchpad->sp_buffers); dma_free_coherent(dev, num_sp * sizeof(u64), xhci->scratchpad->sp_array, @@ -1831,7 +1833,7 @@ void xhci_free_command(struct xhci_hcd *xhci, void xhci_mem_cleanup(struct xhci_hcd *xhci) { - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; int size; int i, j, num_ports; @@ -2373,7 +2375,7 @@ static int xhci_setup_port_arrays(struct xhci_hcd *xhci, gfp_t flags) int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) { dma_addr_t dma; - struct device *dev = xhci_to_hcd(xhci)->self.controller; + struct device *dev = xhci_to_hcd(xhci)->self.sysdev; unsigned int val, val2; u64 val_64; struct xhci_segment *seg; @@ -2419,7 +2421,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) writel(val, &xhci->op_regs->config_reg); /* - * Section 5.4.8 - doorbell array must be + * xHCI section 5.4.6 - doorbell array must be * "physically contiguous and 64-byte (cache line) aligned". */ xhci->dcbaa = dma_alloc_coherent(dev, sizeof(*xhci->dcbaa), &dma, @@ -2480,7 +2482,7 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) (xhci->cmd_ring->first_seg->dma & (u64) ~CMD_RING_RSVD_BITS) | xhci->cmd_ring->cycle_state; xhci_dbg_trace(xhci, trace_xhci_dbg_init, - "// Setting command ring address to 0x%x", val); + "// Setting command ring address to 0x%016llx", val_64); xhci_write_64(xhci, val_64, &xhci->op_regs->cmd_ring); xhci_dbg_cmd_ptrs(xhci); @@ -2601,7 +2603,6 @@ int xhci_mem_init(struct xhci_hcd *xhci, gfp_t flags) return 0; fail: - xhci_warn(xhci, "Couldn't initialize memory\n"); xhci_halt(xhci); xhci_reset(xhci); xhci_mem_cleanup(xhci); diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c index fc99f51d12e183603cd692592b6d77351131a1da..7b86508ac8cf522e1143561a6e9a7d8f40f5a964 100644 --- a/drivers/usb/host/xhci-pci.c +++ b/drivers/usb/host/xhci-pci.c @@ -199,6 +199,9 @@ static void xhci_pci_quirks(struct device *dev, struct xhci_hcd *xhci) pdev->device == 0x1042) xhci->quirks |= XHCI_BROKEN_STREAMS; + if (pdev->vendor == PCI_VENDOR_ID_TI && pdev->device == 0x8241) + xhci->quirks |= XHCI_LIMIT_ENDPOINT_INTERVAL_7; + if (xhci->quirks & XHCI_RESET_ON_RESUME) xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, "QUIRK: Resetting on resume"); diff --git a/drivers/usb/host/xhci-plat.c b/drivers/usb/host/xhci-plat.c index 6ed468fa7d5e593ca2e0fdabc6048fb7f2dd9f5d..7c2a9e7c8e0f2aa4d5cc7f5e413f23ccc75c116d 100644 --- a/drivers/usb/host/xhci-plat.c +++ b/drivers/usb/host/xhci-plat.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,16 @@ static int xhci_priv_init_quirk(struct usb_hcd *hcd) return priv->init_quirk(hcd); } +static int xhci_priv_resume_quirk(struct usb_hcd *hcd) +{ + struct xhci_plat_priv *priv = hcd_to_xhci_priv(hcd); + + if (!priv->resume_quirk) + return 0; + + return priv->resume_quirk(hcd); +} + static void xhci_plat_quirks(struct device *dev, struct xhci_hcd *xhci) { /* @@ -92,18 +103,21 @@ static const struct xhci_plat_priv xhci_plat_renesas_rcar_gen2 = { .firmware_name = XHCI_RCAR_FIRMWARE_NAME_V1, .init_quirk = xhci_rcar_init_quirk, .plat_start = xhci_rcar_start, + .resume_quirk = xhci_rcar_resume_quirk, }; static const struct xhci_plat_priv xhci_plat_renesas_rcar_gen3 = { .firmware_name = XHCI_RCAR_FIRMWARE_NAME_V2, .init_quirk = xhci_rcar_init_quirk, .plat_start = xhci_rcar_start, + .resume_quirk = xhci_rcar_resume_quirk, }; static const struct xhci_plat_priv xhci_plat_renesas_rcar_r8a7796 = { .firmware_name = XHCI_RCAR_FIRMWARE_NAME_V3, .init_quirk = xhci_rcar_init_quirk, .plat_start = xhci_rcar_start, + .resume_quirk = xhci_rcar_resume_quirk, }; static const struct of_device_id usb_xhci_of_match[] = { @@ -148,6 +162,7 @@ static int xhci_plat_probe(struct platform_device *pdev) { const struct of_device_id *match; const struct hc_driver *driver; + struct device *sysdev; struct xhci_hcd *xhci; struct resource *res; struct usb_hcd *hcd; @@ -164,24 +179,47 @@ static int xhci_plat_probe(struct platform_device *pdev) if (irq < 0) return -ENODEV; + /* + * sysdev must point to a device that is known to the system firmware + * or PCI hardware. We handle these three cases here: + * 1. xhci_plat comes from firmware + * 2. xhci_plat is child of a device from firmware (dwc3-plat) + * 3. xhci_plat is grandchild of a pci device (dwc3-pci) + */ + sysdev = &pdev->dev; + if (sysdev->parent && !sysdev->of_node && sysdev->parent->of_node) + sysdev = sysdev->parent; +#ifdef CONFIG_PCI + else if (sysdev->parent && sysdev->parent->parent && + sysdev->parent->parent->bus == &pci_bus_type) + sysdev = sysdev->parent->parent; +#endif + /* Try to set 64-bit DMA first */ - if (!pdev->dev.dma_mask) + if (WARN_ON(!sysdev->dma_mask)) /* Platform did not initialize dma_mask */ - ret = dma_coerce_mask_and_coherent(&pdev->dev, + ret = dma_coerce_mask_and_coherent(sysdev, DMA_BIT_MASK(64)); else - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); + ret = dma_set_mask_and_coherent(sysdev, DMA_BIT_MASK(64)); /* If seting 64-bit DMA mask fails, fall back to 32-bit DMA mask */ if (ret) { - ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); + ret = dma_set_mask_and_coherent(sysdev, DMA_BIT_MASK(32)); if (ret) return ret; } - hcd = usb_create_hcd(driver, &pdev->dev, dev_name(&pdev->dev)); - if (!hcd) - return -ENOMEM; + pm_runtime_set_active(&pdev->dev); + pm_runtime_enable(&pdev->dev); + pm_runtime_get_noresume(&pdev->dev); + + hcd = __usb_create_hcd(driver, sysdev, &pdev->dev, + dev_name(&pdev->dev), NULL); + if (!hcd) { + ret = -ENOMEM; + goto disable_runtime; + } res = platform_get_resource(pdev, IORESOURCE_MEM, 0); hcd->regs = devm_ioremap_resource(&pdev->dev, res); @@ -222,20 +260,20 @@ static int xhci_plat_probe(struct platform_device *pdev) xhci->clk = clk; xhci->main_hcd = hcd; - xhci->shared_hcd = usb_create_shared_hcd(driver, &pdev->dev, + xhci->shared_hcd = __usb_create_hcd(driver, sysdev, &pdev->dev, dev_name(&pdev->dev), hcd); if (!xhci->shared_hcd) { ret = -ENOMEM; goto disable_clk; } - if (device_property_read_bool(&pdev->dev, "usb3-lpm-capable")) + if (device_property_read_bool(sysdev, "usb3-lpm-capable")) xhci->quirks |= XHCI_LPM_SUPPORT; if (device_property_read_bool(&pdev->dev, "quirk-broken-port-ped")) xhci->quirks |= XHCI_BROKEN_PORT_PED; - hcd->usb_phy = devm_usb_get_phy_by_phandle(&pdev->dev, "usb-phy", 0); + hcd->usb_phy = devm_usb_get_phy_by_phandle(sysdev, "usb-phy", 0); if (IS_ERR(hcd->usb_phy)) { ret = PTR_ERR(hcd->usb_phy); if (ret == -EPROBE_DEFER) @@ -258,6 +296,15 @@ static int xhci_plat_probe(struct platform_device *pdev) if (ret) goto dealloc_usb2_hcd; + device_enable_async_suspend(&pdev->dev); + pm_runtime_put_noidle(&pdev->dev); + + /* + * Prevent runtime pm from being on as default, users should enable + * runtime pm using power/control in sysfs. + */ + pm_runtime_forbid(&pdev->dev); + return 0; @@ -277,6 +324,10 @@ static int xhci_plat_probe(struct platform_device *pdev) put_hcd: usb_put_hcd(hcd); +disable_runtime: + pm_runtime_put_noidle(&pdev->dev); + pm_runtime_disable(&pdev->dev); + return ret; } @@ -298,14 +349,17 @@ static int xhci_plat_remove(struct platform_device *dev) clk_disable_unprepare(clk); usb_put_hcd(hcd); + pm_runtime_set_suspended(&dev->dev); + pm_runtime_disable(&dev->dev); + return 0; } -#ifdef CONFIG_PM_SLEEP -static int xhci_plat_suspend(struct device *dev) +static int __maybe_unused xhci_plat_suspend(struct device *dev) { struct usb_hcd *hcd = dev_get_drvdata(dev); struct xhci_hcd *xhci = hcd_to_xhci(hcd); + int ret; /* * xhci_suspend() needs `do_wakeup` to know whether host is allowed @@ -315,24 +369,53 @@ static int xhci_plat_suspend(struct device *dev) * reconsider this when xhci_plat_suspend enlarges its scope, e.g., * also applies to runtime suspend. */ - return xhci_suspend(xhci, device_may_wakeup(dev)); + ret = xhci_suspend(xhci, device_may_wakeup(dev)); + + if (!device_may_wakeup(dev) && !IS_ERR(xhci->clk)) + clk_disable_unprepare(xhci->clk); + + return ret; } -static int xhci_plat_resume(struct device *dev) +static int __maybe_unused xhci_plat_resume(struct device *dev) { struct usb_hcd *hcd = dev_get_drvdata(dev); struct xhci_hcd *xhci = hcd_to_xhci(hcd); + int ret; + + if (!device_may_wakeup(dev) && !IS_ERR(xhci->clk)) + clk_prepare_enable(xhci->clk); + + ret = xhci_priv_resume_quirk(hcd); + if (ret) + return ret; + + return xhci_resume(xhci, 0); +} + +static int __maybe_unused xhci_plat_runtime_suspend(struct device *dev) +{ + struct usb_hcd *hcd = dev_get_drvdata(dev); + struct xhci_hcd *xhci = hcd_to_xhci(hcd); + + return xhci_suspend(xhci, true); +} + +static int __maybe_unused xhci_plat_runtime_resume(struct device *dev) +{ + struct usb_hcd *hcd = dev_get_drvdata(dev); + struct xhci_hcd *xhci = hcd_to_xhci(hcd); return xhci_resume(xhci, 0); } static const struct dev_pm_ops xhci_plat_pm_ops = { SET_SYSTEM_SLEEP_PM_OPS(xhci_plat_suspend, xhci_plat_resume) + + SET_RUNTIME_PM_OPS(xhci_plat_runtime_suspend, + xhci_plat_runtime_resume, + NULL) }; -#define DEV_PM_OPS (&xhci_plat_pm_ops) -#else -#define DEV_PM_OPS NULL -#endif /* CONFIG_PM */ static const struct acpi_device_id usb_xhci_acpi_match[] = { /* XHCI-compliant USB Controller */ @@ -347,7 +430,7 @@ static struct platform_driver usb_xhci_driver = { .shutdown = usb_hcd_platform_shutdown, .driver = { .name = "xhci-hcd", - .pm = DEV_PM_OPS, + .pm = &xhci_plat_pm_ops, .of_match_table = of_match_ptr(usb_xhci_of_match), .acpi_match_table = ACPI_PTR(usb_xhci_acpi_match), }, diff --git a/drivers/usb/host/xhci-plat.h b/drivers/usb/host/xhci-plat.h index 9af0cb48053f27bd07cd3ad1d79b3a6815700fa4..29b227895b0732352765afa042819b4d0ae9c864 100644 --- a/drivers/usb/host/xhci-plat.h +++ b/drivers/usb/host/xhci-plat.h @@ -17,6 +17,7 @@ struct xhci_plat_priv { const char *firmware_name; void (*plat_start)(struct usb_hcd *); int (*init_quirk)(struct usb_hcd *); + int (*resume_quirk)(struct usb_hcd *); }; #define hcd_to_xhci_priv(h) ((struct xhci_plat_priv *)hcd_to_xhci(h)->priv) diff --git a/drivers/usb/host/xhci-rcar.c b/drivers/usb/host/xhci-rcar.c index d28df386e780bd04608ad199535ee326119b932c..07278228214b80066df2ea32a5828fdf1066f34d 100644 --- a/drivers/usb/host/xhci-rcar.c +++ b/drivers/usb/host/xhci-rcar.c @@ -198,3 +198,14 @@ int xhci_rcar_init_quirk(struct usb_hcd *hcd) return xhci_rcar_download_firmware(hcd); } + +int xhci_rcar_resume_quirk(struct usb_hcd *hcd) +{ + int ret; + + ret = xhci_rcar_download_firmware(hcd); + if (!ret) + xhci_rcar_start(hcd); + + return ret; +} diff --git a/drivers/usb/host/xhci-rcar.h b/drivers/usb/host/xhci-rcar.h index d2ffe20401cf9f372bbcc0be4a7443e0e3b92b60..d247951147a1b9b99772bbd0093667c3f5976634 100644 --- a/drivers/usb/host/xhci-rcar.h +++ b/drivers/usb/host/xhci-rcar.h @@ -18,6 +18,7 @@ #if IS_ENABLED(CONFIG_USB_XHCI_RCAR) void xhci_rcar_start(struct usb_hcd *hcd); int xhci_rcar_init_quirk(struct usb_hcd *hcd); +int xhci_rcar_resume_quirk(struct usb_hcd *hcd); #else static inline void xhci_rcar_start(struct usb_hcd *hcd) { @@ -27,5 +28,10 @@ static inline int xhci_rcar_init_quirk(struct usb_hcd *hcd) { return 0; } + +static inline int xhci_rcar_resume_quirk(struct usb_hcd *hcd) +{ + return 0; +} #endif #endif /* _XHCI_RCAR_H */ diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index a3309aa02993dfa79e52a2b93a87b9efa289d498..74bf5c60a26075fff92cf74f6d3b2a2d4af4f0d4 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -167,8 +167,6 @@ static void next_trb(struct xhci_hcd *xhci, */ static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring) { - ring->deq_updates++; - /* event ring doesn't have link trbs, check for last trb */ if (ring->type == TYPE_EVENT) { if (!last_trb_on_seg(ring->deq_seg, ring->dequeue)) { @@ -191,6 +189,9 @@ static void inc_deq(struct xhci_hcd *xhci, struct xhci_ring *ring) ring->deq_seg = ring->deq_seg->next; ring->dequeue = ring->deq_seg->trbs; } + + trace_xhci_inc_deq(ring); + return; } @@ -223,7 +224,6 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring, ring->num_trbs_free--; next = ++(ring->enqueue); - ring->enq_updates++; /* Update the dequeue pointer further if that was a link TRB */ while (trb_is_link(next)) { @@ -259,6 +259,8 @@ static void inc_enq(struct xhci_hcd *xhci, struct xhci_ring *ring, ring->enqueue = ring->enq_seg->trbs; next = ring->enqueue; } + + trace_xhci_inc_enq(ring); } /* @@ -359,21 +361,19 @@ static int xhci_abort_cmd_ring(struct xhci_hcd *xhci, unsigned long flags) xhci_write_64(xhci, temp_64 | CMD_RING_ABORT, &xhci->op_regs->cmd_ring); - /* Section 4.6.1.2 of xHCI 1.0 spec says software should - * time the completion od all xHCI commands, including - * the Command Abort operation. If software doesn't see - * CRR negated in a timely manner (e.g. longer than 5 - * seconds), then it should assume that the there are - * larger problems with the xHC and assert HCRST. + /* Section 4.6.1.2 of xHCI 1.0 spec says software should also time the + * completion of the Command Abort operation. If CRR is not negated in 5 + * seconds then driver handles it as if host died (-ENODEV). + * In the future we should distinguish between -ENODEV and -ETIMEDOUT + * and try to recover a -ETIMEDOUT with a host controller reset. */ ret = xhci_handshake(&xhci->op_regs->cmd_ring, CMD_RING_RUNNING, 0, 5 * 1000 * 1000); if (ret < 0) { - xhci_err(xhci, - "Stop command ring failed, maybe the host is dead\n"); - xhci->xhc_state |= XHCI_STATE_DYING; + xhci_err(xhci, "Abort failed to stop command ring: %d\n", ret); xhci_halt(xhci); - return -ESHUTDOWN; + xhci_hc_died(xhci); + return ret; } /* * Writing the CMD_RING_ABORT bit should cause a cmd completion event, @@ -689,6 +689,8 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, struct xhci_virt_ep *ep; struct xhci_td *cur_td = NULL; struct xhci_td *last_unlinked_td; + struct xhci_ep_ctx *ep_ctx; + struct xhci_virt_device *vdev; struct xhci_dequeue_state deq_state; @@ -702,6 +704,11 @@ static void xhci_handle_cmd_stop_ep(struct xhci_hcd *xhci, int slot_id, memset(&deq_state, 0, sizeof(deq_state)); ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3])); + + vdev = xhci->devs[slot_id]; + ep_ctx = xhci_get_ep_ctx(xhci, vdev->out_ctx, ep_index); + trace_xhci_handle_cmd_stop_ep(ep_ctx); + ep = &xhci->devs[slot_id]->eps[ep_index]; last_unlinked_td = list_last_entry(&ep->cancelled_td_list, struct xhci_td, cancelled_td_list); @@ -866,6 +873,40 @@ static void xhci_kill_endpoint_urbs(struct xhci_hcd *xhci, } } +/* + * host controller died, register read returns 0xffffffff + * Complete pending commands, mark them ABORTED. + * URBs need to be given back as usb core might be waiting with device locks + * held for the URBs to finish during device disconnect, blocking host remove. + * + * Call with xhci->lock held. + * lock is relased and re-acquired while giving back urb. + */ +void xhci_hc_died(struct xhci_hcd *xhci) +{ + int i, j; + + if (xhci->xhc_state & XHCI_STATE_DYING) + return; + + xhci_err(xhci, "xHCI host controller not responding, assume dead\n"); + xhci->xhc_state |= XHCI_STATE_DYING; + + xhci_cleanup_command_queue(xhci); + + /* return any pending urbs, remove may be waiting for them */ + for (i = 0; i <= HCS_MAX_SLOTS(xhci->hcs_params1); i++) { + if (!xhci->devs[i]) + continue; + for (j = 0; j < 31; j++) + xhci_kill_endpoint_urbs(xhci, i, j); + } + + /* inform usb core hc died if PCI remove isn't already handling it */ + if (!(xhci->xhc_state & XHCI_STATE_REMOVING)) + usb_hc_died(xhci_to_hcd(xhci)); +} + /* Watchdog timer function for when a stop endpoint command fails to complete. * In this case, we assume the host controller is broken or dying or dead. The * host may still be completing some other events, so we have to be careful to @@ -887,7 +928,6 @@ void xhci_stop_endpoint_command_watchdog(unsigned long arg) { struct xhci_hcd *xhci; struct xhci_virt_ep *ep; - int ret, i, j; unsigned long flags; ep = (struct xhci_virt_ep *) arg; @@ -904,52 +944,22 @@ void xhci_stop_endpoint_command_watchdog(unsigned long arg) } xhci_warn(xhci, "xHCI host not responding to stop endpoint command.\n"); - xhci_warn(xhci, "Assuming host is dying, halting host.\n"); - /* Oops, HC is dead or dying or at least not responding to the stop - * endpoint command. - */ - - xhci->xhc_state |= XHCI_STATE_DYING; ep->ep_state &= ~EP_STOP_CMD_PENDING; - /* Disable interrupts from the host controller and start halting it */ - xhci_quiesce(xhci); - spin_unlock_irqrestore(&xhci->lock, flags); + xhci_halt(xhci); - ret = xhci_halt(xhci); + /* + * handle a stop endpoint cmd timeout as if host died (-ENODEV). + * In the future we could distinguish between -ENODEV and -ETIMEDOUT + * and try to recover a -ETIMEDOUT with a host controller reset + */ + xhci_hc_died(xhci); - spin_lock_irqsave(&xhci->lock, flags); - if (ret < 0) { - /* This is bad; the host is not responding to commands and it's - * not allowing itself to be halted. At least interrupts are - * disabled. If we call usb_hc_died(), it will attempt to - * disconnect all device drivers under this host. Those - * disconnect() methods will wait for all URBs to be unlinked, - * so we must complete them. - */ - xhci_warn(xhci, "Non-responsive xHCI host is not halting.\n"); - xhci_warn(xhci, "Completing active URBs anyway.\n"); - /* We could turn all TDs on the rings to no-ops. This won't - * help if the host has cached part of the ring, and is slow if - * we want to preserve the cycle bit. Skip it and hope the host - * doesn't touch the memory. - */ - } - for (i = 0; i < MAX_HC_SLOTS; i++) { - if (!xhci->devs[i]) - continue; - for (j = 0; j < 31; j++) - xhci_kill_endpoint_urbs(xhci, i, j); - } spin_unlock_irqrestore(&xhci->lock, flags); - xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, - "Calling usb_hc_died()"); - usb_hc_died(xhci_to_hcd(xhci)); xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, "xHCI host controller is dead."); } - static void update_ring_for_set_deq_completion(struct xhci_hcd *xhci, struct xhci_virt_device *dev, struct xhci_ring *ep_ring, @@ -1029,6 +1039,8 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id, ep_ctx = xhci_get_ep_ctx(xhci, dev->out_ctx, ep_index); slot_ctx = xhci_get_slot_ctx(xhci, dev->out_ctx); + trace_xhci_handle_cmd_set_deq(slot_ctx); + trace_xhci_handle_cmd_set_deq_ep(ep_ctx); if (cmd_comp_code != COMP_SUCCESS) { unsigned int ep_state; @@ -1099,9 +1111,15 @@ static void xhci_handle_cmd_set_deq(struct xhci_hcd *xhci, int slot_id, static void xhci_handle_cmd_reset_ep(struct xhci_hcd *xhci, int slot_id, union xhci_trb *trb, u32 cmd_comp_code) { + struct xhci_virt_device *vdev; + struct xhci_ep_ctx *ep_ctx; unsigned int ep_index; ep_index = TRB_TO_EP_INDEX(le32_to_cpu(trb->generic.field[3])); + vdev = xhci->devs[slot_id]; + ep_ctx = xhci_get_ep_ctx(xhci, vdev->out_ctx, ep_index); + trace_xhci_handle_cmd_reset_ep(ep_ctx); + /* This command will only fail if the endpoint wasn't halted, * but we don't care. */ @@ -1114,11 +1132,11 @@ static void xhci_handle_cmd_reset_ep(struct xhci_hcd *xhci, int slot_id, */ if (xhci->quirks & XHCI_RESET_EP_QUIRK) { struct xhci_command *command; + command = xhci_alloc_command(xhci, false, false, GFP_ATOMIC); - if (!command) { - xhci_warn(xhci, "WARN Cannot submit cfg ep: ENOMEM\n"); + if (!command) return; - } + xhci_dbg_trace(xhci, trace_xhci_dbg_quirks, "Queueing configure endpoint command"); xhci_queue_configure_endpoint(xhci, command, @@ -1143,10 +1161,15 @@ static void xhci_handle_cmd_enable_slot(struct xhci_hcd *xhci, int slot_id, static void xhci_handle_cmd_disable_slot(struct xhci_hcd *xhci, int slot_id) { struct xhci_virt_device *virt_dev; + struct xhci_slot_ctx *slot_ctx; virt_dev = xhci->devs[slot_id]; if (!virt_dev) return; + + slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx); + trace_xhci_handle_cmd_disable_slot(slot_ctx); + if (xhci->quirks & XHCI_EP_LIMIT_QUIRK) /* Delete default control endpoint resources */ xhci_free_device_endpoint_resources(xhci, virt_dev, true); @@ -1158,6 +1181,7 @@ static void xhci_handle_cmd_config_ep(struct xhci_hcd *xhci, int slot_id, { struct xhci_virt_device *virt_dev; struct xhci_input_control_ctx *ctrl_ctx; + struct xhci_ep_ctx *ep_ctx; unsigned int ep_index; unsigned int ep_state; u32 add_flags, drop_flags; @@ -1182,6 +1206,9 @@ static void xhci_handle_cmd_config_ep(struct xhci_hcd *xhci, int slot_id, /* Input ctx add_flags are the endpoint index plus one */ ep_index = xhci_last_valid_endpoint(add_flags) - 1; + ep_ctx = xhci_get_ep_ctx(xhci, virt_dev->out_ctx, ep_index); + trace_xhci_handle_cmd_config_ep(ep_ctx); + /* A usb_set_interface() call directly after clearing a halted * condition may race on this quirky hardware. Not worth * worrying about, since this is prototype hardware. Not sure @@ -1206,9 +1233,26 @@ static void xhci_handle_cmd_config_ep(struct xhci_hcd *xhci, int slot_id, return; } +static void xhci_handle_cmd_addr_dev(struct xhci_hcd *xhci, int slot_id) +{ + struct xhci_virt_device *vdev; + struct xhci_slot_ctx *slot_ctx; + + vdev = xhci->devs[slot_id]; + slot_ctx = xhci_get_slot_ctx(xhci, vdev->out_ctx); + trace_xhci_handle_cmd_addr_dev(slot_ctx); +} + static void xhci_handle_cmd_reset_dev(struct xhci_hcd *xhci, int slot_id, struct xhci_event_cmd *event) { + struct xhci_virt_device *vdev; + struct xhci_slot_ctx *slot_ctx; + + vdev = xhci->devs[slot_id]; + slot_ctx = xhci_get_slot_ctx(xhci, vdev->out_ctx); + trace_xhci_handle_cmd_reset_dev(slot_ctx); + xhci_dbg(xhci, "Completed reset device command.\n"); if (!xhci->devs[slot_id]) xhci_warn(xhci, "Reset device command completion " @@ -1250,7 +1294,6 @@ void xhci_cleanup_command_queue(struct xhci_hcd *xhci) void xhci_handle_command_timeout(struct work_struct *work) { struct xhci_hcd *xhci; - int ret; unsigned long flags; u64 hw_ring_state; @@ -1271,22 +1314,17 @@ void xhci_handle_command_timeout(struct work_struct *work) /* Make sure command ring is running before aborting it */ hw_ring_state = xhci_read_64(xhci, &xhci->op_regs->cmd_ring); + if (hw_ring_state == ~(u64)0) { + xhci_hc_died(xhci); + goto time_out_completed; + } + if ((xhci->cmd_ring_state & CMD_RING_STATE_RUNNING) && (hw_ring_state & CMD_RING_RUNNING)) { /* Prevent new doorbell, and start command abort */ xhci->cmd_ring_state = CMD_RING_STATE_ABORTED; xhci_dbg(xhci, "Command timeout\n"); - ret = xhci_abort_cmd_ring(xhci, flags); - if (unlikely(ret == -ESHUTDOWN)) { - xhci_err(xhci, "Abort command ring failed\n"); - xhci_cleanup_command_queue(xhci); - spin_unlock_irqrestore(&xhci->lock, flags); - usb_hc_died(xhci_to_hcd(xhci)->primary_hcd); - xhci_dbg(xhci, "xHCI host controller is dead.\n"); - - return; - } - + xhci_abort_cmd_ring(xhci, flags); goto time_out_completed; } @@ -1384,6 +1422,7 @@ static void handle_cmd_completion(struct xhci_hcd *xhci, case TRB_EVAL_CONTEXT: break; case TRB_ADDR_DEV: + xhci_handle_cmd_addr_dev(xhci, slot_id); break; case TRB_STOP_RING: WARN_ON(slot_id != TRB_TO_SLOT_ID( @@ -2243,7 +2282,8 @@ static int handle_tx_event(struct xhci_hcd *xhci, slot_id = TRB_TO_SLOT_ID(le32_to_cpu(event->flags)); xdev = xhci->devs[slot_id]; if (!xdev) { - xhci_err(xhci, "ERROR Transfer event pointed to bad slot\n"); + xhci_err(xhci, "ERROR Transfer event pointed to bad slot %u\n", + slot_id); xhci_err(xhci, "@%016llx %08x %08x %08x %08x\n", (unsigned long long) xhci_trb_virt_to_dma( xhci->event_ring->deq_seg, @@ -2252,8 +2292,6 @@ static int handle_tx_event(struct xhci_hcd *xhci, upper_32_bits(le64_to_cpu(event->buffer)), le32_to_cpu(event->transfer_len), le32_to_cpu(event->flags)); - xhci_dbg(xhci, "Event ring:\n"); - xhci_debug_segment(xhci, xhci->event_ring->deq_seg); return -ENODEV; } @@ -2263,8 +2301,9 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep_ring = xhci_dma_to_transfer_ring(ep, le64_to_cpu(event->buffer)); ep_ctx = xhci_get_ep_ctx(xhci, xdev->out_ctx, ep_index); if (!ep_ring || GET_EP_CTX_STATE(ep_ctx) == EP_STATE_DISABLED) { - xhci_err(xhci, "ERROR Transfer event for disabled endpoint " - "or incorrect stream ring\n"); + xhci_err(xhci, + "ERROR Transfer event for disabled endpoint slot %u ep %u or incorrect stream ring\n", + slot_id, ep_index); xhci_err(xhci, "@%016llx %08x %08x %08x %08x\n", (unsigned long long) xhci_trb_virt_to_dma( xhci->event_ring->deq_seg, @@ -2273,8 +2312,6 @@ static int handle_tx_event(struct xhci_hcd *xhci, upper_32_bits(le64_to_cpu(event->buffer)), le32_to_cpu(event->transfer_len), le32_to_cpu(event->flags)); - xhci_dbg(xhci, "Event ring:\n"); - xhci_debug_segment(xhci, xhci->event_ring->deq_seg); return -ENODEV; } @@ -2298,45 +2335,62 @@ static int handle_tx_event(struct xhci_hcd *xhci, trb_comp_code = COMP_SHORT_PACKET; else xhci_warn_ratelimited(xhci, - "WARN Successful completion on short TX: needs XHCI_TRUST_TX_LENGTH quirk?\n"); + "WARN Successful completion on short TX for slot %u ep %u: needs XHCI_TRUST_TX_LENGTH quirk?\n", + slot_id, ep_index); case COMP_SHORT_PACKET: break; case COMP_STOPPED: - xhci_dbg(xhci, "Stopped on Transfer TRB\n"); + xhci_dbg(xhci, "Stopped on Transfer TRB for slot %u ep %u\n", + slot_id, ep_index); break; case COMP_STOPPED_LENGTH_INVALID: - xhci_dbg(xhci, "Stopped on No-op or Link TRB\n"); + xhci_dbg(xhci, + "Stopped on No-op or Link TRB for slot %u ep %u\n", + slot_id, ep_index); break; case COMP_STOPPED_SHORT_PACKET: - xhci_dbg(xhci, "Stopped with short packet transfer detected\n"); + xhci_dbg(xhci, + "Stopped with short packet transfer detected for slot %u ep %u\n", + slot_id, ep_index); break; case COMP_STALL_ERROR: - xhci_dbg(xhci, "Stalled endpoint\n"); + xhci_dbg(xhci, "Stalled endpoint for slot %u ep %u\n", slot_id, + ep_index); ep->ep_state |= EP_HALTED; status = -EPIPE; break; case COMP_TRB_ERROR: - xhci_warn(xhci, "WARN: TRB error on endpoint\n"); + xhci_warn(xhci, + "WARN: TRB error for slot %u ep %u on endpoint\n", + slot_id, ep_index); status = -EILSEQ; break; case COMP_SPLIT_TRANSACTION_ERROR: case COMP_USB_TRANSACTION_ERROR: - xhci_dbg(xhci, "Transfer error on endpoint\n"); + xhci_dbg(xhci, "Transfer error for slot %u ep %u on endpoint\n", + slot_id, ep_index); status = -EPROTO; break; case COMP_BABBLE_DETECTED_ERROR: - xhci_dbg(xhci, "Babble error on endpoint\n"); + xhci_dbg(xhci, "Babble error for slot %u ep %u on endpoint\n", + slot_id, ep_index); status = -EOVERFLOW; break; case COMP_DATA_BUFFER_ERROR: - xhci_warn(xhci, "WARN: HC couldn't access mem fast enough\n"); + xhci_warn(xhci, + "WARN: HC couldn't access mem fast enough for slot %u ep %u\n", + slot_id, ep_index); status = -ENOSR; break; case COMP_BANDWIDTH_OVERRUN_ERROR: - xhci_warn(xhci, "WARN: bandwidth overrun event on endpoint\n"); + xhci_warn(xhci, + "WARN: bandwidth overrun event for slot %u ep %u on endpoint\n", + slot_id, ep_index); break; case COMP_ISOCH_BUFFER_OVERRUN: - xhci_warn(xhci, "WARN: buffer overrun event on endpoint\n"); + xhci_warn(xhci, + "WARN: buffer overrun event for slot %u ep %u on endpoint", + slot_id, ep_index); break; case COMP_RING_UNDERRUN: /* @@ -2360,7 +2414,9 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep_index); goto cleanup; case COMP_INCOMPATIBLE_DEVICE_ERROR: - xhci_warn(xhci, "WARN: detect an incompatible device"); + xhci_warn(xhci, + "WARN: detect an incompatible device for slot %u ep %u", + slot_id, ep_index); status = -EPROTO; break; case COMP_MISSED_SERVICE_ERROR: @@ -2371,19 +2427,24 @@ static int handle_tx_event(struct xhci_hcd *xhci, * short transfer when process the ep_ring next time. */ ep->skip = true; - xhci_dbg(xhci, "Miss service interval error, set skip flag\n"); + xhci_dbg(xhci, + "Miss service interval error for slot %u ep %u, set skip flag\n", + slot_id, ep_index); goto cleanup; case COMP_NO_PING_RESPONSE_ERROR: ep->skip = true; - xhci_dbg(xhci, "No Ping response error, Skip one Isoc TD\n"); + xhci_dbg(xhci, + "No Ping response error for slot %u ep %u, Skip one Isoc TD\n", + slot_id, ep_index); goto cleanup; default: if (xhci_is_vendor_info_code(xhci, trb_comp_code)) { status = 0; break; } - xhci_warn(xhci, "ERROR Unknown event condition %u, HC probably busted\n", - trb_comp_code); + xhci_warn(xhci, + "ERROR Unknown event condition %u for slot %u ep %u , HC probably busted\n", + trb_comp_code, slot_id, ep_index); goto cleanup; } @@ -2402,15 +2463,11 @@ static int handle_tx_event(struct xhci_hcd *xhci, xhci_warn(xhci, "WARN Event TRB for slot %d ep %d with no TDs queued?\n", TRB_TO_SLOT_ID(le32_to_cpu(event->flags)), ep_index); - xhci_dbg(xhci, "Event TRB with TRB type ID %u\n", - (le32_to_cpu(event->flags) & - TRB_TYPE_BITMASK)>>10); - xhci_print_trb_offsets(xhci, (union xhci_trb *) event); } if (ep->skip) { ep->skip = false; - xhci_dbg(xhci, "td_list is empty while skip " - "flag set. Clear skip flag.\n"); + xhci_dbg(xhci, "td_list is empty while skip flag set. Clear skip flag for slot %u ep %u.\n", + slot_id, ep_index); } goto cleanup; } @@ -2418,8 +2475,8 @@ static int handle_tx_event(struct xhci_hcd *xhci, /* We've skipped all the TDs on the ep ring when ep->skip set */ if (ep->skip && td_num == 0) { ep->skip = false; - xhci_dbg(xhci, "All tds on the ep_ring skipped. " - "Clear skip flag.\n"); + xhci_dbg(xhci, "All tds on the ep_ring skipped. Clear skip flag for slot %u ep %u.\n", + slot_id, ep_index); goto cleanup; } @@ -2478,7 +2535,9 @@ static int handle_tx_event(struct xhci_hcd *xhci, ep_ring->last_td_was_short = false; if (ep->skip) { - xhci_dbg(xhci, "Found td. Clear skip flag.\n"); + xhci_dbg(xhci, + "Found td. Clear skip flag for slot %u ep %u.\n", + slot_id, ep_index); ep->skip = false; } @@ -2495,7 +2554,9 @@ static int handle_tx_event(struct xhci_hcd *xhci, * the TD. */ if (trb_is_noop(ep_trb)) { - xhci_dbg(xhci, "ep_trb is a no-op TRB. Skip it\n"); + xhci_dbg(xhci, + "ep_trb is a no-op TRB. Skip it for slot %u ep %u\n", + slot_id, ep_index); goto cleanup; } @@ -2623,7 +2684,8 @@ irqreturn_t xhci_irq(struct usb_hcd *hcd) spin_lock(&xhci->lock); /* Check if the xHC generated the interrupt, or the irq is shared */ status = readl(&xhci->op_regs->status); - if (status == 0xffffffff) { + if (status == ~(u32)0) { + xhci_hc_died(xhci); ret = IRQ_HANDLED; goto out; } @@ -3942,10 +4004,8 @@ void xhci_queue_new_dequeue_state(struct xhci_hcd *xhci, /* This function gets called from contexts where it cannot sleep */ cmd = xhci_alloc_command(xhci, false, false, GFP_ATOMIC); - if (!cmd) { - xhci_warn(xhci, "WARN Cannot submit Set TR Deq Ptr: ENOMEM\n"); + if (!cmd) return; - } ep->queued_deq_seg = deq_state->new_deq_seg; ep->queued_deq_ptr = deq_state->new_deq_ptr; diff --git a/drivers/usb/host/xhci-trace.h b/drivers/usb/host/xhci-trace.h index 1ac2cdf8eece9542b551a525a338b67d2e8b30c0..8ce96de10e8a1cb0adbb1eb61b7abe3e5b084573 100644 --- a/drivers/usb/host/xhci-trace.h +++ b/drivers/usb/host/xhci-trace.h @@ -231,6 +231,7 @@ DECLARE_EVENT_CLASS(xhci_log_urb, __field(int, epnum) __field(int, dir_in) __field(int, type) + __field(int, slot_id) ), TP_fast_assign( __entry->urb = urb; @@ -245,8 +246,9 @@ DECLARE_EVENT_CLASS(xhci_log_urb, __entry->epnum = usb_endpoint_num(&urb->ep->desc); __entry->dir_in = usb_endpoint_dir_in(&urb->ep->desc); __entry->type = usb_endpoint_type(&urb->ep->desc); + __entry->slot_id = urb->dev->slot_id; ), - TP_printk("ep%d%s-%s: urb %p pipe %u length %d/%d sgs %d/%d stream %d flags %08x", + TP_printk("ep%d%s-%s: urb %p pipe %u slot %d length %d/%d sgs %d/%d stream %d flags %08x", __entry->epnum, __entry->dir_in ? "in" : "out", ({ char *s; switch (__entry->type) { @@ -264,8 +266,8 @@ DECLARE_EVENT_CLASS(xhci_log_urb, break; default: s = "UNKNOWN"; - } s; }), __entry->urb, __entry->pipe, __entry->actual, - __entry->length, __entry->num_mapped_sgs, + } s; }), __entry->urb, __entry->pipe, __entry->slot_id, + __entry->actual, __entry->length, __entry->num_mapped_sgs, __entry->num_sgs, __entry->stream, __entry->flags ) ); @@ -285,6 +287,172 @@ DEFINE_EVENT(xhci_log_urb, xhci_urb_dequeue, TP_ARGS(urb) ); +DECLARE_EVENT_CLASS(xhci_log_ep_ctx, + TP_PROTO(struct xhci_ep_ctx *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u32, info) + __field(u32, info2) + __field(u64, deq) + __field(u32, tx_info) + ), + TP_fast_assign( + __entry->info = le32_to_cpu(ctx->ep_info); + __entry->info2 = le32_to_cpu(ctx->ep_info2); + __entry->deq = le64_to_cpu(ctx->deq); + __entry->tx_info = le32_to_cpu(ctx->tx_info); + ), + TP_printk("%s", xhci_decode_ep_context(__entry->info, + __entry->info2, __entry->deq, __entry->tx_info) + ) +); + +DEFINE_EVENT(xhci_log_ep_ctx, xhci_handle_cmd_stop_ep, + TP_PROTO(struct xhci_ep_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_ep_ctx, xhci_handle_cmd_set_deq_ep, + TP_PROTO(struct xhci_ep_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_ep_ctx, xhci_handle_cmd_reset_ep, + TP_PROTO(struct xhci_ep_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_ep_ctx, xhci_handle_cmd_config_ep, + TP_PROTO(struct xhci_ep_ctx *ctx), + TP_ARGS(ctx) +); + +DECLARE_EVENT_CLASS(xhci_log_slot_ctx, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(u32, info) + __field(u32, info2) + __field(u32, tt_info) + __field(u32, state) + ), + TP_fast_assign( + __entry->info = le32_to_cpu(ctx->dev_info); + __entry->info2 = le32_to_cpu(ctx->dev_info2); + __entry->tt_info = le64_to_cpu(ctx->tt_info); + __entry->state = le32_to_cpu(ctx->dev_state); + ), + TP_printk("%s", xhci_decode_slot_context(__entry->info, + __entry->info2, __entry->tt_info, + __entry->state) + ) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_alloc_dev, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_free_dev, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_handle_cmd_disable_slot, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_discover_or_reset_device, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_setup_device_slot, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_handle_cmd_addr_dev, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_handle_cmd_reset_dev, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DEFINE_EVENT(xhci_log_slot_ctx, xhci_handle_cmd_set_deq, + TP_PROTO(struct xhci_slot_ctx *ctx), + TP_ARGS(ctx) +); + +DECLARE_EVENT_CLASS(xhci_log_ring, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring), + TP_STRUCT__entry( + __field(u32, type) + __field(void *, ring) + __field(dma_addr_t, enq) + __field(dma_addr_t, deq) + __field(dma_addr_t, enq_seg) + __field(dma_addr_t, deq_seg) + __field(unsigned int, num_segs) + __field(unsigned int, stream_id) + __field(unsigned int, cycle_state) + __field(unsigned int, num_trbs_free) + __field(unsigned int, bounce_buf_len) + ), + TP_fast_assign( + __entry->ring = ring; + __entry->type = ring->type; + __entry->num_segs = ring->num_segs; + __entry->stream_id = ring->stream_id; + __entry->enq_seg = ring->enq_seg->dma; + __entry->deq_seg = ring->deq_seg->dma; + __entry->cycle_state = ring->cycle_state; + __entry->num_trbs_free = ring->num_trbs_free; + __entry->bounce_buf_len = ring->bounce_buf_len; + __entry->enq = xhci_trb_virt_to_dma(ring->enq_seg, ring->enqueue); + __entry->deq = xhci_trb_virt_to_dma(ring->deq_seg, ring->dequeue); + ), + TP_printk("%s %p: enq %pad(%pad) deq %pad(%pad) segs %d stream %d free_trbs %d bounce %d cycle %d", + xhci_ring_type_string(__entry->type), __entry->ring, + &__entry->enq, &__entry->enq_seg, + &__entry->deq, &__entry->deq_seg, + __entry->num_segs, + __entry->stream_id, + __entry->num_trbs_free, + __entry->bounce_buf_len, + __entry->cycle_state + ) +); + +DEFINE_EVENT(xhci_log_ring, xhci_ring_alloc, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring) +); + +DEFINE_EVENT(xhci_log_ring, xhci_ring_free, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring) +); + +DEFINE_EVENT(xhci_log_ring, xhci_ring_expansion, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring) +); + +DEFINE_EVENT(xhci_log_ring, xhci_inc_enq, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring) +); + +DEFINE_EVENT(xhci_log_ring, xhci_inc_deq, + TP_PROTO(struct xhci_ring *ring), + TP_ARGS(ring) +); #endif /* __XHCI_TRACE_H */ /* this part must be outside header guard */ diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 953fd8f62df0787b0479286c12513656f141614f..2d1310220832c3792f7290f1095e8bd23ea71fd2 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -125,7 +125,7 @@ int xhci_halt(struct xhci_hcd *xhci) /* * Set the run bit and wait for the host to be running. */ -static int xhci_start(struct xhci_hcd *xhci) +int xhci_start(struct xhci_hcd *xhci) { u32 temp; int ret; @@ -216,31 +216,21 @@ int xhci_reset(struct xhci_hcd *xhci) return ret; } -#ifdef CONFIG_PCI -static int xhci_free_msi(struct xhci_hcd *xhci) -{ - int i; - - if (!xhci->msix_entries) - return -EINVAL; - - for (i = 0; i < xhci->msix_count; i++) - if (xhci->msix_entries[i].vector) - free_irq(xhci->msix_entries[i].vector, - xhci_to_hcd(xhci)); - return 0; -} +#ifdef CONFIG_USB_PCI /* * Set up MSI */ static int xhci_setup_msi(struct xhci_hcd *xhci) { int ret; + /* + * TODO:Check with MSI Soc for sysdev + */ struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller); - ret = pci_enable_msi(pdev); - if (ret) { + ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI); + if (ret < 0) { xhci_dbg_trace(xhci, trace_xhci_dbg_init, "failed to allocate MSI entry"); return ret; @@ -251,34 +241,12 @@ static int xhci_setup_msi(struct xhci_hcd *xhci) if (ret) { xhci_dbg_trace(xhci, trace_xhci_dbg_init, "disable MSI interrupt"); - pci_disable_msi(pdev); + pci_free_irq_vectors(pdev); } return ret; } -/* - * Free IRQs - * free all IRQs request - */ -static void xhci_free_irq(struct xhci_hcd *xhci) -{ - struct pci_dev *pdev = to_pci_dev(xhci_to_hcd(xhci)->self.controller); - int ret; - - /* return if using legacy interrupt */ - if (xhci_to_hcd(xhci)->irq > 0) - return; - - ret = xhci_free_msi(xhci); - if (!ret) - return; - if (pdev->irq > 0) - free_irq(pdev->irq, xhci_to_hcd(xhci)); - - return; -} - /* * Set up MSI-X */ @@ -298,28 +266,17 @@ static int xhci_setup_msix(struct xhci_hcd *xhci) xhci->msix_count = min(num_online_cpus() + 1, HCS_MAX_INTRS(xhci->hcs_params1)); - xhci->msix_entries = - kmalloc((sizeof(struct msix_entry))*xhci->msix_count, - GFP_KERNEL); - if (!xhci->msix_entries) - return -ENOMEM; - - for (i = 0; i < xhci->msix_count; i++) { - xhci->msix_entries[i].entry = i; - xhci->msix_entries[i].vector = 0; - } - - ret = pci_enable_msix_exact(pdev, xhci->msix_entries, xhci->msix_count); - if (ret) { + ret = pci_alloc_irq_vectors(pdev, xhci->msix_count, xhci->msix_count, + PCI_IRQ_MSIX); + if (ret < 0) { xhci_dbg_trace(xhci, trace_xhci_dbg_init, "Failed to enable MSI-X"); - goto free_entries; + return ret; } for (i = 0; i < xhci->msix_count; i++) { - ret = request_irq(xhci->msix_entries[i].vector, - xhci_msi_irq, - 0, "xhci_hcd", xhci_to_hcd(xhci)); + ret = request_irq(pci_irq_vector(pdev, i), xhci_msi_irq, 0, + "xhci_hcd", xhci_to_hcd(xhci)); if (ret) goto disable_msix; } @@ -329,11 +286,9 @@ static int xhci_setup_msix(struct xhci_hcd *xhci) disable_msix: xhci_dbg_trace(xhci, trace_xhci_dbg_init, "disable MSI-X interrupt"); - xhci_free_irq(xhci); - pci_disable_msix(pdev); -free_entries: - kfree(xhci->msix_entries); - xhci->msix_entries = NULL; + while (--i >= 0) + free_irq(pci_irq_vector(pdev, i), xhci_to_hcd(xhci)); + pci_free_irq_vectors(pdev); return ret; } @@ -346,27 +301,33 @@ static void xhci_cleanup_msix(struct xhci_hcd *xhci) if (xhci->quirks & XHCI_PLAT) return; - xhci_free_irq(xhci); + /* return if using legacy interrupt */ + if (hcd->irq > 0) + return; + + if (hcd->msix_enabled) { + int i; - if (xhci->msix_entries) { - pci_disable_msix(pdev); - kfree(xhci->msix_entries); - xhci->msix_entries = NULL; + for (i = 0; i < xhci->msix_count; i++) + free_irq(pci_irq_vector(pdev, i), xhci_to_hcd(xhci)); } else { - pci_disable_msi(pdev); + free_irq(pci_irq_vector(pdev, 0), xhci_to_hcd(xhci)); } + pci_free_irq_vectors(pdev); hcd->msix_enabled = 0; - return; } static void __maybe_unused xhci_msix_sync_irqs(struct xhci_hcd *xhci) { - int i; + struct usb_hcd *hcd = xhci_to_hcd(xhci); + + if (hcd->msix_enabled) { + struct pci_dev *pdev = to_pci_dev(hcd->self.controller); + int i; - if (xhci->msix_entries) { for (i = 0; i < xhci->msix_count; i++) - synchronize_irq(xhci->msix_entries[i].vector); + synchronize_irq(pci_irq_vector(pdev, i)); } } @@ -539,7 +500,7 @@ static int xhci_all_ports_seen_u0(struct xhci_hcd *xhci) * device contexts (?), set up a command ring segment (or two?), create event * ring (one for now). */ -int xhci_init(struct usb_hcd *hcd) +static int xhci_init(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); int retval = 0; @@ -619,16 +580,10 @@ int xhci_run(struct usb_hcd *hcd) if (ret) return ret; - xhci_dbg(xhci, "Command ring memory map follows:\n"); - xhci_debug_ring(xhci, xhci->cmd_ring); - xhci_dbg_ring_ptrs(xhci, xhci->cmd_ring); xhci_dbg_cmd_ptrs(xhci); xhci_dbg(xhci, "ERST memory map follows:\n"); xhci_dbg_erst(xhci, &xhci->erst); - xhci_dbg(xhci, "Event ring:\n"); - xhci_debug_ring(xhci, xhci->event_ring); - xhci_dbg_ring_ptrs(xhci, xhci->event_ring); temp_64 = xhci_read_64(xhci, &xhci->ir_set->erst_dequeue); temp_64 &= ~ERST_PTR_MASK; xhci_dbg_trace(xhci, trace_xhci_dbg_init, @@ -661,9 +616,11 @@ int xhci_run(struct usb_hcd *hcd) if (xhci->quirks & XHCI_NEC_HOST) { struct xhci_command *command; + command = xhci_alloc_command(xhci, false, false, GFP_KERNEL); if (!command) return -ENOMEM; + xhci_queue_vendor_command(xhci, command, 0, 0, 0, TRB_TYPE(TRB_NEC_GET_FW)); } @@ -682,28 +639,28 @@ EXPORT_SYMBOL_GPL(xhci_run); * Disable device contexts, disable IRQs, and quiesce the HC. * Reset the HC, finish any completed transactions, and cleanup memory. */ -void xhci_stop(struct usb_hcd *hcd) +static void xhci_stop(struct usb_hcd *hcd) { u32 temp; struct xhci_hcd *xhci = hcd_to_xhci(hcd); mutex_lock(&xhci->mutex); - if (!(xhci->xhc_state & XHCI_STATE_HALTED)) { - spin_lock_irq(&xhci->lock); - - xhci->xhc_state |= XHCI_STATE_HALTED; - xhci->cmd_ring_state = CMD_RING_STATE_STOPPED; - xhci_halt(xhci); - xhci_reset(xhci); - spin_unlock_irq(&xhci->lock); - } - + /* Only halt host and free memory after both hcds are removed */ if (!usb_hcd_is_primary_hcd(hcd)) { + /* usb core will free this hcd shortly, unset pointer */ + xhci->shared_hcd = NULL; mutex_unlock(&xhci->mutex); return; } + spin_lock_irq(&xhci->lock); + xhci->xhc_state |= XHCI_STATE_HALTED; + xhci->cmd_ring_state = CMD_RING_STATE_STOPPED; + xhci_halt(xhci); + xhci_reset(xhci); + spin_unlock_irq(&xhci->lock); + xhci_cleanup_msix(xhci); /* Deleting Compliance Mode Recovery Timer */ @@ -721,7 +678,7 @@ void xhci_stop(struct usb_hcd *hcd) xhci_dbg_trace(xhci, trace_xhci_dbg_init, "// Disabling event ring interrupts"); temp = readl(&xhci->op_regs->status); - writel(temp & ~STS_EINT, &xhci->op_regs->status); + writel((temp & ~0x1fff) | STS_EINT, &xhci->op_regs->status); temp = readl(&xhci->ir_set->irq_pending); writel(ER_IRQ_DISABLE(temp), &xhci->ir_set->irq_pending); xhci_print_ir_set(xhci, 0); @@ -743,12 +700,12 @@ void xhci_stop(struct usb_hcd *hcd) * * This will only ever be called with the main usb_hcd (the USB3 roothub). */ -void xhci_shutdown(struct usb_hcd *hcd) +static void xhci_shutdown(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); if (xhci->quirks & XHCI_SPURIOUS_REBOOT) - usb_disable_xhci_ports(to_pci_dev(hcd->self.controller)); + usb_disable_xhci_ports(to_pci_dev(hcd->self.sysdev)); spin_lock_irq(&xhci->lock); xhci_halt(xhci); @@ -765,7 +722,7 @@ void xhci_shutdown(struct usb_hcd *hcd) /* Yet another workaround for spurious wakeups at shutdown with HSW */ if (xhci->quirks & XHCI_SPURIOUS_WAKEUP) - pci_set_power_state(to_pci_dev(hcd->self.controller), PCI_D3hot); + pci_set_power_state(to_pci_dev(hcd->self.sysdev), PCI_D3hot); } #ifdef CONFIG_PM @@ -1054,7 +1011,7 @@ int xhci_resume(struct xhci_hcd *xhci, bool hibernated) xhci_dbg(xhci, "// Disabling event ring interrupts\n"); temp = readl(&xhci->op_regs->status); - writel(temp & ~STS_EINT, &xhci->op_regs->status); + writel((temp & ~0x1fff) | STS_EINT, &xhci->op_regs->status); temp = readl(&xhci->ir_set->irq_pending); writel(ER_IRQ_DISABLE(temp), &xhci->ir_set->irq_pending); xhci_print_ir_set(xhci, 0); @@ -1176,7 +1133,7 @@ unsigned int xhci_get_endpoint_address(unsigned int ep_index) * endpoint index to create a bitmask. The slot context is bit 0, endpoint 0 is * bit 1, etc. */ -unsigned int xhci_get_endpoint_flag(struct usb_endpoint_descriptor *desc) +static unsigned int xhci_get_endpoint_flag(struct usb_endpoint_descriptor *desc) { return 1 << (xhci_get_endpoint_index(desc) + 1); } @@ -1185,7 +1142,7 @@ unsigned int xhci_get_endpoint_flag(struct usb_endpoint_descriptor *desc) * endpoint index to create a bitmask. The slot context is bit 0, endpoint 0 is * bit 1, etc. */ -unsigned int xhci_get_endpoint_flag_from_index(unsigned int ep_index) +static unsigned int xhci_get_endpoint_flag_from_index(unsigned int ep_index) { return 1 << (ep_index + 1); } @@ -1306,11 +1263,6 @@ static int xhci_check_maxpacket(struct xhci_hcd *xhci, unsigned int slot_id, ctrl_ctx->add_flags = cpu_to_le32(EP0_FLAG); ctrl_ctx->drop_flags = 0; - xhci_dbg(xhci, "Slot %d input context\n", slot_id); - xhci_dbg_ctx(xhci, command->in_ctx, ep_index); - xhci_dbg(xhci, "Slot %d output context\n", slot_id); - xhci_dbg_ctx(xhci, out_ctx, ep_index); - ret = xhci_configure_endpoint(xhci, urb->dev, command, true, false); @@ -1329,7 +1281,7 @@ static int xhci_check_maxpacket(struct xhci_hcd *xhci, unsigned int slot_id, * non-error returns are a promise to giveback() the urb later * we drop ownership so next owner (or urb unlink) can get it */ -int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags) +static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); unsigned long flags; @@ -1465,7 +1417,7 @@ int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags) * Note that this function can be called in any context, or so says * usb_hcd_unlink_urb() */ -int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) +static int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) { unsigned long flags; int ret, i; @@ -1501,10 +1453,16 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) if (!ep || !ep_ring) goto err_giveback; + /* If xHC is dead take it down and return ALL URBs in xhci_hc_died() */ temp = readl(&xhci->op_regs->status); - if (temp == 0xffffffff || (xhci->xhc_state & XHCI_STATE_HALTED)) { + if (temp == ~(u32)0 || xhci->xhc_state & XHCI_STATE_DYING) { + xhci_hc_died(xhci); + goto done; + } + + if (xhci->xhc_state & XHCI_STATE_HALTED) { xhci_dbg_trace(xhci, trace_xhci_dbg_cancel_urb, - "HW died, freeing TD."); + "HC halted, freeing TD manually."); for (i = urb_priv->num_tds_done; i < urb_priv->num_tds; i++) { @@ -1576,7 +1534,7 @@ int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) * disabled, so there's no need for mutual exclusion to protect * the xhci->devs[slot_id] structure. */ -int xhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, +static int xhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep) { struct xhci_hcd *xhci; @@ -1659,7 +1617,7 @@ int xhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, * configuration or alt setting is installed in the device, so there's no need * for mutual exclusion to protect the xhci->devs[slot_id] structure. */ -int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, +static int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep) { struct xhci_hcd *xhci; @@ -1852,7 +1810,6 @@ static int xhci_evaluate_context_result(struct xhci_hcd *xhci, struct usb_device *udev, u32 *cmd_status) { int ret; - struct xhci_virt_device *virt_dev = xhci->devs[udev->slot_id]; switch (*cmd_status) { case COMP_COMMAND_ABORTED: @@ -1873,7 +1830,6 @@ static int xhci_evaluate_context_result(struct xhci_hcd *xhci, case COMP_CONTEXT_STATE_ERROR: dev_warn(&udev->dev, "WARN: invalid context state for evaluate context command.\n"); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, 1); ret = -EINVAL; break; case COMP_INCOMPATIBLE_DEVICE_ERROR: @@ -2330,7 +2286,7 @@ static unsigned int xhci_get_ss_bw_consumed(struct xhci_bw_info *ep_bw) } -void xhci_drop_ep_from_interval_table(struct xhci_hcd *xhci, +static void xhci_drop_ep_from_interval_table(struct xhci_hcd *xhci, struct xhci_bw_info *ep_bw, struct xhci_interval_bw_table *bw_table, struct usb_device *udev, @@ -2595,6 +2551,12 @@ static int xhci_configure_endpoint(struct xhci_hcd *xhci, return -EINVAL; spin_lock_irqsave(&xhci->lock, flags); + + if (xhci->xhc_state & XHCI_STATE_DYING) { + spin_unlock_irqrestore(&xhci->lock, flags); + return -ESHUTDOWN; + } + virt_dev = xhci->devs[udev->slot_id]; ctrl_ctx = xhci_get_input_control_ctx(command->in_ctx); @@ -2689,7 +2651,7 @@ static void xhci_check_bw_drop_ep_streams(struct xhci_hcd *xhci, * else should be touching the xhci->devs[slot_id] structure, so we * don't need to take the xhci->lock for manipulating that. */ -int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) { int i; int ret = 0; @@ -2746,9 +2708,6 @@ int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) break; } } - xhci_dbg(xhci, "New Input Control Context:\n"); - xhci_dbg_ctx(xhci, virt_dev->in_ctx, - LAST_CTX_TO_EP_NUM(le32_to_cpu(slot_ctx->dev_info))); ret = xhci_configure_endpoint(xhci, udev, command, false, false); @@ -2756,10 +2715,6 @@ int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) /* Callee should call reset_bandwidth() */ goto command_cleanup; - xhci_dbg(xhci, "Output context after successful config ep cmd:\n"); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, - LAST_CTX_TO_EP_NUM(le32_to_cpu(slot_ctx->dev_info))); - /* Free any rings that were dropped, but not changed. */ for (i = 1; i < 31; i++) { if ((le32_to_cpu(ctrl_ctx->drop_flags) & (1 << (i + 1))) && @@ -2793,7 +2748,7 @@ int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) return ret; } -void xhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) +static void xhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev) { struct xhci_hcd *xhci; struct xhci_virt_device *virt_dev; @@ -2826,9 +2781,6 @@ static void xhci_setup_input_ctx_for_config_ep(struct xhci_hcd *xhci, ctrl_ctx->drop_flags = cpu_to_le32(drop_flags); xhci_slot_copy(xhci, in_ctx, out_ctx); ctrl_ctx->add_flags |= cpu_to_le32(SLOT_FLAG); - - xhci_dbg(xhci, "Input Context:\n"); - xhci_dbg_ctx(xhci, in_ctx, xhci_last_valid_endpoint(add_flags)); } static void xhci_setup_input_ctx_for_quirk(struct xhci_hcd *xhci, @@ -2919,7 +2871,7 @@ void xhci_cleanup_stalled_ring(struct xhci_hcd *xhci, * Context: in_interrupt */ -void xhci_endpoint_reset(struct usb_hcd *hcd, +static void xhci_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep) { struct xhci_hcd *xhci; @@ -3095,7 +3047,7 @@ static u32 xhci_calculate_no_streams_bitmask(struct xhci_hcd *xhci, * hardware or endpoints claim they can't support the number of requested * stream IDs. */ -int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, +static int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, unsigned int num_streams, gfp_t mem_flags) { @@ -3129,10 +3081,9 @@ int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, } config_cmd = xhci_alloc_command(xhci, true, true, mem_flags); - if (!config_cmd) { - xhci_dbg(xhci, "Could not allocate xHCI command structure.\n"); + if (!config_cmd) return -ENOMEM; - } + ctrl_ctx = xhci_get_input_control_ctx(config_cmd->in_ctx); if (!ctrl_ctx) { xhci_warn(xhci, "%s: Could not get input context, bad type.\n", @@ -3259,7 +3210,7 @@ int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, * Modify the endpoint context state, submit a configure endpoint command, * and free all endpoint rings for streams if that completes successfully. */ -int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev, +static int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint **eps, unsigned int num_eps, gfp_t mem_flags) { @@ -3391,7 +3342,8 @@ void xhci_free_device_endpoint_resources(struct xhci_hcd *xhci, * re-initialization during S3/S4. In this case, call xhci_alloc_dev() to * re-allocate the device. */ -int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_discover_or_reset_device(struct usb_hcd *hcd, + struct usb_device *udev) { int ret, i; unsigned long flags; @@ -3443,6 +3395,8 @@ int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev) SLOT_STATE_DISABLED) return 0; + trace_xhci_discover_or_reset_device(slot_ctx); + xhci_dbg(xhci, "Resetting device with slot ID %u\n", slot_id); /* Allocate the command structure that holds the struct completion. * Assume we're in process context, since the normal device reset @@ -3539,9 +3493,6 @@ int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev) } /* If necessary, update the number of active TTs on this root port */ xhci_update_tt_active_eps(xhci, virt_dev, old_active_eps); - - xhci_dbg(xhci, "Output context after successful reset device cmd:\n"); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, last_freed_endpoint); ret = 0; command_cleanup: @@ -3554,12 +3505,11 @@ int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev) * disconnected, and all traffic has been stopped and the endpoints have been * disabled. Free any HC data structures associated with that device. */ -void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev) +static void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); struct xhci_virt_device *virt_dev; - unsigned long flags; - u32 state; + struct xhci_slot_ctx *slot_ctx; int i, ret; struct xhci_command *command; @@ -3587,6 +3537,8 @@ void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev) } virt_dev = xhci->devs[udev->slot_id]; + slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx); + trace_xhci_free_dev(slot_ctx); /* Stop any wayward timer functions (which may grab the lock) */ for (i = 0; i < 31; i++) { @@ -3594,30 +3546,50 @@ void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev) del_timer_sync(&virt_dev->eps[i].stop_cmd_timer); } + xhci_disable_slot(xhci, command, udev->slot_id); + /* + * Event command completion handler will free any data structures + * associated with the slot. XXX Can free sleep? + */ +} + +int xhci_disable_slot(struct xhci_hcd *xhci, struct xhci_command *command, + u32 slot_id) +{ + unsigned long flags; + u32 state; + int ret = 0; + struct xhci_virt_device *virt_dev; + + virt_dev = xhci->devs[slot_id]; + if (!virt_dev) + return -EINVAL; + if (!command) + command = xhci_alloc_command(xhci, false, false, GFP_KERNEL); + if (!command) + return -ENOMEM; + spin_lock_irqsave(&xhci->lock, flags); /* Don't disable the slot if the host controller is dead. */ state = readl(&xhci->op_regs->status); if (state == 0xffffffff || (xhci->xhc_state & XHCI_STATE_DYING) || (xhci->xhc_state & XHCI_STATE_HALTED)) { - xhci_free_virt_device(xhci, udev->slot_id); + xhci_free_virt_device(xhci, slot_id); spin_unlock_irqrestore(&xhci->lock, flags); kfree(command); - return; + return ret; } - if (xhci_queue_slot_control(xhci, command, TRB_DISABLE_SLOT, - udev->slot_id)) { + ret = xhci_queue_slot_control(xhci, command, TRB_DISABLE_SLOT, + slot_id); + if (ret) { spin_unlock_irqrestore(&xhci->lock, flags); xhci_dbg(xhci, "FIXME: allocate a command ring segment\n"); - return; + return ret; } xhci_ring_cmd_db(xhci); spin_unlock_irqrestore(&xhci->lock, flags); - - /* - * Event command completion handler will free any data structures - * associated with the slot. XXX Can free sleep? - */ + return ret; } /* @@ -3650,6 +3622,8 @@ static int xhci_reserve_host_control_ep_resources(struct xhci_hcd *xhci) int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); + struct xhci_virt_device *vdev; + struct xhci_slot_ctx *slot_ctx; unsigned long flags; int ret, slot_id; struct xhci_command *command; @@ -3705,6 +3679,10 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev) xhci_warn(xhci, "Could not allocate xHCI USB device data structures\n"); goto disable_slot; } + vdev = xhci->devs[slot_id]; + slot_ctx = xhci_get_slot_ctx(xhci, vdev->out_ctx); + trace_xhci_alloc_dev(slot_ctx); + udev->slot_id = slot_id; #ifndef CONFIG_USB_DEFAULT_PERSIST @@ -3724,15 +3702,10 @@ int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev) disable_slot: /* Disable slot, if we can do it without mem alloc */ - spin_lock_irqsave(&xhci->lock, flags); kfree(command->completion); command->completion = NULL; command->status = 0; - if (!xhci_queue_slot_control(xhci, command, TRB_DISABLE_SLOT, - udev->slot_id)) - xhci_ring_cmd_db(xhci); - spin_unlock_irqrestore(&xhci->lock, flags); - return 0; + return xhci_disable_slot(xhci, command, udev->slot_id); } /* @@ -3779,9 +3752,10 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, ret = -EINVAL; goto out; } + slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx); + trace_xhci_setup_device_slot(slot_ctx); if (setup == SETUP_CONTEXT_ONLY) { - slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx); if (GET_SLOT_STATE(le32_to_cpu(slot_ctx->dev_state)) == SLOT_STATE_DEFAULT) { xhci_dbg(xhci, "Slot already in default state\n"); @@ -3818,8 +3792,6 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, ctrl_ctx->add_flags = cpu_to_le32(SLOT_FLAG | EP0_FLAG); ctrl_ctx->drop_flags = 0; - xhci_dbg(xhci, "Slot ID %d Input Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, virt_dev->in_ctx, 2); trace_xhci_address_ctx(xhci, virt_dev->in_ctx, le32_to_cpu(slot_ctx->dev_info) >> 27); @@ -3872,8 +3844,6 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, xhci_err(xhci, "ERROR: unexpected setup %s command completion code 0x%x.\n", act, command->status); - xhci_dbg(xhci, "Slot ID %d Output Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, 2); trace_xhci_address_ctx(xhci, virt_dev->out_ctx, 1); ret = -EINVAL; break; @@ -3892,17 +3862,12 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, xhci_dbg_trace(xhci, trace_xhci_dbg_address, "Output Context DMA address = %#08llx", (unsigned long long)virt_dev->out_ctx->dma); - xhci_dbg(xhci, "Slot ID %d Input Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, virt_dev->in_ctx, 2); trace_xhci_address_ctx(xhci, virt_dev->in_ctx, le32_to_cpu(slot_ctx->dev_info) >> 27); - xhci_dbg(xhci, "Slot ID %d Output Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, 2); /* * USB core uses address 1 for the roothubs, so we add one to the * address given back to us by the HC. */ - slot_ctx = xhci_get_slot_ctx(xhci, virt_dev->out_ctx); trace_xhci_address_ctx(xhci, virt_dev->out_ctx, le32_to_cpu(slot_ctx->dev_info) >> 27); /* Zero the input context control for later use */ @@ -3921,12 +3886,12 @@ static int xhci_setup_device(struct usb_hcd *hcd, struct usb_device *udev, return ret; } -int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev) { return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ADDRESS); } -int xhci_enable_device(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_enable_device(struct usb_hcd *hcd, struct usb_device *udev) { return xhci_setup_device(hcd, udev, SETUP_CONTEXT_ONLY); } @@ -4003,14 +3968,10 @@ static int __maybe_unused xhci_change_max_exit_latency(struct xhci_hcd *xhci, xhci_dbg_trace(xhci, trace_xhci_dbg_context_change, "Set up evaluate context for LPM MEL change."); - xhci_dbg(xhci, "Slot %u Input Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, command->in_ctx, 0); /* Issue and wait for the evaluate context command. */ ret = xhci_configure_endpoint(xhci, udev, command, true, true); - xhci_dbg(xhci, "Slot %u Output Context:\n", udev->slot_id); - xhci_dbg_ctx(xhci, virt_dev->out_ctx, 0); if (!ret) { spin_lock_irqsave(&xhci->lock, flags); @@ -4083,7 +4044,7 @@ static int xhci_calculate_usb2_hw_lpm_params(struct usb_device *udev) return PORT_BESLD(besld) | PORT_L1_TIMEOUT(l1) | PORT_HIRDM(hirdm); } -int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, +static int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, struct usb_device *udev, int enable) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); @@ -4207,7 +4168,7 @@ static int xhci_check_usb2_port_capability(struct xhci_hcd *xhci, int port, return 0; } -int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); int portnum = udev->portnum - 1; @@ -4616,7 +4577,7 @@ static int calculate_max_exit_latency(struct usb_device *udev, } /* Returns the USB3 hub-encoded value for the U1/U2 timeout. */ -int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, +static int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { struct xhci_hcd *xhci; @@ -4647,7 +4608,7 @@ int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, return hub_encoded_timeout; } -int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, +static int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { struct xhci_hcd *xhci; @@ -4663,24 +4624,24 @@ int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, } #else /* CONFIG_PM */ -int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, +static int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, struct usb_device *udev, int enable) { return 0; } -int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev) +static int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev) { return 0; } -int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, +static int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { return USB3_LPM_DISABLED; } -int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, +static int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, struct usb_device *udev, enum usb3_link_state state) { return 0; @@ -4692,7 +4653,7 @@ int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, /* Once a hub descriptor is fetched for a device, we need to update the xHC's * internal data structures for the device. */ -int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, +static int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, struct usb_tt *tt, gfp_t mem_flags) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); @@ -4713,11 +4674,11 @@ int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, xhci_warn(xhci, "Cannot update hub desc for unknown device.\n"); return -EINVAL; } + config_cmd = xhci_alloc_command(xhci, true, true, mem_flags); - if (!config_cmd) { - xhci_dbg(xhci, "Could not allocate xHCI command structure.\n"); + if (!config_cmd) return -ENOMEM; - } + ctrl_ctx = xhci_get_input_control_ctx(config_cmd->in_ctx); if (!ctrl_ctx) { xhci_warn(xhci, "%s: Could not get input context, bad type.\n", @@ -4778,8 +4739,6 @@ int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, xhci_dbg(xhci, "Set up %s for hub device.\n", (xhci->hci_version > 0x95) ? "configure endpoint" : "evaluate context"); - xhci_dbg(xhci, "Slot %u Input Context:\n", hdev->slot_id); - xhci_dbg_ctx(xhci, config_cmd->in_ctx, 0); /* Issue and wait for the configure endpoint or * evaluate context command. @@ -4791,14 +4750,11 @@ int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, ret = xhci_configure_endpoint(xhci, hdev, config_cmd, true, false); - xhci_dbg(xhci, "Slot %u Output Context:\n", hdev->slot_id); - xhci_dbg_ctx(xhci, vdev->out_ctx, 0); - xhci_free_command(xhci, config_cmd); return ret; } -int xhci_get_frame(struct usb_hcd *hcd) +static int xhci_get_frame(struct usb_hcd *hcd) { struct xhci_hcd *xhci = hcd_to_xhci(hcd); /* EHCI mods by the periodic size. Why? */ @@ -4808,7 +4764,11 @@ int xhci_get_frame(struct usb_hcd *hcd) int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks) { struct xhci_hcd *xhci; - struct device *dev = hcd->self.controller; + /* + * TODO: Check with DWC3 clients for sysdev according to + * quirks + */ + struct device *dev = hcd->self.sysdev; int retval; /* Accept arbitrarily long scatter-gather lists */ diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index da3eb695fe5407a6e8fc5dbab8ac5b97a54e32bd..73a28a986d5e408f14e9b526d0d66ad18072d77d 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -425,6 +425,7 @@ struct xhci_op_regs { #define PORT_L1DS_MASK (0xff << 8) #define PORT_L1DS(p) (((p) & 0xff) << 8) #define PORT_HLE (1 << 16) +#define PORT_TEST_MODE_SHIFT 28 /* USB3 Protocol PORTLI Port Link Information */ #define PORT_RX_LANES(p) (((p) >> 16) & 0xf) @@ -617,6 +618,7 @@ struct xhci_slot_ctx { #define ROUTE_STRING_MASK (0xfffff) /* Device speed - values defined by PORTSC Device Speed field - 20:23 */ #define DEV_SPEED (0xf << 20) +#define GET_DEV_SPEED(n) (((n) & DEV_SPEED) >> 20) /* bit 24 reserved */ /* Is this LS/FS device connected through a HS hub? - bit 25 */ #define DEV_MTT (0x1 << 25) @@ -637,6 +639,7 @@ struct xhci_slot_ctx { #define DEVINFO_TO_ROOT_HUB_PORT(p) (((p) >> 16) & 0xff) /* Maximum number of ports under a hub device */ #define XHCI_MAX_PORTS(p) (((p) & 0xff) << 24) +#define DEVINFO_TO_MAX_PORTS(p) (((p) & (0xff << 24)) >> 24) /* tt_info bitmasks */ /* @@ -651,6 +654,7 @@ struct xhci_slot_ctx { */ #define TT_PORT (0xff << 8) #define TT_THINK_TIME(p) (((p) & 0x3) << 16) +#define GET_TT_THINK_TIME(p) (((p) & (0x3 << 16)) >> 16) /* dev_state bitmasks */ /* USB device address - assigned by the HC */ @@ -1562,10 +1566,8 @@ struct xhci_ring { struct xhci_segment *last_seg; union xhci_trb *enqueue; struct xhci_segment *enq_seg; - unsigned int enq_updates; union xhci_trb *dequeue; struct xhci_segment *deq_seg; - unsigned int deq_updates; struct list_head td_list; /* * Write the cycle state into the TRB cycle field to give ownership of @@ -1604,7 +1606,6 @@ struct xhci_scratchpad { u64 *sp_array; dma_addr_t sp_dma; void **sp_buffers; - dma_addr_t *sp_dma_buffers; }; struct urb_priv { @@ -1722,7 +1723,6 @@ struct xhci_hcd { int page_shift; /* msi-x vectors */ int msix_count; - struct msix_entry *msix_entries; /* optional clock */ struct clk *clk; /* data structures */ @@ -1818,6 +1818,7 @@ struct xhci_hcd { #define XHCI_MISSING_CAS (1 << 24) /* For controller with a broken Port Disable implementation */ #define XHCI_BROKEN_PORT_PED (1 << 25) +#define XHCI_LIMIT_ENDPOINT_INTERVAL_7 (1 << 26) unsigned int num_active_eps; unsigned int limit_active_eps; @@ -1843,6 +1844,7 @@ struct xhci_hcd { /* Compliance Mode Recovery Data */ struct timer_list comp_mode_recovery_timer; u32 port_status_u0; + u16 test_mode; /* Compliance Mode Timer Triggered every 2 seconds */ #define COMP_MODE_RCVRY_MSECS 2000 @@ -1918,19 +1920,10 @@ void xhci_print_ir_set(struct xhci_hcd *xhci, int set_num); void xhci_print_registers(struct xhci_hcd *xhci); void xhci_dbg_regs(struct xhci_hcd *xhci); void xhci_print_run_regs(struct xhci_hcd *xhci); -void xhci_print_trb_offsets(struct xhci_hcd *xhci, union xhci_trb *trb); -void xhci_debug_trb(struct xhci_hcd *xhci, union xhci_trb *trb); -void xhci_debug_segment(struct xhci_hcd *xhci, struct xhci_segment *seg); -void xhci_debug_ring(struct xhci_hcd *xhci, struct xhci_ring *ring); void xhci_dbg_erst(struct xhci_hcd *xhci, struct xhci_erst *erst); void xhci_dbg_cmd_ptrs(struct xhci_hcd *xhci); -void xhci_dbg_ring_ptrs(struct xhci_hcd *xhci, struct xhci_ring *ring); -void xhci_dbg_ctx(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx, unsigned int last_ep); char *xhci_get_slot_state(struct xhci_hcd *xhci, struct xhci_container_ctx *ctx); -void xhci_dbg_ep_rings(struct xhci_hcd *xhci, - unsigned int slot_id, unsigned int ep_index, - struct xhci_virt_ep *ep); void xhci_dbg_trace(struct xhci_hcd *xhci, void (*trace)(struct va_format *), const char *fmt, ...); @@ -1944,16 +1937,8 @@ void xhci_copy_ep0_dequeue_into_input_ctx(struct xhci_hcd *xhci, struct usb_device *udev); unsigned int xhci_get_endpoint_index(struct usb_endpoint_descriptor *desc); unsigned int xhci_get_endpoint_address(unsigned int ep_index); -unsigned int xhci_get_endpoint_flag(struct usb_endpoint_descriptor *desc); -unsigned int xhci_get_endpoint_flag_from_index(unsigned int ep_index); unsigned int xhci_last_valid_endpoint(u32 added_ctxs); void xhci_endpoint_zero(struct xhci_hcd *xhci, struct xhci_virt_device *virt_dev, struct usb_host_endpoint *ep); -void xhci_drop_ep_from_interval_table(struct xhci_hcd *xhci, - struct xhci_bw_info *ep_bw, - struct xhci_interval_bw_table *bw_table, - struct usb_device *udev, - struct xhci_virt_ep *virt_ep, - struct xhci_tt_bw_info *tt_info); void xhci_update_tt_active_eps(struct xhci_hcd *xhci, struct xhci_virt_device *virt_dev, int old_active_eps); @@ -2010,53 +1995,25 @@ typedef void (*xhci_get_quirks_t)(struct device *, struct xhci_hcd *); int xhci_handshake(void __iomem *ptr, u32 mask, u32 done, int usec); void xhci_quiesce(struct xhci_hcd *xhci); int xhci_halt(struct xhci_hcd *xhci); +int xhci_start(struct xhci_hcd *xhci); int xhci_reset(struct xhci_hcd *xhci); -int xhci_init(struct usb_hcd *hcd); int xhci_run(struct usb_hcd *hcd); -void xhci_stop(struct usb_hcd *hcd); -void xhci_shutdown(struct usb_hcd *hcd); int xhci_gen_setup(struct usb_hcd *hcd, xhci_get_quirks_t get_quirks); void xhci_init_driver(struct hc_driver *drv, const struct xhci_driver_overrides *over); +int xhci_disable_slot(struct xhci_hcd *xhci, + struct xhci_command *command, u32 slot_id); -#ifdef CONFIG_PM int xhci_suspend(struct xhci_hcd *xhci, bool do_wakeup); int xhci_resume(struct xhci_hcd *xhci, bool hibernated); -#else -#define xhci_suspend NULL -#define xhci_resume NULL -#endif -int xhci_get_frame(struct usb_hcd *hcd); irqreturn_t xhci_irq(struct usb_hcd *hcd); irqreturn_t xhci_msi_irq(int irq, void *hcd); int xhci_alloc_dev(struct usb_hcd *hcd, struct usb_device *udev); -void xhci_free_dev(struct usb_hcd *hcd, struct usb_device *udev); int xhci_alloc_tt_info(struct xhci_hcd *xhci, struct xhci_virt_device *virt_dev, struct usb_device *hdev, struct usb_tt *tt, gfp_t mem_flags); -int xhci_alloc_streams(struct usb_hcd *hcd, struct usb_device *udev, - struct usb_host_endpoint **eps, unsigned int num_eps, - unsigned int num_streams, gfp_t mem_flags); -int xhci_free_streams(struct usb_hcd *hcd, struct usb_device *udev, - struct usb_host_endpoint **eps, unsigned int num_eps, - gfp_t mem_flags); -int xhci_address_device(struct usb_hcd *hcd, struct usb_device *udev); -int xhci_enable_device(struct usb_hcd *hcd, struct usb_device *udev); -int xhci_update_device(struct usb_hcd *hcd, struct usb_device *udev); -int xhci_set_usb2_hardware_lpm(struct usb_hcd *hcd, - struct usb_device *udev, int enable); -int xhci_update_hub_device(struct usb_hcd *hcd, struct usb_device *hdev, - struct usb_tt *tt, gfp_t mem_flags); -int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flags); -int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status); -int xhci_add_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep); -int xhci_drop_endpoint(struct usb_hcd *hcd, struct usb_device *udev, struct usb_host_endpoint *ep); -void xhci_endpoint_reset(struct usb_hcd *hcd, struct usb_host_endpoint *ep); -int xhci_discover_or_reset_device(struct usb_hcd *hcd, struct usb_device *udev); -int xhci_check_bandwidth(struct usb_hcd *hcd, struct usb_device *udev); -void xhci_reset_bandwidth(struct usb_hcd *hcd, struct usb_device *udev); /* xHCI ring, segment, TRB, and TD functions */ dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg, union xhci_trb *trb); @@ -2100,9 +2057,6 @@ void xhci_queue_new_dequeue_state(struct xhci_hcd *xhci, struct xhci_dequeue_state *deq_state); void xhci_cleanup_stalled_ring(struct xhci_hcd *xhci, unsigned int ep_index, struct xhci_td *td); -void xhci_queue_config_ep_quirk(struct xhci_hcd *xhci, - unsigned int slot_id, unsigned int ep_index, - struct xhci_dequeue_state *deq_state); void xhci_stop_endpoint_command_watchdog(unsigned long arg); void xhci_handle_command_timeout(struct work_struct *work); @@ -2113,16 +2067,13 @@ void xhci_cleanup_command_queue(struct xhci_hcd *xhci); /* xHCI roothub code */ void xhci_set_link_state(struct xhci_hcd *xhci, __le32 __iomem **port_array, int port_id, u32 link_state); -int xhci_enable_usb3_lpm_timeout(struct usb_hcd *hcd, - struct usb_device *udev, enum usb3_link_state state); -int xhci_disable_usb3_lpm_timeout(struct usb_hcd *hcd, - struct usb_device *udev, enum usb3_link_state state); void xhci_test_and_clear_bit(struct xhci_hcd *xhci, __le32 __iomem **port_array, int port_id, u32 port_bit); int xhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, u16 wIndex, char *buf, u16 wLength); int xhci_hub_status_data(struct usb_hcd *hcd, char *buf); int xhci_find_raw_port_number(struct usb_hcd *hcd, int port1); +void xhci_hc_died(struct xhci_hcd *xhci); #ifdef CONFIG_PM int xhci_bus_suspend(struct usb_hcd *hcd); @@ -2153,6 +2104,22 @@ static inline struct xhci_ring *xhci_urb_to_transfer_ring(struct xhci_hcd *xhci, urb->stream_id); } +static inline char *xhci_slot_state_string(u32 state) +{ + switch (state) { + case SLOT_STATE_ENABLED: + return "enabled/disabled"; + case SLOT_STATE_DEFAULT: + return "default"; + case SLOT_STATE_ADDRESSED: + return "addressed"; + case SLOT_STATE_CONFIGURED: + return "configured"; + default: + return "reserved"; + } +} + static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, u32 field3) { @@ -2162,14 +2129,12 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, switch (type) { case TRB_LINK: sprintf(str, - "TRB %08x%08x status '%s' len %d slot %d ep %d type '%s' flags %c:%c", - field1, field0, - xhci_trb_comp_code_string(GET_COMP_CODE(field2)), - EVENT_TRB_LEN(field2), TRB_TO_SLOT_ID(field3), - /* Macro decrements 1, maybe it shouldn't?!? */ - TRB_TO_EP_INDEX(field3) + 1, - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), - field3 & EVENT_DATA ? 'E' : 'e', + "LINK %08x%08x intr %d type '%s' flags %c:%c:%c:%c", + field1, field0, GET_INTR_TARGET(field2), + xhci_trb_type_string(type), + field3 & TRB_IOC ? 'I' : 'i', + field3 & TRB_CHAIN ? 'C' : 'c', + field3 & TRB_TC ? 'T' : 't', field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_TRANSFER: @@ -2187,37 +2152,52 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, EVENT_TRB_LEN(field2), TRB_TO_SLOT_ID(field3), /* Macro decrements 1, maybe it shouldn't?!? */ TRB_TO_EP_INDEX(field3) + 1, - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field3 & EVENT_DATA ? 'E' : 'e', field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_SETUP: - sprintf(str, - "bRequestType %02x bRequest %02x wValue %02x%02x wIndex %02x%02x wLength %d length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c:%c", - field0 & 0xff, - (field0 & 0xff00) >> 8, - (field0 & 0xff000000) >> 24, - (field0 & 0xff0000) >> 16, - (field1 & 0xff00) >> 8, - field1 & 0xff, - (field1 & 0xff000000) >> 16 | - (field1 & 0xff0000) >> 16, - TRB_LEN(field2), GET_TD_SIZE(field2), - GET_INTR_TARGET(field2), - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), - field3 & TRB_BEI ? 'B' : 'b', - field3 & TRB_IDT ? 'I' : 'i', - field3 & TRB_IOC ? 'I' : 'i', - field3 & TRB_CHAIN ? 'C' : 'c', - field3 & TRB_NO_SNOOP ? 'S' : 's', - field3 & TRB_ISP ? 'I' : 'i', - field3 & TRB_ENT ? 'E' : 'e', - field3 & TRB_CYCLE ? 'C' : 'c'); + sprintf(str, "bRequestType %02x bRequest %02x wValue %02x%02x wIndex %02x%02x wLength %d length %d TD size %d intr %d type '%s' flags %c:%c:%c", + field0 & 0xff, + (field0 & 0xff00) >> 8, + (field0 & 0xff000000) >> 24, + (field0 & 0xff0000) >> 16, + (field1 & 0xff00) >> 8, + field1 & 0xff, + (field1 & 0xff000000) >> 16 | + (field1 & 0xff0000) >> 16, + TRB_LEN(field2), GET_TD_SIZE(field2), + GET_INTR_TARGET(field2), + xhci_trb_type_string(type), + field3 & TRB_IDT ? 'I' : 'i', + field3 & TRB_IOC ? 'I' : 'i', + field3 & TRB_CYCLE ? 'C' : 'c'); break; - case TRB_NORMAL: case TRB_DATA: + sprintf(str, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c", + field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), + GET_INTR_TARGET(field2), + xhci_trb_type_string(type), + field3 & TRB_IDT ? 'I' : 'i', + field3 & TRB_IOC ? 'I' : 'i', + field3 & TRB_CHAIN ? 'C' : 'c', + field3 & TRB_NO_SNOOP ? 'S' : 's', + field3 & TRB_ISP ? 'I' : 'i', + field3 & TRB_ENT ? 'E' : 'e', + field3 & TRB_CYCLE ? 'C' : 'c'); + break; case TRB_STATUS: + sprintf(str, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c", + field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), + GET_INTR_TARGET(field2), + xhci_trb_type_string(type), + field3 & TRB_IOC ? 'I' : 'i', + field3 & TRB_CHAIN ? 'C' : 'c', + field3 & TRB_ENT ? 'E' : 'e', + field3 & TRB_CYCLE ? 'C' : 'c'); + break; + case TRB_NORMAL: case TRB_ISOC: case TRB_EVENT_DATA: case TRB_TR_NOOP: @@ -2225,7 +2205,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, "Buffer %08x%08x length %d TD size %d intr %d type '%s' flags %c:%c:%c:%c:%c:%c:%c:%c", field1, field0, TRB_LEN(field2), GET_TD_SIZE(field2), GET_INTR_TARGET(field2), - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field3 & TRB_BEI ? 'B' : 'b', field3 & TRB_IDT ? 'I' : 'i', field3 & TRB_IOC ? 'I' : 'i', @@ -2240,21 +2220,21 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_ENABLE_SLOT: sprintf(str, "%s: flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_DISABLE_SLOT: case TRB_NEG_BANDWIDTH: sprintf(str, "%s: slot %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), TRB_TO_SLOT_ID(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_ADDR_DEV: sprintf(str, "%s: ctx %08x%08x slot %d flags %c:%c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_SLOT_ID(field3), field3 & TRB_BSR ? 'B' : 'b', @@ -2263,7 +2243,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_CONFIG_EP: sprintf(str, "%s: ctx %08x%08x slot %d flags %c:%c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_SLOT_ID(field3), field3 & TRB_DC ? 'D' : 'd', @@ -2272,7 +2252,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_EVAL_CONTEXT: sprintf(str, "%s: ctx %08x%08x slot %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_SLOT_ID(field3), field3 & TRB_CYCLE ? 'C' : 'c'); @@ -2280,7 +2260,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_RESET_EP: sprintf(str, "%s: ctx %08x%08x slot %d ep %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_SLOT_ID(field3), /* Macro decrements 1, maybe it shouldn't?!? */ @@ -2290,7 +2270,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_STOP_RING: sprintf(str, "%s: slot %d sp %d ep %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), TRB_TO_SLOT_ID(field3), TRB_TO_SUSPEND_PORT(field3), /* Macro decrements 1, maybe it shouldn't?!? */ @@ -2300,7 +2280,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_SET_DEQ: sprintf(str, "%s: deq %08x%08x stream %d slot %d ep %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_STREAM_ID(field2), TRB_TO_SLOT_ID(field3), @@ -2311,14 +2291,14 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_RESET_DEV: sprintf(str, "%s: slot %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), TRB_TO_SLOT_ID(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_FORCE_EVENT: sprintf(str, "%s: event %08x%08x vf intr %d vf id %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_VF_INTR_TARGET(field2), TRB_TO_VF_ID(field3), @@ -2327,14 +2307,14 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_SET_LT: sprintf(str, "%s: belt %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), TRB_TO_BELT(field3), field3 & TRB_CYCLE ? 'C' : 'c'); break; case TRB_GET_BW: sprintf(str, "%s: ctx %08x%08x slot %d speed %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field1, field0, TRB_TO_SLOT_ID(field3), TRB_TO_DEV_SPEED(field3), @@ -2343,7 +2323,7 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, case TRB_FORCE_HEADER: sprintf(str, "%s: info %08x%08x%08x pkt type %d roothub port %d flags %c", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field2, field1, field0 & 0xffffffe0, TRB_TO_PACKET_TYPE(field0), TRB_TO_ROOTHUB_PORT(field3), @@ -2352,12 +2332,155 @@ static inline const char *xhci_decode_trb(u32 field0, u32 field1, u32 field2, default: sprintf(str, "type '%s' -> raw %08x %08x %08x %08x", - xhci_trb_type_string(TRB_FIELD_TO_TYPE(field3)), + xhci_trb_type_string(type), field0, field1, field2, field3); } return str; } +static inline const char *xhci_decode_slot_context(u32 info, u32 info2, + u32 tt_info, u32 state) +{ + static char str[1024]; + u32 speed; + u32 hub; + u32 mtt; + int ret = 0; + + speed = info & DEV_SPEED; + hub = info & DEV_HUB; + mtt = info & DEV_MTT; + + ret = sprintf(str, "RS %05x %s%s%s Ctx Entries %d MEL %d us Port# %d/%d", + info & ROUTE_STRING_MASK, + ({ char *s; + switch (speed) { + case SLOT_SPEED_FS: + s = "full-speed"; + break; + case SLOT_SPEED_LS: + s = "low-speed"; + break; + case SLOT_SPEED_HS: + s = "high-speed"; + break; + case SLOT_SPEED_SS: + s = "super-speed"; + break; + case SLOT_SPEED_SSP: + s = "super-speed plus"; + break; + default: + s = "UNKNOWN speed"; + } s; }), + mtt ? " multi-TT" : "", + hub ? " Hub" : "", + (info & LAST_CTX_MASK) >> 27, + info2 & MAX_EXIT, + DEVINFO_TO_ROOT_HUB_PORT(info2), + DEVINFO_TO_MAX_PORTS(info2)); + + ret += sprintf(str + ret, " [TT Slot %d Port# %d TTT %d Intr %d] Addr %d State %s", + tt_info & TT_SLOT, (tt_info & TT_PORT) >> 8, + GET_TT_THINK_TIME(tt_info), GET_INTR_TARGET(tt_info), + state & DEV_ADDR_MASK, + xhci_slot_state_string(GET_SLOT_STATE(state))); + + return str; +} + +static inline const char *xhci_ep_state_string(u8 state) +{ + switch (state) { + case EP_STATE_DISABLED: + return "disabled"; + case EP_STATE_RUNNING: + return "running"; + case EP_STATE_HALTED: + return "halted"; + case EP_STATE_STOPPED: + return "stopped"; + case EP_STATE_ERROR: + return "error"; + default: + return "INVALID"; + } +} + +static inline const char *xhci_ep_type_string(u8 type) +{ + switch (type) { + case ISOC_OUT_EP: + return "Isoc OUT"; + case BULK_OUT_EP: + return "Bulk OUT"; + case INT_OUT_EP: + return "Int OUT"; + case CTRL_EP: + return "Ctrl"; + case ISOC_IN_EP: + return "Isoc IN"; + case BULK_IN_EP: + return "Bulk IN"; + case INT_IN_EP: + return "Int IN"; + default: + return "INVALID"; + } +} + +static inline const char *xhci_decode_ep_context(u32 info, u32 info2, u64 deq, + u32 tx_info) +{ + static char str[1024]; + int ret; + + u32 esit; + u16 maxp; + u16 avg; + + u8 max_pstr; + u8 ep_state; + u8 interval; + u8 ep_type; + u8 burst; + u8 cerr; + u8 mult; + u8 lsa; + u8 hid; + + esit = EP_MAX_ESIT_PAYLOAD_HI(info) << 16 | + EP_MAX_ESIT_PAYLOAD_LO(tx_info); + + ep_state = info & EP_STATE_MASK; + max_pstr = info & EP_MAXPSTREAMS_MASK; + interval = CTX_TO_EP_INTERVAL(info); + mult = CTX_TO_EP_MULT(info) + 1; + lsa = info & EP_HAS_LSA; + + cerr = (info2 & (3 << 1)) >> 1; + ep_type = CTX_TO_EP_TYPE(info2); + hid = info2 & (1 << 7); + burst = CTX_TO_MAX_BURST(info2); + maxp = MAX_PACKET_DECODED(info2); + + avg = EP_AVG_TRB_LENGTH(tx_info); + + ret = sprintf(str, "State %s mult %d max P. Streams %d %s", + xhci_ep_state_string(ep_state), mult, + max_pstr, lsa ? "LSA " : ""); + + ret += sprintf(str + ret, "interval %d us max ESIT payload %d CErr %d ", + (1 << interval) * 125, esit, cerr); + + ret += sprintf(str + ret, "Type %s %sburst %d maxp %d deq %016llx ", + xhci_ep_type_string(ep_type), hid ? "HID" : "", + burst, maxp, deq); + + ret += sprintf(str + ret, "avg trb len %d", avg); + + return str; +} #endif /* __LINUX_XHCI_HCD_H */ diff --git a/drivers/usb/isp1760/isp1760-if.c b/drivers/usb/isp1760/isp1760-if.c index 79205b31e4a9d5caf01d5c2e06f0349805cb9846..bc68bbab7fa170796e06efc11080c67598bcc32b 100644 --- a/drivers/usb/isp1760/isp1760-if.c +++ b/drivers/usb/isp1760/isp1760-if.c @@ -21,11 +21,11 @@ #include "isp1760-core.h" #include "isp1760-regs.h" -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI #include #endif -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI static int isp1761_pci_init(struct pci_dev *dev) { resource_size_t mem_start; @@ -286,7 +286,7 @@ static int __init isp1760_init(void) ret = platform_driver_register(&isp1760_plat_driver); if (!ret) any_ret = 0; -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI ret = pci_register_driver(&isp1761_pci_driver); if (!ret) any_ret = 0; @@ -301,7 +301,7 @@ module_init(isp1760_init); static void __exit isp1760_exit(void) { platform_driver_unregister(&isp1760_plat_driver); -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI pci_unregister_driver(&isp1761_pci_driver); #endif isp1760_deinit_kmem_cache(); diff --git a/drivers/usb/misc/adutux.c b/drivers/usb/misc/adutux.c index db9a9e6ff6bee9c74840b7f5734b086795a090fa..dfd54ea4808faa1efbd98921887cf29b3bd3f1d3 100644 --- a/drivers/usb/misc/adutux.c +++ b/drivers/usb/misc/adutux.c @@ -655,24 +655,15 @@ static int adu_probe(struct usb_interface *interface, { struct usb_device *udev = interface_to_usbdev(interface); struct adu_device *dev = NULL; - struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - int retval = -ENODEV; + int retval = -ENOMEM; int in_end_size; int out_end_size; - int i; - - if (udev == NULL) { - dev_err(&interface->dev, "udev is NULL.\n"); - goto exit; - } + int res; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(struct adu_device), GFP_KERNEL); - if (!dev) { - retval = -ENOMEM; - goto exit; - } + if (!dev) + return -ENOMEM; mutex_init(&dev->mtx); spin_lock_init(&dev->buflock); @@ -680,24 +671,13 @@ static int adu_probe(struct usb_interface *interface, init_waitqueue_head(&dev->read_wait); init_waitqueue_head(&dev->write_wait); - iface_desc = &interface->altsetting[0]; - - /* set up the endpoint information */ - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(endpoint)) - dev->interrupt_in_endpoint = endpoint; - - if (usb_endpoint_is_int_out(endpoint)) - dev->interrupt_out_endpoint = endpoint; - } - if (dev->interrupt_in_endpoint == NULL) { - dev_err(&interface->dev, "interrupt in endpoint not found\n"); - goto error; - } - if (dev->interrupt_out_endpoint == NULL) { - dev_err(&interface->dev, "interrupt out endpoint not found\n"); + res = usb_find_common_endpoints_reverse(&interface->altsetting[0], + NULL, NULL, + &dev->interrupt_in_endpoint, + &dev->interrupt_out_endpoint); + if (res) { + dev_err(&interface->dev, "interrupt endpoints not found\n"); + retval = res; goto error; } @@ -705,10 +685,8 @@ static int adu_probe(struct usb_interface *interface, out_end_size = usb_endpoint_maxp(dev->interrupt_out_endpoint); dev->read_buffer_primary = kmalloc((4 * in_end_size), GFP_KERNEL); - if (!dev->read_buffer_primary) { - retval = -ENOMEM; + if (!dev->read_buffer_primary) goto error; - } /* debug code prime the buffer */ memset(dev->read_buffer_primary, 'a', in_end_size); @@ -717,10 +695,8 @@ static int adu_probe(struct usb_interface *interface, memset(dev->read_buffer_primary + (3 * in_end_size), 'd', in_end_size); dev->read_buffer_secondary = kmalloc((4 * in_end_size), GFP_KERNEL); - if (!dev->read_buffer_secondary) { - retval = -ENOMEM; + if (!dev->read_buffer_secondary) goto error; - } /* debug code prime the buffer */ memset(dev->read_buffer_secondary, 'e', in_end_size); @@ -748,6 +724,7 @@ static int adu_probe(struct usb_interface *interface, if (!usb_string(udev, udev->descriptor.iSerialNumber, dev->serial_number, sizeof(dev->serial_number))) { dev_err(&interface->dev, "Could not retrieve serial number\n"); + retval = -EIO; goto error; } dev_dbg(&interface->dev,"serial_number=%s", dev->serial_number); @@ -770,8 +747,8 @@ static int adu_probe(struct usb_interface *interface, dev_info(&interface->dev, "ADU%d %s now attached to /dev/usb/adutux%d\n", le16_to_cpu(udev->descriptor.idProduct), dev->serial_number, (dev->minor - ADU_MINOR_BASE)); -exit: - return retval; + + return 0; error: adu_delete(dev); diff --git a/drivers/usb/misc/appledisplay.c b/drivers/usb/misc/appledisplay.c index da5ff401a35489614db878afadf8cc8bbdffc852..8efdc500e790b5b5d15705f0a211b7e69fe05532 100644 --- a/drivers/usb/misc/appledisplay.c +++ b/drivers/usb/misc/appledisplay.c @@ -212,28 +212,21 @@ static int appledisplay_probe(struct usb_interface *iface, struct backlight_properties props; struct appledisplay *pdata; struct usb_device *udev = interface_to_usbdev(iface); - struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; int int_in_endpointAddr = 0; - int i, retval = -ENOMEM, brightness; + int retval, brightness; char bl_name[20]; /* set up the endpoint information */ /* use only the first interrupt-in endpoint */ - iface_desc = iface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - endpoint = &iface_desc->endpoint[i].desc; - if (!int_in_endpointAddr && usb_endpoint_is_int_in(endpoint)) { - /* we found an interrupt in endpoint */ - int_in_endpointAddr = endpoint->bEndpointAddress; - break; - } - } - if (!int_in_endpointAddr) { + retval = usb_find_int_in_endpoint(iface->cur_altsetting, &endpoint); + if (retval) { dev_err(&iface->dev, "Could not find int-in endpoint\n"); - return -EIO; + return retval; } + int_in_endpointAddr = endpoint->bEndpointAddress; + /* allocate memory for our device state and initialize it */ pdata = kzalloc(sizeof(struct appledisplay), GFP_KERNEL); if (!pdata) { diff --git a/drivers/usb/misc/chaoskey.c b/drivers/usb/misc/chaoskey.c index aa350dc9eb25c2db5a785fbd5147229d0cf06963..e9cae4d82af2ec80a1bfa321871aa75895dda170 100644 --- a/drivers/usb/misc/chaoskey.c +++ b/drivers/usb/misc/chaoskey.c @@ -117,28 +117,26 @@ static int chaoskey_probe(struct usb_interface *interface, { struct usb_device *udev = interface_to_usbdev(interface); struct usb_host_interface *altsetting = interface->cur_altsetting; - int i; - int in_ep = -1; + struct usb_endpoint_descriptor *epd; + int in_ep; struct chaoskey *dev; int result = -ENOMEM; int size; + int res; usb_dbg(interface, "probe %s-%s", udev->product, udev->serial); /* Find the first bulk IN endpoint and its packet size */ - for (i = 0; i < altsetting->desc.bNumEndpoints; i++) { - if (usb_endpoint_is_bulk_in(&altsetting->endpoint[i].desc)) { - in_ep = usb_endpoint_num(&altsetting->endpoint[i].desc); - size = usb_endpoint_maxp(&altsetting->endpoint[i].desc); - break; - } + res = usb_find_bulk_in_endpoint(altsetting, &epd); + if (res) { + usb_dbg(interface, "no IN endpoint found"); + return res; } + in_ep = usb_endpoint_num(epd); + size = usb_endpoint_maxp(epd); + /* Validate endpoint and size */ - if (in_ep == -1) { - usb_dbg(interface, "no IN endpoint found"); - return -ENODEV; - } if (size <= 0) { usb_dbg(interface, "invalid size (%d)", size); return -ENODEV; diff --git a/drivers/usb/misc/ftdi-elan.c b/drivers/usb/misc/ftdi-elan.c index 01a9373b7e18a16fa0b8bc35625fffbe0eefc15b..8291499d058178a27b3b951427e8ef5cf2fcb907 100644 --- a/drivers/usb/misc/ftdi-elan.c +++ b/drivers/usb/misc/ftdi-elan.c @@ -2700,10 +2700,8 @@ static int ftdi_elan_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - size_t buffer_size; - int i; - int retval = -ENOMEM; + struct usb_endpoint_descriptor *bulk_in, *bulk_out; + int retval; struct usb_ftdi *ftdi; ftdi = kzalloc(sizeof(struct usb_ftdi), GFP_KERNEL); @@ -2720,31 +2718,25 @@ static int ftdi_elan_probe(struct usb_interface *interface, ftdi->interface = interface; mutex_init(&ftdi->u132_lock); ftdi->expected = 4; + iface_desc = interface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - if (!ftdi->bulk_in_endpointAddr && - usb_endpoint_is_bulk_in(endpoint)) { - buffer_size = usb_endpoint_maxp(endpoint); - ftdi->bulk_in_size = buffer_size; - ftdi->bulk_in_endpointAddr = endpoint->bEndpointAddress; - ftdi->bulk_in_buffer = kmalloc(buffer_size, GFP_KERNEL); - if (!ftdi->bulk_in_buffer) { - retval = -ENOMEM; - goto error; - } - } - if (!ftdi->bulk_out_endpointAddr && - usb_endpoint_is_bulk_out(endpoint)) { - ftdi->bulk_out_endpointAddr = - endpoint->bEndpointAddress; - } - } - if (!(ftdi->bulk_in_endpointAddr && ftdi->bulk_out_endpointAddr)) { + retval = usb_find_common_endpoints(iface_desc, + &bulk_in, &bulk_out, NULL, NULL); + if (retval) { dev_err(&ftdi->udev->dev, "Could not find both bulk-in and bulk-out endpoints\n"); - retval = -ENODEV; goto error; } + + ftdi->bulk_in_size = usb_endpoint_maxp(bulk_in); + ftdi->bulk_in_endpointAddr = bulk_in->bEndpointAddress; + ftdi->bulk_in_buffer = kmalloc(ftdi->bulk_in_size, GFP_KERNEL); + if (!ftdi->bulk_in_buffer) { + retval = -ENOMEM; + goto error; + } + + ftdi->bulk_out_endpointAddr = bulk_out->bEndpointAddress; + dev_info(&ftdi->udev->dev, "interface %d has I=%02X O=%02X\n", iface_desc->desc.bInterfaceNumber, ftdi->bulk_in_endpointAddr, ftdi->bulk_out_endpointAddr); diff --git a/drivers/usb/misc/idmouse.c b/drivers/usb/misc/idmouse.c index 502bfe30a077a20120616af74b378c62bc2ec6d9..81fcbf024c65837cb38f03ebed5124dfba1e40c2 100644 --- a/drivers/usb/misc/idmouse.c +++ b/drivers/usb/misc/idmouse.c @@ -360,26 +360,22 @@ static int idmouse_probe(struct usb_interface *interface, dev->interface = interface; /* set up the endpoint information - use only the first bulk-in endpoint */ - endpoint = &iface_desc->endpoint[0].desc; - if (!dev->bulk_in_endpointAddr && usb_endpoint_is_bulk_in(endpoint)) { - /* we found a bulk in endpoint */ - dev->orig_bi_size = usb_endpoint_maxp(endpoint); - dev->bulk_in_size = 0x200; /* works _much_ faster */ - dev->bulk_in_endpointAddr = endpoint->bEndpointAddress; - dev->bulk_in_buffer = - kmalloc(IMGSIZE + dev->bulk_in_size, GFP_KERNEL); - - if (!dev->bulk_in_buffer) { - idmouse_delete(dev); - return -ENOMEM; - } + result = usb_find_bulk_in_endpoint(iface_desc, &endpoint); + if (result) { + dev_err(&interface->dev, "Unable to find bulk-in endpoint.\n"); + idmouse_delete(dev); + return result; } - if (!(dev->bulk_in_endpointAddr)) { - dev_err(&interface->dev, "Unable to find bulk-in endpoint.\n"); + dev->orig_bi_size = usb_endpoint_maxp(endpoint); + dev->bulk_in_size = 0x200; /* works _much_ faster */ + dev->bulk_in_endpointAddr = endpoint->bEndpointAddress; + dev->bulk_in_buffer = kmalloc(IMGSIZE + dev->bulk_in_size, GFP_KERNEL); + if (!dev->bulk_in_buffer) { idmouse_delete(dev); - return -ENODEV; + return -ENOMEM; } + /* allow device read, write and ioctl */ dev->present = 1; diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 37c63cb39714b86ada39d3232565ef1595ed71c6..77569531b78a56d45fb43ee255a0a22e6da2979d 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -756,9 +756,8 @@ static int iowarrior_probe(struct usb_interface *interface, struct usb_device *udev = interface_to_usbdev(interface); struct iowarrior *dev = NULL; struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - int i; int retval = -ENOMEM; + int res; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(struct iowarrior), GFP_KERNEL); @@ -781,27 +780,19 @@ static int iowarrior_probe(struct usb_interface *interface, iface_desc = interface->cur_altsetting; dev->product_id = le16_to_cpu(udev->descriptor.idProduct); - /* set up the endpoint information */ - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(endpoint)) - dev->int_in_endpoint = endpoint; - if (usb_endpoint_is_int_out(endpoint)) - /* this one will match for the IOWarrior56 only */ - dev->int_out_endpoint = endpoint; - } - - if (!dev->int_in_endpoint) { + res = usb_find_last_int_in_endpoint(iface_desc, &dev->int_in_endpoint); + if (res) { dev_err(&interface->dev, "no interrupt-in endpoint found\n"); - retval = -ENODEV; + retval = res; goto error; } if (dev->product_id == USB_DEVICE_ID_CODEMERCS_IOW56) { - if (!dev->int_out_endpoint) { + res = usb_find_last_int_out_endpoint(iface_desc, + &dev->int_out_endpoint); + if (res) { dev_err(&interface->dev, "no interrupt-out endpoint found\n"); - retval = -ENODEV; + retval = res; goto error; } } diff --git a/drivers/usb/misc/ldusb.c b/drivers/usb/misc/ldusb.c index 3bc5356832db777424111b74cefbcfe2e77ce4e4..9d9487c66f87a21e4a62b88250cdbc97e1e92313 100644 --- a/drivers/usb/misc/ldusb.c +++ b/drivers/usb/misc/ldusb.c @@ -122,13 +122,13 @@ MODULE_SUPPORTED_DEVICE("LD USB Devices"); * avoid racing conditions and get better performance of the driver. */ static int ring_buffer_size = 128; -module_param(ring_buffer_size, int, 0); +module_param(ring_buffer_size, int, 0000); MODULE_PARM_DESC(ring_buffer_size, "Read ring buffer size in reports"); /* The write_buffer can contain more than one interrupt out transfer. */ static int write_buffer_size = 10; -module_param(write_buffer_size, int, 0); +module_param(write_buffer_size, int, 0000); MODULE_PARM_DESC(write_buffer_size, "Write buffer size in reports"); /* As of kernel version 2.6.4 ehci-hcd uses an @@ -141,30 +141,30 @@ MODULE_PARM_DESC(write_buffer_size, "Write buffer size in reports"); * or set to 1 to use the standard interval from the endpoint descriptors. */ static int min_interrupt_in_interval = 2; -module_param(min_interrupt_in_interval, int, 0); +module_param(min_interrupt_in_interval, int, 0000); MODULE_PARM_DESC(min_interrupt_in_interval, "Minimum interrupt in interval in ms"); static int min_interrupt_out_interval = 2; -module_param(min_interrupt_out_interval, int, 0); +module_param(min_interrupt_out_interval, int, 0000); MODULE_PARM_DESC(min_interrupt_out_interval, "Minimum interrupt out interval in ms"); /* Structure to hold all of our device specific stuff */ struct ld_usb { struct mutex mutex; /* locks this structure */ - struct usb_interface* intf; /* save off the usb interface pointer */ + struct usb_interface *intf; /* save off the usb interface pointer */ int open_count; /* number of times this port has been opened */ - char* ring_buffer; + char *ring_buffer; unsigned int ring_head; unsigned int ring_tail; wait_queue_head_t read_wait; wait_queue_head_t write_wait; - char* interrupt_in_buffer; - struct usb_endpoint_descriptor* interrupt_in_endpoint; - struct urb* interrupt_in_urb; + char *interrupt_in_buffer; + struct usb_endpoint_descriptor *interrupt_in_endpoint; + struct urb *interrupt_in_urb; int interrupt_in_interval; size_t interrupt_in_endpoint_size; int interrupt_in_running; @@ -172,9 +172,9 @@ struct ld_usb { int buffer_overflow; spinlock_t rbsl; - char* interrupt_out_buffer; - struct usb_endpoint_descriptor* interrupt_out_endpoint; - struct urb* interrupt_out_urb; + char *interrupt_out_buffer; + struct usb_endpoint_descriptor *interrupt_out_endpoint; + struct urb *interrupt_out_urb; int interrupt_out_interval; size_t interrupt_out_endpoint_size; int interrupt_out_busy; @@ -244,7 +244,7 @@ static void ld_usb_interrupt_in_callback(struct urb *urb) if (urb->actual_length > 0) { next_ring_head = (dev->ring_head+1) % ring_buffer_size; if (next_ring_head != dev->ring_tail) { - actual_buffer = (size_t*)(dev->ring_buffer + dev->ring_head*(sizeof(size_t)+dev->interrupt_in_endpoint_size)); + actual_buffer = (size_t *)(dev->ring_buffer + dev->ring_head * (sizeof(size_t)+dev->interrupt_in_endpoint_size)); /* actual_buffer gets urb->actual_length + interrupt_in_buffer */ *actual_buffer = urb->actual_length; memcpy(actual_buffer+1, dev->interrupt_in_buffer, urb->actual_length); @@ -483,7 +483,7 @@ static ssize_t ld_usb_read(struct file *file, char __user *buffer, size_t count, } /* actual_buffer contains actual_length + interrupt_in_buffer */ - actual_buffer = (size_t*)(dev->ring_buffer + dev->ring_tail*(sizeof(size_t)+dev->interrupt_in_endpoint_size)); + actual_buffer = (size_t *)(dev->ring_buffer + dev->ring_tail * (sizeof(size_t)+dev->interrupt_in_endpoint_size)); bytes_to_read = min(count, *actual_buffer); if (bytes_to_read < *actual_buffer) dev_warn(&dev->intf->dev, "Read buffer overflow, %zd bytes dropped\n", @@ -561,7 +561,7 @@ static ssize_t ld_usb_write(struct file *file, const char __user *buffer, /* write the data into interrupt_out_buffer from userspace */ bytes_to_write = min(count, write_buffer_size*dev->interrupt_out_endpoint_size); if (bytes_to_write < count) - dev_warn(&dev->intf->dev, "Write buffer overflow, %zd bytes dropped\n",count-bytes_to_write); + dev_warn(&dev->intf->dev, "Write buffer overflow, %zd bytes dropped\n", count-bytes_to_write); dev_dbg(&dev->intf->dev, "%s: count = %zd, bytes_to_write = %zd\n", __func__, count, bytes_to_write); @@ -650,10 +650,9 @@ static int ld_usb_probe(struct usb_interface *intf, const struct usb_device_id * struct usb_device *udev = interface_to_usbdev(intf); struct ld_usb *dev = NULL; struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; char *buffer; - int i; int retval = -ENOMEM; + int res; /* allocate memory for our device state and initialize it */ @@ -681,21 +680,17 @@ static int ld_usb_probe(struct usb_interface *intf, const struct usb_device_id * iface_desc = intf->cur_altsetting; - /* set up the endpoint information */ - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(endpoint)) - dev->interrupt_in_endpoint = endpoint; - - if (usb_endpoint_is_int_out(endpoint)) - dev->interrupt_out_endpoint = endpoint; - } - if (dev->interrupt_in_endpoint == NULL) { + res = usb_find_last_int_in_endpoint(iface_desc, + &dev->interrupt_in_endpoint); + if (res) { dev_err(&intf->dev, "Interrupt in endpoint not found\n"); + retval = res; goto error; } - if (dev->interrupt_out_endpoint == NULL) + + res = usb_find_last_int_out_endpoint(iface_desc, + &dev->interrupt_out_endpoint); + if (res) dev_warn(&intf->dev, "Interrupt out endpoint not found (using control endpoint instead)\n"); dev->interrupt_in_endpoint_size = usb_endpoint_maxp(dev->interrupt_in_endpoint); diff --git a/drivers/usb/misc/legousbtower.c b/drivers/usb/misc/legousbtower.c index 322a042d6e59ab55ec10e9fb8961a3080ec8f6f9..aa3c280fdf8d40662e375466dd3b2648ec037fe1 100644 --- a/drivers/usb/misc/legousbtower.c +++ b/drivers/usb/misc/legousbtower.c @@ -317,9 +317,16 @@ static int tower_open (struct inode *inode, struct file *file) int subminor; int retval = 0; struct usb_interface *interface; - struct tower_reset_reply reset_reply; + struct tower_reset_reply *reset_reply; int result; + reset_reply = kmalloc(sizeof(*reset_reply), GFP_KERNEL); + + if (!reset_reply) { + retval = -ENOMEM; + goto exit; + } + nonseekable_open(inode, file); subminor = iminor(inode); @@ -364,8 +371,8 @@ static int tower_open (struct inode *inode, struct file *file) USB_TYPE_VENDOR | USB_DIR_IN | USB_RECIP_DEVICE, 0, 0, - &reset_reply, - sizeof(reset_reply), + reset_reply, + sizeof(*reset_reply), 1000); if (result < 0) { dev_err(&dev->udev->dev, @@ -406,6 +413,7 @@ static int tower_open (struct inode *inode, struct file *file) mutex_unlock(&dev->lock); exit: + kfree(reset_reply); return retval; } @@ -806,10 +814,7 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device struct device *idev = &interface->dev; struct usb_device *udev = interface_to_usbdev(interface); struct lego_usb_tower *dev = NULL; - struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor* endpoint; - struct tower_get_version_reply get_version_reply; - int i; + struct tower_get_version_reply *get_version_reply = NULL; int retval = -ENOMEM; int result; @@ -846,25 +851,13 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device dev->interrupt_out_urb = NULL; dev->interrupt_out_busy = 0; - iface_desc = interface->cur_altsetting; - - /* set up the endpoint information */ - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_xfer_int(endpoint)) { - if (usb_endpoint_dir_in(endpoint)) - dev->interrupt_in_endpoint = endpoint; - else - dev->interrupt_out_endpoint = endpoint; - } - } - if(dev->interrupt_in_endpoint == NULL) { - dev_err(idev, "interrupt in endpoint not found\n"); - goto error; - } - if (dev->interrupt_out_endpoint == NULL) { - dev_err(idev, "interrupt out endpoint not found\n"); + result = usb_find_common_endpoints_reverse(interface->cur_altsetting, + NULL, NULL, + &dev->interrupt_in_endpoint, + &dev->interrupt_out_endpoint); + if (result) { + dev_err(idev, "interrupt endpoints not found\n"); + retval = result; goto error; } @@ -886,6 +879,13 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device dev->interrupt_in_interval = interrupt_in_interval ? interrupt_in_interval : dev->interrupt_in_endpoint->bInterval; dev->interrupt_out_interval = interrupt_out_interval ? interrupt_out_interval : dev->interrupt_out_endpoint->bInterval; + get_version_reply = kmalloc(sizeof(*get_version_reply), GFP_KERNEL); + + if (!get_version_reply) { + retval = -ENOMEM; + goto error; + } + /* get the firmware version and log it */ result = usb_control_msg (udev, usb_rcvctrlpipe(udev, 0), @@ -893,18 +893,19 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device USB_TYPE_VENDOR | USB_DIR_IN | USB_RECIP_DEVICE, 0, 0, - &get_version_reply, - sizeof(get_version_reply), + get_version_reply, + sizeof(*get_version_reply), 1000); if (result < 0) { dev_err(idev, "LEGO USB Tower get version control request failed\n"); retval = result; goto error; } - dev_info(&interface->dev, "LEGO USB Tower firmware version is %d.%d " - "build %d\n", get_version_reply.major, - get_version_reply.minor, - le16_to_cpu(get_version_reply.build_no)); + dev_info(&interface->dev, + "LEGO USB Tower firmware version is %d.%d build %d\n", + get_version_reply->major, + get_version_reply->minor, + le16_to_cpu(get_version_reply->build_no)); /* we can register the device now, as it is ready */ usb_set_intfdata (interface, dev); @@ -928,6 +929,7 @@ static int tower_probe (struct usb_interface *interface, const struct usb_device return retval; error: + kfree(get_version_reply); tower_delete(dev); return retval; } diff --git a/drivers/usb/misc/lvstest.c b/drivers/usb/misc/lvstest.c index d3d12475326663def2ce1f389f06f040d2a56126..2142132a1f82ee93881f3f938fa5f69352d2648d 100644 --- a/drivers/usb/misc/lvstest.c +++ b/drivers/usb/misc/lvstest.c @@ -193,7 +193,7 @@ static ssize_t u2_timeout_store(struct device *dev, return ret; } - if (val < 0 || val > 127) + if (val > 127) return -EINVAL; ret = lvs_rh_set_port_feature(hdev, lvs->portnum | (val << 8), @@ -222,7 +222,7 @@ static ssize_t u1_timeout_store(struct device *dev, return ret; } - if (val < 0 || val > 127) + if (val > 127) return -EINVAL; ret = lvs_rh_set_port_feature(hdev, lvs->portnum | (val << 8), @@ -367,10 +367,9 @@ static int lvs_rh_probe(struct usb_interface *intf, hdev = interface_to_usbdev(intf); desc = intf->cur_altsetting; - if (desc->desc.bNumEndpoints < 1) - return -ENODEV; - - endpoint = &desc->endpoint[0].desc; + ret = usb_find_int_in_endpoint(desc, &endpoint); + if (ret) + return ret; /* valid only for SS root hub */ if (hdev->descriptor.bDeviceProtocol != USB_HUB_PR_SS || hdev->parent) { @@ -433,6 +432,7 @@ static void lvs_rh_disconnect(struct usb_interface *intf) struct lvs_rh *lvs = usb_get_intfdata(intf); sysfs_remove_group(&intf->dev.kobj, &lvs_attr_group); + usb_poison_urb(lvs->urb); /* used in scheduled work */ flush_work(&lvs->rh_work); usb_free_urb(lvs->urb); } diff --git a/drivers/usb/misc/sisusbvga/sisusb_con.c b/drivers/usb/misc/sisusbvga/sisusb_con.c index 4b5777ec1501fe37a82c3bd3b23b35d8ab96ec79..3c6948af726aad38f9cf3a294ded3c7dfbac04ab 100644 --- a/drivers/usb/misc/sisusbvga/sisusb_con.c +++ b/drivers/usb/misc/sisusbvga/sisusb_con.c @@ -816,7 +816,7 @@ sisusbcon_scroll_area(struct vc_data *c, struct sisusb_usb_data *sisusb, mutex_unlock(&sisusb->lock); - return 1; + return true; } /* Interface routine */ diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c index 9f48419abc46b97ebdb562fa4a0b606e7223d67e..0f5ad896c7e3161186ea2540994841d6bfaa5b3d 100644 --- a/drivers/usb/misc/usblcd.c +++ b/drivers/usb/misc/usblcd.c @@ -313,16 +313,15 @@ static int lcd_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_lcd *dev = NULL; - struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - size_t buffer_size; + struct usb_endpoint_descriptor *bulk_in, *bulk_out; int i; - int retval = -ENOMEM; + int retval; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) - goto error; + return -ENOMEM; + kref_init(&dev->kref); sema_init(&dev->limit_sem, USB_LCD_CONCURRENT_WRITES); init_usb_anchor(&dev->submitted); @@ -338,33 +337,24 @@ static int lcd_probe(struct usb_interface *interface, /* set up the endpoint information */ /* use only the first bulk-in and bulk-out endpoints */ - iface_desc = interface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (!dev->bulk_in_endpointAddr && - usb_endpoint_is_bulk_in(endpoint)) { - /* we found a bulk in endpoint */ - buffer_size = usb_endpoint_maxp(endpoint); - dev->bulk_in_size = buffer_size; - dev->bulk_in_endpointAddr = endpoint->bEndpointAddress; - dev->bulk_in_buffer = kmalloc(buffer_size, GFP_KERNEL); - if (!dev->bulk_in_buffer) - goto error; - } - - if (!dev->bulk_out_endpointAddr && - usb_endpoint_is_bulk_out(endpoint)) { - /* we found a bulk out endpoint */ - dev->bulk_out_endpointAddr = endpoint->bEndpointAddress; - } - } - if (!(dev->bulk_in_endpointAddr && dev->bulk_out_endpointAddr)) { + retval = usb_find_common_endpoints(interface->cur_altsetting, + &bulk_in, &bulk_out, NULL, NULL); + if (retval) { dev_err(&interface->dev, "Could not find both bulk-in and bulk-out endpoints\n"); goto error; } + dev->bulk_in_size = usb_endpoint_maxp(bulk_in); + dev->bulk_in_endpointAddr = bulk_in->bEndpointAddress; + dev->bulk_in_buffer = kmalloc(dev->bulk_in_size, GFP_KERNEL); + if (!dev->bulk_in_buffer) { + retval = -ENOMEM; + goto error; + } + + dev->bulk_out_endpointAddr = bulk_out->bEndpointAddress; + /* save our data pointer in this interface device */ usb_set_intfdata(interface, dev); @@ -390,8 +380,7 @@ static int lcd_probe(struct usb_interface *interface, return 0; error: - if (dev) - kref_put(&dev->kref, lcd_delete); + kref_put(&dev->kref, lcd_delete); return retval; } diff --git a/drivers/usb/misc/usbtest.c b/drivers/usb/misc/usbtest.c index 17c0810682579a7fbba5e3fcef1b377519b8f0d8..eee82ca55b7b383bc507c39af7e64bb66a63c6a8 100644 --- a/drivers/usb/misc/usbtest.c +++ b/drivers/usb/misc/usbtest.c @@ -124,6 +124,20 @@ static struct usb_device *testdev_to_usbdev(struct usbtest_dev *test) /*-------------------------------------------------------------------------*/ +static inline void endpoint_update(int edi, + struct usb_host_endpoint **in, + struct usb_host_endpoint **out, + struct usb_host_endpoint *e) +{ + if (edi) { + if (!*in) + *in = e; + } else { + if (!*out) + *out = e; + } +} + static int get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf) { @@ -151,46 +165,26 @@ get_endpoints(struct usbtest_dev *dev, struct usb_interface *intf) */ for (ep = 0; ep < alt->desc.bNumEndpoints; ep++) { struct usb_host_endpoint *e; + int edi; e = alt->endpoint + ep; + edi = usb_endpoint_dir_in(&e->desc); + switch (usb_endpoint_type(&e->desc)) { case USB_ENDPOINT_XFER_BULK: - break; + endpoint_update(edi, &in, &out, e); + continue; case USB_ENDPOINT_XFER_INT: if (dev->info->intr) - goto try_intr; + endpoint_update(edi, &int_in, &int_out, e); + continue; case USB_ENDPOINT_XFER_ISOC: if (dev->info->iso) - goto try_iso; + endpoint_update(edi, &iso_in, &iso_out, e); /* FALLTHROUGH */ default: continue; } - if (usb_endpoint_dir_in(&e->desc)) { - if (!in) - in = e; - } else { - if (!out) - out = e; - } - continue; -try_intr: - if (usb_endpoint_dir_in(&e->desc)) { - if (!int_in) - int_in = e; - } else { - if (!int_out) - int_out = e; - } - continue; -try_iso: - if (usb_endpoint_dir_in(&e->desc)) { - if (!iso_in) - iso_in = e; - } else { - if (!iso_out) - iso_out = e; - } } if ((in && out) || iso_in || iso_out || int_in || int_out) goto found; diff --git a/drivers/usb/misc/uss720.c b/drivers/usb/misc/uss720.c index 07014cad6dbe357bca938561eaf976dbbee902d4..5947373700a197a4e62bb62bbf5940c33271d610 100644 --- a/drivers/usb/misc/uss720.c +++ b/drivers/usb/misc/uss720.c @@ -689,7 +689,7 @@ static int uss720_probe(struct usb_interface *intf, { struct usb_device *usbdev = usb_get_dev(interface_to_usbdev(intf)); struct usb_host_interface *interface; - struct usb_host_endpoint *endpoint; + struct usb_endpoint_descriptor *epd; struct parport_uss720_private *priv; struct parport *pp; unsigned char reg; @@ -745,9 +745,11 @@ static int uss720_probe(struct usb_interface *intf, get_1284_register(pp, 0, ®, GFP_KERNEL); dev_dbg(&intf->dev, "reg: %7ph\n", priv->reg); - endpoint = &interface->endpoint[2]; - dev_dbg(&intf->dev, "epaddr %d interval %d\n", - endpoint->desc.bEndpointAddress, endpoint->desc.bInterval); + i = usb_find_last_int_in_endpoint(interface, &epd); + if (!i) { + dev_dbg(&intf->dev, "epaddr %d interval %d\n", + epd->bEndpointAddress, epd->bInterval); + } parport_announce_port(pp); usb_set_intfdata(intf, pp); diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 54e53ac4c08fd7c47ebf1d271083d2313b92ade0..58abdf28620a537e37184eec548d72ea57fb5dba 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -195,8 +195,8 @@ static int yurex_probe(struct usb_interface *interface, const struct usb_device_ struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; int retval = -ENOMEM; - int i; DEFINE_WAIT(wait); + int res; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); @@ -212,20 +212,14 @@ static int yurex_probe(struct usb_interface *interface, const struct usb_device_ /* set up the endpoint information */ iface_desc = interface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_int_in(endpoint)) { - dev->int_in_endpointAddr = endpoint->bEndpointAddress; - break; - } - } - if (!dev->int_in_endpointAddr) { - retval = -ENODEV; + res = usb_find_int_in_endpoint(iface_desc, &endpoint); + if (res) { dev_err(&interface->dev, "Could not find endpoints\n"); + retval = res; goto error; } + dev->int_in_endpointAddr = endpoint->bEndpointAddress; /* allocate control URB */ dev->cntl_urb = usb_alloc_urb(0, GFP_KERNEL); diff --git a/drivers/usb/mtu3/mtu3_dr.c b/drivers/usb/mtu3/mtu3_dr.c index 1a8987e7c5b0648c031fb5aece274a4939fc2ead..11a0d3b84c5eaef31217fd0645edf197ed2f29da 100644 --- a/drivers/usb/mtu3/mtu3_dr.c +++ b/drivers/usb/mtu3/mtu3_dr.c @@ -223,25 +223,25 @@ static int ssusb_extcon_register(struct otg_switch_mtk *otg_sx) return 0; otg_sx->vbus_nb.notifier_call = ssusb_vbus_notifier; - ret = extcon_register_notifier(edev, EXTCON_USB, + ret = devm_extcon_register_notifier(ssusb->dev, edev, EXTCON_USB, &otg_sx->vbus_nb); if (ret < 0) dev_err(ssusb->dev, "failed to register notifier for USB\n"); otg_sx->id_nb.notifier_call = ssusb_id_notifier; - ret = extcon_register_notifier(edev, EXTCON_USB_HOST, + ret = devm_extcon_register_notifier(ssusb->dev, edev, EXTCON_USB_HOST, &otg_sx->id_nb); if (ret < 0) dev_err(ssusb->dev, "failed to register notifier for USB-HOST\n"); dev_dbg(ssusb->dev, "EXTCON_USB: %d, EXTCON_USB_HOST: %d\n", - extcon_get_cable_state_(edev, EXTCON_USB), - extcon_get_cable_state_(edev, EXTCON_USB_HOST)); + extcon_get_state(edev, EXTCON_USB), + extcon_get_state(edev, EXTCON_USB_HOST)); /* default as host, switch to device mode if needed */ - if (extcon_get_cable_state_(edev, EXTCON_USB_HOST) == false) + if (extcon_get_state(edev, EXTCON_USB_HOST) == false) ssusb_set_mailbox(otg_sx, MTU3_ID_FLOAT); - if (extcon_get_cable_state_(edev, EXTCON_USB) == true) + if (extcon_get_state(edev, EXTCON_USB) == true) ssusb_set_mailbox(otg_sx, MTU3_VBUS_VALID); return 0; @@ -367,13 +367,6 @@ void ssusb_otg_switch_exit(struct ssusb_mtk *ssusb) cancel_delayed_work(&otg_sx->extcon_reg_dwork); - if (otg_sx->edev) { - extcon_unregister_notifier(otg_sx->edev, - EXTCON_USB, &otg_sx->vbus_nb); - extcon_unregister_notifier(otg_sx->edev, - EXTCON_USB_HOST, &otg_sx->id_nb); - } - if (otg_sx->manual_drd_enabled) ssusb_debugfs_exit(ssusb); } diff --git a/drivers/usb/musb/Kconfig b/drivers/usb/musb/Kconfig index 72a2a504084897da62e91d476768d60ac954c51f..5506a9c03c1f64ab0ff6c6c038c2346ddbf46a10 100644 --- a/drivers/usb/musb/Kconfig +++ b/drivers/usb/musb/Kconfig @@ -160,8 +160,8 @@ config USB_TI_CPPI_DMA Enable DMA transfers when TI CPPI DMA is available. config USB_TI_CPPI41_DMA - bool 'TI CPPI 4.1 (AM335x)' - depends on ARCH_OMAP && DMADEVICES + bool 'TI CPPI 4.1' + depends on (ARCH_OMAP || ARCH_DAVINCI_DA8XX) && DMADEVICES select TI_CPPI41 config USB_TUSB_OMAP_DMA diff --git a/drivers/usb/musb/cppi_dma.c b/drivers/usb/musb/cppi_dma.c index c4fabe952ca6a34dd3cb5c1f938e9ecc80c470ba..a13bd362504399a88aba8d8f688d66f0fefecb7f 100644 --- a/drivers/usb/musb/cppi_dma.c +++ b/drivers/usb/musb/cppi_dma.c @@ -582,9 +582,10 @@ cppi_next_tx_segment(struct musb *musb, struct cppi_channel *tx) maxpacket = length; n_bds = 1; } else { - n_bds = length / maxpacket; - if (!length || (length % maxpacket)) - n_bds++; + if (length) + n_bds = DIV_ROUND_UP(length, maxpacket); + else + n_bds = 1; n_bds = min(n_bds, (unsigned) NUM_TXCHAN_BD); length = min(n_bds * maxpacket, length); } @@ -790,9 +791,7 @@ cppi_next_rx_segment(struct musb *musb, struct cppi_channel *rx, int onepacket) n_bds = 0xffff / maxpacket; length = n_bds * maxpacket; } else { - n_bds = length / maxpacket; - if (length % maxpacket) - n_bds++; + n_bds = DIV_ROUND_UP(length, maxpacket); } if (n_bds == 1) onepacket = 1; diff --git a/drivers/usb/musb/da8xx.c b/drivers/usb/musb/da8xx.c index d79c288ccbf2db0fcb08f04ad0678cad343503b4..df88123274ca722dee54fdf311fdcda0044ba517 100644 --- a/drivers/usb/musb/da8xx.c +++ b/drivers/usb/musb/da8xx.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -456,12 +457,41 @@ static inline u8 get_vbus_power(struct device *dev) return current_uA / 1000 / 2; } +#ifdef CONFIG_USB_TI_CPPI41_DMA +static void da8xx_dma_controller_callback(struct dma_controller *c) +{ + struct musb *musb = c->musb; + void __iomem *reg_base = musb->ctrl_base; + + musb_writel(reg_base, DA8XX_USB_END_OF_INTR_REG, 0); +} + +static struct dma_controller * +da8xx_dma_controller_create(struct musb *musb, void __iomem *base) +{ + struct dma_controller *controller; + + controller = cppi41_dma_controller_create(musb, base); + if (IS_ERR_OR_NULL(controller)) + return controller; + + controller->dma_callback = da8xx_dma_controller_callback; + + return controller; +} +#endif + static const struct musb_platform_ops da8xx_ops = { - .quirks = MUSB_INDEXED_EP | MUSB_PRESERVE_SESSION, + .quirks = MUSB_INDEXED_EP | MUSB_PRESERVE_SESSION | + MUSB_DMA_CPPI41 | MUSB_DA8XX, .init = da8xx_musb_init, .exit = da8xx_musb_exit, .fifo_mode = 2, +#ifdef CONFIG_USB_TI_CPPI41_DMA + .dma_init = da8xx_dma_controller_create, + .dma_exit = cppi41_dma_controller_destroy, +#endif .enable = da8xx_musb_enable, .disable = da8xx_musb_disable, @@ -483,6 +513,12 @@ static const struct musb_hdrc_config da8xx_config = { .multipoint = 1, }; +static struct of_dev_auxdata da8xx_auxdata_lookup[] = { + OF_DEV_AUXDATA("ti,da830-cppi41", 0x01e01000, "cppi41-dmaengine", + NULL), + {} +}; + static int da8xx_probe(struct platform_device *pdev) { struct resource musb_resources[2]; @@ -533,6 +569,11 @@ static int da8xx_probe(struct platform_device *pdev) } platform_set_drvdata(pdev, glue); + ret = of_platform_populate(pdev->dev.of_node, NULL, + da8xx_auxdata_lookup, &pdev->dev); + if (ret) + return ret; + memset(musb_resources, 0x00, sizeof(*musb_resources) * ARRAY_SIZE(musb_resources)); diff --git a/drivers/usb/musb/musb_core.c b/drivers/usb/musb/musb_core.c index 0c3664ab705eed549e73d53dea13976066182ee1..870da18f5077cd762ae826c84c84e67ab1a8862e 100644 --- a/drivers/usb/musb/musb_core.c +++ b/drivers/usb/musb/musb_core.c @@ -2332,7 +2332,7 @@ musb_init_controller(struct device *dev, int nIrq, void __iomem *ctrl) setup_timer(&musb->otg_timer, musb_otg_timer_func, (unsigned long) musb); /* attach to the IRQ */ - if (request_irq(nIrq, musb->isr, 0, dev_name(dev), musb)) { + if (request_irq(nIrq, musb->isr, IRQF_SHARED, dev_name(dev), musb)) { dev_err(dev, "request_irq %d failed!\n", nIrq); status = -ENODEV; goto fail3; diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h index 5b708be6d1d1d4293e04cc36bc4403ba1bff17e7..3e98d4268a6478a36348b7144dd38fe49167670b 100644 --- a/drivers/usb/musb/musb_core.h +++ b/drivers/usb/musb/musb_core.h @@ -172,6 +172,7 @@ struct musb_io; */ struct musb_platform_ops { +#define MUSB_DA8XX BIT(8) #define MUSB_PRESERVE_SESSION BIT(7) #define MUSB_DMA_UX500 BIT(6) #define MUSB_DMA_CPPI41 BIT(5) diff --git a/drivers/usb/musb/musb_cppi41.c b/drivers/usb/musb/musb_cppi41.c index 355655f8a3fbc9c4e3d8acd03eefa31a5736fb2f..e7c8b1b8bf2296229b2eef3343811e6cbbc9b317 100644 --- a/drivers/usb/musb/musb_cppi41.c +++ b/drivers/usb/musb/musb_cppi41.c @@ -571,6 +571,10 @@ static int cppi41_dma_channel_abort(struct dma_channel *channel) } } + /* DA8xx Advisory 2.3.27: wait 250 ms before to start the teardown */ + if (musb->io.quirks & MUSB_DA8XX) + mdelay(250); + tdbit = 1 << cppi41_channel->port_num; if (is_tx) tdbit <<= 16; diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig index 61cef7511a506af48e3aa2165e0ac992b9cf4d8d..3006f569c068106559b64d15c8acdb0c53e8c10c 100644 --- a/drivers/usb/phy/Kconfig +++ b/drivers/usb/phy/Kconfig @@ -74,13 +74,6 @@ config AM335X_PHY_USB This driver provides PHY support for that phy which part for the AM335x SoC. -config SAMSUNG_USBPHY - tristate - help - Enable this to support Samsung USB phy helper driver for Samsung SoCs. - This driver provides common interface to interact, for Samsung USB 2.0 PHY - driver and later for Samsung USB 3.0 PHY driver. - config TWL6030_USB tristate "TWL6030 USB Transceiver Driver" depends on TWL4030_CORE && OMAP_USB2 && USB_MUSB_OMAP2PLUS diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile index b433e5d89be4bddac3af69d87dbaea1ec67126cf..e7c9ca8cafb01ee9482910c4c7c2fa545b4f7f3b 100644 --- a/drivers/usb/phy/Makefile +++ b/drivers/usb/phy/Makefile @@ -14,7 +14,6 @@ obj-$(CONFIG_TAHVO_USB) += phy-tahvo.o obj-$(CONFIG_AM335X_CONTROL_USB) += phy-am335x-control.o obj-$(CONFIG_AM335X_PHY_USB) += phy-am335x.o obj-$(CONFIG_OMAP_OTG) += phy-omap-otg.o -obj-$(CONFIG_SAMSUNG_USBPHY) += phy-samsung-usb.o obj-$(CONFIG_TWL6030_USB) += phy-twl6030-usb.o obj-$(CONFIG_USB_EHCI_TEGRA) += phy-tegra-usb.o obj-$(CONFIG_USB_GPIO_VBUS) += phy-gpio-vbus-usb.o diff --git a/drivers/usb/phy/phy-fsl-usb.c b/drivers/usb/phy/phy-fsl-usb.c index 392ab422163c236084f73252fe8577f386c1d7c1..cf8f40ae6e017943c9ae97e50fca17d50304d51e 100644 --- a/drivers/usb/phy/phy-fsl-usb.c +++ b/drivers/usb/phy/phy-fsl-usb.c @@ -44,6 +44,13 @@ #include "phy-fsl-usb.h" +#ifdef VERBOSE +#define VDBG(fmt, args...) pr_debug("[%s] " fmt, \ + __func__, ## args) +#else +#define VDBG(stuff...) do {} while (0) +#endif + #define DRIVER_VERSION "Rev. 1.55" #define DRIVER_AUTHOR "Jerry Huang/Li Yang" #define DRIVER_DESC "Freescale USB OTG Transceiver Driver" diff --git a/drivers/usb/serial/aircable.c b/drivers/usb/serial/aircable.c index 80a9845cd93fba16e50ae7797606b7b9b925c887..569c2200ba42a915902385ed61f71bd8e0f124cc 100644 --- a/drivers/usb/serial/aircable.c +++ b/drivers/usb/serial/aircable.c @@ -29,12 +29,6 @@ * is any other control code, I will simply check for the first * one. * - * The driver registers himself with the USB-serial core and the USB Core. I had - * to implement a probe function against USB-serial, because other way, the - * driver was attaching himself to both interfaces. I have tried with different - * configurations of usb_serial_driver with out exit, only the probe function - * could handle this correctly. - * * I have taken some info from a Greg Kroah-Hartman article: * http://www.linuxjournal.com/article/6573 * And from Linux Device Driver Kit CD, which is a great work, the authors taken @@ -93,30 +87,17 @@ static int aircable_prepare_write_buffer(struct usb_serial_port *port, return count + HCI_HEADER_LENGTH; } -static int aircable_probe(struct usb_serial *serial, - const struct usb_device_id *id) +static int aircable_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { - struct usb_host_interface *iface_desc = serial->interface-> - cur_altsetting; - struct usb_endpoint_descriptor *endpoint; - int num_bulk_out = 0; - int i; - - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - endpoint = &iface_desc->endpoint[i].desc; - if (usb_endpoint_is_bulk_out(endpoint)) { - dev_dbg(&serial->dev->dev, - "found bulk out on endpoint %d\n", i); - ++num_bulk_out; - } - } - - if (num_bulk_out == 0) { - dev_dbg(&serial->dev->dev, "Invalid interface, discarding\n"); + /* Ignore the first interface, which has no bulk endpoints. */ + if (epds->num_bulk_out == 0) { + dev_dbg(&serial->interface->dev, + "ignoring interface with no bulk-out endpoints\n"); return -ENODEV; } - return 0; + return 1; } static int aircable_process_packet(struct usb_serial_port *port, @@ -164,9 +145,8 @@ static struct usb_serial_driver aircable_device = { .name = "aircable", }, .id_table = id_table, - .num_ports = 1, .bulk_out_size = HCI_COMPLETE_FRAME, - .probe = aircable_probe, + .calc_num_ports = aircable_calc_num_ports, .process_read_urb = aircable_process_read_urb, .prepare_write_buffer = aircable_prepare_write_buffer, .throttle = usb_serial_generic_throttle, diff --git a/drivers/usb/serial/ark3116.c b/drivers/usb/serial/ark3116.c index 2779e59c30f1e3532d036b37c97f799d963c52fb..0adbd38b4eeaef03ab7f4230158bd37cdb918a01 100644 --- a/drivers/usb/serial/ark3116.c +++ b/drivers/usb/serial/ark3116.c @@ -122,19 +122,6 @@ static inline int calc_divisor(int bps) return (12000000 + 2*bps) / (4*bps); } -static int ark3116_attach(struct usb_serial *serial) -{ - /* make sure we have our end-points */ - if (serial->num_bulk_in == 0 || - serial->num_bulk_out == 0 || - serial->num_interrupt_in == 0) { - dev_err(&serial->interface->dev, "missing endpoint\n"); - return -ENODEV; - } - - return 0; -} - static int ark3116_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; @@ -671,7 +658,9 @@ static struct usb_serial_driver ark3116_device = { }, .id_table = id_table, .num_ports = 1, - .attach = ark3116_attach, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .port_probe = ark3116_port_probe, .port_remove = ark3116_port_remove, .set_termios = ark3116_set_termios, diff --git a/drivers/usb/serial/cyberjack.c b/drivers/usb/serial/cyberjack.c index 80260b08398b2e55833c65d21e0be70cc43ccf01..47fbd9f0c0c7a8508f23b562f4a220dbd19bc33f 100644 --- a/drivers/usb/serial/cyberjack.c +++ b/drivers/usb/serial/cyberjack.c @@ -50,7 +50,6 @@ #define CYBERJACK_PRODUCT_ID 0x0100 /* Function prototypes */ -static int cyberjack_attach(struct usb_serial *serial); static int cyberjack_port_probe(struct usb_serial_port *port); static int cyberjack_port_remove(struct usb_serial_port *port); static int cyberjack_open(struct tty_struct *tty, @@ -78,7 +77,7 @@ static struct usb_serial_driver cyberjack_device = { .description = "Reiner SCT Cyberjack USB card reader", .id_table = id_table, .num_ports = 1, - .attach = cyberjack_attach, + .num_bulk_out = 1, .port_probe = cyberjack_port_probe, .port_remove = cyberjack_port_remove, .open = cyberjack_open, @@ -102,14 +101,6 @@ struct cyberjack_private { short wrsent; /* Data already sent */ }; -static int cyberjack_attach(struct usb_serial *serial) -{ - if (serial->num_bulk_out < serial->num_ports) - return -ENODEV; - - return 0; -} - static int cyberjack_port_probe(struct usb_serial_port *port) { struct cyberjack_private *priv; diff --git a/drivers/usb/serial/digi_acceleport.c b/drivers/usb/serial/digi_acceleport.c index 6537d3ca2797d8573236578e3088f6dbce1ce1b5..2ce39af32cfa614c2b7ccd26d2df1581cfc89c16 100644 --- a/drivers/usb/serial/digi_acceleport.c +++ b/drivers/usb/serial/digi_acceleport.c @@ -273,6 +273,8 @@ static struct usb_serial_driver digi_acceleport_2_device = { .description = "Digi 2 port USB adapter", .id_table = id_table_2, .num_ports = 3, + .num_bulk_in = 4, + .num_bulk_out = 4, .open = digi_open, .close = digi_close, .dtr_rts = digi_dtr_rts, @@ -302,6 +304,8 @@ static struct usb_serial_driver digi_acceleport_4_device = { .description = "Digi 4 port USB adapter", .id_table = id_table_4, .num_ports = 4, + .num_bulk_in = 5, + .num_bulk_out = 5, .open = digi_open, .close = digi_close, .write = digi_write, @@ -1251,27 +1255,8 @@ static int digi_port_init(struct usb_serial_port *port, unsigned port_num) static int digi_startup(struct usb_serial *serial) { - struct device *dev = &serial->interface->dev; struct digi_serial *serial_priv; int ret; - int i; - - /* check whether the device has the expected number of endpoints */ - if (serial->num_port_pointers < serial->type->num_ports + 1) { - dev_err(dev, "OOB endpoints missing\n"); - return -ENODEV; - } - - for (i = 0; i < serial->type->num_ports + 1 ; i++) { - if (!serial->port[i]->read_urb) { - dev_err(dev, "bulk-in endpoint missing\n"); - return -ENODEV; - } - if (!serial->port[i]->write_urb) { - dev_err(dev, "bulk-out endpoint missing\n"); - return -ENODEV; - } - } serial_priv = kzalloc(sizeof(*serial_priv), GFP_KERNEL); if (!serial_priv) diff --git a/drivers/usb/serial/f81534.c b/drivers/usb/serial/f81534.c index 22f23a429a95cb6a1cdeaca445a92035fd3e98e8..3d616a2a9f963a96e73ae865651d40e868aeb4c9 100644 --- a/drivers/usb/serial/f81534.c +++ b/drivers/usb/serial/f81534.c @@ -611,20 +611,30 @@ static int f81534_find_config_idx(struct usb_serial *serial, u8 *index) * The f81534_calc_num_ports() will run to "new style" with checking * F81534_PORT_UNAVAILABLE section. */ -static int f81534_calc_num_ports(struct usb_serial *serial) +static int f81534_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { + struct device *dev = &serial->interface->dev; + int size_bulk_in = usb_endpoint_maxp(epds->bulk_in[0]); + int size_bulk_out = usb_endpoint_maxp(epds->bulk_out[0]); u8 setting[F81534_CUSTOM_DATA_SIZE]; u8 setting_idx; u8 num_port = 0; int status; size_t i; + if (size_bulk_out != F81534_WRITE_BUFFER_SIZE || + size_bulk_in != F81534_MAX_RECEIVE_BLOCK_SIZE) { + dev_err(dev, "unsupported endpoint max packet size\n"); + return -ENODEV; + } + /* Check had custom setting */ status = f81534_find_config_idx(serial, &setting_idx); if (status) { dev_err(&serial->interface->dev, "%s: find idx failed: %d\n", __func__, status); - return 0; + return status; } /* @@ -640,7 +650,7 @@ static int f81534_calc_num_ports(struct usb_serial *serial) dev_err(&serial->interface->dev, "%s: get custom data failed: %d\n", __func__, status); - return 0; + return status; } dev_dbg(&serial->interface->dev, @@ -656,7 +666,7 @@ static int f81534_calc_num_ports(struct usb_serial *serial) dev_err(&serial->interface->dev, "%s: read failed: %d\n", __func__, status); - return 0; + return status; } dev_dbg(&serial->interface->dev, "%s: read default config\n", @@ -671,12 +681,24 @@ static int f81534_calc_num_ports(struct usb_serial *serial) ++num_port; } - if (num_port) - return num_port; + if (!num_port) { + dev_warn(&serial->interface->dev, + "no config found, assuming 4 ports\n"); + num_port = 4; /* Nothing found, oldest version IC */ + } + + /* + * Setup bulk-out endpoint multiplexing. All ports share the same + * bulk-out endpoint. + */ + BUILD_BUG_ON(ARRAY_SIZE(epds->bulk_out) < F81534_NUM_PORT); + + for (i = 1; i < num_port; ++i) + epds->bulk_out[i] = epds->bulk_out[0]; - dev_warn(&serial->interface->dev, "%s: Read Failed. default 4 ports\n", - __func__); - return 4; /* Nothing found, oldest version IC */ + epds->num_bulk_out = num_port; + + return num_port; } static void f81534_set_termios(struct tty_struct *tty, @@ -1067,96 +1089,6 @@ static void f81534_write_usb_callback(struct urb *urb) } } -static int f81534_setup_ports(struct usb_serial *serial) -{ - struct usb_serial_port *port; - u8 port0_out_address; - int buffer_size; - size_t i; - - /* - * In our system architecture, we had 2 or 4 serial ports, - * but only get 1 set of bulk in/out endpoints. - * - * The usb-serial subsystem will generate port 0 data, - * but port 1/2/3 will not. It's will generate write URB and buffer - * by following code and use the port0 read URB for read operation. - */ - for (i = 1; i < serial->num_ports; ++i) { - port0_out_address = serial->port[0]->bulk_out_endpointAddress; - buffer_size = serial->port[0]->bulk_out_size; - port = serial->port[i]; - - if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) - return -ENOMEM; - - port->bulk_out_size = buffer_size; - port->bulk_out_endpointAddress = port0_out_address; - - port->write_urbs[0] = usb_alloc_urb(0, GFP_KERNEL); - if (!port->write_urbs[0]) - return -ENOMEM; - - port->bulk_out_buffers[0] = kzalloc(buffer_size, GFP_KERNEL); - if (!port->bulk_out_buffers[0]) - return -ENOMEM; - - usb_fill_bulk_urb(port->write_urbs[0], serial->dev, - usb_sndbulkpipe(serial->dev, - port0_out_address), - port->bulk_out_buffers[0], buffer_size, - serial->type->write_bulk_callback, port); - - port->write_urb = port->write_urbs[0]; - port->bulk_out_buffer = port->bulk_out_buffers[0]; - } - - return 0; -} - -static int f81534_probe(struct usb_serial *serial, - const struct usb_device_id *id) -{ - struct usb_endpoint_descriptor *endpoint; - struct usb_host_interface *iface_desc; - struct device *dev; - int num_bulk_in = 0; - int num_bulk_out = 0; - int size_bulk_in = 0; - int size_bulk_out = 0; - int i; - - dev = &serial->interface->dev; - iface_desc = serial->interface->cur_altsetting; - - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_bulk_in(endpoint)) { - ++num_bulk_in; - size_bulk_in = usb_endpoint_maxp(endpoint); - } - - if (usb_endpoint_is_bulk_out(endpoint)) { - ++num_bulk_out; - size_bulk_out = usb_endpoint_maxp(endpoint); - } - } - - if (num_bulk_in != 1 || num_bulk_out != 1) { - dev_err(dev, "expected endpoints not found\n"); - return -ENODEV; - } - - if (size_bulk_out != F81534_WRITE_BUFFER_SIZE || - size_bulk_in != F81534_MAX_RECEIVE_BLOCK_SIZE) { - dev_err(dev, "unsupported endpoint max packet size\n"); - return -ENODEV; - } - - return 0; -} - static int f81534_attach(struct usb_serial *serial) { struct f81534_serial_private *serial_priv; @@ -1173,10 +1105,6 @@ static int f81534_attach(struct usb_serial *serial) mutex_init(&serial_priv->urb_mutex); - status = f81534_setup_ports(serial); - if (status) - return status; - /* Check had custom setting */ status = f81534_find_config_idx(serial, &serial_priv->setting_idx); if (status) { @@ -1380,12 +1308,13 @@ static struct usb_serial_driver f81534_device = { }, .description = DRIVER_DESC, .id_table = f81534_id_table, + .num_bulk_in = 1, + .num_bulk_out = 1, .open = f81534_open, .close = f81534_close, .write = f81534_write, .tx_empty = f81534_tx_empty, .calc_num_ports = f81534_calc_num_ports, - .probe = f81534_probe, .attach = f81534_attach, .port_probe = f81534_port_probe, .dtr_rts = f81534_dtr_rts, diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index c540de15aad2c7d301543aef202b3560fb2f7399..d38780fa87881a0ef3696e73e61527304271393c 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -873,6 +873,7 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE_AND_INTERFACE_INFO(MICROCHIP_VID, MICROCHIP_USB_BOARD_PID, USB_CLASS_VENDOR_SPEC, USB_SUBCLASS_VENDOR_SPEC, 0x00) }, + { USB_DEVICE_INTERFACE_NUMBER(ACTEL_VID, MICROSEMI_ARROW_SF2PLUS_BOARD_PID, 2) }, { USB_DEVICE(JETI_VID, JETI_SPC1201_PID) }, { USB_DEVICE(MARVELL_VID, MARVELL_SHEEVAPLUG_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, @@ -1406,6 +1407,9 @@ static int write_latency_timer(struct usb_serial_port *port) int rv; int l = priv->latency; + if (priv->chip_type == SIO || priv->chip_type == FT8U232AM) + return -EINVAL; + if (priv->flags & ASYNC_LOW_LATENCY) l = 1; @@ -1422,7 +1426,7 @@ static int write_latency_timer(struct usb_serial_port *port) return rv; } -static int read_latency_timer(struct usb_serial_port *port) +static int _read_latency_timer(struct usb_serial_port *port) { struct ftdi_private *priv = usb_get_serial_port_data(port); struct usb_device *udev = port->serial->dev; @@ -1440,11 +1444,10 @@ static int read_latency_timer(struct usb_serial_port *port) 0, priv->interface, buf, 1, WDR_TIMEOUT); if (rv < 1) { - dev_err(&port->dev, "Unable to read latency timer: %i\n", rv); if (rv >= 0) rv = -EIO; } else { - priv->latency = buf[0]; + rv = buf[0]; } kfree(buf); @@ -1452,6 +1455,25 @@ static int read_latency_timer(struct usb_serial_port *port) return rv; } +static int read_latency_timer(struct usb_serial_port *port) +{ + struct ftdi_private *priv = usb_get_serial_port_data(port); + int rv; + + if (priv->chip_type == SIO || priv->chip_type == FT8U232AM) + return -EINVAL; + + rv = _read_latency_timer(port); + if (rv < 0) { + dev_err(&port->dev, "Unable to read latency timer: %i\n", rv); + return rv; + } + + priv->latency = rv; + + return 0; +} + static int get_serial_info(struct usb_serial_port *port, struct serial_struct __user *retinfo) { @@ -1603,9 +1625,19 @@ static void ftdi_determine_type(struct usb_serial_port *port) priv->baud_base = 12000000 / 16; } else if (version < 0x400) { /* Assume it's an FT8U232AM (or FT8U245AM) */ - /* (It might be a BM because of the iSerialNumber bug, - * but it will still work as an AM device.) */ priv->chip_type = FT8U232AM; + /* + * It might be a BM type because of the iSerialNumber bug. + * If iSerialNumber==0 and the latency timer is readable, + * assume it is BM type. + */ + if (udev->descriptor.iSerialNumber == 0 && + _read_latency_timer(port) >= 0) { + dev_dbg(&port->dev, + "%s: has latency timer so not an AM type\n", + __func__); + priv->chip_type = FT232BM; + } } else if (version < 0x600) { /* Assume it's an FT232BM (or FT245BM) */ priv->chip_type = FT232BM; @@ -1685,9 +1717,12 @@ static ssize_t latency_timer_store(struct device *dev, { struct usb_serial_port *port = to_usb_serial_port(dev); struct ftdi_private *priv = usb_get_serial_port_data(port); - int v = simple_strtoul(valbuf, NULL, 10); + u8 v; int rv; + if (kstrtou8(valbuf, 10, &v)) + return -EINVAL; + priv->latency = v; rv = write_latency_timer(port); if (rv < 0) @@ -1704,10 +1739,13 @@ static ssize_t store_event_char(struct device *dev, struct usb_serial_port *port = to_usb_serial_port(dev); struct ftdi_private *priv = usb_get_serial_port_data(port); struct usb_device *udev = port->serial->dev; - int v = simple_strtoul(valbuf, NULL, 10); + unsigned int v; int rv; - dev_dbg(&port->dev, "%s: setting event char = %i\n", __func__, v); + if (kstrtouint(valbuf, 0, &v) || v >= 0x200) + return -EINVAL; + + dev_dbg(&port->dev, "%s: setting event char = 0x%03x\n", __func__, v); rv = usb_control_msg(udev, usb_sndctrlpipe(udev, 0), diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 48ee04c94a7541ad6c5f9023be107246207c56fd..71fb9e59db7125c845a9a1c23c4e33430a142a3e 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -873,6 +873,12 @@ #define FIC_VID 0x1457 #define FIC_NEO1973_DEBUG_PID 0x5118 +/* + * Actel / Microsemi + */ +#define ACTEL_VID 0x1514 +#define MICROSEMI_ARROW_SF2PLUS_BOARD_PID 0x2008 + /* Olimex */ #define OLIMEX_VID 0x15BA #define OLIMEX_ARM_USB_OCD_PID 0x0003 diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index 49ce2be90fa00e5b4305784f2532e34069752072..35cb8c0e584fc19f4550e24eb7b35fed3819bcc4 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -37,13 +37,41 @@ MODULE_PARM_DESC(product, "User specified USB idProduct"); static struct usb_device_id generic_device_ids[2]; /* Initially all zeroes. */ -struct usb_serial_driver usb_serial_generic_device = { +static int usb_serial_generic_probe(struct usb_serial *serial, + const struct usb_device_id *id) +{ + struct device *dev = &serial->interface->dev; + + dev_info(dev, "The \"generic\" usb-serial driver is only for testing and one-off prototypes.\n"); + dev_info(dev, "Tell linux-usb@vger.kernel.org to add your device to a proper driver.\n"); + + return 0; +} + +static int usb_serial_generic_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) +{ + struct device *dev = &serial->interface->dev; + int num_ports; + + num_ports = max(epds->num_bulk_in, epds->num_bulk_out); + + if (num_ports == 0) { + dev_err(dev, "device has no bulk endpoints\n"); + return -ENODEV; + } + + return num_ports; +} + +static struct usb_serial_driver usb_serial_generic_device = { .driver = { .owner = THIS_MODULE, .name = "generic", }, .id_table = generic_device_ids, - .num_ports = 1, + .probe = usb_serial_generic_probe, + .calc_num_ports = usb_serial_generic_calc_num_ports, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, .resume = usb_serial_generic_resume, diff --git a/drivers/usb/serial/io_edgeport.c b/drivers/usb/serial/io_edgeport.c index bb7673e80a57aeeba501cd7dc4628394c491f1d0..bdf8bd814a9aaf89d646545cac01da659ab475b6 100644 --- a/drivers/usb/serial/io_edgeport.c +++ b/drivers/usb/serial/io_edgeport.c @@ -1544,11 +1544,6 @@ static void edge_set_termios(struct tty_struct *tty, struct usb_serial_port *port, struct ktermios *old_termios) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); - unsigned int cflag; - - cflag = tty->termios.c_cflag; - dev_dbg(&port->dev, "%s - clfag %08x iflag %08x\n", __func__, tty->termios.c_cflag, tty->termios.c_iflag); - dev_dbg(&port->dev, "%s - old clfag %08x old iflag %08x\n", __func__, old_termios->c_cflag, old_termios->c_iflag); if (edge_port == NULL) return; @@ -2844,14 +2839,9 @@ static int edge_startup(struct usb_serial *serial) bool interrupt_in_found; bool bulk_in_found; bool bulk_out_found; - static __u32 descriptor[3] = { EDGE_COMPATIBILITY_MASK0, - EDGE_COMPATIBILITY_MASK1, - EDGE_COMPATIBILITY_MASK2 }; - - if (serial->num_bulk_in < 1 || serial->num_interrupt_in < 1) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } + static const __u32 descriptor[3] = { EDGE_COMPATIBILITY_MASK0, + EDGE_COMPATIBILITY_MASK1, + EDGE_COMPATIBILITY_MASK2 }; dev = serial->dev; @@ -3120,6 +3110,9 @@ static struct usb_serial_driver edgeport_2port_device = { .description = "Edgeport 2 port adapter", .id_table = edgeport_2port_id_table, .num_ports = 2, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, @@ -3152,6 +3145,9 @@ static struct usb_serial_driver edgeport_4port_device = { .description = "Edgeport 4 port adapter", .id_table = edgeport_4port_id_table, .num_ports = 4, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, @@ -3184,6 +3180,9 @@ static struct usb_serial_driver edgeport_8port_device = { .description = "Edgeport 8 port adapter", .id_table = edgeport_8port_id_table, .num_ports = 8, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, @@ -3216,6 +3215,9 @@ static struct usb_serial_driver epic_device = { .description = "EPiC device", .id_table = Epic_port_id_table, .num_ports = 1, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, diff --git a/drivers/usb/serial/io_ti.c b/drivers/usb/serial/io_ti.c index a76b95d32157871f5e2964b629784a7642da8480..87798e625d6c827ed09b79a011eb46cdd770c864 100644 --- a/drivers/usb/serial/io_ti.c +++ b/drivers/usb/serial/io_ti.c @@ -1933,13 +1933,6 @@ static int edge_open(struct tty_struct *tty, struct usb_serial_port *port) if (edge_serial->num_ports_open == 0) { /* we are the first port to open, post the interrupt urb */ urb = edge_serial->serial->port[0]->interrupt_in_urb; - if (!urb) { - dev_err(&port->dev, - "%s - no interrupt urb present, exiting\n", - __func__); - status = -EINVAL; - goto release_es_lock; - } urb->context = edge_serial; status = usb_submit_urb(urb, GFP_KERNEL); if (status) { @@ -1959,12 +1952,6 @@ static int edge_open(struct tty_struct *tty, struct usb_serial_port *port) /* start up our bulk read urb */ urb = port->read_urb; - if (!urb) { - dev_err(&port->dev, "%s - no read urb present, exiting\n", - __func__); - status = -EINVAL; - goto unlink_int_urb; - } edge_port->ep_read_urb_state = EDGE_READ_URB_RUNNING; urb->context = edge_port; status = usb_submit_urb(urb, GFP_KERNEL); @@ -2385,14 +2372,6 @@ static void edge_set_termios(struct tty_struct *tty, struct usb_serial_port *port, struct ktermios *old_termios) { struct edgeport_port *edge_port = usb_get_serial_port_data(port); - unsigned int cflag; - - cflag = tty->termios.c_cflag; - - dev_dbg(&port->dev, "%s - clfag %08x iflag %08x\n", __func__, - tty->termios.c_cflag, tty->termios.c_iflag); - dev_dbg(&port->dev, "%s - old clfag %08x old iflag %08x\n", __func__, - old_termios->c_cflag, old_termios->c_iflag); if (edge_port == NULL) return; @@ -2544,19 +2523,31 @@ static void edge_heartbeat_work(struct work_struct *work) edge_heartbeat_schedule(serial); } -static int edge_startup(struct usb_serial *serial) +static int edge_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { - struct edgeport_serial *edge_serial; - int status; - u16 product_id; + struct device *dev = &serial->interface->dev; + unsigned char num_ports = serial->type->num_ports; /* Make sure we have the required endpoints when in download mode. */ if (serial->interface->cur_altsetting->desc.bNumEndpoints > 1) { - if (serial->num_bulk_in < serial->num_ports || - serial->num_bulk_out < serial->num_ports) + if (epds->num_bulk_in < num_ports || + epds->num_bulk_out < num_ports || + epds->num_interrupt_in < 1) { + dev_err(dev, "required endpoints missing\n"); return -ENODEV; + } } + return num_ports; +} + +static int edge_startup(struct usb_serial *serial) +{ + struct edgeport_serial *edge_serial; + int status; + u16 product_id; + /* create our private serial structure */ edge_serial = kzalloc(sizeof(struct edgeport_serial), GFP_KERNEL); if (!edge_serial) @@ -2736,11 +2727,13 @@ static struct usb_serial_driver edgeport_1port_device = { .description = "Edgeport TI 1 port adapter", .id_table = edgeport_1port_id_table, .num_ports = 1, + .num_bulk_out = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, .unthrottle = edge_unthrottle, .attach = edge_startup, + .calc_num_ports = edge_calc_num_ports, .disconnect = edge_disconnect, .release = edge_release, .port_probe = edge_port_probe, @@ -2773,11 +2766,13 @@ static struct usb_serial_driver edgeport_2port_device = { .description = "Edgeport TI 2 port adapter", .id_table = edgeport_2port_id_table, .num_ports = 2, + .num_bulk_out = 1, .open = edge_open, .close = edge_close, .throttle = edge_throttle, .unthrottle = edge_unthrottle, .attach = edge_startup, + .calc_num_ports = edge_calc_num_ports, .disconnect = edge_disconnect, .release = edge_release, .port_probe = edge_port_probe, diff --git a/drivers/usb/serial/ipaq.c b/drivers/usb/serial/ipaq.c index ec1b8f2c11837fecd39468c1e053880d72da9f94..cde0dcdce9c4d1402b69076c2a6a228b36963833 100644 --- a/drivers/usb/serial/ipaq.c +++ b/drivers/usb/serial/ipaq.c @@ -33,7 +33,8 @@ static int initial_wait; /* Function prototypes for an ipaq */ static int ipaq_open(struct tty_struct *tty, struct usb_serial_port *port); -static int ipaq_calc_num_ports(struct usb_serial *serial); +static int ipaq_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds); static int ipaq_startup(struct usb_serial *serial); static const struct usb_device_id ipaq_id_table[] = { @@ -550,42 +551,38 @@ static int ipaq_open(struct tty_struct *tty, return usb_serial_generic_open(tty, port); } -static int ipaq_calc_num_ports(struct usb_serial *serial) +static int ipaq_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { /* - * some devices have 3 endpoints, the 3rd of which - * must be ignored as it would make the core - * create a second port which oopses when used + * Some of the devices in ipaq_id_table[] are composite, and we + * shouldn't bind to all the interfaces. This test will rule out + * some obviously invalid possibilities. */ - int ipaq_num_ports = 1; - - dev_dbg(&serial->dev->dev, "%s - numberofendpoints: %d\n", __func__, - (int)serial->interface->cur_altsetting->desc.bNumEndpoints); + if (epds->num_bulk_in == 0 || epds->num_bulk_out == 0) + return -ENODEV; /* - * a few devices have 4 endpoints, seemingly Yakuma devices, - * and we need the second pair, so let them have 2 ports - * - * TODO: can we drop port 1 ? + * A few devices have four endpoints, seemingly Yakuma devices, and + * we need the second pair. */ - if (serial->interface->cur_altsetting->desc.bNumEndpoints > 3) { - ipaq_num_ports = 2; + if (epds->num_bulk_in > 1 && epds->num_bulk_out > 1) { + epds->bulk_in[0] = epds->bulk_in[1]; + epds->bulk_out[0] = epds->bulk_out[1]; } - return ipaq_num_ports; -} + /* + * Other devices have 3 endpoints, but we only use the first bulk in + * and out endpoints. + */ + epds->num_bulk_in = 1; + epds->num_bulk_out = 1; + return 1; +} static int ipaq_startup(struct usb_serial *serial) { - /* Some of the devices in ipaq_id_table[] are composite, and we - * shouldn't bind to all the interfaces. This test will rule out - * some obviously invalid possibilities. - */ - if (serial->num_bulk_in < serial->num_ports || - serial->num_bulk_out < serial->num_ports) - return -ENODEV; - if (serial->dev->actconfig->desc.bConfigurationValue != 1) { /* * FIXME: HP iPaq rx3715, possibly others, have 1 config that @@ -597,10 +594,6 @@ static int ipaq_startup(struct usb_serial *serial) return -ENODEV; } - dev_dbg(&serial->dev->dev, - "%s - iPAQ module configured for %d ports\n", __func__, - serial->num_ports); - return usb_reset_configuration(serial->dev); } diff --git a/drivers/usb/serial/iuu_phoenix.c b/drivers/usb/serial/iuu_phoenix.c index 030390f37b0a8e4822a58b9e1e3fa412736ad451..18fc992a245fcd8c700aea392ed2a1fcf9d584ea 100644 --- a/drivers/usb/serial/iuu_phoenix.c +++ b/drivers/usb/serial/iuu_phoenix.c @@ -68,16 +68,6 @@ struct iuu_private { u32 clk; }; -static int iuu_attach(struct usb_serial *serial) -{ - unsigned char num_ports = serial->num_ports; - - if (serial->num_bulk_in < num_ports || serial->num_bulk_out < num_ports) - return -ENODEV; - - return 0; -} - static int iuu_port_probe(struct usb_serial_port *port) { struct iuu_private *priv; @@ -598,9 +588,8 @@ static void read_buf_callback(struct urb *urb) } dev_dbg(&port->dev, "%s - %i chars to write\n", __func__, urb->actual_length); - if (data == NULL) - dev_dbg(&port->dev, "%s - data is NULL !!!\n", __func__); - if (urb->actual_length && data) { + + if (urb->actual_length) { tty_insert_flip_string(&port->port, data, urb->actual_length); tty_flip_buffer_push(&port->port); } @@ -665,10 +654,8 @@ static void iuu_uart_read_callback(struct urb *urb) /* error stop all */ return; } - if (data == NULL) - dev_dbg(&port->dev, "%s - data is NULL !!!\n", __func__); - if (urb->actual_length == 1 && data != NULL) + if (urb->actual_length == 1) len = (int) data[0]; if (urb->actual_length > 1) { @@ -1183,6 +1170,8 @@ static struct usb_serial_driver iuu_device = { }, .id_table = id_table, .num_ports = 1, + .num_bulk_in = 1, + .num_bulk_out = 1, .bulk_in_size = 512, .bulk_out_size = 512, .open = iuu_open, @@ -1193,7 +1182,6 @@ static struct usb_serial_driver iuu_device = { .tiocmset = iuu_tiocmset, .set_termios = iuu_set_termios, .init_termios = iuu_init_termios, - .attach = iuu_attach, .port_probe = iuu_port_probe, .port_remove = iuu_port_remove, }; diff --git a/drivers/usb/serial/keyspan_pda.c b/drivers/usb/serial/keyspan_pda.c index d2dab2a341b827d558eea480858b791344fd38b6..196908dd25a1e8f6591643a0149f4d83c46f3c62 100644 --- a/drivers/usb/serial/keyspan_pda.c +++ b/drivers/usb/serial/keyspan_pda.c @@ -708,19 +708,6 @@ MODULE_FIRMWARE("keyspan_pda/keyspan_pda.fw"); MODULE_FIRMWARE("keyspan_pda/xircom_pgs.fw"); #endif -static int keyspan_pda_attach(struct usb_serial *serial) -{ - unsigned char num_ports = serial->num_ports; - - if (serial->num_bulk_out < num_ports || - serial->num_interrupt_in < num_ports) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } - - return 0; -} - static int keyspan_pda_port_probe(struct usb_serial_port *port) { @@ -784,6 +771,8 @@ static struct usb_serial_driver keyspan_pda_device = { .description = "Keyspan PDA", .id_table = id_table_std, .num_ports = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .dtr_rts = keyspan_pda_dtr_rts, .open = keyspan_pda_open, .close = keyspan_pda_close, @@ -798,7 +787,6 @@ static struct usb_serial_driver keyspan_pda_device = { .break_ctl = keyspan_pda_break_ctl, .tiocmget = keyspan_pda_tiocmget, .tiocmset = keyspan_pda_tiocmset, - .attach = keyspan_pda_attach, .port_probe = keyspan_pda_port_probe, .port_remove = keyspan_pda_port_remove, }; diff --git a/drivers/usb/serial/kobil_sct.c b/drivers/usb/serial/kobil_sct.c index 813035f51fe73a4e0de69ce76e6c3329ba4d376c..3024b9b253605160a5f5e7038159bcc47ec4d1ea 100644 --- a/drivers/usb/serial/kobil_sct.c +++ b/drivers/usb/serial/kobil_sct.c @@ -51,7 +51,6 @@ /* Function prototypes */ -static int kobil_attach(struct usb_serial *serial); static int kobil_port_probe(struct usb_serial_port *probe); static int kobil_port_remove(struct usb_serial_port *probe); static int kobil_open(struct tty_struct *tty, struct usb_serial_port *port); @@ -87,7 +86,7 @@ static struct usb_serial_driver kobil_device = { .description = "KOBIL USB smart card terminal", .id_table = id_table, .num_ports = 1, - .attach = kobil_attach, + .num_interrupt_out = 1, .port_probe = kobil_port_probe, .port_remove = kobil_port_remove, .ioctl = kobil_ioctl, @@ -115,16 +114,6 @@ struct kobil_private { }; -static int kobil_attach(struct usb_serial *serial) -{ - if (serial->num_interrupt_out < serial->num_ports) { - dev_err(&serial->interface->dev, "missing interrupt-out endpoint\n"); - return -ENODEV; - } - - return 0; -} - static int kobil_port_probe(struct usb_serial_port *port) { struct usb_serial *serial = port->serial; diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index f075121c6e32329d136935d2d1d56ae8b49c784f..a453965f9e9a3153ffcbb28d4a6257a4db094cd3 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -973,11 +973,24 @@ static void mos7720_bulk_out_data_callback(struct urb *urb) tty_port_tty_wakeup(&mos7720_port->port->port); } -static int mos77xx_calc_num_ports(struct usb_serial *serial) +static int mos77xx_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { u16 product = le16_to_cpu(serial->dev->descriptor.idProduct); - if (product == MOSCHIP_DEVICE_ID_7715) + + if (product == MOSCHIP_DEVICE_ID_7715) { + /* + * The 7715 uses the first bulk in/out endpoint pair for the + * parallel port, and the second for the serial port. We swap + * the endpoint descriptors here so that the the first and + * only registered port structure uses the serial-port + * endpoints. + */ + swap(epds->bulk_in[0], epds->bulk_in[1]); + swap(epds->bulk_out[0], epds->bulk_out[1]); + return 1; + } return 2; } @@ -1395,7 +1408,7 @@ struct divisor_table_entry { /* Define table of divisors for moschip 7720 hardware * * These assume a 3.6864MHz crystal, the standard /16, and * * MCR.7 = 0. */ -static struct divisor_table_entry divisor_table[] = { +static const struct divisor_table_entry divisor_table[] = { { 50, 2304}, { 110, 1047}, /* 2094.545455 => 230450 => .0217 % over */ { 134, 857}, /* 1713.011152 => 230398.5 => .00065% under */ @@ -1675,7 +1688,6 @@ static void mos7720_set_termios(struct tty_struct *tty, struct usb_serial_port *port, struct ktermios *old_termios) { int status; - unsigned int cflag; struct usb_serial *serial; struct moschip_port *mos7720_port; @@ -1691,16 +1703,6 @@ static void mos7720_set_termios(struct tty_struct *tty, return; } - dev_dbg(&port->dev, "setting termios - ASPIRE\n"); - - cflag = tty->termios.c_cflag; - - dev_dbg(&port->dev, "%s - cflag %08x iflag %08x\n", __func__, - tty->termios.c_cflag, RELEVANT_IFLAG(tty->termios.c_iflag)); - - dev_dbg(&port->dev, "%s - old cflag %08x old iflag %08x\n", __func__, - old_termios->c_cflag, RELEVANT_IFLAG(old_termios->c_iflag)); - /* change the port settings to the new ones specified */ change_port_settings(tty, mos7720_port, old_termios); @@ -1900,54 +1902,24 @@ static int mos7720_startup(struct usb_serial *serial) u16 product; int ret_val; - if (serial->num_bulk_in < 2 || serial->num_bulk_out < 2) { - dev_err(&serial->interface->dev, "missing bulk endpoints\n"); - return -ENODEV; - } - product = le16_to_cpu(serial->dev->descriptor.idProduct); dev = serial->dev; - /* - * The 7715 uses the first bulk in/out endpoint pair for the parallel - * port, and the second for the serial port. Because the usbserial core - * assumes both pairs are serial ports, we must engage in a bit of - * subterfuge and swap the pointers for ports 0 and 1 in order to make - * port 0 point to the serial port. However, both moschip devices use a - * single interrupt-in endpoint for both ports (as mentioned a little - * further down), and this endpoint was assigned to port 0. So after - * the swap, we must copy the interrupt endpoint elements from port 1 - * (as newly assigned) to port 0, and null out port 1 pointers. - */ - if (product == MOSCHIP_DEVICE_ID_7715) { - struct usb_serial_port *tmp = serial->port[0]; - serial->port[0] = serial->port[1]; - serial->port[1] = tmp; - serial->port[0]->interrupt_in_urb = tmp->interrupt_in_urb; - serial->port[0]->interrupt_in_buffer = tmp->interrupt_in_buffer; - serial->port[0]->interrupt_in_endpointAddress = - tmp->interrupt_in_endpointAddress; - serial->port[1]->interrupt_in_urb = NULL; - serial->port[1]->interrupt_in_buffer = NULL; - - if (serial->port[0]->interrupt_in_urb) { - struct urb *urb = serial->port[0]->interrupt_in_urb; - - urb->complete = mos7715_interrupt_callback; - } - } - /* setting configuration feature to one */ usb_control_msg(serial->dev, usb_sndctrlpipe(serial->dev, 0), (__u8)0x03, 0x00, 0x01, 0x00, NULL, 0x00, 5000); -#ifdef CONFIG_USB_SERIAL_MOS7715_PARPORT if (product == MOSCHIP_DEVICE_ID_7715) { + struct urb *urb = serial->port[0]->interrupt_in_urb; + + urb->complete = mos7715_interrupt_callback; + +#ifdef CONFIG_USB_SERIAL_MOS7715_PARPORT ret_val = mos7715_parport_init(serial); if (ret_val < 0) return ret_val; - } #endif + } /* start the interrupt urb */ ret_val = usb_submit_urb(serial->port[0]->interrupt_in_urb, GFP_KERNEL); if (ret_val) { @@ -2039,6 +2011,9 @@ static struct usb_serial_driver moschip7720_2port_driver = { }, .description = "Moschip 2 port adapter", .id_table = id_table, + .num_bulk_in = 2, + .num_bulk_out = 2, + .num_interrupt_in = 1, .calc_num_ports = mos77xx_calc_num_ports, .open = mos7720_open, .close = mos7720_close, diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index 3821c53fcee9ec2c269ce562ef25d7a67ed34965..e8669aae14b3504d5e8cf2a6db25900edd66e269 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -1868,7 +1868,6 @@ static void mos7840_set_termios(struct tty_struct *tty, struct ktermios *old_termios) { int status; - unsigned int cflag; struct usb_serial *serial; struct moschip_port *mos7840_port; @@ -1890,15 +1889,6 @@ static void mos7840_set_termios(struct tty_struct *tty, return; } - dev_dbg(&port->dev, "%s", "setting termios - \n"); - - cflag = tty->termios.c_cflag; - - dev_dbg(&port->dev, "%s - clfag %08x iflag %08x\n", __func__, - tty->termios.c_cflag, RELEVANT_IFLAG(tty->termios.c_iflag)); - dev_dbg(&port->dev, "%s - old clfag %08x old iflag %08x\n", __func__, - old_termios->c_cflag, RELEVANT_IFLAG(old_termios->c_iflag)); - /* change the port settings to the new ones specified */ mos7840_change_port_settings(tty, mos7840_port, old_termios); @@ -2104,26 +2094,27 @@ static int mos7840_probe(struct usb_serial *serial, return 0; } -static int mos7840_calc_num_ports(struct usb_serial *serial) +static int mos7840_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { int device_type = (unsigned long)usb_get_serial_data(serial); - int mos7840_num_ports; + int num_ports; - mos7840_num_ports = (device_type >> 4) & 0x000F; + num_ports = (device_type >> 4) & 0x000F; - return mos7840_num_ports; -} + /* + * num_ports is currently never zero as device_type is one of + * MOSCHIP_DEVICE_ID_78{1,2,4}0. + */ + if (num_ports == 0) + return -ENODEV; -static int mos7840_attach(struct usb_serial *serial) -{ - if (serial->num_bulk_in < serial->num_ports || - serial->num_bulk_out < serial->num_ports || - serial->num_interrupt_in < 1) { + if (epds->num_bulk_in < num_ports || epds->num_bulk_out < num_ports) { dev_err(&serial->interface->dev, "missing endpoints\n"); return -ENODEV; } - return 0; + return num_ports; } static int mos7840_port_probe(struct usb_serial_port *port) @@ -2384,7 +2375,7 @@ static struct usb_serial_driver moschip7840_4port_device = { }, .description = DRIVER_DESC, .id_table = id_table, - .num_ports = 4, + .num_interrupt_in = 1, .open = mos7840_open, .close = mos7840_close, .write = mos7840_write, @@ -2401,7 +2392,6 @@ static struct usb_serial_driver moschip7840_4port_device = { .tiocmset = mos7840_tiocmset, .tiocmiwait = usb_serial_generic_tiocmiwait, .get_icount = usb_serial_generic_get_icount, - .attach = mos7840_attach, .port_probe = mos7840_port_probe, .port_remove = mos7840_port_remove, .read_bulk_callback = mos7840_bulk_in_callback, diff --git a/drivers/usb/serial/mxuport.c b/drivers/usb/serial/mxuport.c index c88215a0fa3d8d0eaa22be4e873cf59d2525e481..3aef091fe88b6b69540e012cc0842dcdb8e77e59 100644 --- a/drivers/usb/serial/mxuport.c +++ b/drivers/usb/serial/mxuport.c @@ -946,20 +946,39 @@ static void mxuport_set_termios(struct tty_struct *tty, * Determine how many ports this device has dynamically. It will be * called after the probe() callback is called, but before attach(). */ -static int mxuport_calc_num_ports(struct usb_serial *serial) +static int mxuport_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { unsigned long features = (unsigned long)usb_get_serial_data(serial); + int num_ports; + int i; - if (features & MX_UPORT_2_PORT) - return 2; - if (features & MX_UPORT_4_PORT) - return 4; - if (features & MX_UPORT_8_PORT) - return 8; - if (features & MX_UPORT_16_PORT) - return 16; + if (features & MX_UPORT_2_PORT) { + num_ports = 2; + } else if (features & MX_UPORT_4_PORT) { + num_ports = 4; + } else if (features & MX_UPORT_8_PORT) { + num_ports = 8; + } else if (features & MX_UPORT_16_PORT) { + num_ports = 16; + } else { + dev_warn(&serial->interface->dev, + "unknown device, assuming two ports\n"); + num_ports = 2; + } - return 0; + /* + * Setup bulk-out endpoint multiplexing. All ports share the same + * bulk-out endpoint. + */ + BUILD_BUG_ON(ARRAY_SIZE(epds->bulk_out) < 16); + + for (i = 1; i < num_ports; ++i) + epds->bulk_out[i] = epds->bulk_out[0]; + + epds->num_bulk_out = num_ports; + + return num_ports; } /* Get the version of the firmware currently running. */ @@ -1142,102 +1161,11 @@ static int mxuport_port_probe(struct usb_serial_port *port) port->port_number); } -static int mxuport_alloc_write_urb(struct usb_serial *serial, - struct usb_serial_port *port, - struct usb_serial_port *port0, - int j) -{ - struct usb_device *dev = interface_to_usbdev(serial->interface); - - set_bit(j, &port->write_urbs_free); - port->write_urbs[j] = usb_alloc_urb(0, GFP_KERNEL); - if (!port->write_urbs[j]) - return -ENOMEM; - - port->bulk_out_buffers[j] = kmalloc(port0->bulk_out_size, GFP_KERNEL); - if (!port->bulk_out_buffers[j]) - return -ENOMEM; - - usb_fill_bulk_urb(port->write_urbs[j], dev, - usb_sndbulkpipe(dev, port->bulk_out_endpointAddress), - port->bulk_out_buffers[j], - port->bulk_out_size, - serial->type->write_bulk_callback, - port); - return 0; -} - - -static int mxuport_alloc_write_urbs(struct usb_serial *serial, - struct usb_serial_port *port, - struct usb_serial_port *port0) -{ - int j; - int ret; - - for (j = 0; j < ARRAY_SIZE(port->write_urbs); ++j) { - ret = mxuport_alloc_write_urb(serial, port, port0, j); - if (ret) - return ret; - } - return 0; -} - - static int mxuport_attach(struct usb_serial *serial) { struct usb_serial_port *port0 = serial->port[0]; struct usb_serial_port *port1 = serial->port[1]; - struct usb_serial_port *port; int err; - int i; - int j; - - /* - * Throw away all but the first allocated write URBs so we can - * set them up again to fit the multiplexing scheme. - */ - for (i = 1; i < serial->num_bulk_out; ++i) { - port = serial->port[i]; - for (j = 0; j < ARRAY_SIZE(port->write_urbs); ++j) { - usb_free_urb(port->write_urbs[j]); - kfree(port->bulk_out_buffers[j]); - port->write_urbs[j] = NULL; - port->bulk_out_buffers[j] = NULL; - } - port->write_urbs_free = 0; - } - - /* - * All write data is sent over the first bulk out endpoint, - * with an added header to indicate the port. Allocate URBs - * for each port to the first bulk out endpoint. - */ - for (i = 1; i < serial->num_ports; ++i) { - port = serial->port[i]; - port->bulk_out_size = port0->bulk_out_size; - port->bulk_out_endpointAddress = - port0->bulk_out_endpointAddress; - - err = mxuport_alloc_write_urbs(serial, port, port0); - if (err) - return err; - - port->write_urb = port->write_urbs[0]; - port->bulk_out_buffer = port->bulk_out_buffers[0]; - - /* - * Ensure each port has a fifo. The framework only - * allocates a fifo to ports with a bulk out endpoint, - * where as we need one for every port. - */ - if (!kfifo_initialized(&port->write_fifo)) { - err = kfifo_alloc(&port->write_fifo, PAGE_SIZE, - GFP_KERNEL); - if (err) - return err; - } - } /* * All data from the ports is received on the first bulk in @@ -1366,7 +1294,8 @@ static struct usb_serial_driver mxuport_device = { }, .description = "MOXA UPort", .id_table = mxuport_idtable, - .num_ports = 0, + .num_bulk_in = 2, + .num_bulk_out = 1, .probe = mxuport_probe, .port_probe = mxuport_port_probe, .attach = mxuport_attach, diff --git a/drivers/usb/serial/omninet.c b/drivers/usb/serial/omninet.c index dd706953b4660905cc5abf5e85477bfb6bdb4149..efcd7feed6f4d1c882627736de30253ca2858da6 100644 --- a/drivers/usb/serial/omninet.c +++ b/drivers/usb/serial/omninet.c @@ -1,6 +1,8 @@ /* * USB ZyXEL omni.net LCD PLUS driver * + * Copyright (C) 2013,2017 Johan Hovold + * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version * 2 as published by the Free Software Foundation. @@ -32,12 +34,10 @@ /* function prototypes */ static void omninet_process_read_urb(struct urb *urb); -static void omninet_write_bulk_callback(struct urb *urb); -static int omninet_write(struct tty_struct *tty, struct usb_serial_port *port, - const unsigned char *buf, int count); -static int omninet_write_room(struct tty_struct *tty); -static void omninet_disconnect(struct usb_serial *serial); -static int omninet_attach(struct usb_serial *serial); +static int omninet_prepare_write_buffer(struct usb_serial_port *port, + void *buf, size_t count); +static int omninet_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds); static int omninet_port_probe(struct usb_serial_port *port); static int omninet_port_remove(struct usb_serial_port *port); @@ -55,15 +55,12 @@ static struct usb_serial_driver zyxel_omninet_device = { }, .description = "ZyXEL - omni.net lcd plus usb", .id_table = id_table, - .num_ports = 1, - .attach = omninet_attach, + .num_bulk_out = 2, + .calc_num_ports = omninet_calc_num_ports, .port_probe = omninet_port_probe, .port_remove = omninet_port_remove, - .write = omninet_write, - .write_room = omninet_write_room, - .write_bulk_callback = omninet_write_bulk_callback, .process_read_urb = omninet_process_read_urb, - .disconnect = omninet_disconnect, + .prepare_write_buffer = omninet_prepare_write_buffer, }; static struct usb_serial_driver * const serial_drivers[] = { @@ -104,15 +101,14 @@ struct omninet_data { __u8 od_outseq; /* Sequence number for bulk_out URBs */ }; -static int omninet_attach(struct usb_serial *serial) +static int omninet_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { - /* The second bulk-out endpoint is used for writing. */ - if (serial->num_bulk_out < 2) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } + /* We need only the second bulk-out for our single-port device. */ + epds->bulk_out[0] = epds->bulk_out[1]; + epds->num_bulk_out = 1; - return 0; + return 1; } static int omninet_port_probe(struct usb_serial_port *port) @@ -159,96 +155,24 @@ static void omninet_process_read_urb(struct urb *urb) tty_flip_buffer_push(&port->port); } -static int omninet_write(struct tty_struct *tty, struct usb_serial_port *port, - const unsigned char *buf, int count) +static int omninet_prepare_write_buffer(struct usb_serial_port *port, + void *buf, size_t count) { - struct usb_serial *serial = port->serial; - struct usb_serial_port *wport = serial->port[1]; - struct omninet_data *od = usb_get_serial_port_data(port); - struct omninet_header *header = (struct omninet_header *) - wport->write_urb->transfer_buffer; - - int result; - - if (count == 0) { - dev_dbg(&port->dev, "%s - write request of 0 bytes\n", __func__); - return 0; - } - - if (!test_and_clear_bit(0, &port->write_urbs_free)) { - dev_dbg(&port->dev, "%s - already writing\n", __func__); - return 0; - } - - count = (count > OMNINET_PAYLOADSIZE) ? OMNINET_PAYLOADSIZE : count; - - memcpy(wport->write_urb->transfer_buffer + OMNINET_HEADERLEN, - buf, count); - - usb_serial_debug_data(&port->dev, __func__, count, - wport->write_urb->transfer_buffer); - - header->oh_seq = od->od_outseq++; - header->oh_len = count; - header->oh_xxx = 0x03; - header->oh_pad = 0x00; - - /* send the data out the bulk port, always 64 bytes */ - wport->write_urb->transfer_buffer_length = OMNINET_BULKOUTSIZE; - - result = usb_submit_urb(wport->write_urb, GFP_ATOMIC); - if (result) { - set_bit(0, &wport->write_urbs_free); - dev_err_console(port, - "%s - failed submitting write urb, error %d\n", - __func__, result); - } else - result = count; - - return result; -} + struct omninet_header *header = buf; + count = min_t(size_t, count, OMNINET_PAYLOADSIZE); -static int omninet_write_room(struct tty_struct *tty) -{ - struct usb_serial_port *port = tty->driver_data; - struct usb_serial *serial = port->serial; - struct usb_serial_port *wport = serial->port[1]; - - int room = 0; /* Default: no room */ - - if (test_bit(0, &wport->write_urbs_free)) - room = wport->bulk_out_size - OMNINET_HEADERLEN; - - dev_dbg(&port->dev, "%s - returns %d\n", __func__, room); - - return room; -} - -static void omninet_write_bulk_callback(struct urb *urb) -{ -/* struct omninet_header *header = (struct omninet_header *) - urb->transfer_buffer; */ - struct usb_serial_port *port = urb->context; - int status = urb->status; - - set_bit(0, &port->write_urbs_free); - if (status) { - dev_dbg(&port->dev, "%s - nonzero write bulk status received: %d\n", - __func__, status); - return; - } + count = kfifo_out_locked(&port->write_fifo, buf + OMNINET_HEADERLEN, + count, &port->lock); - usb_serial_port_softint(port); -} - - -static void omninet_disconnect(struct usb_serial *serial) -{ - struct usb_serial_port *wport = serial->port[1]; + header->oh_seq = od->od_outseq++; + header->oh_len = count; + header->oh_xxx = 0x03; + header->oh_pad = 0x00; - usb_kill_urb(wport->write_urb); + /* always 64 bytes */ + return OMNINET_BULKOUTSIZE; } module_usb_serial_driver(serial_drivers, id_table); diff --git a/drivers/usb/serial/opticon.c b/drivers/usb/serial/opticon.c index 3937b9c3cc691b778cfb7a310a6e5400ee53d095..58657d64678ba06eff913acec8c7584a19502db9 100644 --- a/drivers/usb/serial/opticon.c +++ b/drivers/usb/serial/opticon.c @@ -367,16 +367,6 @@ static int opticon_ioctl(struct tty_struct *tty, return -ENOIOCTLCMD; } -static int opticon_startup(struct usb_serial *serial) -{ - if (!serial->num_bulk_in) { - dev_err(&serial->dev->dev, "no bulk in endpoint\n"); - return -ENODEV; - } - - return 0; -} - static int opticon_port_probe(struct usb_serial_port *port) { struct opticon_private *priv; @@ -408,8 +398,8 @@ static struct usb_serial_driver opticon_device = { }, .id_table = id_table, .num_ports = 1, + .num_bulk_in = 1, .bulk_in_size = 256, - .attach = opticon_startup, .port_probe = opticon_port_probe, .port_remove = opticon_port_remove, .open = opticon_open, diff --git a/drivers/usb/serial/oti6858.c b/drivers/usb/serial/oti6858.c index b8bf52bf7a944d5fddd6976f90a63db3cd6807e5..b11eead469eee85860119eda06db9aecec5e1838 100644 --- a/drivers/usb/serial/oti6858.c +++ b/drivers/usb/serial/oti6858.c @@ -134,7 +134,6 @@ static int oti6858_chars_in_buffer(struct tty_struct *tty); static int oti6858_tiocmget(struct tty_struct *tty); static int oti6858_tiocmset(struct tty_struct *tty, unsigned int set, unsigned int clear); -static int oti6858_attach(struct usb_serial *serial); static int oti6858_port_probe(struct usb_serial_port *port); static int oti6858_port_remove(struct usb_serial_port *port); @@ -146,6 +145,9 @@ static struct usb_serial_driver oti6858_device = { }, .id_table = id_table, .num_ports = 1, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 1, .open = oti6858_open, .close = oti6858_close, .write = oti6858_write, @@ -159,7 +161,6 @@ static struct usb_serial_driver oti6858_device = { .write_bulk_callback = oti6858_write_bulk_callback, .write_room = oti6858_write_room, .chars_in_buffer = oti6858_chars_in_buffer, - .attach = oti6858_attach, .port_probe = oti6858_port_probe, .port_remove = oti6858_port_remove, }; @@ -326,20 +327,6 @@ static void send_data(struct work_struct *work) usb_serial_port_softint(port); } -static int oti6858_attach(struct usb_serial *serial) -{ - unsigned char num_ports = serial->num_ports; - - if (serial->num_bulk_in < num_ports || - serial->num_bulk_out < num_ports || - serial->num_interrupt_in < num_ports) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } - - return 0; -} - static int oti6858_port_probe(struct usb_serial_port *port) { struct oti6858_private *priv; diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c index ca69eb42071bcb1ed5d01706698e2289d5757b03..c9ebefd8f35fdbe5491627f4df89b80c87aeabad 100644 --- a/drivers/usb/serial/pl2303.c +++ b/drivers/usb/serial/pl2303.c @@ -33,9 +33,11 @@ #define PL2303_QUIRK_UART_STATE_IDX0 BIT(0) #define PL2303_QUIRK_LEGACY BIT(1) +#define PL2303_QUIRK_ENDPOINT_HACK BIT(2) static const struct usb_device_id id_table[] = { - { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID) }, + { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID), + .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ2) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_DCU11) }, { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_RSAQ3) }, @@ -48,7 +50,8 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(PL2303_VENDOR_ID, PL2303_PRODUCT_ID_ZTEK) }, { USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID) }, { USB_DEVICE(IODATA_VENDOR_ID, IODATA_PRODUCT_ID_RSAQ5) }, - { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID) }, + { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID), + .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(ATEN_VENDOR_ID, ATEN_PRODUCT_ID2) }, { USB_DEVICE(ATEN_VENDOR_ID2, ATEN_PRODUCT_ID) }, { USB_DEVICE(ELCOM_VENDOR_ID, ELCOM_PRODUCT_ID) }, @@ -68,7 +71,8 @@ static const struct usb_device_id id_table[] = { .driver_info = PL2303_QUIRK_UART_STATE_IDX0 }, { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_X75), .driver_info = PL2303_QUIRK_UART_STATE_IDX0 }, - { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_EF81) }, + { USB_DEVICE(SIEMENS_VENDOR_ID, SIEMENS_PRODUCT_ID_EF81), + .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(BENQ_VENDOR_ID, BENQ_PRODUCT_ID_S81) }, /* Benq/Siemens S81 */ { USB_DEVICE(SYNTECH_VENDOR_ID, SYNTECH_PRODUCT_ID) }, { USB_DEVICE(NOKIA_CA42_VENDOR_ID, NOKIA_CA42_PRODUCT_ID) }, @@ -78,7 +82,8 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(SPEEDDRAGON_VENDOR_ID, SPEEDDRAGON_PRODUCT_ID) }, { USB_DEVICE(DATAPILOT_U2_VENDOR_ID, DATAPILOT_U2_PRODUCT_ID) }, { USB_DEVICE(BELKIN_VENDOR_ID, BELKIN_PRODUCT_ID) }, - { USB_DEVICE(ALCOR_VENDOR_ID, ALCOR_PRODUCT_ID) }, + { USB_DEVICE(ALCOR_VENDOR_ID, ALCOR_PRODUCT_ID), + .driver_info = PL2303_QUIRK_ENDPOINT_HACK }, { USB_DEVICE(WS002IN_VENDOR_ID, WS002IN_PRODUCT_ID) }, { USB_DEVICE(COREGA_VENDOR_ID, COREGA_PRODUCT_ID) }, { USB_DEVICE(YCCABLE_VENDOR_ID, YCCABLE_PRODUCT_ID) }, @@ -218,20 +223,68 @@ static int pl2303_probe(struct usb_serial *serial, return 0; } +/* + * Use interrupt endpoint from first interface if available. + * + * This is needed due to the looney way its endpoints are set up. + */ +static int pl2303_endpoint_hack(struct usb_serial *serial, + struct usb_serial_endpoints *epds) +{ + struct usb_interface *interface = serial->interface; + struct usb_device *dev = serial->dev; + struct device *ddev = &interface->dev; + struct usb_host_interface *iface_desc; + struct usb_endpoint_descriptor *endpoint; + unsigned int i; + + if (interface == dev->actconfig->interface[0]) + return 0; + + /* check out the endpoints of the other interface */ + iface_desc = dev->actconfig->interface[0]->cur_altsetting; + + for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { + endpoint = &iface_desc->endpoint[i].desc; + + if (!usb_endpoint_is_int_in(endpoint)) + continue; + + dev_dbg(ddev, "found interrupt in on separate interface\n"); + if (epds->num_interrupt_in < ARRAY_SIZE(epds->interrupt_in)) + epds->interrupt_in[epds->num_interrupt_in++] = endpoint; + } + + return 0; +} + +static int pl2303_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) +{ + unsigned long quirks = (unsigned long)usb_get_serial_data(serial); + struct device *dev = &serial->interface->dev; + int ret; + + if (quirks & PL2303_QUIRK_ENDPOINT_HACK) { + ret = pl2303_endpoint_hack(serial, epds); + if (ret) + return ret; + } + + if (epds->num_interrupt_in < 1) { + dev_err(dev, "required interrupt-in endpoint missing\n"); + return -ENODEV; + } + + return 1; +} + static int pl2303_startup(struct usb_serial *serial) { struct pl2303_serial_private *spriv; - unsigned char num_ports = serial->num_ports; enum pl2303_type type = TYPE_01; unsigned char *buf; - if (serial->num_bulk_in < num_ports || - serial->num_bulk_out < num_ports || - serial->num_interrupt_in < num_ports) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } - spriv = kzalloc(sizeof(*spriv), GFP_KERNEL); if (!spriv) return -ENOMEM; @@ -938,7 +991,9 @@ static struct usb_serial_driver pl2303_device = { .name = "pl2303", }, .id_table = id_table, - .num_ports = 1, + .num_bulk_in = 1, + .num_bulk_out = 1, + .num_interrupt_in = 0, /* see pl2303_calc_num_ports */ .bulk_in_size = 256, .bulk_out_size = 256, .open = pl2303_open, @@ -954,6 +1009,7 @@ static struct usb_serial_driver pl2303_device = { .process_read_urb = pl2303_process_read_urb, .read_int_callback = pl2303_read_int_callback, .probe = pl2303_probe, + .calc_num_ports = pl2303_calc_num_ports, .attach = pl2303_startup, .release = pl2303_release, .port_probe = pl2303_port_probe, diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c index fdbb904d153f735b83c007137eaa42d5bf76a6b4..60e17d1444c39fc50f4fbd85a579e967ba823775 100644 --- a/drivers/usb/serial/quatech2.c +++ b/drivers/usb/serial/quatech2.c @@ -246,7 +246,8 @@ static inline int update_mctrl(struct qt2_port_private *port_priv, return status; } -static int qt2_calc_num_ports(struct usb_serial *serial) +static int qt2_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { struct qt2_device_detail d; int i; @@ -600,7 +601,6 @@ static void qt2_process_read_urb(struct urb *urb) escapeflag = true; break; case QT2_CONTROL_ESCAPE: - tty_buffer_request_room(&port->port, 2); tty_insert_flip_string(&port->port, ch, 2); i += 2; escapeflag = true; @@ -615,8 +615,7 @@ static void qt2_process_read_urb(struct urb *urb) continue; } - tty_buffer_request_room(&port->port, 1); - tty_insert_flip_string(&port->port, ch, 1); + tty_insert_flip_char(&port->port, *ch, TTY_NORMAL); } tty_flip_buffer_push(&port->port); diff --git a/drivers/usb/serial/sierra.c b/drivers/usb/serial/sierra.c index 465e851b281585b3b7445177868a061cecfd3c15..4c4ac4705ac03f8d850a653fe91e35b78bfe25b6 100644 --- a/drivers/usb/serial/sierra.c +++ b/drivers/usb/serial/sierra.c @@ -85,7 +85,8 @@ static int sierra_vsc_set_nmea(struct usb_device *udev, __u16 enable) USB_CTRL_SET_TIMEOUT); /* int timeout */ } -static int sierra_calc_num_ports(struct usb_serial *serial) +static int sierra_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { int num_ports = 0; u8 ifnum, numendpoints; diff --git a/drivers/usb/serial/spcp8x5.c b/drivers/usb/serial/spcp8x5.c index ddfd787c461cb5367c1f932b3edffb42c0812832..5167b6564c8b4574016aa48d068bb290e3ade06e 100644 --- a/drivers/usb/serial/spcp8x5.c +++ b/drivers/usb/serial/spcp8x5.c @@ -154,19 +154,6 @@ static int spcp8x5_probe(struct usb_serial *serial, return 0; } -static int spcp8x5_attach(struct usb_serial *serial) -{ - unsigned char num_ports = serial->num_ports; - - if (serial->num_bulk_in < num_ports || - serial->num_bulk_out < num_ports) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } - - return 0; -} - static int spcp8x5_port_probe(struct usb_serial_port *port) { const struct usb_device_id *id = usb_get_serial_data(port->serial); @@ -488,6 +475,8 @@ static struct usb_serial_driver spcp8x5_device = { }, .id_table = id_table, .num_ports = 1, + .num_bulk_in = 1, + .num_bulk_out = 1, .open = spcp8x5_open, .dtr_rts = spcp8x5_dtr_rts, .carrier_raised = spcp8x5_carrier_raised, @@ -496,7 +485,6 @@ static struct usb_serial_driver spcp8x5_device = { .tiocmget = spcp8x5_tiocmget, .tiocmset = spcp8x5_tiocmset, .probe = spcp8x5_probe, - .attach = spcp8x5_attach, .port_probe = spcp8x5_port_probe, .port_remove = spcp8x5_port_remove, }; diff --git a/drivers/usb/serial/symbolserial.c b/drivers/usb/serial/symbolserial.c index 37f3ad15ed06a5f6c1cf9db54ca0beef801d523f..0d1727232d0c1b2168804a557a71a24c2dcce247 100644 --- a/drivers/usb/serial/symbolserial.c +++ b/drivers/usb/serial/symbolserial.c @@ -147,16 +147,6 @@ static void symbol_unthrottle(struct tty_struct *tty) } } -static int symbol_startup(struct usb_serial *serial) -{ - if (!serial->num_interrupt_in) { - dev_err(&serial->dev->dev, "no interrupt-in endpoint\n"); - return -ENODEV; - } - - return 0; -} - static int symbol_port_probe(struct usb_serial_port *port) { struct symbol_private *priv; @@ -188,7 +178,7 @@ static struct usb_serial_driver symbol_device = { }, .id_table = id_table, .num_ports = 1, - .attach = symbol_startup, + .num_interrupt_in = 1, .port_probe = symbol_port_probe, .port_remove = symbol_port_remove, .open = symbol_open, diff --git a/drivers/usb/serial/ti_usb_3410_5052.c b/drivers/usb/serial/ti_usb_3410_5052.c index 3107bf5d1c966bbe35f4b0c84af803fcf0c0308e..8fc3854e5e6967dc32f1a04e5ba95a304f10b683 100644 --- a/drivers/usb/serial/ti_usb_3410_5052.c +++ b/drivers/usb/serial/ti_usb_3410_5052.c @@ -427,6 +427,7 @@ static struct usb_serial_driver ti_1port_device = { .description = "TI USB 3410 1 port adapter", .id_table = ti_id_table_3410, .num_ports = 1, + .num_bulk_out = 1, .attach = ti_startup, .release = ti_release, .port_probe = ti_port_probe, @@ -459,6 +460,7 @@ static struct usb_serial_driver ti_2port_device = { .description = "TI USB 5052 2 port adapter", .id_table = ti_id_table_5052, .num_ports = 2, + .num_bulk_out = 1, .attach = ti_startup, .release = ti_release, .port_probe = ti_port_probe, @@ -927,7 +929,6 @@ static void ti_set_termios(struct tty_struct *tty, { struct ti_port *tport = usb_get_serial_port_data(port); struct ti_uart_config *config; - tcflag_t cflag, iflag; int baud; int status; int port_number = port->port_number; @@ -935,13 +936,6 @@ static void ti_set_termios(struct tty_struct *tty, u16 wbaudrate; u16 wflags = 0; - cflag = tty->termios.c_cflag; - iflag = tty->termios.c_iflag; - - dev_dbg(&port->dev, "%s - cflag %08x, iflag %08x\n", __func__, cflag, iflag); - dev_dbg(&port->dev, "%s - old clfag %08x, old iflag %08x\n", __func__, - old_termios->c_cflag, old_termios->c_iflag); - config = kmalloc(sizeof(*config), GFP_KERNEL); if (!config) return; diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index 4a037b4a79cf3168cb45d60d1b6488ac21681570..c7ca95f64edc2b178dec37f852dbb90333efe637 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -38,7 +38,6 @@ #include #include #include -#include "pl2303.h" #define DRIVER_AUTHOR "Greg Kroah-Hartman " #define DRIVER_DESC "USB Serial Driver core" @@ -710,6 +709,39 @@ static const struct tty_port_operations serial_port_ops = { .shutdown = serial_port_shutdown, }; +static void find_endpoints(struct usb_serial *serial, + struct usb_serial_endpoints *epds) +{ + struct device *dev = &serial->interface->dev; + struct usb_host_interface *iface_desc; + struct usb_endpoint_descriptor *epd; + unsigned int i; + + BUILD_BUG_ON(ARRAY_SIZE(epds->bulk_in) < USB_MAXENDPOINTS / 2); + BUILD_BUG_ON(ARRAY_SIZE(epds->bulk_out) < USB_MAXENDPOINTS / 2); + BUILD_BUG_ON(ARRAY_SIZE(epds->interrupt_in) < USB_MAXENDPOINTS / 2); + BUILD_BUG_ON(ARRAY_SIZE(epds->interrupt_out) < USB_MAXENDPOINTS / 2); + + iface_desc = serial->interface->cur_altsetting; + for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { + epd = &iface_desc->endpoint[i].desc; + + if (usb_endpoint_is_bulk_in(epd)) { + dev_dbg(dev, "found bulk in on endpoint %u\n", i); + epds->bulk_in[epds->num_bulk_in++] = epd; + } else if (usb_endpoint_is_bulk_out(epd)) { + dev_dbg(dev, "found bulk out on endpoint %u\n", i); + epds->bulk_out[epds->num_bulk_out++] = epd; + } else if (usb_endpoint_is_int_in(epd)) { + dev_dbg(dev, "found interrupt in on endpoint %u\n", i); + epds->interrupt_in[epds->num_interrupt_in++] = epd; + } else if (usb_endpoint_is_int_out(epd)) { + dev_dbg(dev, "found interrupt out on endpoint %u\n", i); + epds->interrupt_out[epds->num_interrupt_out++] = epd; + } + } +} + static int usb_serial_probe(struct usb_interface *interface, const struct usb_device_id *id) { @@ -717,23 +749,15 @@ static int usb_serial_probe(struct usb_interface *interface, struct usb_device *dev = interface_to_usbdev(interface); struct usb_serial *serial = NULL; struct usb_serial_port *port; - struct usb_host_interface *iface_desc; struct usb_endpoint_descriptor *endpoint; - struct usb_endpoint_descriptor *interrupt_in_endpoint[MAX_NUM_PORTS]; - struct usb_endpoint_descriptor *interrupt_out_endpoint[MAX_NUM_PORTS]; - struct usb_endpoint_descriptor *bulk_in_endpoint[MAX_NUM_PORTS]; - struct usb_endpoint_descriptor *bulk_out_endpoint[MAX_NUM_PORTS]; + struct usb_serial_endpoints *epds; struct usb_serial_driver *type = NULL; int retval; int buffer_size; int i; int j; - int num_interrupt_in = 0; - int num_interrupt_out = 0; - int num_bulk_in = 0; - int num_bulk_out = 0; int num_ports = 0; - int max_endpoints; + unsigned char max_endpoints; mutex_lock(&table_lock); type = search_serial_device(interface); @@ -752,8 +776,8 @@ static int usb_serial_probe(struct usb_interface *interface, serial = create_serial(dev, interface, type); if (!serial) { - module_put(type->driver.owner); - return -ENOMEM; + retval = -ENOMEM; + goto err_put_module; } /* if this device type has a probe function, call it */ @@ -765,129 +789,48 @@ static int usb_serial_probe(struct usb_interface *interface, if (retval) { dev_dbg(ddev, "sub driver rejected device\n"); - usb_serial_put(serial); - module_put(type->driver.owner); - return retval; + goto err_put_serial; } } /* descriptor matches, let's find the endpoints needed */ - /* check out the endpoints */ - iface_desc = interface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (usb_endpoint_is_bulk_in(endpoint)) { - /* we found a bulk in endpoint */ - dev_dbg(ddev, "found bulk in on endpoint %d\n", i); - if (num_bulk_in < MAX_NUM_PORTS) { - bulk_in_endpoint[num_bulk_in] = endpoint; - ++num_bulk_in; - } - } - - if (usb_endpoint_is_bulk_out(endpoint)) { - /* we found a bulk out endpoint */ - dev_dbg(ddev, "found bulk out on endpoint %d\n", i); - if (num_bulk_out < MAX_NUM_PORTS) { - bulk_out_endpoint[num_bulk_out] = endpoint; - ++num_bulk_out; - } - } - - if (usb_endpoint_is_int_in(endpoint)) { - /* we found a interrupt in endpoint */ - dev_dbg(ddev, "found interrupt in on endpoint %d\n", i); - if (num_interrupt_in < MAX_NUM_PORTS) { - interrupt_in_endpoint[num_interrupt_in] = - endpoint; - ++num_interrupt_in; - } - } - - if (usb_endpoint_is_int_out(endpoint)) { - /* we found an interrupt out endpoint */ - dev_dbg(ddev, "found interrupt out on endpoint %d\n", i); - if (num_interrupt_out < MAX_NUM_PORTS) { - interrupt_out_endpoint[num_interrupt_out] = - endpoint; - ++num_interrupt_out; - } - } + epds = kzalloc(sizeof(*epds), GFP_KERNEL); + if (!epds) { + retval = -ENOMEM; + goto err_put_serial; } -#if IS_ENABLED(CONFIG_USB_SERIAL_PL2303) - /* BEGIN HORRIBLE HACK FOR PL2303 */ - /* this is needed due to the looney way its endpoints are set up */ - if (((le16_to_cpu(dev->descriptor.idVendor) == PL2303_VENDOR_ID) && - (le16_to_cpu(dev->descriptor.idProduct) == PL2303_PRODUCT_ID)) || - ((le16_to_cpu(dev->descriptor.idVendor) == ATEN_VENDOR_ID) && - (le16_to_cpu(dev->descriptor.idProduct) == ATEN_PRODUCT_ID)) || - ((le16_to_cpu(dev->descriptor.idVendor) == ALCOR_VENDOR_ID) && - (le16_to_cpu(dev->descriptor.idProduct) == ALCOR_PRODUCT_ID)) || - ((le16_to_cpu(dev->descriptor.idVendor) == SIEMENS_VENDOR_ID) && - (le16_to_cpu(dev->descriptor.idProduct) == SIEMENS_PRODUCT_ID_EF81))) { - if (interface != dev->actconfig->interface[0]) { - /* check out the endpoints of the other interface*/ - iface_desc = dev->actconfig->interface[0]->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - if (usb_endpoint_is_int_in(endpoint)) { - /* we found a interrupt in endpoint */ - dev_dbg(ddev, "found interrupt in for Prolific device on separate interface\n"); - if (num_interrupt_in < MAX_NUM_PORTS) { - interrupt_in_endpoint[num_interrupt_in] = endpoint; - ++num_interrupt_in; - } - } - } - } + find_endpoints(serial, epds); - /* Now make sure the PL-2303 is configured correctly. - * If not, give up now and hope this hack will work - * properly during a later invocation of usb_serial_probe - */ - if (num_bulk_in == 0 || num_bulk_out == 0) { - dev_info(ddev, "PL-2303 hack: descriptors matched but endpoints did not\n"); - usb_serial_put(serial); - module_put(type->driver.owner); - return -ENODEV; - } - } - /* END HORRIBLE HACK FOR PL2303 */ -#endif - -#ifdef CONFIG_USB_SERIAL_GENERIC - if (type == &usb_serial_generic_device) { - num_ports = num_bulk_out; - if (num_ports == 0) { - dev_err(ddev, "Generic device with no bulk out, not allowed.\n"); - usb_serial_put(serial); - module_put(type->driver.owner); - return -EIO; - } - dev_info(ddev, "The \"generic\" usb-serial driver is only for testing and one-off prototypes.\n"); - dev_info(ddev, "Tell linux-usb@vger.kernel.org to add your device to a proper driver.\n"); + if (epds->num_bulk_in < type->num_bulk_in || + epds->num_bulk_out < type->num_bulk_out || + epds->num_interrupt_in < type->num_interrupt_in || + epds->num_interrupt_out < type->num_interrupt_out) { + dev_err(ddev, "required endpoints missing\n"); + retval = -ENODEV; + goto err_free_epds; } -#endif - if (!num_ports) { - /* if this device type has a calc_num_ports function, call it */ - if (type->calc_num_ports) - num_ports = type->calc_num_ports(serial); - if (!num_ports) - num_ports = type->num_ports; + + if (type->calc_num_ports) { + retval = type->calc_num_ports(serial, epds); + if (retval < 0) + goto err_free_epds; + num_ports = retval; } + if (!num_ports) + num_ports = type->num_ports; + if (num_ports > MAX_NUM_PORTS) { dev_warn(ddev, "too many ports requested: %d\n", num_ports); num_ports = MAX_NUM_PORTS; } - serial->num_ports = num_ports; - serial->num_bulk_in = num_bulk_in; - serial->num_bulk_out = num_bulk_out; - serial->num_interrupt_in = num_interrupt_in; - serial->num_interrupt_out = num_interrupt_out; + serial->num_ports = (unsigned char)num_ports; + serial->num_bulk_in = epds->num_bulk_in; + serial->num_bulk_out = epds->num_bulk_out; + serial->num_interrupt_in = epds->num_interrupt_in; + serial->num_interrupt_out = epds->num_interrupt_out; /* found all that we need */ dev_info(ddev, "%s converter detected\n", type->description); @@ -895,10 +838,10 @@ static int usb_serial_probe(struct usb_interface *interface, /* create our ports, we need as many as the max endpoints */ /* we don't use num_ports here because some devices have more endpoint pairs than ports */ - max_endpoints = max(num_bulk_in, num_bulk_out); - max_endpoints = max(max_endpoints, num_interrupt_in); - max_endpoints = max(max_endpoints, num_interrupt_out); - max_endpoints = max(max_endpoints, (int)serial->num_ports); + max_endpoints = max(epds->num_bulk_in, epds->num_bulk_out); + max_endpoints = max(max_endpoints, epds->num_interrupt_in); + max_endpoints = max(max_endpoints, epds->num_interrupt_out); + max_endpoints = max(max_endpoints, serial->num_ports); serial->num_port_pointers = max_endpoints; dev_dbg(ddev, "setting up %d port structure(s)\n", max_endpoints); @@ -923,8 +866,8 @@ static int usb_serial_probe(struct usb_interface *interface, } /* set up the endpoint information */ - for (i = 0; i < num_bulk_in; ++i) { - endpoint = bulk_in_endpoint[i]; + for (i = 0; i < epds->num_bulk_in; ++i) { + endpoint = epds->bulk_in[i]; port = serial->port[i]; buffer_size = max_t(int, serial->type->bulk_in_size, usb_endpoint_maxp(endpoint)); @@ -952,8 +895,8 @@ static int usb_serial_probe(struct usb_interface *interface, port->bulk_in_buffer = port->bulk_in_buffers[0]; } - for (i = 0; i < num_bulk_out; ++i) { - endpoint = bulk_out_endpoint[i]; + for (i = 0; i < epds->num_bulk_out; ++i) { + endpoint = epds->bulk_out[i]; port = serial->port[i]; if (kfifo_alloc(&port->write_fifo, PAGE_SIZE, GFP_KERNEL)) goto probe_error; @@ -985,8 +928,8 @@ static int usb_serial_probe(struct usb_interface *interface, } if (serial->type->read_int_callback) { - for (i = 0; i < num_interrupt_in; ++i) { - endpoint = interrupt_in_endpoint[i]; + for (i = 0; i < epds->num_interrupt_in; ++i) { + endpoint = epds->interrupt_in[i]; port = serial->port[i]; port->interrupt_in_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_in_urb) @@ -1005,13 +948,13 @@ static int usb_serial_probe(struct usb_interface *interface, serial->type->read_int_callback, port, endpoint->bInterval); } - } else if (num_interrupt_in) { + } else if (epds->num_interrupt_in) { dev_dbg(ddev, "The device claims to support interrupt in transfers, but read_int_callback is not defined\n"); } if (serial->type->write_int_callback) { - for (i = 0; i < num_interrupt_out; ++i) { - endpoint = interrupt_out_endpoint[i]; + for (i = 0; i < epds->num_interrupt_out; ++i) { + endpoint = epds->interrupt_out[i]; port = serial->port[i]; port->interrupt_out_urb = usb_alloc_urb(0, GFP_KERNEL); if (!port->interrupt_out_urb) @@ -1031,7 +974,7 @@ static int usb_serial_probe(struct usb_interface *interface, serial->type->write_int_callback, port, endpoint->bInterval); } - } else if (num_interrupt_out) { + } else if (epds->num_interrupt_out) { dev_dbg(ddev, "The device claims to support interrupt out transfers, but write_int_callback is not defined\n"); } @@ -1053,12 +996,6 @@ static int usb_serial_probe(struct usb_interface *interface, serial->attached = 1; } - /* Avoid race with tty_open and serial_install by setting the - * disconnected flag and not clearing it until all ports have been - * registered. - */ - serial->disconnected = 1; - if (allocate_minors(serial, num_ports)) { dev_err(ddev, "No more free serial minor numbers\n"); goto probe_error; @@ -1076,18 +1013,23 @@ static int usb_serial_probe(struct usb_interface *interface, dev_err(ddev, "Error registering port device, continuing\n"); } - serial->disconnected = 0; - if (num_ports > 0) usb_serial_console_init(serial->port[0]->minor); exit: + kfree(epds); module_put(type->driver.owner); return 0; probe_error: + retval = -EIO; +err_free_epds: + kfree(epds); +err_put_serial: usb_serial_put(serial); +err_put_module: module_put(type->driver.owner); - return -EIO; + + return retval; } static void usb_serial_disconnect(struct usb_interface *interface) diff --git a/drivers/usb/serial/usb_debug.c b/drivers/usb/serial/usb_debug.c index ca2fa5bbe17e1a9406079a89571e8dd31a083c9a..12f4c5a91e628eaf043a5d103d86a0838164da96 100644 --- a/drivers/usb/serial/usb_debug.c +++ b/drivers/usb/serial/usb_debug.c @@ -17,7 +17,7 @@ #define USB_DEBUG_MAX_PACKET_SIZE 8 #define USB_DEBUG_BRK_SIZE 8 -static char USB_DEBUG_BRK[USB_DEBUG_BRK_SIZE] = { +static const char USB_DEBUG_BRK[USB_DEBUG_BRK_SIZE] = { 0x00, 0xff, 0x01, @@ -32,7 +32,18 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x0525, 0x127a) }, { }, }; -MODULE_DEVICE_TABLE(usb, id_table); + +static const struct usb_device_id dbc_id_table[] = { + { USB_DEVICE(0x1d6b, 0x0004) }, + { }, +}; + +static const struct usb_device_id id_table_combined[] = { + { USB_DEVICE(0x0525, 0x127a) }, + { USB_DEVICE(0x1d6b, 0x0004) }, + { }, +}; +MODULE_DEVICE_TABLE(usb, id_table_combined); /* This HW really does not support a serial break, so one will be * emulated when ever the break state is set to true. @@ -71,9 +82,20 @@ static struct usb_serial_driver debug_device = { .process_read_urb = usb_debug_process_read_urb, }; +static struct usb_serial_driver dbc_device = { + .driver = { + .owner = THIS_MODULE, + .name = "xhci_dbc", + }, + .id_table = dbc_id_table, + .num_ports = 1, + .break_ctl = usb_debug_break_ctl, + .process_read_urb = usb_debug_process_read_urb, +}; + static struct usb_serial_driver * const serial_drivers[] = { - &debug_device, NULL + &debug_device, &dbc_device, NULL }; -module_usb_serial_driver(serial_drivers, id_table); +module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_LICENSE("GPL"); diff --git a/drivers/usb/serial/visor.c b/drivers/usb/serial/visor.c index 337a0be89fcf0767f67731a9b0a50700e1df9318..9f3317a940ef6a8d88767eaa919bb4b61f6f13b6 100644 --- a/drivers/usb/serial/visor.c +++ b/drivers/usb/serial/visor.c @@ -40,11 +40,12 @@ static int visor_open(struct tty_struct *tty, struct usb_serial_port *port); static void visor_close(struct usb_serial_port *port); static int visor_probe(struct usb_serial *serial, const struct usb_device_id *id); -static int visor_calc_num_ports(struct usb_serial *serial); +static int visor_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds); +static int clie_5_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds); static void visor_read_int_callback(struct urb *urb); static int clie_3_5_startup(struct usb_serial *serial); -static int treo_attach(struct usb_serial *serial); -static int clie_5_attach(struct usb_serial *serial); static int palm_os_3_probe(struct usb_serial *serial, const struct usb_device_id *id); static int palm_os_4_probe(struct usb_serial *serial, @@ -174,7 +175,6 @@ static struct usb_serial_driver handspring_device = { .close = visor_close, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, - .attach = treo_attach, .probe = visor_probe, .calc_num_ports = visor_calc_num_ports, .read_int_callback = visor_read_int_callback, @@ -189,14 +189,14 @@ static struct usb_serial_driver clie_5_device = { .description = "Sony Clie 5.0", .id_table = clie_id_5_table, .num_ports = 2, + .num_bulk_out = 2, .bulk_out_size = 256, .open = visor_open, .close = visor_close, .throttle = usb_serial_generic_throttle, .unthrottle = usb_serial_generic_unthrottle, - .attach = clie_5_attach, .probe = visor_probe, - .calc_num_ports = visor_calc_num_ports, + .calc_num_ports = clie_5_calc_num_ports, .read_int_callback = visor_read_int_callback, }; @@ -466,16 +466,60 @@ static int visor_probe(struct usb_serial *serial, return retval; } -static int visor_calc_num_ports(struct usb_serial *serial) +static int visor_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) { + unsigned int vid = le16_to_cpu(serial->dev->descriptor.idVendor); int num_ports = (int)(long)(usb_get_serial_data(serial)); if (num_ports) usb_set_serial_data(serial, NULL); + /* + * Only swap the bulk endpoints for the Handspring devices with + * interrupt in endpoints, which for now are the Treo devices. + */ + if (!(vid == HANDSPRING_VENDOR_ID || vid == KYOCERA_VENDOR_ID) || + epds->num_interrupt_in == 0) + goto out; + + if (epds->num_bulk_in < 2 || epds->num_interrupt_in < 2) { + dev_err(&serial->interface->dev, "missing endpoints\n"); + return -ENODEV; + } + + /* + * It appears that Treos and Kyoceras want to use the + * 1st bulk in endpoint to communicate with the 2nd bulk out endpoint, + * so let's swap the 1st and 2nd bulk in and interrupt endpoints. + * Note that swapping the bulk out endpoints would break lots of + * apps that want to communicate on the second port. + */ + swap(epds->bulk_in[0], epds->bulk_in[1]); + swap(epds->interrupt_in[0], epds->interrupt_in[1]); +out: return num_ports; } +static int clie_5_calc_num_ports(struct usb_serial *serial, + struct usb_serial_endpoints *epds) +{ + /* + * TH55 registers 2 ports. + * Communication in from the UX50/TH55 uses the first bulk-in + * endpoint, while communication out to the UX50/TH55 uses the second + * bulk-out endpoint. + */ + + /* + * FIXME: Should we swap the descriptors instead of using the same + * bulk-out endpoint for both ports? + */ + epds->bulk_out[0] = epds->bulk_out[1]; + + return serial->type->num_ports; +} + static int clie_3_5_startup(struct usb_serial *serial) { struct device *dev = &serial->dev->dev; @@ -531,94 +575,6 @@ static int clie_3_5_startup(struct usb_serial *serial) return result; } -static int treo_attach(struct usb_serial *serial) -{ - struct usb_serial_port *swap_port; - - /* Only do this endpoint hack for the Handspring devices with - * interrupt in endpoints, which for now are the Treo devices. */ - if (!((le16_to_cpu(serial->dev->descriptor.idVendor) - == HANDSPRING_VENDOR_ID) || - (le16_to_cpu(serial->dev->descriptor.idVendor) - == KYOCERA_VENDOR_ID)) || - (serial->num_interrupt_in == 0)) - return 0; - - if (serial->num_bulk_in < 2 || serial->num_interrupt_in < 2) { - dev_err(&serial->interface->dev, "missing endpoints\n"); - return -ENODEV; - } - - /* - * It appears that Treos and Kyoceras want to use the - * 1st bulk in endpoint to communicate with the 2nd bulk out endpoint, - * so let's swap the 1st and 2nd bulk in and interrupt endpoints. - * Note that swapping the bulk out endpoints would break lots of - * apps that want to communicate on the second port. - */ -#define COPY_PORT(dest, src) \ - do { \ - int i; \ - \ - for (i = 0; i < ARRAY_SIZE(src->read_urbs); ++i) { \ - dest->read_urbs[i] = src->read_urbs[i]; \ - dest->read_urbs[i]->context = dest; \ - dest->bulk_in_buffers[i] = src->bulk_in_buffers[i]; \ - } \ - dest->read_urb = src->read_urb; \ - dest->bulk_in_endpointAddress = src->bulk_in_endpointAddress;\ - dest->bulk_in_buffer = src->bulk_in_buffer; \ - dest->bulk_in_size = src->bulk_in_size; \ - dest->interrupt_in_urb = src->interrupt_in_urb; \ - dest->interrupt_in_urb->context = dest; \ - dest->interrupt_in_endpointAddress = \ - src->interrupt_in_endpointAddress;\ - dest->interrupt_in_buffer = src->interrupt_in_buffer; \ - } while (0); - - swap_port = kmalloc(sizeof(*swap_port), GFP_KERNEL); - if (!swap_port) - return -ENOMEM; - COPY_PORT(swap_port, serial->port[0]); - COPY_PORT(serial->port[0], serial->port[1]); - COPY_PORT(serial->port[1], swap_port); - kfree(swap_port); - - return 0; -} - -static int clie_5_attach(struct usb_serial *serial) -{ - struct usb_serial_port *port; - unsigned int pipe; - int j; - - /* TH55 registers 2 ports. - Communication in from the UX50/TH55 uses bulk_in_endpointAddress - from port 0. Communication out to the UX50/TH55 uses - bulk_out_endpointAddress from port 1 - - Lets do a quick and dirty mapping - */ - - /* some sanity check */ - if (serial->num_bulk_out < 2) { - dev_err(&serial->interface->dev, "missing bulk out endpoints\n"); - return -ENODEV; - } - - /* port 0 now uses the modified endpoint Address */ - port = serial->port[0]; - port->bulk_out_endpointAddress = - serial->port[1]->bulk_out_endpointAddress; - - pipe = usb_sndbulkpipe(serial->dev, port->bulk_out_endpointAddress); - for (j = 0; j < ARRAY_SIZE(port->write_urbs); ++j) - port->write_urbs[j]->pipe = pipe; - - return 0; -} - module_usb_serial_driver(serial_drivers, id_table_combined); MODULE_AUTHOR(DRIVER_AUTHOR); diff --git a/drivers/usb/serial/whiteheat.c b/drivers/usb/serial/whiteheat.c index 5ab65eb1dacc8b5aa2fdf5152df2a381180683c2..55cebc1e6fec1d6a2664e2d796c49e2af728af75 100644 --- a/drivers/usb/serial/whiteheat.c +++ b/drivers/usb/serial/whiteheat.c @@ -80,8 +80,6 @@ static int whiteheat_firmware_download(struct usb_serial *serial, static int whiteheat_firmware_attach(struct usb_serial *serial); /* function prototypes for the Connect Tech WhiteHEAT serial converter */ -static int whiteheat_probe(struct usb_serial *serial, - const struct usb_device_id *id); static int whiteheat_attach(struct usb_serial *serial); static void whiteheat_release(struct usb_serial *serial); static int whiteheat_port_probe(struct usb_serial_port *port); @@ -118,7 +116,8 @@ static struct usb_serial_driver whiteheat_device = { .description = "Connect Tech - WhiteHEAT", .id_table = id_table_std, .num_ports = 4, - .probe = whiteheat_probe, + .num_bulk_in = 5, + .num_bulk_out = 5, .attach = whiteheat_attach, .release = whiteheat_release, .port_probe = whiteheat_port_probe, @@ -221,33 +220,6 @@ static int whiteheat_firmware_attach(struct usb_serial *serial) * Connect Tech's White Heat serial driver functions *****************************************************************************/ -static int whiteheat_probe(struct usb_serial *serial, - const struct usb_device_id *id) -{ - struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - size_t num_bulk_in = 0; - size_t num_bulk_out = 0; - size_t min_num_bulk; - unsigned int i; - - iface_desc = serial->interface->cur_altsetting; - - for (i = 0; i < iface_desc->desc.bNumEndpoints; i++) { - endpoint = &iface_desc->endpoint[i].desc; - if (usb_endpoint_is_bulk_in(endpoint)) - ++num_bulk_in; - if (usb_endpoint_is_bulk_out(endpoint)) - ++num_bulk_out; - } - - min_num_bulk = COMMAND_PORT + 1; - if (num_bulk_in < min_num_bulk || num_bulk_out < min_num_bulk) - return -ENODEV; - - return 0; -} - static int whiteheat_attach(struct usb_serial *serial) { struct usb_serial_port *command_port; diff --git a/drivers/usb/storage/karma.c b/drivers/usb/storage/karma.c index f9d407f0b50868d2c576f2e57ddae64de0a26aad..b05ba4929f00e7f9487306e7ecebce87b86b295e 100644 --- a/drivers/usb/storage/karma.c +++ b/drivers/usb/storage/karma.c @@ -105,7 +105,7 @@ static struct us_unusual_dev karma_unusual_dev_list[] = { */ static int rio_karma_send_command(char cmd, struct us_data *us) { - int result, partial; + int result; unsigned long timeout; static unsigned char seq = 1; struct karma_data *data = (struct karma_data *) us->extra; @@ -119,12 +119,12 @@ static int rio_karma_send_command(char cmd, struct us_data *us) timeout = jiffies + msecs_to_jiffies(6000); for (;;) { result = usb_stor_bulk_transfer_buf(us, us->send_bulk_pipe, - us->iobuf, RIO_SEND_LEN, &partial); + us->iobuf, RIO_SEND_LEN, NULL); if (result != USB_STOR_XFER_GOOD) goto err; result = usb_stor_bulk_transfer_buf(us, us->recv_bulk_pipe, - data->recv, RIO_RECV_LEN, &partial); + data->recv, RIO_RECV_LEN, NULL); if (result != USB_STOR_XFER_GOOD) goto err; diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 9129f6cb823074a555a90f74611dac3f0164a60d..5a70c33ef0e0645d1c97a59021a6b400776dd473 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -42,7 +42,7 @@ * - a patch that adds the entry for your device, including your * email address right above the entry (plus maybe a brief * explanation of the reason for the entry), - * - a copy of /proc/bus/usb/devices with your device plugged in + * - a copy of /sys/kernel/debug/usb/devices with your device plugged in * running with this patch. * Send your submission to either Phil Dibowitz or * Alan Stern , and don't forget to CC: the @@ -176,7 +176,7 @@ UNUSUAL_DEV( 0x0420, 0x0001, 0x0100, 0x0100, /* * Reported by Andrew Nayenko - * Updated for new firmware by Phillip Potter + * Updated for new firmware by Phillip Potter */ UNUSUAL_DEV( 0x0421, 0x0019, 0x0592, 0x0610, "Nokia", diff --git a/drivers/usb/storage/usb.c b/drivers/usb/storage/usb.c index 615bea08ec0aeb1788155e067ac4f999a737f9a0..06615934fed1cc537694b03d023b613c95aea9f1 100644 --- a/drivers/usb/storage/usb.c +++ b/drivers/usb/storage/usb.c @@ -737,13 +737,11 @@ static void get_protocol(struct us_data *us) /* Get the pipe settings */ static int get_pipes(struct us_data *us) { - struct usb_host_interface *altsetting = - us->pusb_intf->cur_altsetting; - int i; - struct usb_endpoint_descriptor *ep; - struct usb_endpoint_descriptor *ep_in = NULL; - struct usb_endpoint_descriptor *ep_out = NULL; - struct usb_endpoint_descriptor *ep_int = NULL; + struct usb_host_interface *alt = us->pusb_intf->cur_altsetting; + struct usb_endpoint_descriptor *ep_in; + struct usb_endpoint_descriptor *ep_out; + struct usb_endpoint_descriptor *ep_int; + int res; /* * Find the first endpoint of each type we need. @@ -751,28 +749,16 @@ static int get_pipes(struct us_data *us) * An optional interrupt-in is OK (necessary for CBI protocol). * We will ignore any others. */ - for (i = 0; i < altsetting->desc.bNumEndpoints; i++) { - ep = &altsetting->endpoint[i].desc; - - if (usb_endpoint_xfer_bulk(ep)) { - if (usb_endpoint_dir_in(ep)) { - if (!ep_in) - ep_in = ep; - } else { - if (!ep_out) - ep_out = ep; - } - } - - else if (usb_endpoint_is_int_in(ep)) { - if (!ep_int) - ep_int = ep; - } + res = usb_find_common_endpoints(alt, &ep_in, &ep_out, NULL, NULL); + if (res) { + usb_stor_dbg(us, "bulk endpoints not found\n"); + return res; } - if (!ep_in || !ep_out || (us->protocol == USB_PR_CBI && !ep_int)) { - usb_stor_dbg(us, "Endpoint sanity check failed! Rejecting dev.\n"); - return -EIO; + res = usb_find_int_in_endpoint(alt, &ep_int); + if (res && us->protocol == USB_PR_CBI) { + usb_stor_dbg(us, "interrupt endpoint not found\n"); + return res; } /* Calculate and store the pipe values */ diff --git a/drivers/usb/typec/Kconfig b/drivers/usb/typec/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..dfcfe459b7cf5605d4a00abc78e402c6e2468445 --- /dev/null +++ b/drivers/usb/typec/Kconfig @@ -0,0 +1,22 @@ + +menu "USB Power Delivery and Type-C drivers" + +config TYPEC + tristate + +config TYPEC_WCOVE + tristate "Intel WhiskeyCove PMIC USB Type-C PHY driver" + depends on ACPI + depends on INTEL_SOC_PMIC + depends on INTEL_PMC_IPC + depends on BXT_WC_PMIC_OPREGION + select TYPEC + help + This driver adds support for USB Type-C detection on Intel Broxton + platforms that have Intel Whiskey Cove PMIC. The driver can detect the + role and cable orientation. + + To compile this driver as module, choose M here: the module will be + called typec_wcove + +endmenu diff --git a/drivers/usb/typec/Makefile b/drivers/usb/typec/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b9cb862221af39f36b28701576cec2011fccb845 --- /dev/null +++ b/drivers/usb/typec/Makefile @@ -0,0 +1,2 @@ +obj-$(CONFIG_TYPEC) += typec.o +obj-$(CONFIG_TYPEC_WCOVE) += typec_wcove.o diff --git a/drivers/usb/typec/typec.c b/drivers/usb/typec/typec.c new file mode 100644 index 0000000000000000000000000000000000000000..89e540bb7ff3a688621223eb7a9fd4252f21404b --- /dev/null +++ b/drivers/usb/typec/typec.c @@ -0,0 +1,1262 @@ +/* + * USB Type-C Connector Class + * + * Copyright (C) 2017, Intel Corporation + * Author: Heikki Krogerus + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +struct typec_mode { + int index; + u32 vdo; + char *desc; + enum typec_port_type roles; + + struct typec_altmode *alt_mode; + + unsigned int active:1; + + char group_name[6]; + struct attribute_group group; + struct attribute *attrs[5]; + struct device_attribute vdo_attr; + struct device_attribute desc_attr; + struct device_attribute active_attr; + struct device_attribute roles_attr; +}; + +struct typec_altmode { + struct device dev; + u16 svid; + int n_modes; + struct typec_mode modes[ALTMODE_MAX_MODES]; + const struct attribute_group *mode_groups[ALTMODE_MAX_MODES]; +}; + +struct typec_plug { + struct device dev; + enum typec_plug_index index; +}; + +struct typec_cable { + struct device dev; + enum typec_plug_type type; + struct usb_pd_identity *identity; + unsigned int active:1; +}; + +struct typec_partner { + struct device dev; + unsigned int usb_pd:1; + struct usb_pd_identity *identity; + enum typec_accessory accessory; +}; + +struct typec_port { + unsigned int id; + struct device dev; + + int prefer_role; + enum typec_data_role data_role; + enum typec_role pwr_role; + enum typec_role vconn_role; + enum typec_pwr_opmode pwr_opmode; + + const struct typec_capability *cap; +}; + +#define to_typec_port(_dev_) container_of(_dev_, struct typec_port, dev) +#define to_typec_plug(_dev_) container_of(_dev_, struct typec_plug, dev) +#define to_typec_cable(_dev_) container_of(_dev_, struct typec_cable, dev) +#define to_typec_partner(_dev_) container_of(_dev_, struct typec_partner, dev) +#define to_altmode(_dev_) container_of(_dev_, struct typec_altmode, dev) + +static const struct device_type typec_partner_dev_type; +static const struct device_type typec_cable_dev_type; +static const struct device_type typec_plug_dev_type; +static const struct device_type typec_port_dev_type; + +#define is_typec_partner(_dev_) (_dev_->type == &typec_partner_dev_type) +#define is_typec_cable(_dev_) (_dev_->type == &typec_cable_dev_type) +#define is_typec_plug(_dev_) (_dev_->type == &typec_plug_dev_type) +#define is_typec_port(_dev_) (_dev_->type == &typec_port_dev_type) + +static DEFINE_IDA(typec_index_ida); +static struct class *typec_class; + +/* Common attributes */ + +static const char * const typec_accessory_modes[] = { + [TYPEC_ACCESSORY_NONE] = "none", + [TYPEC_ACCESSORY_AUDIO] = "analog_audio", + [TYPEC_ACCESSORY_DEBUG] = "debug", +}; + +static struct usb_pd_identity *get_pd_identity(struct device *dev) +{ + if (is_typec_partner(dev)) { + struct typec_partner *partner = to_typec_partner(dev); + + return partner->identity; + } else if (is_typec_cable(dev)) { + struct typec_cable *cable = to_typec_cable(dev); + + return cable->identity; + } + return NULL; +} + +static ssize_t id_header_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct usb_pd_identity *id = get_pd_identity(dev); + + return sprintf(buf, "0x%08x\n", id->id_header); +} +static DEVICE_ATTR_RO(id_header); + +static ssize_t cert_stat_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct usb_pd_identity *id = get_pd_identity(dev); + + return sprintf(buf, "0x%08x\n", id->cert_stat); +} +static DEVICE_ATTR_RO(cert_stat); + +static ssize_t product_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct usb_pd_identity *id = get_pd_identity(dev); + + return sprintf(buf, "0x%08x\n", id->product); +} +static DEVICE_ATTR_RO(product); + +static struct attribute *usb_pd_id_attrs[] = { + &dev_attr_id_header.attr, + &dev_attr_cert_stat.attr, + &dev_attr_product.attr, + NULL +}; + +static const struct attribute_group usb_pd_id_group = { + .name = "identity", + .attrs = usb_pd_id_attrs, +}; + +static const struct attribute_group *usb_pd_id_groups[] = { + &usb_pd_id_group, + NULL, +}; + +static void typec_report_identity(struct device *dev) +{ + sysfs_notify(&dev->kobj, "identity", "id_header"); + sysfs_notify(&dev->kobj, "identity", "cert_stat"); + sysfs_notify(&dev->kobj, "identity", "product"); +} + +/* ------------------------------------------------------------------------- */ +/* Alternate Modes */ + +/** + * typec_altmode_update_active - Report Enter/Exit mode + * @alt: Handle to the alternate mode + * @mode: Mode index + * @active: True when the mode has been entered + * + * If a partner or cable plug executes Enter/Exit Mode command successfully, the + * drivers use this routine to report the updated state of the mode. + */ +void typec_altmode_update_active(struct typec_altmode *alt, int mode, + bool active) +{ + struct typec_mode *m = &alt->modes[mode]; + char dir[6]; + + if (m->active == active) + return; + + m->active = active; + snprintf(dir, sizeof(dir), "mode%d", mode); + sysfs_notify(&alt->dev.kobj, dir, "active"); + kobject_uevent(&alt->dev.kobj, KOBJ_CHANGE); +} +EXPORT_SYMBOL_GPL(typec_altmode_update_active); + +/** + * typec_altmode2port - Alternate Mode to USB Type-C port + * @alt: The Alternate Mode + * + * Returns handle to the port that a cable plug or partner with @alt is + * connected to. + */ +struct typec_port *typec_altmode2port(struct typec_altmode *alt) +{ + if (is_typec_plug(alt->dev.parent)) + return to_typec_port(alt->dev.parent->parent->parent); + if (is_typec_partner(alt->dev.parent)) + return to_typec_port(alt->dev.parent->parent); + if (is_typec_port(alt->dev.parent)) + return to_typec_port(alt->dev.parent); + + return NULL; +} +EXPORT_SYMBOL_GPL(typec_altmode2port); + +static ssize_t +typec_altmode_vdo_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_mode *mode = container_of(attr, struct typec_mode, + vdo_attr); + + return sprintf(buf, "0x%08x\n", mode->vdo); +} + +static ssize_t +typec_altmode_desc_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_mode *mode = container_of(attr, struct typec_mode, + desc_attr); + + return sprintf(buf, "%s\n", mode->desc ? mode->desc : ""); +} + +static ssize_t +typec_altmode_active_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_mode *mode = container_of(attr, struct typec_mode, + active_attr); + + return sprintf(buf, "%s\n", mode->active ? "yes" : "no"); +} + +static ssize_t +typec_altmode_active_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + struct typec_mode *mode = container_of(attr, struct typec_mode, + active_attr); + struct typec_port *port = typec_altmode2port(mode->alt_mode); + bool activate; + int ret; + + if (!port->cap->activate_mode) + return -EOPNOTSUPP; + + ret = kstrtobool(buf, &activate); + if (ret) + return ret; + + ret = port->cap->activate_mode(port->cap, mode->index, activate); + if (ret) + return ret; + + return size; +} + +static ssize_t +typec_altmode_roles_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_mode *mode = container_of(attr, struct typec_mode, + roles_attr); + ssize_t ret; + + switch (mode->roles) { + case TYPEC_PORT_DFP: + ret = sprintf(buf, "source\n"); + break; + case TYPEC_PORT_UFP: + ret = sprintf(buf, "sink\n"); + break; + case TYPEC_PORT_DRP: + default: + ret = sprintf(buf, "source sink\n"); + break; + } + return ret; +} + +static void typec_init_modes(struct typec_altmode *alt, + struct typec_mode_desc *desc, bool is_port) +{ + int i; + + for (i = 0; i < alt->n_modes; i++, desc++) { + struct typec_mode *mode = &alt->modes[i]; + + /* Not considering the human readable description critical */ + mode->desc = kstrdup(desc->desc, GFP_KERNEL); + if (desc->desc && !mode->desc) + dev_err(&alt->dev, "failed to copy mode%d desc\n", i); + + mode->alt_mode = alt; + mode->vdo = desc->vdo; + mode->roles = desc->roles; + mode->index = desc->index; + sprintf(mode->group_name, "mode%d", desc->index); + + sysfs_attr_init(&mode->vdo_attr.attr); + mode->vdo_attr.attr.name = "vdo"; + mode->vdo_attr.attr.mode = 0444; + mode->vdo_attr.show = typec_altmode_vdo_show; + + sysfs_attr_init(&mode->desc_attr.attr); + mode->desc_attr.attr.name = "description"; + mode->desc_attr.attr.mode = 0444; + mode->desc_attr.show = typec_altmode_desc_show; + + sysfs_attr_init(&mode->active_attr.attr); + mode->active_attr.attr.name = "active"; + mode->active_attr.attr.mode = 0644; + mode->active_attr.show = typec_altmode_active_show; + mode->active_attr.store = typec_altmode_active_store; + + mode->attrs[0] = &mode->vdo_attr.attr; + mode->attrs[1] = &mode->desc_attr.attr; + mode->attrs[2] = &mode->active_attr.attr; + + /* With ports, list the roles that the mode is supported with */ + if (is_port) { + sysfs_attr_init(&mode->roles_attr.attr); + mode->roles_attr.attr.name = "supported_roles"; + mode->roles_attr.attr.mode = 0444; + mode->roles_attr.show = typec_altmode_roles_show; + + mode->attrs[3] = &mode->roles_attr.attr; + } + + mode->group.attrs = mode->attrs; + mode->group.name = mode->group_name; + + alt->mode_groups[i] = &mode->group; + } +} + +static ssize_t svid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_altmode *alt = to_altmode(dev); + + return sprintf(buf, "%04x\n", alt->svid); +} +static DEVICE_ATTR_RO(svid); + +static struct attribute *typec_altmode_attrs[] = { + &dev_attr_svid.attr, + NULL +}; +ATTRIBUTE_GROUPS(typec_altmode); + +static void typec_altmode_release(struct device *dev) +{ + struct typec_altmode *alt = to_altmode(dev); + int i; + + for (i = 0; i < alt->n_modes; i++) + kfree(alt->modes[i].desc); + kfree(alt); +} + +static const struct device_type typec_altmode_dev_type = { + .name = "typec_alternate_mode", + .groups = typec_altmode_groups, + .release = typec_altmode_release, +}; + +static struct typec_altmode * +typec_register_altmode(struct device *parent, struct typec_altmode_desc *desc) +{ + struct typec_altmode *alt; + int ret; + + alt = kzalloc(sizeof(*alt), GFP_KERNEL); + if (!alt) + return NULL; + + alt->svid = desc->svid; + alt->n_modes = desc->n_modes; + typec_init_modes(alt, desc->modes, is_typec_port(parent)); + + alt->dev.parent = parent; + alt->dev.groups = alt->mode_groups; + alt->dev.type = &typec_altmode_dev_type; + dev_set_name(&alt->dev, "svid-%04x", alt->svid); + + ret = device_register(&alt->dev); + if (ret) { + dev_err(parent, "failed to register alternate mode (%d)\n", + ret); + put_device(&alt->dev); + return NULL; + } + + return alt; +} + +/** + * typec_unregister_altmode - Unregister Alternate Mode + * @alt: The alternate mode to be unregistered + * + * Unregister device created with typec_partner_register_altmode(), + * typec_plug_register_altmode() or typec_port_register_altmode(). + */ +void typec_unregister_altmode(struct typec_altmode *alt) +{ + if (alt) + device_unregister(&alt->dev); +} +EXPORT_SYMBOL_GPL(typec_unregister_altmode); + +/* ------------------------------------------------------------------------- */ +/* Type-C Partners */ + +static ssize_t accessory_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_partner *p = to_typec_partner(dev); + + return sprintf(buf, "%s\n", typec_accessory_modes[p->accessory]); +} +static DEVICE_ATTR_RO(accessory_mode); + +static ssize_t supports_usb_power_delivery_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_partner *p = to_typec_partner(dev); + + return sprintf(buf, "%s\n", p->usb_pd ? "yes" : "no"); +} +static DEVICE_ATTR_RO(supports_usb_power_delivery); + +static struct attribute *typec_partner_attrs[] = { + &dev_attr_accessory_mode.attr, + &dev_attr_supports_usb_power_delivery.attr, + NULL +}; +ATTRIBUTE_GROUPS(typec_partner); + +static void typec_partner_release(struct device *dev) +{ + struct typec_partner *partner = to_typec_partner(dev); + + kfree(partner); +} + +static const struct device_type typec_partner_dev_type = { + .name = "typec_partner", + .groups = typec_partner_groups, + .release = typec_partner_release, +}; + +/** + * typec_partner_set_identity - Report result from Discover Identity command + * @partner: The partner updated identity values + * + * This routine is used to report that the result of Discover Identity USB power + * delivery command has become available. + */ +int typec_partner_set_identity(struct typec_partner *partner) +{ + if (!partner->identity) + return -EINVAL; + + typec_report_identity(&partner->dev); + return 0; +} +EXPORT_SYMBOL_GPL(typec_partner_set_identity); + +/** + * typec_partner_register_altmode - Register USB Type-C Partner Alternate Mode + * @partner: USB Type-C Partner that supports the alternate mode + * @desc: Description of the alternate mode + * + * This routine is used to register each alternate mode individually that + * @partner has listed in response to Discover SVIDs command. The modes for a + * SVID listed in response to Discover Modes command need to be listed in an + * array in @desc. + * + * Returns handle to the alternate mode on success or NULL on failure. + */ +struct typec_altmode * +typec_partner_register_altmode(struct typec_partner *partner, + struct typec_altmode_desc *desc) +{ + return typec_register_altmode(&partner->dev, desc); +} +EXPORT_SYMBOL_GPL(typec_partner_register_altmode); + +/** + * typec_register_partner - Register a USB Type-C Partner + * @port: The USB Type-C Port the partner is connected to + * @desc: Description of the partner + * + * Registers a device for USB Type-C Partner described in @desc. + * + * Returns handle to the partner on success or NULL on failure. + */ +struct typec_partner *typec_register_partner(struct typec_port *port, + struct typec_partner_desc *desc) +{ + struct typec_partner *partner; + int ret; + + partner = kzalloc(sizeof(*partner), GFP_KERNEL); + if (!partner) + return NULL; + + partner->usb_pd = desc->usb_pd; + partner->accessory = desc->accessory; + + if (desc->identity) { + /* + * Creating directory for the identity only if the driver is + * able to provide data to it. + */ + partner->dev.groups = usb_pd_id_groups; + partner->identity = desc->identity; + } + + partner->dev.class = typec_class; + partner->dev.parent = &port->dev; + partner->dev.type = &typec_partner_dev_type; + dev_set_name(&partner->dev, "%s-partner", dev_name(&port->dev)); + + ret = device_register(&partner->dev); + if (ret) { + dev_err(&port->dev, "failed to register partner (%d)\n", ret); + put_device(&partner->dev); + return NULL; + } + + return partner; +} +EXPORT_SYMBOL_GPL(typec_register_partner); + +/** + * typec_unregister_partner - Unregister a USB Type-C Partner + * @partner: The partner to be unregistered + * + * Unregister device created with typec_register_partner(). + */ +void typec_unregister_partner(struct typec_partner *partner) +{ + if (partner) + device_unregister(&partner->dev); +} +EXPORT_SYMBOL_GPL(typec_unregister_partner); + +/* ------------------------------------------------------------------------- */ +/* Type-C Cable Plugs */ + +static void typec_plug_release(struct device *dev) +{ + struct typec_plug *plug = to_typec_plug(dev); + + kfree(plug); +} + +static const struct device_type typec_plug_dev_type = { + .name = "typec_plug", + .release = typec_plug_release, +}; + +/** + * typec_plug_register_altmode - Register USB Type-C Cable Plug Alternate Mode + * @plug: USB Type-C Cable Plug that supports the alternate mode + * @desc: Description of the alternate mode + * + * This routine is used to register each alternate mode individually that @plug + * has listed in response to Discover SVIDs command. The modes for a SVID that + * the plug lists in response to Discover Modes command need to be listed in an + * array in @desc. + * + * Returns handle to the alternate mode on success or NULL on failure. + */ +struct typec_altmode * +typec_plug_register_altmode(struct typec_plug *plug, + struct typec_altmode_desc *desc) +{ + return typec_register_altmode(&plug->dev, desc); +} +EXPORT_SYMBOL_GPL(typec_plug_register_altmode); + +/** + * typec_register_plug - Register a USB Type-C Cable Plug + * @cable: USB Type-C Cable with the plug + * @desc: Description of the cable plug + * + * Registers a device for USB Type-C Cable Plug described in @desc. A USB Type-C + * Cable Plug represents a plug with electronics in it that can response to USB + * Power Delivery SOP Prime or SOP Double Prime packages. + * + * Returns handle to the cable plug on success or NULL on failure. + */ +struct typec_plug *typec_register_plug(struct typec_cable *cable, + struct typec_plug_desc *desc) +{ + struct typec_plug *plug; + char name[8]; + int ret; + + plug = kzalloc(sizeof(*plug), GFP_KERNEL); + if (!plug) + return NULL; + + sprintf(name, "plug%d", desc->index); + + plug->index = desc->index; + plug->dev.class = typec_class; + plug->dev.parent = &cable->dev; + plug->dev.type = &typec_plug_dev_type; + dev_set_name(&plug->dev, "%s-%s", dev_name(cable->dev.parent), name); + + ret = device_register(&plug->dev); + if (ret) { + dev_err(&cable->dev, "failed to register plug (%d)\n", ret); + put_device(&plug->dev); + return NULL; + } + + return plug; +} +EXPORT_SYMBOL_GPL(typec_register_plug); + +/** + * typec_unregister_plug - Unregister a USB Type-C Cable Plug + * @plug: The cable plug to be unregistered + * + * Unregister device created with typec_register_plug(). + */ +void typec_unregister_plug(struct typec_plug *plug) +{ + if (plug) + device_unregister(&plug->dev); +} +EXPORT_SYMBOL_GPL(typec_unregister_plug); + +/* Type-C Cables */ + +static ssize_t +type_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct typec_cable *cable = to_typec_cable(dev); + + return sprintf(buf, "%s\n", cable->active ? "active" : "passive"); +} +static DEVICE_ATTR_RO(type); + +static const char * const typec_plug_types[] = { + [USB_PLUG_NONE] = "unknown", + [USB_PLUG_TYPE_A] = "type-a", + [USB_PLUG_TYPE_B] = "type-b", + [USB_PLUG_TYPE_C] = "type-c", + [USB_PLUG_CAPTIVE] = "captive", +}; + +static ssize_t plug_type_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct typec_cable *cable = to_typec_cable(dev); + + return sprintf(buf, "%s\n", typec_plug_types[cable->type]); +} +static DEVICE_ATTR_RO(plug_type); + +static struct attribute *typec_cable_attrs[] = { + &dev_attr_type.attr, + &dev_attr_plug_type.attr, + NULL +}; +ATTRIBUTE_GROUPS(typec_cable); + +static void typec_cable_release(struct device *dev) +{ + struct typec_cable *cable = to_typec_cable(dev); + + kfree(cable); +} + +static const struct device_type typec_cable_dev_type = { + .name = "typec_cable", + .groups = typec_cable_groups, + .release = typec_cable_release, +}; + +/** + * typec_cable_set_identity - Report result from Discover Identity command + * @cable: The cable updated identity values + * + * This routine is used to report that the result of Discover Identity USB power + * delivery command has become available. + */ +int typec_cable_set_identity(struct typec_cable *cable) +{ + if (!cable->identity) + return -EINVAL; + + typec_report_identity(&cable->dev); + return 0; +} +EXPORT_SYMBOL_GPL(typec_cable_set_identity); + +/** + * typec_register_cable - Register a USB Type-C Cable + * @port: The USB Type-C Port the cable is connected to + * @desc: Description of the cable + * + * Registers a device for USB Type-C Cable described in @desc. The cable will be + * parent for the optional cable plug devises. + * + * Returns handle to the cable on success or NULL on failure. + */ +struct typec_cable *typec_register_cable(struct typec_port *port, + struct typec_cable_desc *desc) +{ + struct typec_cable *cable; + int ret; + + cable = kzalloc(sizeof(*cable), GFP_KERNEL); + if (!cable) + return NULL; + + cable->type = desc->type; + cable->active = desc->active; + + if (desc->identity) { + /* + * Creating directory for the identity only if the driver is + * able to provide data to it. + */ + cable->dev.groups = usb_pd_id_groups; + cable->identity = desc->identity; + } + + cable->dev.class = typec_class; + cable->dev.parent = &port->dev; + cable->dev.type = &typec_cable_dev_type; + dev_set_name(&cable->dev, "%s-cable", dev_name(&port->dev)); + + ret = device_register(&cable->dev); + if (ret) { + dev_err(&port->dev, "failed to register cable (%d)\n", ret); + put_device(&cable->dev); + return NULL; + } + + return cable; +} +EXPORT_SYMBOL_GPL(typec_register_cable); + +/** + * typec_unregister_cable - Unregister a USB Type-C Cable + * @cable: The cable to be unregistered + * + * Unregister device created with typec_register_cable(). + */ +void typec_unregister_cable(struct typec_cable *cable) +{ + if (cable) + device_unregister(&cable->dev); +} +EXPORT_SYMBOL_GPL(typec_unregister_cable); + +/* ------------------------------------------------------------------------- */ +/* USB Type-C ports */ + +static const char * const typec_roles[] = { + [TYPEC_SINK] = "sink", + [TYPEC_SOURCE] = "source", +}; + +static const char * const typec_data_roles[] = { + [TYPEC_DEVICE] = "device", + [TYPEC_HOST] = "host", +}; + +static ssize_t +preferred_role_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t size) +{ + struct typec_port *port = to_typec_port(dev); + int role; + int ret; + + if (port->cap->type != TYPEC_PORT_DRP) { + dev_dbg(dev, "Preferred role only supported with DRP ports\n"); + return -EOPNOTSUPP; + } + + if (!port->cap->try_role) { + dev_dbg(dev, "Setting preferred role not supported\n"); + return -EOPNOTSUPP; + } + + role = sysfs_match_string(typec_roles, buf); + if (role < 0) { + if (sysfs_streq(buf, "none")) + role = TYPEC_NO_PREFERRED_ROLE; + else + return -EINVAL; + } + + ret = port->cap->try_role(port->cap, role); + if (ret) + return ret; + + port->prefer_role = role; + return size; +} + +static ssize_t +preferred_role_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct typec_port *port = to_typec_port(dev); + + if (port->cap->type != TYPEC_PORT_DRP) + return 0; + + if (port->prefer_role < 0) + return 0; + + return sprintf(buf, "%s\n", typec_roles[port->prefer_role]); +} +static DEVICE_ATTR_RW(preferred_role); + +static ssize_t data_role_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct typec_port *port = to_typec_port(dev); + int ret; + + if (port->cap->type != TYPEC_PORT_DRP) { + dev_dbg(dev, "data role swap only supported with DRP ports\n"); + return -EOPNOTSUPP; + } + + if (!port->cap->dr_set) { + dev_dbg(dev, "data role swapping not supported\n"); + return -EOPNOTSUPP; + } + + ret = sysfs_match_string(typec_data_roles, buf); + if (ret < 0) + return ret; + + ret = port->cap->dr_set(port->cap, ret); + if (ret) + return ret; + + return size; +} + +static ssize_t data_role_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct typec_port *port = to_typec_port(dev); + + if (port->cap->type == TYPEC_PORT_DRP) + return sprintf(buf, "%s\n", port->data_role == TYPEC_HOST ? + "[host] device" : "host [device]"); + + return sprintf(buf, "[%s]\n", typec_data_roles[port->data_role]); +} +static DEVICE_ATTR_RW(data_role); + +static ssize_t power_role_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct typec_port *port = to_typec_port(dev); + int ret = size; + + if (!port->cap->pd_revision) { + dev_dbg(dev, "USB Power Delivery not supported\n"); + return -EOPNOTSUPP; + } + + if (!port->cap->pr_set) { + dev_dbg(dev, "power role swapping not supported\n"); + return -EOPNOTSUPP; + } + + if (port->pwr_opmode != TYPEC_PWR_MODE_PD) { + dev_dbg(dev, "partner unable to swap power role\n"); + return -EIO; + } + + ret = sysfs_match_string(typec_roles, buf); + if (ret < 0) + return ret; + + ret = port->cap->pr_set(port->cap, ret); + if (ret) + return ret; + + return size; +} + +static ssize_t power_role_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct typec_port *port = to_typec_port(dev); + + if (port->cap->type == TYPEC_PORT_DRP) + return sprintf(buf, "%s\n", port->pwr_role == TYPEC_SOURCE ? + "[source] sink" : "source [sink]"); + + return sprintf(buf, "[%s]\n", typec_roles[port->pwr_role]); +} +static DEVICE_ATTR_RW(power_role); + +static const char * const typec_pwr_opmodes[] = { + [TYPEC_PWR_MODE_USB] = "default", + [TYPEC_PWR_MODE_1_5A] = "1.5A", + [TYPEC_PWR_MODE_3_0A] = "3.0A", + [TYPEC_PWR_MODE_PD] = "usb_power_delivery", +}; + +static ssize_t power_operation_mode_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_port *port = to_typec_port(dev); + + return sprintf(buf, "%s\n", typec_pwr_opmodes[port->pwr_opmode]); +} +static DEVICE_ATTR_RO(power_operation_mode); + +static ssize_t vconn_source_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + struct typec_port *port = to_typec_port(dev); + bool source; + int ret; + + if (!port->cap->pd_revision) { + dev_dbg(dev, "VCONN swap depends on USB Power Delivery\n"); + return -EOPNOTSUPP; + } + + if (!port->cap->vconn_set) { + dev_dbg(dev, "VCONN swapping not supported\n"); + return -EOPNOTSUPP; + } + + ret = kstrtobool(buf, &source); + if (ret) + return ret; + + ret = port->cap->vconn_set(port->cap, (enum typec_role)source); + if (ret) + return ret; + + return size; +} + +static ssize_t vconn_source_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct typec_port *port = to_typec_port(dev); + + return sprintf(buf, "%s\n", + port->vconn_role == TYPEC_SOURCE ? "yes" : "no"); +} +static DEVICE_ATTR_RW(vconn_source); + +static ssize_t supported_accessory_modes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_port *port = to_typec_port(dev); + ssize_t ret = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(port->cap->accessory); i++) { + if (port->cap->accessory[i]) + ret += sprintf(buf + ret, "%s ", + typec_accessory_modes[port->cap->accessory[i]]); + } + + if (!ret) + return sprintf(buf, "none\n"); + + buf[ret - 1] = '\n'; + + return ret; +} +static DEVICE_ATTR_RO(supported_accessory_modes); + +static ssize_t usb_typec_revision_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_port *port = to_typec_port(dev); + u16 rev = port->cap->revision; + + return sprintf(buf, "%d.%d\n", (rev >> 8) & 0xff, (rev >> 4) & 0xf); +} +static DEVICE_ATTR_RO(usb_typec_revision); + +static ssize_t usb_power_delivery_revision_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct typec_port *p = to_typec_port(dev); + + return sprintf(buf, "%d\n", (p->cap->pd_revision >> 8) & 0xff); +} +static DEVICE_ATTR_RO(usb_power_delivery_revision); + +static struct attribute *typec_attrs[] = { + &dev_attr_data_role.attr, + &dev_attr_power_operation_mode.attr, + &dev_attr_power_role.attr, + &dev_attr_preferred_role.attr, + &dev_attr_supported_accessory_modes.attr, + &dev_attr_usb_power_delivery_revision.attr, + &dev_attr_usb_typec_revision.attr, + &dev_attr_vconn_source.attr, + NULL, +}; +ATTRIBUTE_GROUPS(typec); + +static int typec_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + int ret; + + ret = add_uevent_var(env, "TYPEC_PORT=%s", dev_name(dev)); + if (ret) + dev_err(dev, "failed to add uevent TYPEC_PORT\n"); + + return ret; +} + +static void typec_release(struct device *dev) +{ + struct typec_port *port = to_typec_port(dev); + + ida_simple_remove(&typec_index_ida, port->id); + kfree(port); +} + +static const struct device_type typec_port_dev_type = { + .name = "typec_port", + .groups = typec_groups, + .uevent = typec_uevent, + .release = typec_release, +}; + +/* --------------------------------------- */ +/* Driver callbacks to report role updates */ + +/** + * typec_set_data_role - Report data role change + * @port: The USB Type-C Port where the role was changed + * @role: The new data role + * + * This routine is used by the port drivers to report data role changes. + */ +void typec_set_data_role(struct typec_port *port, enum typec_data_role role) +{ + if (port->data_role == role) + return; + + port->data_role = role; + sysfs_notify(&port->dev.kobj, NULL, "data_role"); + kobject_uevent(&port->dev.kobj, KOBJ_CHANGE); +} +EXPORT_SYMBOL_GPL(typec_set_data_role); + +/** + * typec_set_pwr_role - Report power role change + * @port: The USB Type-C Port where the role was changed + * @role: The new data role + * + * This routine is used by the port drivers to report power role changes. + */ +void typec_set_pwr_role(struct typec_port *port, enum typec_role role) +{ + if (port->pwr_role == role) + return; + + port->pwr_role = role; + sysfs_notify(&port->dev.kobj, NULL, "power_role"); + kobject_uevent(&port->dev.kobj, KOBJ_CHANGE); +} +EXPORT_SYMBOL_GPL(typec_set_pwr_role); + +/** + * typec_set_pwr_role - Report VCONN source change + * @port: The USB Type-C Port which VCONN role changed + * @role: Source when @port is sourcing VCONN, or Sink when it's not + * + * This routine is used by the port drivers to report if the VCONN source is + * changes. + */ +void typec_set_vconn_role(struct typec_port *port, enum typec_role role) +{ + if (port->vconn_role == role) + return; + + port->vconn_role = role; + sysfs_notify(&port->dev.kobj, NULL, "vconn_source"); + kobject_uevent(&port->dev.kobj, KOBJ_CHANGE); +} +EXPORT_SYMBOL_GPL(typec_set_vconn_role); + +/** + * typec_set_pwr_opmode - Report changed power operation mode + * @port: The USB Type-C Port where the mode was changed + * @opmode: New power operation mode + * + * This routine is used by the port drivers to report changed power operation + * mode in @port. The modes are USB (default), 1.5A, 3.0A as defined in USB + * Type-C specification, and "USB Power Delivery" when the power levels are + * negotiated with methods defined in USB Power Delivery specification. + */ +void typec_set_pwr_opmode(struct typec_port *port, + enum typec_pwr_opmode opmode) +{ + if (port->pwr_opmode == opmode) + return; + + port->pwr_opmode = opmode; + sysfs_notify(&port->dev.kobj, NULL, "power_operation_mode"); + kobject_uevent(&port->dev.kobj, KOBJ_CHANGE); +} +EXPORT_SYMBOL_GPL(typec_set_pwr_opmode); + +/* --------------------------------------- */ + +/** + * typec_port_register_altmode - Register USB Type-C Port Alternate Mode + * @port: USB Type-C Port that supports the alternate mode + * @desc: Description of the alternate mode + * + * This routine is used to register an alternate mode that @port is capable of + * supporting. + * + * Returns handle to the alternate mode on success or NULL on failure. + */ +struct typec_altmode * +typec_port_register_altmode(struct typec_port *port, + struct typec_altmode_desc *desc) +{ + return typec_register_altmode(&port->dev, desc); +} +EXPORT_SYMBOL_GPL(typec_port_register_altmode); + +/** + * typec_register_port - Register a USB Type-C Port + * @parent: Parent device + * @cap: Description of the port + * + * Registers a device for USB Type-C Port described in @cap. + * + * Returns handle to the port on success or NULL on failure. + */ +struct typec_port *typec_register_port(struct device *parent, + const struct typec_capability *cap) +{ + struct typec_port *port; + int role; + int ret; + int id; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return NULL; + + id = ida_simple_get(&typec_index_ida, 0, 0, GFP_KERNEL); + if (id < 0) { + kfree(port); + return NULL; + } + + if (cap->type == TYPEC_PORT_DFP) + role = TYPEC_SOURCE; + else if (cap->type == TYPEC_PORT_UFP) + role = TYPEC_SINK; + else + role = cap->prefer_role; + + if (role == TYPEC_SOURCE) { + port->data_role = TYPEC_HOST; + port->pwr_role = TYPEC_SOURCE; + port->vconn_role = TYPEC_SOURCE; + } else { + port->data_role = TYPEC_DEVICE; + port->pwr_role = TYPEC_SINK; + port->vconn_role = TYPEC_SINK; + } + + port->id = id; + port->cap = cap; + port->prefer_role = cap->prefer_role; + + port->dev.class = typec_class; + port->dev.parent = parent; + port->dev.fwnode = cap->fwnode; + port->dev.type = &typec_port_dev_type; + dev_set_name(&port->dev, "port%d", id); + + ret = device_register(&port->dev); + if (ret) { + dev_err(parent, "failed to register port (%d)\n", ret); + put_device(&port->dev); + return NULL; + } + + return port; +} +EXPORT_SYMBOL_GPL(typec_register_port); + +/** + * typec_unregister_port - Unregister a USB Type-C Port + * @port: The port to be unregistered + * + * Unregister device created with typec_register_port(). + */ +void typec_unregister_port(struct typec_port *port) +{ + if (port) + device_unregister(&port->dev); +} +EXPORT_SYMBOL_GPL(typec_unregister_port); + +static int __init typec_init(void) +{ + typec_class = class_create(THIS_MODULE, "typec"); + return PTR_ERR_OR_ZERO(typec_class); +} +subsys_initcall(typec_init); + +static void __exit typec_exit(void) +{ + class_destroy(typec_class); + ida_destroy(&typec_index_ida); +} +module_exit(typec_exit); + +MODULE_AUTHOR("Heikki Krogerus "); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("USB Type-C Connector Class"); diff --git a/drivers/usb/typec/typec_wcove.c b/drivers/usb/typec/typec_wcove.c new file mode 100644 index 0000000000000000000000000000000000000000..d5a7b21fa3f198ef43598d170cd61f7494ac4826 --- /dev/null +++ b/drivers/usb/typec/typec_wcove.c @@ -0,0 +1,377 @@ +/** + * typec_wcove.c - WhiskeyCove PMIC USB Type-C PHY driver + * + * Copyright (C) 2017 Intel Corporation + * Author: Heikki Krogerus + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +/* Register offsets */ +#define WCOVE_CHGRIRQ0 0x4e09 +#define WCOVE_PHYCTRL 0x5e07 + +#define USBC_CONTROL1 0x7001 +#define USBC_CONTROL2 0x7002 +#define USBC_CONTROL3 0x7003 +#define USBC_CC1_CTRL 0x7004 +#define USBC_CC2_CTRL 0x7005 +#define USBC_STATUS1 0x7007 +#define USBC_STATUS2 0x7008 +#define USBC_STATUS3 0x7009 +#define USBC_IRQ1 0x7015 +#define USBC_IRQ2 0x7016 +#define USBC_IRQMASK1 0x7017 +#define USBC_IRQMASK2 0x7018 + +/* Register bits */ + +#define USBC_CONTROL1_MODE_DRP(r) (((r) & ~0x7) | 4) + +#define USBC_CONTROL2_UNATT_SNK BIT(0) +#define USBC_CONTROL2_UNATT_SRC BIT(1) +#define USBC_CONTROL2_DIS_ST BIT(2) + +#define USBC_CONTROL3_PD_DIS BIT(1) + +#define USBC_CC_CTRL_VCONN_EN BIT(1) + +#define USBC_STATUS1_DET_ONGOING BIT(6) +#define USBC_STATUS1_RSLT(r) ((r) & 0xf) +#define USBC_RSLT_NOTHING 0 +#define USBC_RSLT_SRC_DEFAULT 1 +#define USBC_RSLT_SRC_1_5A 2 +#define USBC_RSLT_SRC_3_0A 3 +#define USBC_RSLT_SNK 4 +#define USBC_RSLT_DEBUG_ACC 5 +#define USBC_RSLT_AUDIO_ACC 6 +#define USBC_RSLT_UNDEF 15 +#define USBC_STATUS1_ORIENT(r) (((r) >> 4) & 0x3) +#define USBC_ORIENT_NORMAL 1 +#define USBC_ORIENT_REVERSE 2 + +#define USBC_STATUS2_VBUS_REQ BIT(5) + +#define USBC_IRQ1_ADCDONE1 BIT(2) +#define USBC_IRQ1_OVERTEMP BIT(1) +#define USBC_IRQ1_SHORT BIT(0) + +#define USBC_IRQ2_CC_CHANGE BIT(7) +#define USBC_IRQ2_RX_PD BIT(6) +#define USBC_IRQ2_RX_HR BIT(5) +#define USBC_IRQ2_RX_CR BIT(4) +#define USBC_IRQ2_TX_SUCCESS BIT(3) +#define USBC_IRQ2_TX_FAIL BIT(2) + +#define USBC_IRQMASK1_ALL (USBC_IRQ1_ADCDONE1 | USBC_IRQ1_OVERTEMP | \ + USBC_IRQ1_SHORT) + +#define USBC_IRQMASK2_ALL (USBC_IRQ2_CC_CHANGE | USBC_IRQ2_RX_PD | \ + USBC_IRQ2_RX_HR | USBC_IRQ2_RX_CR | \ + USBC_IRQ2_TX_SUCCESS | USBC_IRQ2_TX_FAIL) + +struct wcove_typec { + struct mutex lock; /* device lock */ + struct device *dev; + struct regmap *regmap; + struct typec_port *port; + struct typec_capability cap; + struct typec_partner *partner; +}; + +enum wcove_typec_func { + WCOVE_FUNC_DRIVE_VBUS = 1, + WCOVE_FUNC_ORIENTATION, + WCOVE_FUNC_ROLE, + WCOVE_FUNC_DRIVE_VCONN, +}; + +enum wcove_typec_orientation { + WCOVE_ORIENTATION_NORMAL, + WCOVE_ORIENTATION_REVERSE, +}; + +enum wcove_typec_role { + WCOVE_ROLE_HOST, + WCOVE_ROLE_DEVICE, +}; + +static uuid_le uuid = UUID_LE(0x482383f0, 0x2876, 0x4e49, + 0x86, 0x85, 0xdb, 0x66, 0x21, 0x1a, 0xf0, 0x37); + +static int wcove_typec_func(struct wcove_typec *wcove, + enum wcove_typec_func func, int param) +{ + union acpi_object *obj; + union acpi_object tmp; + union acpi_object argv4 = ACPI_INIT_DSM_ARGV4(1, &tmp); + + tmp.type = ACPI_TYPE_INTEGER; + tmp.integer.value = param; + + obj = acpi_evaluate_dsm(ACPI_HANDLE(wcove->dev), uuid.b, 1, func, + &argv4); + if (!obj) { + dev_err(wcove->dev, "%s: failed to evaluate _DSM\n", __func__); + return -EIO; + } + + ACPI_FREE(obj); + return 0; +} + +static irqreturn_t wcove_typec_irq(int irq, void *data) +{ + enum typec_role role = TYPEC_SINK; + struct typec_partner_desc partner; + struct wcove_typec *wcove = data; + unsigned int cc1_ctrl; + unsigned int cc2_ctrl; + unsigned int cc_irq1; + unsigned int cc_irq2; + unsigned int status1; + unsigned int status2; + int ret; + + mutex_lock(&wcove->lock); + + ret = regmap_read(wcove->regmap, USBC_IRQ1, &cc_irq1); + if (ret) + goto err; + + ret = regmap_read(wcove->regmap, USBC_IRQ2, &cc_irq2); + if (ret) + goto err; + + ret = regmap_read(wcove->regmap, USBC_STATUS1, &status1); + if (ret) + goto err; + + ret = regmap_read(wcove->regmap, USBC_STATUS2, &status2); + if (ret) + goto err; + + ret = regmap_read(wcove->regmap, USBC_CC1_CTRL, &cc1_ctrl); + if (ret) + goto err; + + ret = regmap_read(wcove->regmap, USBC_CC2_CTRL, &cc2_ctrl); + if (ret) + goto err; + + if (cc_irq1) { + if (cc_irq1 & USBC_IRQ1_OVERTEMP) + dev_err(wcove->dev, "VCONN Switch Over Temperature!\n"); + if (cc_irq1 & USBC_IRQ1_SHORT) + dev_err(wcove->dev, "VCONN Switch Short Circuit!\n"); + ret = regmap_write(wcove->regmap, USBC_IRQ1, cc_irq1); + if (ret) + goto err; + } + + if (cc_irq2) { + ret = regmap_write(wcove->regmap, USBC_IRQ2, cc_irq2); + if (ret) + goto err; + /* + * Ignoring any PD communication interrupts until the PD support + * is available + */ + if (cc_irq2 & ~USBC_IRQ2_CC_CHANGE) { + dev_WARN(wcove->dev, "USB PD handling missing\n"); + goto err; + } + } + + if (status1 & USBC_STATUS1_DET_ONGOING) + goto out; + + if (USBC_STATUS1_RSLT(status1) == USBC_RSLT_NOTHING) { + if (wcove->partner) { + typec_unregister_partner(wcove->partner); + wcove->partner = NULL; + } + + wcove_typec_func(wcove, WCOVE_FUNC_ORIENTATION, + WCOVE_ORIENTATION_NORMAL); + + /* This makes sure the device controller is disconnected */ + wcove_typec_func(wcove, WCOVE_FUNC_ROLE, WCOVE_ROLE_HOST); + + /* Port to default role */ + typec_set_data_role(wcove->port, TYPEC_DEVICE); + typec_set_pwr_role(wcove->port, TYPEC_SINK); + typec_set_pwr_opmode(wcove->port, TYPEC_PWR_MODE_USB); + + goto out; + } + + if (wcove->partner) + goto out; + + switch (USBC_STATUS1_ORIENT(status1)) { + case USBC_ORIENT_NORMAL: + wcove_typec_func(wcove, WCOVE_FUNC_ORIENTATION, + WCOVE_ORIENTATION_NORMAL); + break; + case USBC_ORIENT_REVERSE: + wcove_typec_func(wcove, WCOVE_FUNC_ORIENTATION, + WCOVE_ORIENTATION_REVERSE); + default: + break; + } + + memset(&partner, 0, sizeof(partner)); + + switch (USBC_STATUS1_RSLT(status1)) { + case USBC_RSLT_SRC_DEFAULT: + typec_set_pwr_opmode(wcove->port, TYPEC_PWR_MODE_USB); + break; + case USBC_RSLT_SRC_1_5A: + typec_set_pwr_opmode(wcove->port, TYPEC_PWR_MODE_1_5A); + break; + case USBC_RSLT_SRC_3_0A: + typec_set_pwr_opmode(wcove->port, TYPEC_PWR_MODE_3_0A); + break; + case USBC_RSLT_SNK: + role = TYPEC_SOURCE; + break; + case USBC_RSLT_DEBUG_ACC: + partner.accessory = TYPEC_ACCESSORY_DEBUG; + break; + case USBC_RSLT_AUDIO_ACC: + partner.accessory = TYPEC_ACCESSORY_AUDIO; + break; + default: + dev_WARN(wcove->dev, "%s Undefined result\n", __func__); + goto err; + } + + if (role == TYPEC_SINK) { + wcove_typec_func(wcove, WCOVE_FUNC_ROLE, WCOVE_ROLE_DEVICE); + typec_set_data_role(wcove->port, TYPEC_DEVICE); + typec_set_pwr_role(wcove->port, TYPEC_SINK); + } else { + wcove_typec_func(wcove, WCOVE_FUNC_ROLE, WCOVE_ROLE_HOST); + typec_set_pwr_role(wcove->port, TYPEC_SOURCE); + typec_set_data_role(wcove->port, TYPEC_HOST); + } + + wcove->partner = typec_register_partner(wcove->port, &partner); + if (!wcove->partner) + dev_err(wcove->dev, "failed register partner\n"); +out: + /* If either CC pins is requesting VCONN, we turn it on */ + if ((cc1_ctrl & USBC_CC_CTRL_VCONN_EN) || + (cc2_ctrl & USBC_CC_CTRL_VCONN_EN)) + wcove_typec_func(wcove, WCOVE_FUNC_DRIVE_VCONN, true); + else + wcove_typec_func(wcove, WCOVE_FUNC_DRIVE_VCONN, false); + + /* Relying on the FSM to know when we need to drive VBUS. */ + wcove_typec_func(wcove, WCOVE_FUNC_DRIVE_VBUS, + !!(status2 & USBC_STATUS2_VBUS_REQ)); +err: + /* REVISIT: Clear WhiskeyCove CHGR Type-C interrupt */ + regmap_write(wcove->regmap, WCOVE_CHGRIRQ0, BIT(5)); + + mutex_unlock(&wcove->lock); + return IRQ_HANDLED; +} + +static int wcove_typec_probe(struct platform_device *pdev) +{ + struct intel_soc_pmic *pmic = dev_get_drvdata(pdev->dev.parent); + struct wcove_typec *wcove; + unsigned int val; + int ret; + + wcove = devm_kzalloc(&pdev->dev, sizeof(*wcove), GFP_KERNEL); + if (!wcove) + return -ENOMEM; + + mutex_init(&wcove->lock); + wcove->dev = &pdev->dev; + wcove->regmap = pmic->regmap; + + ret = regmap_irq_get_virq(pmic->irq_chip_data_level2, + platform_get_irq(pdev, 0)); + if (ret < 0) + return ret; + + ret = devm_request_threaded_irq(&pdev->dev, ret, NULL, + wcove_typec_irq, IRQF_ONESHOT, + "wcove_typec", wcove); + if (ret) + return ret; + + if (!acpi_check_dsm(ACPI_HANDLE(&pdev->dev), uuid.b, 0, 0x1f)) { + dev_err(&pdev->dev, "Missing _DSM functions\n"); + return -ENODEV; + } + + wcove->cap.type = TYPEC_PORT_DRP; + wcove->cap.revision = USB_TYPEC_REV_1_1; + wcove->cap.prefer_role = TYPEC_NO_PREFERRED_ROLE; + + /* Make sure the PD PHY is disabled until USB PD is available */ + regmap_read(wcove->regmap, USBC_CONTROL3, &val); + regmap_write(wcove->regmap, USBC_CONTROL3, val | USBC_CONTROL3_PD_DIS); + + /* DRP mode without accessory support */ + regmap_read(wcove->regmap, USBC_CONTROL1, &val); + regmap_write(wcove->regmap, USBC_CONTROL1, USBC_CONTROL1_MODE_DRP(val)); + + wcove->port = typec_register_port(&pdev->dev, &wcove->cap); + if (!wcove->port) + return -ENODEV; + + /* Unmask everything */ + regmap_read(wcove->regmap, USBC_IRQMASK1, &val); + regmap_write(wcove->regmap, USBC_IRQMASK1, val & ~USBC_IRQMASK1_ALL); + regmap_read(wcove->regmap, USBC_IRQMASK2, &val); + regmap_write(wcove->regmap, USBC_IRQMASK2, val & ~USBC_IRQMASK2_ALL); + + platform_set_drvdata(pdev, wcove); + return 0; +} + +static int wcove_typec_remove(struct platform_device *pdev) +{ + struct wcove_typec *wcove = platform_get_drvdata(pdev); + unsigned int val; + + /* Mask everything */ + regmap_read(wcove->regmap, USBC_IRQMASK1, &val); + regmap_write(wcove->regmap, USBC_IRQMASK1, val | USBC_IRQMASK1_ALL); + regmap_read(wcove->regmap, USBC_IRQMASK2, &val); + regmap_write(wcove->regmap, USBC_IRQMASK2, val | USBC_IRQMASK2_ALL); + + typec_unregister_partner(wcove->partner); + typec_unregister_port(wcove->port); + return 0; +} + +static struct platform_driver wcove_typec_driver = { + .driver = { + .name = "bxt_wcove_usbc", + }, + .probe = wcove_typec_probe, + .remove = wcove_typec_remove, +}; + +module_platform_driver(wcove_typec_driver); + +MODULE_AUTHOR("Intel Corporation"); +MODULE_LICENSE("GPL v2"); +MODULE_DESCRIPTION("WhiskeyCove PMIC USB Type-C PHY driver"); +MODULE_ALIAS("platform:bxt_wcove_usbc"); diff --git a/drivers/usb/usb-skeleton.c b/drivers/usb/usb-skeleton.c index 5133a0792eb0418391f74e4f56486731dc7e62fc..bb0bd732e29ab7369fb3865a4682a596b5f4459a 100644 --- a/drivers/usb/usb-skeleton.c +++ b/drivers/usb/usb-skeleton.c @@ -491,16 +491,14 @@ static int skel_probe(struct usb_interface *interface, const struct usb_device_id *id) { struct usb_skel *dev; - struct usb_host_interface *iface_desc; - struct usb_endpoint_descriptor *endpoint; - size_t buffer_size; - int i; - int retval = -ENOMEM; + struct usb_endpoint_descriptor *bulk_in, *bulk_out; + int retval; /* allocate memory for our device state and initialize it */ dev = kzalloc(sizeof(*dev), GFP_KERNEL); if (!dev) - goto error; + return -ENOMEM; + kref_init(&dev->kref); sema_init(&dev->limit_sem, WRITES_IN_FLIGHT); mutex_init(&dev->io_mutex); @@ -513,36 +511,29 @@ static int skel_probe(struct usb_interface *interface, /* set up the endpoint information */ /* use only the first bulk-in and bulk-out endpoints */ - iface_desc = interface->cur_altsetting; - for (i = 0; i < iface_desc->desc.bNumEndpoints; ++i) { - endpoint = &iface_desc->endpoint[i].desc; - - if (!dev->bulk_in_endpointAddr && - usb_endpoint_is_bulk_in(endpoint)) { - /* we found a bulk in endpoint */ - buffer_size = usb_endpoint_maxp(endpoint); - dev->bulk_in_size = buffer_size; - dev->bulk_in_endpointAddr = endpoint->bEndpointAddress; - dev->bulk_in_buffer = kmalloc(buffer_size, GFP_KERNEL); - if (!dev->bulk_in_buffer) - goto error; - dev->bulk_in_urb = usb_alloc_urb(0, GFP_KERNEL); - if (!dev->bulk_in_urb) - goto error; - } - - if (!dev->bulk_out_endpointAddr && - usb_endpoint_is_bulk_out(endpoint)) { - /* we found a bulk out endpoint */ - dev->bulk_out_endpointAddr = endpoint->bEndpointAddress; - } - } - if (!(dev->bulk_in_endpointAddr && dev->bulk_out_endpointAddr)) { + retval = usb_find_common_endpoints(interface->cur_altsetting, + &bulk_in, &bulk_out, NULL, NULL); + if (retval) { dev_err(&interface->dev, "Could not find both bulk-in and bulk-out endpoints\n"); goto error; } + dev->bulk_in_size = usb_endpoint_maxp(bulk_in); + dev->bulk_in_endpointAddr = bulk_in->bEndpointAddress; + dev->bulk_in_buffer = kmalloc(dev->bulk_in_size, GFP_KERNEL); + if (!dev->bulk_in_buffer) { + retval = -ENOMEM; + goto error; + } + dev->bulk_in_urb = usb_alloc_urb(0, GFP_KERNEL); + if (!dev->bulk_in_urb) { + retval = -ENOMEM; + goto error; + } + + dev->bulk_out_endpointAddr = bulk_out->bEndpointAddress; + /* save our data pointer in this interface device */ usb_set_intfdata(interface, dev); @@ -563,9 +554,9 @@ static int skel_probe(struct usb_interface *interface, return 0; error: - if (dev) - /* this frees allocated memory */ - kref_put(&dev->kref, skel_delete); + /* this frees allocated memory */ + kref_put(&dev->kref, skel_delete); + return retval; } diff --git a/drivers/usb/usbip/vhci_hcd.c b/drivers/usb/usbip/vhci_hcd.c index e4cb9f0625e8f0b1afdf9fdf591430780d68eac2..5d8b2c261940d9a7785bc64a505f2959f57cb395 100644 --- a/drivers/usb/usbip/vhci_hcd.c +++ b/drivers/usb/usbip/vhci_hcd.c @@ -430,36 +430,8 @@ static int vhci_hub_control(struct usb_hcd *hcd, u16 typeReq, u16 wValue, return retval; } -static struct vhci_device *get_vdev(struct usb_device *udev) +static void vhci_tx_urb(struct urb *urb, struct vhci_device *vdev) { - struct platform_device *pdev; - struct usb_hcd *hcd; - struct vhci_hcd *vhci; - int pdev_nr, rhport; - - if (!udev) - return NULL; - - for (pdev_nr = 0; pdev_nr < vhci_num_controllers; pdev_nr++) { - pdev = *(vhci_pdevs + pdev_nr); - if (pdev == NULL) - continue; - hcd = platform_get_drvdata(pdev); - if (hcd == NULL) - continue; - vhci = hcd_to_vhci(hcd); - for (rhport = 0; rhport < VHCI_HC_PORTS; rhport++) { - if (vhci->vdev[rhport].udev == udev) - return &vhci->vdev[rhport]; - } - } - - return NULL; -} - -static void vhci_tx_urb(struct urb *urb) -{ - struct vhci_device *vdev = get_vdev(urb->dev); struct vhci_priv *priv; struct vhci_hcd *vhci; unsigned long flags; @@ -601,7 +573,7 @@ static int vhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, } out: - vhci_tx_urb(urb); + vhci_tx_urb(urb, vdev); spin_unlock_irqrestore(&vhci->lock, flags); return 0; diff --git a/drivers/vfio/vfio_iommu_spapr_tce.c b/drivers/vfio/vfio_iommu_spapr_tce.c index cf3de91fbfe7a522456d00bff9005df9a3662ed4..63112c36ab2de129e41b70e2c9db77e17399a781 100644 --- a/drivers/vfio/vfio_iommu_spapr_tce.c +++ b/drivers/vfio/vfio_iommu_spapr_tce.c @@ -198,6 +198,11 @@ static long tce_iommu_register_pages(struct tce_container *container, return ret; tcemem = kzalloc(sizeof(*tcemem), GFP_KERNEL); + if (!tcemem) { + mm_iommu_put(container->mm, mem); + return -ENOMEM; + } + tcemem->mem = mem; list_add(&tcemem->next, &container->prereg_list); @@ -680,7 +685,7 @@ static void tce_iommu_free_table(struct tce_container *container, unsigned long pages = tbl->it_allocated_size >> PAGE_SHIFT; tce_iommu_userspace_view_free(tbl, container->mm); - tbl->it_ops->free(tbl); + iommu_tce_table_put(tbl); decrement_locked_vm(container->mm, pages); } @@ -1335,8 +1340,16 @@ static int tce_iommu_attach_group(void *iommu_data, if (!table_group->ops || !table_group->ops->take_ownership || !table_group->ops->release_ownership) { + if (container->v2) { + ret = -EPERM; + goto unlock_exit; + } ret = tce_iommu_take_ownership(container, table_group); } else { + if (!container->v2) { + ret = -EPERM; + goto unlock_exit; + } ret = tce_iommu_take_ownership_ddw(container, table_group); if (!tce_groups_attached(container) && !container->tables[0]) container->def_window_pending = true; diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 32d2633092a37edf64ec4b9d2afc9fa4f12ea77d..8549cb111627f0b4bc64956df7c2b1e945bbfdc2 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -246,69 +246,46 @@ static int vfio_iova_put_vfio_pfn(struct vfio_dma *dma, struct vfio_pfn *vpfn) return ret; } -struct vwork { - struct mm_struct *mm; - long npage; - struct work_struct work; -}; - -/* delayed decrement/increment for locked_vm */ -static void vfio_lock_acct_bg(struct work_struct *work) +static int vfio_lock_acct(struct task_struct *task, long npage, bool *lock_cap) { - struct vwork *vwork = container_of(work, struct vwork, work); - struct mm_struct *mm; - - mm = vwork->mm; - down_write(&mm->mmap_sem); - mm->locked_vm += vwork->npage; - up_write(&mm->mmap_sem); - mmput(mm); - kfree(vwork); -} - -static void vfio_lock_acct(struct task_struct *task, long npage) -{ - struct vwork *vwork; struct mm_struct *mm; bool is_current; + int ret; if (!npage) - return; + return 0; is_current = (task->mm == current->mm); mm = is_current ? task->mm : get_task_mm(task); if (!mm) - return; /* process exited */ + return -ESRCH; /* process exited */ - if (down_write_trylock(&mm->mmap_sem)) { - mm->locked_vm += npage; - up_write(&mm->mmap_sem); - if (!is_current) - mmput(mm); - return; - } + ret = down_write_killable(&mm->mmap_sem); + if (!ret) { + if (npage > 0) { + if (lock_cap ? !*lock_cap : + !has_capability(task, CAP_IPC_LOCK)) { + unsigned long limit; + + limit = task_rlimit(task, + RLIMIT_MEMLOCK) >> PAGE_SHIFT; + + if (mm->locked_vm + npage > limit) + ret = -ENOMEM; + } + } + + if (!ret) + mm->locked_vm += npage; - if (is_current) { - mm = get_task_mm(task); - if (!mm) - return; + up_write(&mm->mmap_sem); } - /* - * Couldn't get mmap_sem lock, so must setup to update - * mm->locked_vm later. If locked_vm were atomic, we - * wouldn't need this silliness - */ - vwork = kmalloc(sizeof(struct vwork), GFP_KERNEL); - if (WARN_ON(!vwork)) { + if (!is_current) mmput(mm); - return; - } - INIT_WORK(&vwork->work, vfio_lock_acct_bg); - vwork->mm = mm; - vwork->npage = npage; - schedule_work(&vwork->work); + + return ret; } /* @@ -403,10 +380,10 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, * first page and all consecutive pages with the same locking. */ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, - long npage, unsigned long *pfn_base) + long npage, unsigned long *pfn_base, + bool lock_cap, unsigned long limit) { - unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - bool lock_cap = capable(CAP_IPC_LOCK); + unsigned long pfn = 0; long ret, pinned = 0, lock_acct = 0; bool rsvd; dma_addr_t iova = vaddr - dma->vaddr + dma->iova; @@ -442,8 +419,6 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, /* Lock all the consecutive pages from pfn_base */ for (vaddr += PAGE_SIZE, iova += PAGE_SIZE; pinned < npage; pinned++, vaddr += PAGE_SIZE, iova += PAGE_SIZE) { - unsigned long pfn = 0; - ret = vaddr_get_pfn(current->mm, vaddr, dma->prot, &pfn); if (ret) break; @@ -460,14 +435,25 @@ static long vfio_pin_pages_remote(struct vfio_dma *dma, unsigned long vaddr, put_pfn(pfn, dma->prot); pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n", __func__, limit << PAGE_SHIFT); - break; + ret = -ENOMEM; + goto unpin_out; } lock_acct++; } } out: - vfio_lock_acct(current, lock_acct); + ret = vfio_lock_acct(current, lock_acct, &lock_cap); + +unpin_out: + if (ret) { + if (!rsvd) { + for (pfn = *pfn_base ; pinned ; pfn++, pinned--) + put_pfn(pfn, dma->prot); + } + + return ret; + } return pinned; } @@ -488,7 +474,7 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, } if (do_accounting) - vfio_lock_acct(dma->task, locked - unlocked); + vfio_lock_acct(dma->task, locked - unlocked, NULL); return unlocked; } @@ -496,37 +482,26 @@ static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova, static int vfio_pin_page_external(struct vfio_dma *dma, unsigned long vaddr, unsigned long *pfn_base, bool do_accounting) { - unsigned long limit; - bool lock_cap = has_capability(dma->task, CAP_IPC_LOCK); struct mm_struct *mm; int ret; - bool rsvd; mm = get_task_mm(dma->task); if (!mm) return -ENODEV; ret = vaddr_get_pfn(mm, vaddr, dma->prot, pfn_base); - if (ret) - goto pin_page_exit; - - rsvd = is_invalid_reserved_pfn(*pfn_base); - limit = task_rlimit(dma->task, RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (!rsvd && !lock_cap && mm->locked_vm + 1 > limit) { - put_pfn(*pfn_base, dma->prot); - pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK (%ld) exceeded\n", - __func__, dma->task->comm, task_pid_nr(dma->task), - limit << PAGE_SHIFT); - ret = -ENOMEM; - goto pin_page_exit; + if (!ret && do_accounting && !is_invalid_reserved_pfn(*pfn_base)) { + ret = vfio_lock_acct(dma->task, 1, NULL); + if (ret) { + put_pfn(*pfn_base, dma->prot); + if (ret == -ENOMEM) + pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK " + "(%ld) exceeded\n", __func__, + dma->task->comm, task_pid_nr(dma->task), + task_rlimit(dma->task, RLIMIT_MEMLOCK)); + } } - if (!rsvd && do_accounting) - vfio_lock_acct(dma->task, 1); - ret = 1; - -pin_page_exit: mmput(mm); return ret; } @@ -543,7 +518,7 @@ static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova, unlocked = vfio_iova_put_vfio_pfn(dma, vpfn); if (do_accounting) - vfio_lock_acct(dma->task, -unlocked); + vfio_lock_acct(dma->task, -unlocked, NULL); return unlocked; } @@ -606,10 +581,8 @@ static int vfio_iommu_type1_pin_pages(void *iommu_data, remote_vaddr = dma->vaddr + iova - dma->iova; ret = vfio_pin_page_external(dma, remote_vaddr, &phys_pfn[i], do_accounting); - if (ret <= 0) { - WARN_ON(!ret); + if (ret) goto pin_unwind; - } ret = vfio_add_to_pfn_list(dma, iova, phys_pfn[i]); if (ret) { @@ -740,7 +713,7 @@ static long vfio_unmap_unpin(struct vfio_iommu *iommu, struct vfio_dma *dma, dma->iommu_mapped = false; if (do_accounting) { - vfio_lock_acct(dma->task, -unlocked); + vfio_lock_acct(dma->task, -unlocked, NULL); return 0; } return unlocked; @@ -951,13 +924,15 @@ static int vfio_pin_map_dma(struct vfio_iommu *iommu, struct vfio_dma *dma, unsigned long vaddr = dma->vaddr; size_t size = map_size; long npage; - unsigned long pfn; + unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); int ret = 0; while (size) { /* Pin a contiguous chunk of memory */ npage = vfio_pin_pages_remote(dma, vaddr + dma->size, - size >> PAGE_SHIFT, &pfn); + size >> PAGE_SHIFT, &pfn, + lock_cap, limit); if (npage <= 0) { WARN_ON(!npage); ret = (int)npage; @@ -1067,6 +1042,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, { struct vfio_domain *d; struct rb_node *n; + unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + bool lock_cap = capable(CAP_IPC_LOCK); int ret; /* Arbitrarily pick the first domain in the list for lookups */ @@ -1113,7 +1090,8 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu, npage = vfio_pin_pages_remote(dma, vaddr, n >> PAGE_SHIFT, - &pfn); + &pfn, lock_cap, + limit); if (npage <= 0) { WARN_ON(!npage); ret = (int)npage; @@ -1382,7 +1360,7 @@ static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu) if (!is_invalid_reserved_pfn(vpfn->pfn)) locked++; } - vfio_lock_acct(dma->task, locked - unlocked); + vfio_lock_acct(dma->task, locked - unlocked, NULL); } } diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 9b519897cc17b8f31236bd51bfa283fbf5ada319..f61f852d6cfd114978036208bf2b0a5c841231d4 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -817,12 +817,9 @@ static int vhost_net_open(struct inode *inode, struct file *f) struct vhost_virtqueue **vqs; int i; - n = kmalloc(sizeof *n, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!n) { - n = vmalloc(sizeof *n); - if (!n) - return -ENOMEM; - } + n = kvmalloc(sizeof *n, GFP_KERNEL | __GFP_REPEAT); + if (!n) + return -ENOMEM; vqs = kmalloc(VHOST_NET_VQ_MAX * sizeof(*vqs), GFP_KERNEL); if (!vqs) { kvfree(n); diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index f0ba362d4c101aa970a12943d1e69944d54058b7..042030e5a0357ad698181f817e6a75ca7c3087e3 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -534,18 +534,9 @@ long vhost_dev_set_owner(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_dev_set_owner); -static void *vhost_kvzalloc(unsigned long size) -{ - void *n = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - - if (!n) - n = vzalloc(size); - return n; -} - struct vhost_umem *vhost_dev_reset_owner_prepare(void) { - return vhost_kvzalloc(sizeof(struct vhost_umem)); + return kvzalloc(sizeof(struct vhost_umem), GFP_KERNEL); } EXPORT_SYMBOL_GPL(vhost_dev_reset_owner_prepare); @@ -1276,7 +1267,7 @@ EXPORT_SYMBOL_GPL(vhost_vq_access_ok); static struct vhost_umem *vhost_umem_alloc(void) { - struct vhost_umem *umem = vhost_kvzalloc(sizeof(*umem)); + struct vhost_umem *umem = kvzalloc(sizeof(*umem), GFP_KERNEL); if (!umem) return NULL; @@ -1302,7 +1293,7 @@ static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) return -EOPNOTSUPP; if (mem.nregions > max_mem_regions) return -E2BIG; - newmem = vhost_kvzalloc(size + mem.nregions * sizeof(*m->regions)); + newmem = kvzalloc(size + mem.nregions * sizeof(*m->regions), GFP_KERNEL); if (!newmem) return -ENOMEM; diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index 44eed8eb0725b25e3c9765e19387e7c338ab9bbb..3acef3c5d8edeca06d09e4b5d7e4ddeed0c8936c 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -176,6 +176,11 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, restart_tx = true; } + /* Deliver to monitoring devices all correctly transmitted + * packets. + */ + virtio_transport_deliver_tap_pkt(pkt); + virtio_transport_free_pkt(pkt); } if (added) @@ -383,6 +388,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) len = pkt->len; + /* Deliver to monitoring devices all received packets */ + virtio_transport_deliver_tap_pkt(pkt); + /* Only accept correctly addressed packets */ if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid) virtio_transport_recv_pkt(pkt); @@ -500,12 +508,9 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) /* This struct is large and allocation could fail, fall back to vmalloc * if there is no other way. */ - vsock = kzalloc(sizeof(*vsock), GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!vsock) { - vsock = vmalloc(sizeof(*vsock)); - if (!vsock) - return -ENOMEM; - } + vsock = kvmalloc(sizeof(*vsock), GFP_KERNEL | __GFP_REPEAT); + if (!vsock) + return -ENOMEM; vqs = kmalloc_array(ARRAY_SIZE(vsock->vqs), sizeof(*vqs), GFP_KERNEL); if (!vqs) { diff --git a/drivers/video/Makefile b/drivers/video/Makefile index 9ad3c17d645689b79e56364377a71671eb4aac23..445b2c230b56ade5695c9e074ed9548b1a77caff 100644 --- a/drivers/video/Makefile +++ b/drivers/video/Makefile @@ -2,6 +2,7 @@ obj-$(CONFIG_VGASTATE) += vgastate.o obj-$(CONFIG_HDMI) += hdmi.o obj-$(CONFIG_VT) += console/ +obj-$(CONFIG_FB_STI) += console/ obj-$(CONFIG_LOGO) += logo/ obj-y += backlight/ diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 5ffa4b4e26c0e874d3f63068456b3d72d8b4beb5..4e1d2ad50ba142950e0048d986ab7e0df2eafece 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -460,6 +460,13 @@ config BACKLIGHT_BD6107 help If you have a Rohm BD6107 say Y to enable the backlight driver. +config BACKLIGHT_ARCXCNN + tristate "Backlight driver for the Arctic Sands ARCxCnnnn family" + depends on I2C + help + If you have an ARCxCnnnn family backlight say Y to enable + the backlight driver. + endif # BACKLIGHT_CLASS_DEVICE endif # BACKLIGHT_LCD_SUPPORT diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index 16ec534cff3044209adbae3495c19d97fee3ef73..8905129691e84a2fe42acd625f32b0a01772097c 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -55,3 +55,4 @@ obj-$(CONFIG_BACKLIGHT_SKY81452) += sky81452-backlight.o obj-$(CONFIG_BACKLIGHT_TOSA) += tosa_bl.o obj-$(CONFIG_BACKLIGHT_TPS65217) += tps65217_bl.o obj-$(CONFIG_BACKLIGHT_WM831X) += wm831x_bl.o +obj-$(CONFIG_BACKLIGHT_ARCXCNN) += arcxcnn_bl.o diff --git a/drivers/video/backlight/arcxcnn_bl.c b/drivers/video/backlight/arcxcnn_bl.c new file mode 100644 index 0000000000000000000000000000000000000000..dec790de72ff4a8496c28be145c834353e06cebc --- /dev/null +++ b/drivers/video/backlight/arcxcnn_bl.c @@ -0,0 +1,419 @@ +/* + * Backlight driver for ArcticSand ARC_X_C_0N_0N Devices + * + * Copyright 2016 ArcticSand, Inc. + * Author : Brian Dodge + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, see . + */ + +#include +#include +#include +#include +#include +#include + +enum arcxcnn_chip_id { + ARC2C0608 +}; + +/** + * struct arcxcnn_platform_data + * @name : Backlight driver name (NULL will use default) + * @initial_brightness : initial value of backlight brightness + * @leden : initial LED string enables, upper bit is global on/off + * @led_config_0 : fading speed (period between intensity steps) + * @led_config_1 : misc settings, see datasheet + * @dim_freq : pwm dimming frequency if in pwm mode + * @comp_config : misc config, see datasheet + * @filter_config : RC/PWM filter config, see datasheet + * @trim_config : full scale current trim, see datasheet + */ +struct arcxcnn_platform_data { + const char *name; + u16 initial_brightness; + u8 leden; + u8 led_config_0; + u8 led_config_1; + u8 dim_freq; + u8 comp_config; + u8 filter_config; + u8 trim_config; +}; + +#define ARCXCNN_CMD 0x00 /* Command Register */ +#define ARCXCNN_CMD_STDBY 0x80 /* I2C Standby */ +#define ARCXCNN_CMD_RESET 0x40 /* Reset */ +#define ARCXCNN_CMD_BOOST 0x10 /* Boost */ +#define ARCXCNN_CMD_OVP_MASK 0x0C /* --- Over Voltage Threshold */ +#define ARCXCNN_CMD_OVP_XXV 0x0C /* Over Voltage Threshold */ +#define ARCXCNN_CMD_OVP_20V 0x08 /* 20v Over Voltage Threshold */ +#define ARCXCNN_CMD_OVP_24V 0x04 /* 24v Over Voltage Threshold */ +#define ARCXCNN_CMD_OVP_31V 0x00 /* 31.4v Over Voltage Threshold */ +#define ARCXCNN_CMD_EXT_COMP 0x01 /* part (0) or full (1) ext. comp */ + +#define ARCXCNN_CONFIG 0x01 /* Configuration */ +#define ARCXCNN_STATUS1 0x02 /* Status 1 */ +#define ARCXCNN_STATUS2 0x03 /* Status 2 */ +#define ARCXCNN_FADECTRL 0x04 /* Fading Control */ +#define ARCXCNN_ILED_CONFIG 0x05 /* ILED Configuration */ +#define ARCXCNN_ILED_DIM_PWM 0x00 /* config dim mode pwm */ +#define ARCXCNN_ILED_DIM_INT 0x04 /* config dim mode internal */ +#define ARCXCNN_LEDEN 0x06 /* LED Enable Register */ +#define ARCXCNN_LEDEN_ISETEXT 0x80 /* Full-scale current set extern */ +#define ARCXCNN_LEDEN_MASK 0x3F /* LED string enables mask */ +#define ARCXCNN_LEDEN_BITS 0x06 /* Bits of LED string enables */ +#define ARCXCNN_LEDEN_LED1 0x01 +#define ARCXCNN_LEDEN_LED2 0x02 +#define ARCXCNN_LEDEN_LED3 0x04 +#define ARCXCNN_LEDEN_LED4 0x08 +#define ARCXCNN_LEDEN_LED5 0x10 +#define ARCXCNN_LEDEN_LED6 0x20 + +#define ARCXCNN_WLED_ISET_LSB 0x07 /* LED ISET LSB (in upper nibble) */ +#define ARCXCNN_WLED_ISET_LSB_SHIFT 0x04 /* ISET LSB Left Shift */ +#define ARCXCNN_WLED_ISET_MSB 0x08 /* LED ISET MSB (8 bits) */ + +#define ARCXCNN_DIMFREQ 0x09 +#define ARCXCNN_COMP_CONFIG 0x0A +#define ARCXCNN_FILT_CONFIG 0x0B +#define ARCXCNN_IMAXTUNE 0x0C +#define ARCXCNN_ID_MSB 0x1E +#define ARCXCNN_ID_LSB 0x1F + +#define MAX_BRIGHTNESS 4095 +#define INIT_BRIGHT 60 + +struct arcxcnn { + struct i2c_client *client; + struct backlight_device *bl; + struct device *dev; + struct arcxcnn_platform_data *pdata; +}; + +static int arcxcnn_update_field(struct arcxcnn *lp, u8 reg, u8 mask, u8 data) +{ + int ret; + u8 tmp; + + ret = i2c_smbus_read_byte_data(lp->client, reg); + if (ret < 0) { + dev_err(lp->dev, "failed to read 0x%.2x\n", reg); + return ret; + } + + tmp = (u8)ret; + tmp &= ~mask; + tmp |= data & mask; + + return i2c_smbus_write_byte_data(lp->client, reg, tmp); +} + +static int arcxcnn_set_brightness(struct arcxcnn *lp, u32 brightness) +{ + int ret; + u8 val; + + /* lower nibble of brightness goes in upper nibble of LSB register */ + val = (brightness & 0xF) << ARCXCNN_WLED_ISET_LSB_SHIFT; + ret = i2c_smbus_write_byte_data(lp->client, + ARCXCNN_WLED_ISET_LSB, val); + if (ret < 0) + return ret; + + /* remaining 8 bits of brightness go in MSB register */ + val = (brightness >> 4); + return i2c_smbus_write_byte_data(lp->client, + ARCXCNN_WLED_ISET_MSB, val); +} + +static int arcxcnn_bl_update_status(struct backlight_device *bl) +{ + struct arcxcnn *lp = bl_get_data(bl); + u32 brightness = bl->props.brightness; + int ret; + + if (bl->props.state & (BL_CORE_SUSPENDED | BL_CORE_FBBLANK)) + brightness = 0; + + ret = arcxcnn_set_brightness(lp, brightness); + if (ret) + return ret; + + /* set power-on/off/save modes */ + return arcxcnn_update_field(lp, ARCXCNN_CMD, ARCXCNN_CMD_STDBY, + (bl->props.power == 0) ? 0 : ARCXCNN_CMD_STDBY); +} + +static const struct backlight_ops arcxcnn_bl_ops = { + .options = BL_CORE_SUSPENDRESUME, + .update_status = arcxcnn_bl_update_status, +}; + +static int arcxcnn_backlight_register(struct arcxcnn *lp) +{ + struct backlight_properties *props; + const char *name = lp->pdata->name ? : "arctic_bl"; + + props = devm_kzalloc(lp->dev, sizeof(*props), GFP_KERNEL); + if (!props) + return -ENOMEM; + + props->type = BACKLIGHT_PLATFORM; + props->max_brightness = MAX_BRIGHTNESS; + + if (lp->pdata->initial_brightness > props->max_brightness) + lp->pdata->initial_brightness = props->max_brightness; + + props->brightness = lp->pdata->initial_brightness; + + lp->bl = devm_backlight_device_register(lp->dev, name, lp->dev, lp, + &arcxcnn_bl_ops, props); + return PTR_ERR_OR_ZERO(lp->bl); +} + +static void arcxcnn_parse_dt(struct arcxcnn *lp) +{ + struct device *dev = lp->dev; + struct device_node *node = dev->of_node; + u32 prog_val, num_entry, entry, sources[ARCXCNN_LEDEN_BITS]; + int ret; + + /* device tree entry isn't required, defaults are OK */ + if (!node) + return; + + ret = of_property_read_string(node, "label", &lp->pdata->name); + if (ret < 0) + lp->pdata->name = NULL; + + ret = of_property_read_u32(node, "default-brightness", &prog_val); + if (ret == 0) + lp->pdata->initial_brightness = prog_val; + + ret = of_property_read_u32(node, "arc,led-config-0", &prog_val); + if (ret == 0) + lp->pdata->led_config_0 = (u8)prog_val; + + ret = of_property_read_u32(node, "arc,led-config-1", &prog_val); + if (ret == 0) + lp->pdata->led_config_1 = (u8)prog_val; + + ret = of_property_read_u32(node, "arc,dim-freq", &prog_val); + if (ret == 0) + lp->pdata->dim_freq = (u8)prog_val; + + ret = of_property_read_u32(node, "arc,comp-config", &prog_val); + if (ret == 0) + lp->pdata->comp_config = (u8)prog_val; + + ret = of_property_read_u32(node, "arc,filter-config", &prog_val); + if (ret == 0) + lp->pdata->filter_config = (u8)prog_val; + + ret = of_property_read_u32(node, "arc,trim-config", &prog_val); + if (ret == 0) + lp->pdata->trim_config = (u8)prog_val; + + ret = of_property_count_u32_elems(node, "led-sources"); + if (ret < 0) { + lp->pdata->leden = ARCXCNN_LEDEN_MASK; /* all on is default */ + } else { + num_entry = ret; + if (num_entry > ARCXCNN_LEDEN_BITS) + num_entry = ARCXCNN_LEDEN_BITS; + + ret = of_property_read_u32_array(node, "led-sources", sources, + num_entry); + if (ret < 0) { + dev_err(dev, "led-sources node is invalid.\n"); + return; + } + + lp->pdata->leden = 0; + + /* for each enable in source, set bit in led enable */ + for (entry = 0; entry < num_entry; entry++) { + u8 onbit = 1 << sources[entry]; + + lp->pdata->leden |= onbit; + } + } +} + +static int arcxcnn_probe(struct i2c_client *cl, const struct i2c_device_id *id) +{ + struct arcxcnn *lp; + int ret; + + if (!i2c_check_functionality(cl->adapter, I2C_FUNC_SMBUS_BYTE_DATA)) + return -EIO; + + lp = devm_kzalloc(&cl->dev, sizeof(*lp), GFP_KERNEL); + if (!lp) + return -ENOMEM; + + lp->client = cl; + lp->dev = &cl->dev; + lp->pdata = dev_get_platdata(&cl->dev); + + /* reset the device */ + ret = i2c_smbus_write_byte_data(lp->client, + ARCXCNN_CMD, ARCXCNN_CMD_RESET); + if (ret) + goto probe_err; + + if (!lp->pdata) { + lp->pdata = devm_kzalloc(lp->dev, + sizeof(*lp->pdata), GFP_KERNEL); + if (!lp->pdata) + return -ENOMEM; + + /* Setup defaults based on power-on defaults */ + lp->pdata->name = NULL; + lp->pdata->initial_brightness = INIT_BRIGHT; + lp->pdata->leden = ARCXCNN_LEDEN_MASK; + + lp->pdata->led_config_0 = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_FADECTRL); + + lp->pdata->led_config_1 = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_ILED_CONFIG); + /* insure dim mode is not default pwm */ + lp->pdata->led_config_1 |= ARCXCNN_ILED_DIM_INT; + + lp->pdata->dim_freq = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_DIMFREQ); + + lp->pdata->comp_config = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_COMP_CONFIG); + + lp->pdata->filter_config = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_FILT_CONFIG); + + lp->pdata->trim_config = i2c_smbus_read_byte_data( + lp->client, ARCXCNN_IMAXTUNE); + + if (IS_ENABLED(CONFIG_OF)) + arcxcnn_parse_dt(lp); + } + + i2c_set_clientdata(cl, lp); + + /* constrain settings to what is possible */ + if (lp->pdata->initial_brightness > MAX_BRIGHTNESS) + lp->pdata->initial_brightness = MAX_BRIGHTNESS; + + /* set initial brightness */ + ret = arcxcnn_set_brightness(lp, lp->pdata->initial_brightness); + if (ret) + goto probe_err; + + /* set other register values directly */ + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_FADECTRL, + lp->pdata->led_config_0); + if (ret) + goto probe_err; + + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_ILED_CONFIG, + lp->pdata->led_config_1); + if (ret) + goto probe_err; + + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_DIMFREQ, + lp->pdata->dim_freq); + if (ret) + goto probe_err; + + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_COMP_CONFIG, + lp->pdata->comp_config); + if (ret) + goto probe_err; + + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_FILT_CONFIG, + lp->pdata->filter_config); + if (ret) + goto probe_err; + + ret = i2c_smbus_write_byte_data(lp->client, ARCXCNN_IMAXTUNE, + lp->pdata->trim_config); + if (ret) + goto probe_err; + + /* set initial LED Enables */ + arcxcnn_update_field(lp, ARCXCNN_LEDEN, + ARCXCNN_LEDEN_MASK, lp->pdata->leden); + + ret = arcxcnn_backlight_register(lp); + if (ret) + goto probe_register_err; + + backlight_update_status(lp->bl); + + return 0; + +probe_register_err: + dev_err(lp->dev, + "failed to register backlight.\n"); + +probe_err: + dev_err(lp->dev, + "failure ret: %d\n", ret); + return ret; +} + +static int arcxcnn_remove(struct i2c_client *cl) +{ + struct arcxcnn *lp = i2c_get_clientdata(cl); + + /* disable all strings (ignore errors) */ + i2c_smbus_write_byte_data(lp->client, + ARCXCNN_LEDEN, 0x00); + /* reset the device (ignore errors) */ + i2c_smbus_write_byte_data(lp->client, + ARCXCNN_CMD, ARCXCNN_CMD_RESET); + + lp->bl->props.brightness = 0; + + backlight_update_status(lp->bl); + + return 0; +} + +static const struct of_device_id arcxcnn_dt_ids[] = { + { .compatible = "arc,arc2c0608" }, + { } +}; +MODULE_DEVICE_TABLE(of, arcxcnn_dt_ids); + +static const struct i2c_device_id arcxcnn_ids[] = { + {"arc2c0608", ARC2C0608}, + { } +}; +MODULE_DEVICE_TABLE(i2c, arcxcnn_ids); + +static struct i2c_driver arcxcnn_driver = { + .driver = { + .name = "arcxcnn_bl", + .of_match_table = of_match_ptr(arcxcnn_dt_ids), + }, + .probe = arcxcnn_probe, + .remove = arcxcnn_remove, + .id_table = arcxcnn_ids, +}; +module_i2c_driver(arcxcnn_driver); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Brian Dodge "); +MODULE_DESCRIPTION("ARCXCNN Backlight driver"); diff --git a/drivers/video/backlight/pwm_bl.c b/drivers/video/backlight/pwm_bl.c index d7efcb632f7d9dde08b0a494c455725b05d55af0..002f1ce22bd02032062924b19d645ebc25302c93 100644 --- a/drivers/video/backlight/pwm_bl.c +++ b/drivers/video/backlight/pwm_bl.c @@ -297,14 +297,15 @@ static int pwm_backlight_probe(struct platform_device *pdev) } /* - * If the GPIO is configured as input, change the direction to output - * and set the GPIO as active. + * If the GPIO is not known to be already configured as output, that + * is, if gpiod_get_direction returns either GPIOF_DIR_IN or -EINVAL, + * change the direction to output and set the GPIO as active. * Do not force the GPIO to active when it was already output as it * could cause backlight flickering or we would enable the backlight too * early. Leave the decision of the initial backlight state for later. */ if (pb->enable_gpio && - gpiod_get_direction(pb->enable_gpio) == GPIOF_DIR_IN) + gpiod_get_direction(pb->enable_gpio) != GPIOF_DIR_OUT) gpiod_direction_output(pb->enable_gpio, 1); pb->power_supply = devm_regulator_get(&pdev->dev, "power"); diff --git a/drivers/video/console/Kconfig b/drivers/video/console/Kconfig index 5b71bd905a6066ee53e3615d82b2d889a031d10c..2111d06f8c81a5260235befb029d59901e7bde28 100644 --- a/drivers/video/console/Kconfig +++ b/drivers/video/console/Kconfig @@ -6,7 +6,7 @@ menu "Console display driver support" config VGA_CONSOLE bool "VGA text console" if EXPERT || !X86 - depends on !4xx && !8xx && !SPARC && !M68K && !PARISC && !FRV && \ + depends on !4xx && !PPC_8xx && !SPARC && !M68K && !PARISC && !FRV && \ !SUPERH && !BLACKFIN && !AVR32 && !MN10300 && !CRIS && \ (!ARM || ARCH_FOOTBRIDGE || ARCH_INTEGRATOR || ARCH_NETWINDER) && \ !ARM64 && !ARC && !MICROBLAZE && !OPENRISC diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig index 922e4eaed9c5b5278572a79eb1cfc14e267a8bd5..5c6696bb56da89d1ca86f00f16c3c408303ca5ad 100644 --- a/drivers/video/fbdev/Kconfig +++ b/drivers/video/fbdev/Kconfig @@ -689,8 +689,6 @@ config FB_STI select FB_CFB_FILLRECT select FB_CFB_COPYAREA select FB_CFB_IMAGEBLIT - select STI_CONSOLE - select VT default y ---help--- STI refers to the HP "Standard Text Interface" which is a set of diff --git a/drivers/video/fbdev/acornfb.c b/drivers/video/fbdev/acornfb.c index fb75b7e5a19ab52748e4a1980d56046e15716908..0c325b4da61de876d998463389ef040373be8645 100644 --- a/drivers/video/fbdev/acornfb.c +++ b/drivers/video/fbdev/acornfb.c @@ -101,7 +101,7 @@ extern unsigned int vram_size; /* set by setup.c */ #ifdef HAS_VIDC20 #include -#define MAX_SIZE 2*1024*1024 +#define MAX_SIZE (2*1024*1024) /* VIDC20 has a different set of rules from the VIDC: * hcr : must be multiple of 4 @@ -162,7 +162,7 @@ static void acornfb_set_timing(struct fb_info *info) if (memcmp(¤t_vidc, &vidc, sizeof(vidc))) { current_vidc = vidc; - vidc_writel(VIDC20_CTRL| vidc.control); + vidc_writel(VIDC20_CTRL | vidc.control); vidc_writel(0xd0000000 | vidc.pll_ctl); vidc_writel(0x80000000 | vidc.h_cycle); vidc_writel(0x81000000 | vidc.h_sync_width); @@ -297,7 +297,7 @@ acornfb_setcolreg(u_int regno, u_int red, u_int green, u_int blue, pal.p = 0; vidc_writel(0x10000000); for (i = 0; i < 256; i += 1) { - pal.vidc20.red = current_par.palette[ i & 31].vidc20.red; + pal.vidc20.red = current_par.palette[i & 31].vidc20.red; pal.vidc20.green = current_par.palette[(i >> 1) & 31].vidc20.green; pal.vidc20.blue = current_par.palette[(i >> 2) & 31].vidc20.blue; vidc_writel(pal.p); @@ -1043,8 +1043,7 @@ static int acornfb_probe(struct platform_device *dev) base = dma_alloc_wc(current_par.dev, size, &handle, GFP_KERNEL); if (base == NULL) { - printk(KERN_ERR "acornfb: unable to allocate screen " - "memory\n"); + printk(KERN_ERR "acornfb: unable to allocate screen memory\n"); return -ENOMEM; } @@ -1103,8 +1102,7 @@ static int acornfb_probe(struct platform_device *dev) v_sync = h_sync / (fb_info.var.yres + fb_info.var.upper_margin + fb_info.var.lower_margin + fb_info.var.vsync_len); - printk(KERN_INFO "Acornfb: %dkB %cRAM, %s, using %dx%d, " - "%d.%03dkHz, %dHz\n", + printk(KERN_INFO "Acornfb: %dkB %cRAM, %s, using %dx%d, %d.%03dkHz, %dHz\n", fb_info.fix.smem_len / 1024, current_par.using_vram ? 'V' : 'D', VIDC_NAME, fb_info.var.xres, fb_info.var.yres, diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c index 0fab92c628280359ec79023b545ff893c12a2d6c..ffc2c33c6cef520a9aec6473b1b5fd1af96039fd 100644 --- a/drivers/video/fbdev/amba-clcd.c +++ b/drivers/video/fbdev/amba-clcd.c @@ -881,8 +881,8 @@ static int clcdfb_of_dma_setup(struct clcd_fb *fb) if (err) return err; - framesize = fb->panel->mode.xres * fb->panel->mode.yres * - fb->panel->bpp / 8; + framesize = PAGE_ALIGN(fb->panel->mode.xres * fb->panel->mode.yres * + fb->panel->bpp / 8); fb->fb.screen_base = dma_alloc_coherent(&fb->dev->dev, framesize, &dma, GFP_KERNEL); if (!fb->fb.screen_base) diff --git a/drivers/video/fbdev/arcfb.c b/drivers/video/fbdev/arcfb.c index 1928cb2b5386fad37b55f1e6995cfaef88b979e2..7e87d0d616581dbf2e58045b424198524d28b3f4 100644 --- a/drivers/video/fbdev/arcfb.c +++ b/drivers/video/fbdev/arcfb.c @@ -645,17 +645,17 @@ module_param(nosplash, uint, 0); MODULE_PARM_DESC(nosplash, "Disable doing the splash screen"); module_param(arcfb_enable, uint, 0); MODULE_PARM_DESC(arcfb_enable, "Enable communication with Arc board"); -module_param(dio_addr, ulong, 0); +module_param_hw(dio_addr, ulong, ioport, 0); MODULE_PARM_DESC(dio_addr, "IO address for data, eg: 0x480"); -module_param(cio_addr, ulong, 0); +module_param_hw(cio_addr, ulong, ioport, 0); MODULE_PARM_DESC(cio_addr, "IO address for control, eg: 0x400"); -module_param(c2io_addr, ulong, 0); +module_param_hw(c2io_addr, ulong, ioport, 0); MODULE_PARM_DESC(c2io_addr, "IO address for secondary control, eg: 0x408"); module_param(splashval, ulong, 0); MODULE_PARM_DESC(splashval, "Splash pattern: 0xFF is black, 0x00 is green"); module_param(tuhold, ulong, 0); MODULE_PARM_DESC(tuhold, "Time to hold between strobing data to Arc board"); -module_param(irq, uint, 0); +module_param_hw(irq, uint, irq, 0); MODULE_PARM_DESC(irq, "IRQ for the Arc board"); module_init(arcfb_init); diff --git a/drivers/video/fbdev/aty/radeon_base.c b/drivers/video/fbdev/aty/radeon_base.c index 218339a4edaac46e5b223d07f12f335b8966c66f..6b4c7872b37519f9fe6238d4fa505d76e14d4c92 100644 --- a/drivers/video/fbdev/aty/radeon_base.c +++ b/drivers/video/fbdev/aty/radeon_base.c @@ -2453,8 +2453,8 @@ static int radeonfb_pci_register(struct pci_dev *pdev, err |= sysfs_create_bin_file(&rinfo->pdev->dev.kobj, &edid2_attr); if (err) - pr_warning("%s() Creating sysfs files failed, continuing\n", - __func__); + pr_warn("%s() Creating sysfs files failed, continuing\n", + __func__); /* save current mode regs before we switch into the new one * so we can restore this upon __exit diff --git a/drivers/video/fbdev/core/fbmon.c b/drivers/video/fbdev/core/fbmon.c index 62c0cf79674fec39137fe39552288b26e23773e5..687ebb053438b343d63c77881aeebcdc0ad7f857 100644 --- a/drivers/video/fbdev/core/fbmon.c +++ b/drivers/video/fbdev/core/fbmon.c @@ -1073,9 +1073,9 @@ void fb_edid_add_monspecs(unsigned char *edid, struct fb_monspecs *specs) for (i = specs->modedb_len + num; i < specs->modedb_len + num + svd_n; i++) { int idx = svd[i - specs->modedb_len - num]; if (!idx || idx >= ARRAY_SIZE(cea_modes)) { - pr_warning("Reserved SVD code %d\n", idx); + pr_warn("Reserved SVD code %d\n", idx); } else if (!cea_modes[idx].xres) { - pr_warning("Unimplemented SVD code %d\n", idx); + pr_warn("Unimplemented SVD code %d\n", idx); } else { memcpy(&m[i], cea_modes + idx, sizeof(m[i])); pr_debug("Adding SVD #%d: %ux%u@%u\n", idx, diff --git a/drivers/video/fbdev/i810/i810_main.c b/drivers/video/fbdev/i810/i810_main.c index 483ab2592d0c056e9920834303b722105e5eca16..2488baab7c89253b4606faf3b5ae3877939b9201 100644 --- a/drivers/video/fbdev/i810/i810_main.c +++ b/drivers/video/fbdev/i810/i810_main.c @@ -81,7 +81,7 @@ static u32 voffset; static int i810fb_cursor(struct fb_info *info, struct fb_cursor *cursor); static int i810fb_init_pci(struct pci_dev *dev, const struct pci_device_id *entry); -static void __exit i810fb_remove_pci(struct pci_dev *dev); +static void i810fb_remove_pci(struct pci_dev *dev); static int i810fb_resume(struct pci_dev *dev); static int i810fb_suspend(struct pci_dev *dev, pm_message_t state); @@ -128,7 +128,7 @@ static struct pci_driver i810fb_driver = { .name = "i810fb", .id_table = i810fb_pci_tbl, .probe = i810fb_init_pci, - .remove = __exit_p(i810fb_remove_pci), + .remove = i810fb_remove_pci, .suspend = i810fb_suspend, .resume = i810fb_resume, }; @@ -2123,7 +2123,7 @@ static void i810fb_release_resource(struct fb_info *info, } -static void __exit i810fb_remove_pci(struct pci_dev *dev) +static void i810fb_remove_pci(struct pci_dev *dev) { struct fb_info *info = pci_get_drvdata(dev); struct i810fb_par *par = info->par; diff --git a/drivers/video/fbdev/imxfb.c b/drivers/video/fbdev/imxfb.c index 1b0faadb30801921d74231b6fc788fac5b67d9cd..c166e0725be5dab13e9a685a93ea7ea9c23a3351 100644 --- a/drivers/video/fbdev/imxfb.c +++ b/drivers/video/fbdev/imxfb.c @@ -117,6 +117,9 @@ #define IMXFB_LSCR1_DEFAULT 0x00120300 +#define LCDC_LAUSCR 0x80 +#define LAUSCR_AUS_MODE (1<<31) + /* Used fb-mode. Can be set on kernel command line, therefore file-static. */ static const char *fb_mode; @@ -158,6 +161,7 @@ struct imxfb_info { dma_addr_t dbar2; u_int pcr; + u_int lauscr; u_int pwmr; u_int lscr1; u_int dmacr; @@ -422,6 +426,11 @@ static int imxfb_check_var(struct fb_var_screeninfo *var, struct fb_info *info) pcr |= imxfb_mode->pcr & ~(0x3f | (7 << 25)); fbi->pcr = pcr; + /* + * The LCDC AUS Mode Control Register does not exist on imx1. + */ + if (!is_imx1_fb(fbi) && imxfb_mode->aus_mode) + fbi->lauscr = LAUSCR_AUS_MODE; /* * Copy the RGB parameters for this display @@ -638,6 +647,9 @@ static int imxfb_activate_var(struct fb_var_screeninfo *var, struct fb_info *inf if (fbi->dmacr) writel(fbi->dmacr, fbi->regs + LCDC_DMACR); + if (fbi->lauscr) + writel(fbi->lauscr, fbi->regs + LCDC_LAUSCR); + return 0; } @@ -734,6 +746,11 @@ static int imxfb_of_read_mode(struct device *dev, struct device_node *np, imxfb_mode->bpp = bpp; imxfb_mode->pcr = pcr; + /* + * fsl,aus-mode is optional + */ + imxfb_mode->aus_mode = of_property_read_bool(np, "fsl,aus-mode"); + return 0; } diff --git a/drivers/video/fbdev/intelfb/intelfbdrv.c b/drivers/video/fbdev/intelfb/intelfbdrv.c index ff2a5d2023e1f88078c90b56746303783cbd7429..6b444400a86c2252c7f0c165869113f8393ca1a3 100644 --- a/drivers/video/fbdev/intelfb/intelfbdrv.c +++ b/drivers/video/fbdev/intelfb/intelfbdrv.c @@ -934,7 +934,7 @@ static __inline__ int var_to_refresh(const struct fb_var_screeninfo *var) } /*************************************************************** - * Various intialisation functions * + * Various initialisation functions * ***************************************************************/ static void get_initial_mode(struct intelfb_info *dinfo) diff --git a/drivers/video/fbdev/n411.c b/drivers/video/fbdev/n411.c index 053deacad7cc12d3415e7e5d3b6c0b55e91e22ea..a3677313396e21fa78c266ff5df7a972ccc360a1 100644 --- a/drivers/video/fbdev/n411.c +++ b/drivers/video/fbdev/n411.c @@ -193,11 +193,11 @@ module_exit(n411_exit); module_param(nosplash, uint, 0); MODULE_PARM_DESC(nosplash, "Disable doing the splash screen"); -module_param(dio_addr, ulong, 0); +module_param_hw(dio_addr, ulong, ioport, 0); MODULE_PARM_DESC(dio_addr, "IO address for data, eg: 0x480"); -module_param(cio_addr, ulong, 0); +module_param_hw(cio_addr, ulong, ioport, 0); MODULE_PARM_DESC(cio_addr, "IO address for control, eg: 0x400"); -module_param(c2io_addr, ulong, 0); +module_param_hw(c2io_addr, ulong, ioport, 0); MODULE_PARM_DESC(c2io_addr, "IO address for secondary control, eg: 0x408"); module_param(splashval, ulong, 0); MODULE_PARM_DESC(splashval, "Splash pattern: 0x00 is black, 0x01 is white"); diff --git a/drivers/video/fbdev/omap/lcd_mipid.c b/drivers/video/fbdev/omap/lcd_mipid.c index c81f150589e1f8a2c4a9c342d305ef2684f121cb..df9e6ebcfad5f8d37048e3db8901406e96177c1a 100644 --- a/drivers/video/fbdev/omap/lcd_mipid.c +++ b/drivers/video/fbdev/omap/lcd_mipid.c @@ -174,7 +174,7 @@ static void hw_guard_wait(struct mipid_device *md) { unsigned long wait = md->hw_guard_end - jiffies; - if ((long)wait > 0 && wait <= md->hw_guard_wait) { + if ((long)wait > 0 && time_before_eq(wait, md->hw_guard_wait)) { set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(wait); } diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dss.c b/drivers/video/fbdev/omap2/omapfb/dss/dss.c index 47d7f69ad9ad858a75595922f032341d319f442d..48c6500c24e1f4b05a93d4a274c4a366cb8c0d6f 100644 --- a/drivers/video/fbdev/omap2/omapfb/dss/dss.c +++ b/drivers/video/fbdev/omap2/omapfb/dss/dss.c @@ -941,11 +941,13 @@ static int dss_init_features(struct platform_device *pdev) return 0; } +static void dss_uninit_ports(struct platform_device *pdev); + static int dss_init_ports(struct platform_device *pdev) { struct device_node *parent = pdev->dev.of_node; struct device_node *port; - int r; + int r, ret = 0; if (parent == NULL) return 0; @@ -972,17 +974,21 @@ static int dss_init_ports(struct platform_device *pdev) switch (port_type) { case OMAP_DISPLAY_TYPE_DPI: - dpi_init_port(pdev, port); + ret = dpi_init_port(pdev, port); break; case OMAP_DISPLAY_TYPE_SDI: - sdi_init_port(pdev, port); + ret = sdi_init_port(pdev, port); break; default: break; } - } while ((port = omapdss_of_get_next_port(parent, port)) != NULL); + } while (!ret && + (port = omapdss_of_get_next_port(parent, port)) != NULL); - return 0; + if (ret) + dss_uninit_ports(pdev); + + return ret; } static void dss_uninit_ports(struct platform_device *pdev) diff --git a/drivers/video/fbdev/pmag-aa-fb.c b/drivers/video/fbdev/pmag-aa-fb.c index ffe2dd482f840b51e83d3180bc327c1aefdc029e..39922f072db4f89ed5212a8e509cb2e2ca86c2d8 100644 --- a/drivers/video/fbdev/pmag-aa-fb.c +++ b/drivers/video/fbdev/pmag-aa-fb.c @@ -247,7 +247,7 @@ static int pmagaafb_probe(struct device *dev) return err; } -static int __exit pmagaafb_remove(struct device *dev) +static int pmagaafb_remove(struct device *dev) { struct tc_dev *tdev = to_tc_dev(dev); struct fb_info *info = dev_get_drvdata(dev); @@ -280,7 +280,7 @@ static struct tc_driver pmagaafb_driver = { .name = "pmagaafb", .bus = &tc_bus_type, .probe = pmagaafb_probe, - .remove = __exit_p(pmagaafb_remove), + .remove = pmagaafb_remove, }, }; diff --git a/drivers/video/fbdev/pmag-ba-fb.c b/drivers/video/fbdev/pmag-ba-fb.c index df02fb4b7fd1c2fe6363ef5173ed516479811da1..1fd02f40708eb79740db54a292ca87e8e24cab9c 100644 --- a/drivers/video/fbdev/pmag-ba-fb.c +++ b/drivers/video/fbdev/pmag-ba-fb.c @@ -235,7 +235,7 @@ static int pmagbafb_probe(struct device *dev) return err; } -static int __exit pmagbafb_remove(struct device *dev) +static int pmagbafb_remove(struct device *dev) { struct tc_dev *tdev = to_tc_dev(dev); struct fb_info *info = dev_get_drvdata(dev); @@ -270,7 +270,7 @@ static struct tc_driver pmagbafb_driver = { .name = "pmagbafb", .bus = &tc_bus_type, .probe = pmagbafb_probe, - .remove = __exit_p(pmagbafb_remove), + .remove = pmagbafb_remove, }, }; diff --git a/drivers/video/fbdev/pmagb-b-fb.c b/drivers/video/fbdev/pmagb-b-fb.c index a7a179a0bb33f2d85beb0301945375dad45e8f79..46e96c4515060f0c58ade6ff892f711ad39aa0a6 100644 --- a/drivers/video/fbdev/pmagb-b-fb.c +++ b/drivers/video/fbdev/pmagb-b-fb.c @@ -353,7 +353,7 @@ static int pmagbbfb_probe(struct device *dev) return err; } -static int __exit pmagbbfb_remove(struct device *dev) +static int pmagbbfb_remove(struct device *dev) { struct tc_dev *tdev = to_tc_dev(dev); struct fb_info *info = dev_get_drvdata(dev); @@ -388,7 +388,7 @@ static struct tc_driver pmagbbfb_driver = { .name = "pmagbbfb", .bus = &tc_bus_type, .probe = pmagbbfb_probe, - .remove = __exit_p(pmagbbfb_remove), + .remove = pmagbbfb_remove, }, }; diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c index ef73f14d7ba00d4b185739d65006cb50ebea7580..b21a89b03fb474a4a1c5b8821ad8861cd55b7e35 100644 --- a/drivers/video/fbdev/pxafb.c +++ b/drivers/video/fbdev/pxafb.c @@ -645,7 +645,7 @@ static void overlay1fb_disable(struct pxafb_layer *ofb) lcd_writel(ofb->fbi, FBR1, ofb->fbi->fdadr[DMA_OV1] | 0x3); if (wait_for_completion_timeout(&ofb->branch_done, 1 * HZ) == 0) - pr_warning("%s: timeout disabling overlay1\n", __func__); + pr_warn("%s: timeout disabling overlay1\n", __func__); lcd_writel(ofb->fbi, LCCR5, lccr5); } @@ -710,7 +710,7 @@ static void overlay2fb_disable(struct pxafb_layer *ofb) lcd_writel(ofb->fbi, FBR4, ofb->fbi->fdadr[DMA_OV2_Cr] | 0x3); if (wait_for_completion_timeout(&ofb->branch_done, 1 * HZ) == 0) - pr_warning("%s: timeout disabling overlay2\n", __func__); + pr_warn("%s: timeout disabling overlay2\n", __func__); } static struct pxafb_layer_ops ofb_ops[] = { @@ -1187,8 +1187,7 @@ int pxafb_smart_flush(struct fb_info *info) lcd_writel(fbi, LCCR0, fbi->reg_lccr0 | LCCR0_ENB); if (wait_for_completion_timeout(&fbi->command_done, HZ/2) == 0) { - pr_warning("%s: timeout waiting for command done\n", - __func__); + pr_warn("%s: timeout waiting for command done\n", __func__); ret = -ETIMEDOUT; } diff --git a/drivers/video/fbdev/sm501fb.c b/drivers/video/fbdev/sm501fb.c index d80bc8a3200fa9c8435bc0b606a4bae0f02728f1..67e314fdd9471513b911d07d33c0ff3c13e25a05 100644 --- a/drivers/video/fbdev/sm501fb.c +++ b/drivers/video/fbdev/sm501fb.c @@ -1600,6 +1600,7 @@ static int sm501fb_start(struct sm501fb_info *info, info->fbmem = ioremap(res->start, resource_size(res)); if (info->fbmem == NULL) { dev_err(dev, "cannot remap framebuffer\n"); + ret = -ENXIO; goto err_mem_res; } diff --git a/drivers/video/fbdev/udlfb.c b/drivers/video/fbdev/udlfb.c index e9c2f7ba3c8e6c34382b6b0b58a7d65e41e8292d..6a3c353de7c35468e68f271126dca1c0b447ae82 100644 --- a/drivers/video/fbdev/udlfb.c +++ b/drivers/video/fbdev/udlfb.c @@ -1487,15 +1487,25 @@ static struct device_attribute fb_device_attrs[] = { static int dlfb_select_std_channel(struct dlfb_data *dev) { int ret; - u8 set_def_chn[] = { 0x57, 0xCD, 0xDC, 0xA7, + void *buf; + static const u8 set_def_chn[] = { + 0x57, 0xCD, 0xDC, 0xA7, 0x1C, 0x88, 0x5E, 0x15, 0x60, 0xFE, 0xC6, 0x97, 0x16, 0x3D, 0x47, 0xF2 }; + buf = kmemdup(set_def_chn, sizeof(set_def_chn), GFP_KERNEL); + + if (!buf) + return -ENOMEM; + ret = usb_control_msg(dev->udev, usb_sndctrlpipe(dev->udev, 0), NR_USB_REQUEST_CHANNEL, (USB_DIR_OUT | USB_TYPE_VENDOR), 0, 0, - set_def_chn, sizeof(set_def_chn), USB_CTRL_SET_TIMEOUT); + buf, sizeof(set_def_chn), USB_CTRL_SET_TIMEOUT); + + kfree(buf); + return ret; } diff --git a/drivers/video/fbdev/vermilion/vermilion.c b/drivers/video/fbdev/vermilion/vermilion.c index 1c1e95a0b8faa04006c857158c79ac11bc904d96..ce4c4729a5e8c30ea1222b584e159b4a45c24b48 100644 --- a/drivers/video/fbdev/vermilion/vermilion.c +++ b/drivers/video/fbdev/vermilion/vermilion.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include diff --git a/drivers/video/fbdev/xen-fbfront.c b/drivers/video/fbdev/xen-fbfront.c index 3ee309c50b2d015fb3d463dd88f6b99253beea12..46f63960fa9e6aa2913be8b804c1de24ce1245be 100644 --- a/drivers/video/fbdev/xen-fbfront.c +++ b/drivers/video/fbdev/xen-fbfront.c @@ -18,6 +18,8 @@ * frame buffer. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -380,10 +382,18 @@ static int xenfb_probe(struct xenbus_device *dev, video[KPARAM_MEM] = val; } + video[KPARAM_WIDTH] = xenbus_read_unsigned(dev->otherend, "width", + video[KPARAM_WIDTH]); + video[KPARAM_HEIGHT] = xenbus_read_unsigned(dev->otherend, "height", + video[KPARAM_HEIGHT]); + /* If requested res does not fit in available memory, use default */ fb_size = video[KPARAM_MEM] * 1024 * 1024; if (video[KPARAM_WIDTH] * video[KPARAM_HEIGHT] * XENFB_DEPTH / 8 > fb_size) { + pr_warn("display parameters %d,%d,%d invalid, use defaults\n", + video[KPARAM_MEM], video[KPARAM_WIDTH], + video[KPARAM_HEIGHT]); video[KPARAM_WIDTH] = XENFB_WIDTH; video[KPARAM_HEIGHT] = XENFB_HEIGHT; fb_size = XENFB_DEFAULT_FB_LEN; diff --git a/drivers/video/logo/logo.c b/drivers/video/logo/logo.c index b6bc4a0bda2a120a30cdb6a18f66fff9d7de0862..4d50bfd13e7c9f58f9f8f472627d6b7051e1e922 100644 --- a/drivers/video/logo/logo.c +++ b/drivers/video/logo/logo.c @@ -34,7 +34,7 @@ static int __init fb_logo_late_init(void) return 0; } -late_initcall(fb_logo_late_init); +late_initcall_sync(fb_logo_late_init); /* logo's are marked __initdata. Use __ref to tell * modpost that it is intended that this function uses data diff --git a/drivers/virt/fsl_hypervisor.c b/drivers/virt/fsl_hypervisor.c index 150ce2abf6c8f193b4e3b5c9fa66001cd709f39e..d3eca879a0a85474f858e79b759d8b4920ba9709 100644 --- a/drivers/virt/fsl_hypervisor.c +++ b/drivers/virt/fsl_hypervisor.c @@ -243,11 +243,8 @@ static long ioctl_memcpy(struct fsl_hv_ioctl_memcpy __user *p) sg_list = PTR_ALIGN(sg_list_unaligned, sizeof(struct fh_sg_list)); /* Get the physical addresses of the source buffer */ - down_read(¤t->mm->mmap_sem); - num_pinned = get_user_pages(param.local_vaddr - lb_offset, - num_pages, (param.source == -1) ? 0 : FOLL_WRITE, - pages, NULL); - up_read(¤t->mm->mmap_sem); + num_pinned = get_user_pages_unlocked(param.local_vaddr - lb_offset, + num_pages, pages, (param.source == -1) ? 0 : FOLL_WRITE); if (num_pinned != num_pages) { /* get_user_pages() failed */ diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 34adf9b9c0538815db33f62ed842de49be5222e7..408c174ef0d5c076dc2f51da612b8e0f3dfa8269 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -418,8 +418,7 @@ static int init_vqs(struct virtio_balloon *vb) * optionally stat. */ nvqs = virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_STATS_VQ) ? 3 : 2; - err = vb->vdev->config->find_vqs(vb->vdev, nvqs, vqs, callbacks, names, - NULL); + err = virtio_find_vqs(vb->vdev, nvqs, vqs, callbacks, names, NULL); if (err) return err; diff --git a/drivers/virtio/virtio_input.c b/drivers/virtio/virtio_input.c index 79f1293cda9327a051feef083871bdbc62038496..3a0468f2ceb08a3c4cd3eac033477b8049ebd89d 100644 --- a/drivers/virtio/virtio_input.c +++ b/drivers/virtio/virtio_input.c @@ -173,8 +173,7 @@ static int virtinput_init_vqs(struct virtio_input *vi) static const char * const names[] = { "events", "status" }; int err; - err = vi->vdev->config->find_vqs(vi->vdev, 2, vqs, cbs, names, - NULL); + err = virtio_find_vqs(vi->vdev, 2, vqs, cbs, names, NULL); if (err) return err; vi->evt = vqs[0]; diff --git a/drivers/virtio/virtio_mmio.c b/drivers/virtio/virtio_mmio.c index 78343b8f9034b35ea7d18e6f8a5b3e3df4bae9e0..74dc7170fd351e02d1732357b0705f66c507f7ce 100644 --- a/drivers/virtio/virtio_mmio.c +++ b/drivers/virtio/virtio_mmio.c @@ -351,7 +351,7 @@ static void vm_del_vqs(struct virtio_device *vdev) static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), - const char *name) + const char *name, bool ctx) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); struct virtio_mmio_vq_info *info; @@ -388,7 +388,7 @@ static struct virtqueue *vm_setup_vq(struct virtio_device *vdev, unsigned index, /* Create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_MMIO_VRING_ALIGN, vdev, - true, true, vm_notify, callback, name); + true, true, ctx, vm_notify, callback, name); if (!vq) { err = -ENOMEM; goto error_new_virtqueue; @@ -447,6 +447,7 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], + const bool *ctx, struct irq_affinity *desc) { struct virtio_mmio_device *vm_dev = to_virtio_mmio_device(vdev); @@ -459,7 +460,8 @@ static int vm_find_vqs(struct virtio_device *vdev, unsigned nvqs, return err; for (i = 0; i < nvqs; ++i) { - vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i]); + vqs[i] = vm_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false); if (IS_ERR(vqs[i])) { vm_del_vqs(vdev); return PTR_ERR(vqs[i]); diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 698d5d06fa039ca1a27b151a3dcf5d322784e3f0..007a4f3660862e1aa6e9a16207ae54bbbab61d58 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -172,6 +172,7 @@ static int vp_request_msix_vectors(struct virtio_device *vdev, int nvectors, static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), const char *name, + bool ctx, u16 msix_vec) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -183,7 +184,7 @@ static struct virtqueue *vp_setup_vq(struct virtio_device *vdev, unsigned index, if (!info) return ERR_PTR(-ENOMEM); - vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, + vq = vp_dev->setup_vq(vp_dev, info, index, callback, name, ctx, msix_vec); if (IS_ERR(vq)) goto out_info; @@ -274,6 +275,7 @@ void vp_del_vqs(struct virtio_device *vdev) static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], const char * const names[], bool per_vq_vectors, + const bool *ctx, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -315,6 +317,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs, else msix_vec = VP_MSIX_VQ_VECTOR; vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false, msix_vec); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); @@ -345,7 +348,7 @@ static int vp_find_vqs_msix(struct virtio_device *vdev, unsigned nvqs, static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[]) + const char * const names[], const bool *ctx) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); int i, err; @@ -367,6 +370,7 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs, continue; } vqs[i] = vp_setup_vq(vdev, i, callbacks[i], names[i], + ctx ? ctx[i] : false, VIRTIO_MSI_NO_VECTOR); if (IS_ERR(vqs[i])) { err = PTR_ERR(vqs[i]); @@ -383,20 +387,21 @@ static int vp_find_vqs_intx(struct virtio_device *vdev, unsigned nvqs, /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], struct irq_affinity *desc) + const char * const names[], const bool *ctx, + struct irq_affinity *desc) { int err; /* Try MSI-X with one vector per queue. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, true, ctx, desc); if (!err) return 0; /* Fallback: MSI-X with one vector for config, one shared for queues. */ - err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, desc); + err = vp_find_vqs_msix(vdev, nvqs, vqs, callbacks, names, false, ctx, desc); if (!err) return 0; /* Finally fall back to regular interrupts. */ - return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names); + return vp_find_vqs_intx(vdev, nvqs, vqs, callbacks, names, ctx); } const char *vp_bus_name(struct virtio_device *vdev) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index e96334aec1e0d70842d1a9fc53462ab728be87c0..135ee3cf7175881a8259a192ba102978196687c7 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -102,6 +102,7 @@ struct virtio_pci_device { unsigned idx, void (*callback)(struct virtqueue *vq), const char *name, + bool ctx, u16 msix_vec); void (*del_vq)(struct virtio_pci_vq_info *info); @@ -131,7 +132,8 @@ void vp_del_vqs(struct virtio_device *vdev); /* the config->find_vqs() implementation */ int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], struct irq_affinity *desc); + const char * const names[], const bool *ctx, + struct irq_affinity *desc); const char *vp_bus_name(struct virtio_device *vdev); /* Setup the affinity for a virtqueue: diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c index 4bfa48fb1324660f82ae6272d2e1ecc33522ba3e..2780886e8ba3d393ba4847366983c2ab56a7ed3e 100644 --- a/drivers/virtio/virtio_pci_legacy.c +++ b/drivers/virtio/virtio_pci_legacy.c @@ -116,6 +116,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, unsigned index, void (*callback)(struct virtqueue *vq), const char *name, + bool ctx, u16 msix_vec) { struct virtqueue *vq; @@ -135,7 +136,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* create the vring */ vq = vring_create_virtqueue(index, num, VIRTIO_PCI_VRING_ALIGN, &vp_dev->vdev, - true, false, vp_notify, callback, name); + true, false, ctx, + vp_notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 8978f109d2d79828e5b0c12649debc481dfacd7f..2555d80f6eec4b4a78860b46f453092051b50a24 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -297,6 +297,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, unsigned index, void (*callback)(struct virtqueue *vq), const char *name, + bool ctx, u16 msix_vec) { struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; @@ -328,7 +329,8 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* create the vring */ vq = vring_create_virtqueue(index, num, SMP_CACHE_BYTES, &vp_dev->vdev, - true, true, vp_notify, callback, name); + true, true, ctx, + vp_notify, callback, name); if (!vq) return ERR_PTR(-ENOMEM); @@ -387,12 +389,14 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], struct irq_affinity *desc) + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq; - int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, desc); + int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); if (rc) return rc; diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 409aeaa49246a0edd7c6da07ca38b58c3f876109..5e1b548828e60745ba87581d2b9bcb6380b092f8 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -263,6 +263,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, unsigned int out_sgs, unsigned int in_sgs, void *data, + void *ctx, gfp_t gfp) { struct vring_virtqueue *vq = to_vvq(_vq); @@ -275,6 +276,7 @@ static inline int virtqueue_add(struct virtqueue *_vq, START_USE(vq); BUG_ON(data == NULL); + BUG_ON(ctx && vq->indirect); if (unlikely(vq->broken)) { END_USE(vq); @@ -389,6 +391,8 @@ static inline int virtqueue_add(struct virtqueue *_vq, vq->desc_state[head].data = data; if (indirect) vq->desc_state[head].indir_desc = desc; + if (ctx) + vq->desc_state[head].indir_desc = ctx; /* Put entry in available array (but don't update avail->idx until they * do sync). */ @@ -461,7 +465,8 @@ int virtqueue_add_sgs(struct virtqueue *_vq, for (sg = sgs[i]; sg; sg = sg_next(sg)) total_sg++; } - return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp); + return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, + data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_sgs); @@ -483,7 +488,7 @@ int virtqueue_add_outbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 1, 0, data, gfp); + return virtqueue_add(vq, &sg, num, 1, 0, data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_outbuf); @@ -505,10 +510,34 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp) { - return virtqueue_add(vq, &sg, num, 0, 1, data, gfp); + return virtqueue_add(vq, &sg, num, 0, 1, data, NULL, gfp); } EXPORT_SYMBOL_GPL(virtqueue_add_inbuf); +/** + * virtqueue_add_inbuf_ctx - expose input buffers to other end + * @vq: the struct virtqueue we're talking about. + * @sg: scatterlist (must be well-formed and terminated!) + * @num: the number of entries in @sg writable by other side + * @data: the token identifying the buffer. + * @ctx: extra context for the token + * @gfp: how to do memory allocations (if necessary). + * + * Caller must ensure we don't call this with other virtqueue operations + * at the same time (except where noted). + * + * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO). + */ +int virtqueue_add_inbuf_ctx(struct virtqueue *vq, + struct scatterlist *sg, unsigned int num, + void *data, + void *ctx, + gfp_t gfp) +{ + return virtqueue_add(vq, &sg, num, 0, 1, data, ctx, gfp); +} +EXPORT_SYMBOL_GPL(virtqueue_add_inbuf_ctx); + /** * virtqueue_kick_prepare - first half of split virtqueue_kick call. * @vq: the struct virtqueue @@ -598,7 +627,8 @@ bool virtqueue_kick(struct virtqueue *vq) } EXPORT_SYMBOL_GPL(virtqueue_kick); -static void detach_buf(struct vring_virtqueue *vq, unsigned int head) +static void detach_buf(struct vring_virtqueue *vq, unsigned int head, + void **ctx) { unsigned int i, j; __virtio16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT); @@ -622,10 +652,15 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) /* Plus final descriptor */ vq->vq.num_free++; - /* Free the indirect table, if any, now that it's unmapped. */ - if (vq->desc_state[head].indir_desc) { + if (vq->indirect) { struct vring_desc *indir_desc = vq->desc_state[head].indir_desc; - u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); + u32 len; + + /* Free the indirect table, if any, now that it's unmapped. */ + if (!indir_desc) + return; + + len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len); BUG_ON(!(vq->vring.desc[head].flags & cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT))); @@ -634,8 +669,10 @@ static void detach_buf(struct vring_virtqueue *vq, unsigned int head) for (j = 0; j < len / sizeof(struct vring_desc); j++) vring_unmap_one(vq, &indir_desc[j]); - kfree(vq->desc_state[head].indir_desc); + kfree(indir_desc); vq->desc_state[head].indir_desc = NULL; + } else if (ctx) { + *ctx = vq->desc_state[head].indir_desc; } } @@ -660,7 +697,8 @@ static inline bool more_used(const struct vring_virtqueue *vq) * Returns NULL if there are no used buffers, or the "data" token * handed to virtqueue_add_*(). */ -void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) +void *virtqueue_get_buf_ctx(struct virtqueue *_vq, unsigned int *len, + void **ctx) { struct vring_virtqueue *vq = to_vvq(_vq); void *ret; @@ -698,7 +736,7 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) /* detach_buf clears data, so grab it now. */ ret = vq->desc_state[i].data; - detach_buf(vq, i); + detach_buf(vq, i, ctx); vq->last_used_idx++; /* If we expect an interrupt for the next entry, tell host * by writing event index and flush out the write before @@ -715,8 +753,13 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) END_USE(vq); return ret; } -EXPORT_SYMBOL_GPL(virtqueue_get_buf); +EXPORT_SYMBOL_GPL(virtqueue_get_buf_ctx); +void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) +{ + return virtqueue_get_buf_ctx(_vq, len, NULL); +} +EXPORT_SYMBOL_GPL(virtqueue_get_buf); /** * virtqueue_disable_cb - disable callbacks * @vq: the struct virtqueue we're talking about. @@ -878,7 +921,7 @@ void *virtqueue_detach_unused_buf(struct virtqueue *_vq) continue; /* detach_buf clears data, so grab it now. */ buf = vq->desc_state[i].data; - detach_buf(vq, i); + detach_buf(vq, i, NULL); vq->avail_idx_shadow--; vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow); END_USE(vq); @@ -916,6 +959,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring vring, struct virtio_device *vdev, bool weak_barriers, + bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) @@ -950,7 +994,8 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, vq->last_add_time_valid = false; #endif - vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); + vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC) && + !context; vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); /* No callback? Tell other side not to bother us. */ @@ -1019,6 +1064,7 @@ struct virtqueue *vring_create_virtqueue( struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, + bool context, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name) @@ -1058,7 +1104,7 @@ struct virtqueue *vring_create_virtqueue( queue_size_in_bytes = vring_size(num, vring_align); vring_init(&vring, num, queue, vring_align); - vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, + vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, notify, callback, name); if (!vq) { vring_free_queue(vdev, queue_size_in_bytes, queue, @@ -1079,6 +1125,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, + bool context, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), @@ -1086,7 +1133,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, { struct vring vring; vring_init(&vring, num, pages, vring_align); - return __vring_new_virtqueue(index, vring, vdev, weak_barriers, + return __vring_new_virtqueue(index, vring, vdev, weak_barriers, context, notify, callback, name); } EXPORT_SYMBOL_GPL(vring_new_virtqueue); diff --git a/drivers/vme/vme.c b/drivers/vme/vme.c index 0035cf79760a512563540d58535987bac70aa35c..6a3ead42aba876cef534b8e9acc34b30cf6cef3a 100644 --- a/drivers/vme/vme.c +++ b/drivers/vme/vme.c @@ -76,9 +76,16 @@ static struct vme_bridge *find_bridge(struct vme_resource *resource) } } -/* +/** + * vme_free_consistent - Allocate contiguous memory. + * @resource: Pointer to VME resource. + * @size: Size of allocation required. + * @dma: Pointer to variable to store physical address of allocation. + * * Allocate a contiguous block of memory for use by the driver. This is used to * create the buffers for the slave windows. + * + * Return: Virtual address of allocation on success, NULL on failure. */ void *vme_alloc_consistent(struct vme_resource *resource, size_t size, dma_addr_t *dma) @@ -111,8 +118,14 @@ void *vme_alloc_consistent(struct vme_resource *resource, size_t size, } EXPORT_SYMBOL(vme_alloc_consistent); -/* - * Free previously allocated contiguous block of memory. +/** + * vme_free_consistent - Free previously allocated memory. + * @resource: Pointer to VME resource. + * @size: Size of allocation to free. + * @vaddr: Virtual address of allocation. + * @dma: Physical address of allocation. + * + * Free previously allocated block of contiguous memory. */ void vme_free_consistent(struct vme_resource *resource, size_t size, void *vaddr, dma_addr_t dma) @@ -145,6 +158,16 @@ void vme_free_consistent(struct vme_resource *resource, size_t size, } EXPORT_SYMBOL(vme_free_consistent); +/** + * vme_get_size - Helper function returning size of a VME window + * @resource: Pointer to VME slave or master resource. + * + * Determine the size of the VME window provided. This is a helper + * function, wrappering the call to vme_master_get or vme_slave_get + * depending on the type of window resource handed to it. + * + * Return: Size of the window on success, zero on failure. + */ size_t vme_get_size(struct vme_resource *resource) { int enabled, retval; @@ -259,9 +282,16 @@ static u32 vme_get_aspace(int am) return 0; } -/* - * Request a slave image with specific attributes, return some unique - * identifier. +/** + * vme_slave_request - Request a VME slave window resource. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @address: Required VME address space. + * @cycle: Required VME data transfer cycle type. + * + * Request use of a VME window resource capable of being set for the requested + * address space and data transfer cycle. + * + * Return: Pointer to VME resource on success, NULL on failure. */ struct vme_resource *vme_slave_request(struct vme_dev *vdev, u32 address, u32 cycle) @@ -327,6 +357,23 @@ struct vme_resource *vme_slave_request(struct vme_dev *vdev, u32 address, } EXPORT_SYMBOL(vme_slave_request); +/** + * vme_slave_set - Set VME slave window configuration. + * @resource: Pointer to VME slave resource. + * @enabled: State to which the window should be configured. + * @vme_base: Base address for the window. + * @size: Size of the VME window. + * @buf_base: Based address of buffer used to provide VME slave window storage. + * @aspace: VME address space for the VME window. + * @cycle: VME data transfer cycle type for the VME window. + * + * Set configuration for provided VME slave window. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device, if an invalid resource has been provided or invalid + * attributes are provided. Hardware specific errors may also be + * returned. + */ int vme_slave_set(struct vme_resource *resource, int enabled, unsigned long long vme_base, unsigned long long size, dma_addr_t buf_base, u32 aspace, u32 cycle) @@ -362,6 +409,21 @@ int vme_slave_set(struct vme_resource *resource, int enabled, } EXPORT_SYMBOL(vme_slave_set); +/** + * vme_slave_get - Retrieve VME slave window configuration. + * @resource: Pointer to VME slave resource. + * @enabled: Pointer to variable for storing state. + * @vme_base: Pointer to variable for storing window base address. + * @size: Pointer to variable for storing window size. + * @buf_base: Pointer to variable for storing slave buffer base address. + * @aspace: Pointer to variable for storing VME address space. + * @cycle: Pointer to variable for storing VME data transfer cycle type. + * + * Return configuration for provided VME slave window. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device or if an invalid resource has been provided. + */ int vme_slave_get(struct vme_resource *resource, int *enabled, unsigned long long *vme_base, unsigned long long *size, dma_addr_t *buf_base, u32 *aspace, u32 *cycle) @@ -386,6 +448,12 @@ int vme_slave_get(struct vme_resource *resource, int *enabled, } EXPORT_SYMBOL(vme_slave_get); +/** + * vme_slave_free - Free VME slave window + * @resource: Pointer to VME slave resource. + * + * Free the provided slave resource so that it may be reallocated. + */ void vme_slave_free(struct vme_resource *resource) { struct vme_slave_resource *slave_image; @@ -415,9 +483,17 @@ void vme_slave_free(struct vme_resource *resource) } EXPORT_SYMBOL(vme_slave_free); -/* - * Request a master image with specific attributes, return some unique - * identifier. +/** + * vme_master_request - Request a VME master window resource. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @address: Required VME address space. + * @cycle: Required VME data transfer cycle type. + * @dwidth: Required VME data transfer width. + * + * Request use of a VME window resource capable of being set for the requested + * address space, data transfer cycle and width. + * + * Return: Pointer to VME resource on success, NULL on failure. */ struct vme_resource *vme_master_request(struct vme_dev *vdev, u32 address, u32 cycle, u32 dwidth) @@ -486,6 +562,23 @@ struct vme_resource *vme_master_request(struct vme_dev *vdev, u32 address, } EXPORT_SYMBOL(vme_master_request); +/** + * vme_master_set - Set VME master window configuration. + * @resource: Pointer to VME master resource. + * @enabled: State to which the window should be configured. + * @vme_base: Base address for the window. + * @size: Size of the VME window. + * @aspace: VME address space for the VME window. + * @cycle: VME data transfer cycle type for the VME window. + * @dwidth: VME data transfer width for the VME window. + * + * Set configuration for provided VME master window. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device, if an invalid resource has been provided or invalid + * attributes are provided. Hardware specific errors may also be + * returned. + */ int vme_master_set(struct vme_resource *resource, int enabled, unsigned long long vme_base, unsigned long long size, u32 aspace, u32 cycle, u32 dwidth) @@ -522,6 +615,21 @@ int vme_master_set(struct vme_resource *resource, int enabled, } EXPORT_SYMBOL(vme_master_set); +/** + * vme_master_get - Retrieve VME master window configuration. + * @resource: Pointer to VME master resource. + * @enabled: Pointer to variable for storing state. + * @vme_base: Pointer to variable for storing window base address. + * @size: Pointer to variable for storing window size. + * @aspace: Pointer to variable for storing VME address space. + * @cycle: Pointer to variable for storing VME data transfer cycle type. + * @dwidth: Pointer to variable for storing VME data transfer width. + * + * Return configuration for provided VME master window. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device or if an invalid resource has been provided. + */ int vme_master_get(struct vme_resource *resource, int *enabled, unsigned long long *vme_base, unsigned long long *size, u32 *aspace, u32 *cycle, u32 *dwidth) @@ -546,8 +654,20 @@ int vme_master_get(struct vme_resource *resource, int *enabled, } EXPORT_SYMBOL(vme_master_get); -/* - * Read data out of VME space into a buffer. +/** + * vme_master_write - Read data from VME space into a buffer. + * @resource: Pointer to VME master resource. + * @buf: Pointer to buffer where data should be transferred. + * @count: Number of bytes to transfer. + * @offset: Offset into VME master window at which to start transfer. + * + * Perform read of count bytes of data from location on VME bus which maps into + * the VME master window at offset to buf. + * + * Return: Number of bytes read, -EINVAL if resource is not a VME master + * resource or read operation is not supported. -EFAULT returned if + * invalid offset is provided. Hardware specific errors may also be + * returned. */ ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count, loff_t offset) @@ -583,8 +703,20 @@ ssize_t vme_master_read(struct vme_resource *resource, void *buf, size_t count, } EXPORT_SYMBOL(vme_master_read); -/* - * Write data out to VME space from a buffer. +/** + * vme_master_write - Write data out to VME space from a buffer. + * @resource: Pointer to VME master resource. + * @buf: Pointer to buffer holding data to transfer. + * @count: Number of bytes to transfer. + * @offset: Offset into VME master window at which to start transfer. + * + * Perform write of count bytes of data from buf to location on VME bus which + * maps into the VME master window at offset. + * + * Return: Number of bytes written, -EINVAL if resource is not a VME master + * resource or write operation is not supported. -EFAULT returned if + * invalid offset is provided. Hardware specific errors may also be + * returned. */ ssize_t vme_master_write(struct vme_resource *resource, void *buf, size_t count, loff_t offset) @@ -619,8 +751,24 @@ ssize_t vme_master_write(struct vme_resource *resource, void *buf, } EXPORT_SYMBOL(vme_master_write); -/* - * Perform RMW cycle to provided location. +/** + * vme_master_rmw - Perform read-modify-write cycle. + * @resource: Pointer to VME master resource. + * @mask: Bits to be compared and swapped in operation. + * @compare: Bits to be compared with data read from offset. + * @swap: Bits to be swapped in data read from offset. + * @offset: Offset into VME master window at which to perform operation. + * + * Perform read-modify-write cycle on provided location: + * - Location on VME bus is read. + * - Bits selected by mask are compared with compare. + * - Where a selected bit matches that in compare and are selected in swap, + * the bit is swapped. + * - Result written back to location on VME bus. + * + * Return: Bytes written on success, -EINVAL if resource is not a VME master + * resource or RMW operation is not supported. Hardware specific + * errors may also be returned. */ unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask, unsigned int compare, unsigned int swap, loff_t offset) @@ -644,6 +792,17 @@ unsigned int vme_master_rmw(struct vme_resource *resource, unsigned int mask, } EXPORT_SYMBOL(vme_master_rmw); +/** + * vme_master_mmap - Mmap region of VME master window. + * @resource: Pointer to VME master resource. + * @vma: Pointer to definition of user mapping. + * + * Memory map a region of the VME master window into user space. + * + * Return: Zero on success, -EINVAL if resource is not a VME master + * resource or -EFAULT if map exceeds window size. Other generic mmap + * errors may also be returned. + */ int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma) { struct vme_master_resource *image; @@ -670,6 +829,12 @@ int vme_master_mmap(struct vme_resource *resource, struct vm_area_struct *vma) } EXPORT_SYMBOL(vme_master_mmap); +/** + * vme_master_free - Free VME master window + * @resource: Pointer to VME master resource. + * + * Free the provided master resource so that it may be reallocated. + */ void vme_master_free(struct vme_resource *resource) { struct vme_master_resource *master_image; @@ -699,9 +864,15 @@ void vme_master_free(struct vme_resource *resource) } EXPORT_SYMBOL(vme_master_free); -/* - * Request a DMA controller with specific attributes, return some unique - * identifier. +/** + * vme_dma_request - Request a DMA controller. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @route: Required src/destination combination. + * + * Request a VME DMA controller with capability to perform transfers bewteen + * requested source/destination combination. + * + * Return: Pointer to VME DMA resource on success, NULL on failure. */ struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route) { @@ -768,8 +939,15 @@ struct vme_resource *vme_dma_request(struct vme_dev *vdev, u32 route) } EXPORT_SYMBOL(vme_dma_request); -/* - * Start new list +/** + * vme_new_dma_list - Create new VME DMA list. + * @resource: Pointer to VME DMA resource. + * + * Create a new VME DMA list. It is the responsibility of the user to free + * the list once it is no longer required with vme_dma_list_free(). + * + * Return: Pointer to new VME DMA list, NULL on allocation failure or invalid + * VME DMA resource. */ struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource) { @@ -796,8 +974,16 @@ struct vme_dma_list *vme_new_dma_list(struct vme_resource *resource) } EXPORT_SYMBOL(vme_new_dma_list); -/* - * Create "Pattern" type attributes +/** + * vme_dma_pattern_attribute - Create "Pattern" type VME DMA list attribute. + * @pattern: Value to use used as pattern + * @type: Type of pattern to be written. + * + * Create VME DMA list attribute for pattern generation. It is the + * responsibility of the user to free used attributes using + * vme_dma_free_attribute(). + * + * Return: Pointer to VME DMA attribute, NULL on failure. */ struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type) { @@ -831,8 +1017,15 @@ struct vme_dma_attr *vme_dma_pattern_attribute(u32 pattern, u32 type) } EXPORT_SYMBOL(vme_dma_pattern_attribute); -/* - * Create "PCI" type attributes +/** + * vme_dma_pci_attribute - Create "PCI" type VME DMA list attribute. + * @address: PCI base address for DMA transfer. + * + * Create VME DMA list attribute pointing to a location on PCI for DMA + * transfers. It is the responsibility of the user to free used attributes + * using vme_dma_free_attribute(). + * + * Return: Pointer to VME DMA attribute, NULL on failure. */ struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t address) { @@ -869,8 +1062,18 @@ struct vme_dma_attr *vme_dma_pci_attribute(dma_addr_t address) } EXPORT_SYMBOL(vme_dma_pci_attribute); -/* - * Create "VME" type attributes +/** + * vme_dma_vme_attribute - Create "VME" type VME DMA list attribute. + * @address: VME base address for DMA transfer. + * @aspace: VME address space to use for DMA transfer. + * @cycle: VME bus cycle to use for DMA transfer. + * @dwidth: VME data width to use for DMA transfer. + * + * Create VME DMA list attribute pointing to a location on the VME bus for DMA + * transfers. It is the responsibility of the user to free used attributes + * using vme_dma_free_attribute(). + * + * Return: Pointer to VME DMA attribute, NULL on failure. */ struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long address, u32 aspace, u32 cycle, u32 dwidth) @@ -908,8 +1111,12 @@ struct vme_dma_attr *vme_dma_vme_attribute(unsigned long long address, } EXPORT_SYMBOL(vme_dma_vme_attribute); -/* - * Free attribute +/** + * vme_dma_free_attribute - Free DMA list attribute. + * @attributes: Pointer to DMA list attribute. + * + * Free VME DMA list attribute. VME DMA list attributes can be safely freed + * once vme_dma_list_add() has returned. */ void vme_dma_free_attribute(struct vme_dma_attr *attributes) { @@ -918,6 +1125,23 @@ void vme_dma_free_attribute(struct vme_dma_attr *attributes) } EXPORT_SYMBOL(vme_dma_free_attribute); +/** + * vme_dma_list_add - Add enty to a VME DMA list. + * @list: Pointer to VME list. + * @src: Pointer to DMA list attribute to use as source. + * @dest: Pointer to DMA list attribute to use as destination. + * @count: Number of bytes to transfer. + * + * Add an entry to the provided VME DMA list. Entry requires pointers to source + * and destination DMA attributes and a count. + * + * Please note, the attributes supported as source and destinations for + * transfers are hardware dependent. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device or if the link list has already been submitted for execution. + * Hardware specific errors also possible. + */ int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src, struct vme_dma_attr *dest, size_t count) { @@ -942,6 +1166,16 @@ int vme_dma_list_add(struct vme_dma_list *list, struct vme_dma_attr *src, } EXPORT_SYMBOL(vme_dma_list_add); +/** + * vme_dma_list_exec - Queue a VME DMA list for execution. + * @list: Pointer to VME list. + * + * Queue the provided VME DMA list for execution. The call will return once the + * list has been executed. + * + * Return: Zero on success, -EINVAL if operation is not supported on this + * device. Hardware specific errors also possible. + */ int vme_dma_list_exec(struct vme_dma_list *list) { struct vme_bridge *bridge = list->parent->parent; @@ -962,6 +1196,15 @@ int vme_dma_list_exec(struct vme_dma_list *list) } EXPORT_SYMBOL(vme_dma_list_exec); +/** + * vme_dma_list_free - Free a VME DMA list. + * @list: Pointer to VME list. + * + * Free the provided DMA list and all its entries. + * + * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource + * is still in use. Hardware specific errors also possible. + */ int vme_dma_list_free(struct vme_dma_list *list) { struct vme_bridge *bridge = list->parent->parent; @@ -994,6 +1237,15 @@ int vme_dma_list_free(struct vme_dma_list *list) } EXPORT_SYMBOL(vme_dma_list_free); +/** + * vme_dma_free - Free a VME DMA resource. + * @resource: Pointer to VME DMA resource. + * + * Free the provided DMA resource so that it may be reallocated. + * + * Return: Zero on success, -EINVAL on invalid VME resource, -EBUSY if resource + * is still active. + */ int vme_dma_free(struct vme_resource *resource) { struct vme_dma_resource *ctrlr; @@ -1099,6 +1351,22 @@ void vme_irq_handler(struct vme_bridge *bridge, int level, int statid) } EXPORT_SYMBOL(vme_irq_handler); +/** + * vme_irq_request - Request a specific VME interrupt. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @level: Interrupt priority being requested. + * @statid: Interrupt vector being requested. + * @callback: Pointer to callback function called when VME interrupt/vector + * received. + * @priv_data: Generic pointer that will be passed to the callback function. + * + * Request callback to be attached as a handler for VME interrupts with provided + * level and statid. + * + * Return: Zero on success, -EINVAL on invalid vme device, level or if the + * function is not supported, -EBUSY if the level/statid combination is + * already in use. Hardware specific errors also possible. + */ int vme_irq_request(struct vme_dev *vdev, int level, int statid, void (*callback)(int, int, void *), void *priv_data) @@ -1142,6 +1410,14 @@ int vme_irq_request(struct vme_dev *vdev, int level, int statid, } EXPORT_SYMBOL(vme_irq_request); +/** + * vme_irq_free - Free a VME interrupt. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @level: Interrupt priority of interrupt being freed. + * @statid: Interrupt vector of interrupt being freed. + * + * Remove previously attached callback from VME interrupt priority/vector. + */ void vme_irq_free(struct vme_dev *vdev, int level, int statid) { struct vme_bridge *bridge; @@ -1177,6 +1453,18 @@ void vme_irq_free(struct vme_dev *vdev, int level, int statid) } EXPORT_SYMBOL(vme_irq_free); +/** + * vme_irq_generate - Generate VME interrupt. + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * @level: Interrupt priority at which to assert the interrupt. + * @statid: Interrupt vector to associate with the interrupt. + * + * Generate a VME interrupt of the provided level and with the provided + * statid. + * + * Return: Zero on success, -EINVAL on invalid vme device, level or if the + * function is not supported. Hardware specific errors also possible. + */ int vme_irq_generate(struct vme_dev *vdev, int level, int statid) { struct vme_bridge *bridge; @@ -1201,8 +1489,15 @@ int vme_irq_generate(struct vme_dev *vdev, int level, int statid) } EXPORT_SYMBOL(vme_irq_generate); -/* - * Request the location monitor, return resource or NULL +/** + * vme_lm_request - Request a VME location monitor + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * + * Allocate a location monitor resource to the driver. A location monitor + * allows the driver to monitor accesses to a contiguous number of + * addresses on the VME bus. + * + * Return: Pointer to a VME resource on success or NULL on failure. */ struct vme_resource *vme_lm_request(struct vme_dev *vdev) { @@ -1218,7 +1513,7 @@ struct vme_resource *vme_lm_request(struct vme_dev *vdev) goto err_bus; } - /* Loop through DMA resources */ + /* Loop through LM resources */ list_for_each(lm_pos, &bridge->lm_resources) { lm = list_entry(lm_pos, struct vme_lm_resource, list); @@ -1264,6 +1559,17 @@ struct vme_resource *vme_lm_request(struct vme_dev *vdev) } EXPORT_SYMBOL(vme_lm_request); +/** + * vme_lm_count - Determine number of VME Addresses monitored + * @resource: Pointer to VME location monitor resource. + * + * The number of contiguous addresses monitored is hardware dependent. + * Return the number of contiguous addresses monitored by the + * location monitor. + * + * Return: Count of addresses monitored or -EINVAL when provided with an + * invalid location monitor resource. + */ int vme_lm_count(struct vme_resource *resource) { struct vme_lm_resource *lm; @@ -1279,6 +1585,20 @@ int vme_lm_count(struct vme_resource *resource) } EXPORT_SYMBOL(vme_lm_count); +/** + * vme_lm_set - Configure location monitor + * @resource: Pointer to VME location monitor resource. + * @lm_base: Base address to monitor. + * @aspace: VME address space to monitor. + * @cycle: VME bus cycle type to monitor. + * + * Set the base address, address space and cycle type of accesses to be + * monitored by the location monitor. + * + * Return: Zero on success, -EINVAL when provided with an invalid location + * monitor resource or function is not supported. Hardware specific + * errors may also be returned. + */ int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base, u32 aspace, u32 cycle) { @@ -1301,6 +1621,20 @@ int vme_lm_set(struct vme_resource *resource, unsigned long long lm_base, } EXPORT_SYMBOL(vme_lm_set); +/** + * vme_lm_get - Retrieve location monitor settings + * @resource: Pointer to VME location monitor resource. + * @lm_base: Pointer used to output the base address monitored. + * @aspace: Pointer used to output the address space monitored. + * @cycle: Pointer used to output the VME bus cycle type monitored. + * + * Retrieve the base address, address space and cycle type of accesses to + * be monitored by the location monitor. + * + * Return: Zero on success, -EINVAL when provided with an invalid location + * monitor resource or function is not supported. Hardware specific + * errors may also be returned. + */ int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base, u32 *aspace, u32 *cycle) { @@ -1323,6 +1657,21 @@ int vme_lm_get(struct vme_resource *resource, unsigned long long *lm_base, } EXPORT_SYMBOL(vme_lm_get); +/** + * vme_lm_attach - Provide callback for location monitor address + * @resource: Pointer to VME location monitor resource. + * @monitor: Offset to which callback should be attached. + * @callback: Pointer to callback function called when triggered. + * @data: Generic pointer that will be passed to the callback function. + * + * Attach a callback to the specificed offset into the location monitors + * monitored addresses. A generic pointer is provided to allow data to be + * passed to the callback when called. + * + * Return: Zero on success, -EINVAL when provided with an invalid location + * monitor resource or function is not supported. Hardware specific + * errors may also be returned. + */ int vme_lm_attach(struct vme_resource *resource, int monitor, void (*callback)(void *), void *data) { @@ -1345,6 +1694,18 @@ int vme_lm_attach(struct vme_resource *resource, int monitor, } EXPORT_SYMBOL(vme_lm_attach); +/** + * vme_lm_detach - Remove callback for location monitor address + * @resource: Pointer to VME location monitor resource. + * @monitor: Offset to which callback should be removed. + * + * Remove the callback associated with the specificed offset into the + * location monitors monitored addresses. + * + * Return: Zero on success, -EINVAL when provided with an invalid location + * monitor resource or function is not supported. Hardware specific + * errors may also be returned. + */ int vme_lm_detach(struct vme_resource *resource, int monitor) { struct vme_bridge *bridge = find_bridge(resource); @@ -1366,6 +1727,18 @@ int vme_lm_detach(struct vme_resource *resource, int monitor) } EXPORT_SYMBOL(vme_lm_detach); +/** + * vme_lm_free - Free allocated VME location monitor + * @resource: Pointer to VME location monitor resource. + * + * Free allocation of a VME location monitor. + * + * WARNING: This function currently expects that any callbacks that have + * been attached to the location monitor have been removed. + * + * Return: Zero on success, -EINVAL when provided with an invalid location + * monitor resource. + */ void vme_lm_free(struct vme_resource *resource) { struct vme_lm_resource *lm; @@ -1392,6 +1765,16 @@ void vme_lm_free(struct vme_resource *resource) } EXPORT_SYMBOL(vme_lm_free); +/** + * vme_slot_num - Retrieve slot ID + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * + * Retrieve the slot ID associated with the provided VME device. + * + * Return: The slot ID on success, -EINVAL if VME bridge cannot be determined + * or the function is not supported. Hardware specific errors may also + * be returned. + */ int vme_slot_num(struct vme_dev *vdev) { struct vme_bridge *bridge; @@ -1411,6 +1794,15 @@ int vme_slot_num(struct vme_dev *vdev) } EXPORT_SYMBOL(vme_slot_num); +/** + * vme_bus_num - Retrieve bus number + * @vdev: Pointer to VME device struct vme_dev assigned to driver instance. + * + * Retrieve the bus enumeration associated with the provided VME device. + * + * Return: The bus number on success, -EINVAL if VME bridge cannot be + * determined. + */ int vme_bus_num(struct vme_dev *vdev) { struct vme_bridge *bridge; @@ -1556,6 +1948,15 @@ static int __vme_register_driver(struct vme_driver *drv, unsigned int ndevs) return err; } +/** + * vme_register_driver - Register a VME driver + * @drv: Pointer to VME driver structure to register. + * @ndevs: Maximum number of devices to allow to be enumerated. + * + * Register a VME device driver with the VME subsystem. + * + * Return: Zero on success, error value on registration failure. + */ int vme_register_driver(struct vme_driver *drv, unsigned int ndevs) { int err; @@ -1576,6 +1977,12 @@ int vme_register_driver(struct vme_driver *drv, unsigned int ndevs) } EXPORT_SYMBOL(vme_register_driver); +/** + * vme_unregister_driver - Unregister a VME driver + * @drv: Pointer to VME driver structure to unregister. + * + * Unregister a VME device driver from the VME subsystem. + */ void vme_unregister_driver(struct vme_driver *drv) { struct vme_dev *dev, *dev_tmp; diff --git a/drivers/w1/masters/matrox_w1.c b/drivers/w1/masters/matrox_w1.c index 3749db8b43969253c2089419f980d5553ca140c2..97a676bf5989eba00781b1a2c6387eff4b9d53d8 100644 --- a/drivers/w1/masters/matrox_w1.c +++ b/drivers/w1/masters/matrox_w1.c @@ -36,7 +36,6 @@ #include "../w1.h" #include "../w1_int.h" -#include "../w1_log.h" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov "); @@ -157,9 +156,6 @@ static int matrox_w1_probe(struct pci_dev *pdev, const struct pci_device_id *ent struct matrox_device *dev; int err; - assert(pdev != NULL); - assert(ent != NULL); - if (pdev->vendor != PCI_VENDOR_ID_MATROX || pdev->device != PCI_DEVICE_ID_MATROX_G400) return -ENODEV; @@ -224,8 +220,6 @@ static void matrox_w1_remove(struct pci_dev *pdev) { struct matrox_device *dev = pci_get_drvdata(pdev); - assert(dev != NULL); - if (dev->found) { w1_remove_master_device(dev->bus_master); iounmap(dev->virt_addr); diff --git a/drivers/w1/slaves/Kconfig b/drivers/w1/slaves/Kconfig index 0ef9f2663dbd1f81b5f08b1b667c06303f7fa46b..fb68465908f21371a31f0656d101996d568cb26c 100644 --- a/drivers/w1/slaves/Kconfig +++ b/drivers/w1/slaves/Kconfig @@ -86,6 +86,12 @@ config W1_SLAVE_DS2433_CRC Each block has 30 bytes of data and a two byte CRC16. Full block writes are only allowed if the CRC is valid. +config W1_SLAVE_DS2438 + tristate "DS2438 Smart Battery Monitor 0x26 family support" + help + Say Y here if you want to use a 1-wire + DS2438 Smart Battery Monitor device support + config W1_SLAVE_DS2760 tristate "Dallas 2760 battery monitor chip (HP iPAQ & others)" help diff --git a/drivers/w1/slaves/Makefile b/drivers/w1/slaves/Makefile index b4a358955ef9b89a2bca762be87f3f2f0d78497d..54c63e4203024d2dcc34dd69f31665f19f01aa9e 100644 --- a/drivers/w1/slaves/Makefile +++ b/drivers/w1/slaves/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_W1_SLAVE_DS2406) += w1_ds2406.o obj-$(CONFIG_W1_SLAVE_DS2423) += w1_ds2423.o obj-$(CONFIG_W1_SLAVE_DS2431) += w1_ds2431.o obj-$(CONFIG_W1_SLAVE_DS2433) += w1_ds2433.o +obj-$(CONFIG_W1_SLAVE_DS2438) += w1_ds2438.o obj-$(CONFIG_W1_SLAVE_DS2760) += w1_ds2760.o obj-$(CONFIG_W1_SLAVE_DS2780) += w1_ds2780.o obj-$(CONFIG_W1_SLAVE_DS2781) += w1_ds2781.o diff --git a/drivers/w1/slaves/w1_ds2438.c b/drivers/w1/slaves/w1_ds2438.c new file mode 100644 index 0000000000000000000000000000000000000000..5ededb4965e143ab774d2705b2ad1e6b0d4fa51b --- /dev/null +++ b/drivers/w1/slaves/w1_ds2438.c @@ -0,0 +1,390 @@ +/* + * 1-Wire implementation for the ds2438 chip + * + * Copyright (c) 2017 Mariusz Bialonczyk + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include +#include +#include + +#include "../w1.h" +#include "../w1_family.h" + +#define W1_DS2438_RETRIES 3 + +/* Memory commands */ +#define W1_DS2438_READ_SCRATCH 0xBE +#define W1_DS2438_WRITE_SCRATCH 0x4E +#define W1_DS2438_COPY_SCRATCH 0x48 +#define W1_DS2438_RECALL_MEMORY 0xB8 +/* Register commands */ +#define W1_DS2438_CONVERT_TEMP 0x44 +#define W1_DS2438_CONVERT_VOLTAGE 0xB4 + +#define DS2438_PAGE_SIZE 8 +#define DS2438_ADC_INPUT_VAD 0 +#define DS2438_ADC_INPUT_VDD 1 +#define DS2438_MAX_CONVERSION_TIME 10 /* ms */ + +/* Page #0 definitions */ +#define DS2438_STATUS_REG 0x00 /* Status/Configuration Register */ +#define DS2438_STATUS_IAD (1 << 0) /* Current A/D Control Bit */ +#define DS2438_STATUS_CA (1 << 1) /* Current Accumulator Configuration */ +#define DS2438_STATUS_EE (1 << 2) /* Current Accumulator Shadow Selector bit */ +#define DS2438_STATUS_AD (1 << 3) /* Voltage A/D Input Select Bit */ +#define DS2438_STATUS_TB (1 << 4) /* Temperature Busy Flag */ +#define DS2438_STATUS_NVB (1 << 5) /* Nonvolatile Memory Busy Flag */ +#define DS2438_STATUS_ADB (1 << 6) /* A/D Converter Busy Flag */ + +#define DS2438_TEMP_LSB 0x01 +#define DS2438_TEMP_MSB 0x02 +#define DS2438_VOLTAGE_LSB 0x03 +#define DS2438_VOLTAGE_MSB 0x04 +#define DS2438_CURRENT_LSB 0x05 +#define DS2438_CURRENT_MSB 0x06 +#define DS2438_THRESHOLD 0x07 + +int w1_ds2438_get_page(struct w1_slave *sl, int pageno, u8 *buf) +{ + unsigned int retries = W1_DS2438_RETRIES; + u8 w1_buf[2]; + u8 crc; + size_t count; + + while (retries--) { + crc = 0; + + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_RECALL_MEMORY; + w1_buf[1] = 0x00; + w1_write_block(sl->master, w1_buf, 2); + + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_READ_SCRATCH; + w1_buf[1] = 0x00; + w1_write_block(sl->master, w1_buf, 2); + + count = w1_read_block(sl->master, buf, DS2438_PAGE_SIZE + 1); + if (count == DS2438_PAGE_SIZE + 1) { + crc = w1_calc_crc8(buf, DS2438_PAGE_SIZE); + + /* check for correct CRC */ + if ((u8)buf[DS2438_PAGE_SIZE] == crc) + return 0; + } + } + return -1; +} + +int w1_ds2438_get_temperature(struct w1_slave *sl, int16_t *temperature) +{ + unsigned int retries = W1_DS2438_RETRIES; + u8 w1_buf[DS2438_PAGE_SIZE + 1 /*for CRC*/]; + unsigned int tm = DS2438_MAX_CONVERSION_TIME; + unsigned long sleep_rem; + int ret; + + mutex_lock(&sl->master->bus_mutex); + + while (retries--) { + if (w1_reset_select_slave(sl)) + continue; + w1_write_8(sl->master, W1_DS2438_CONVERT_TEMP); + + mutex_unlock(&sl->master->bus_mutex); + sleep_rem = msleep_interruptible(tm); + if (sleep_rem != 0) { + ret = -1; + goto post_unlock; + } + + if (mutex_lock_interruptible(&sl->master->bus_mutex) != 0) { + ret = -1; + goto post_unlock; + } + + break; + } + + if (w1_ds2438_get_page(sl, 0, w1_buf) == 0) { + *temperature = (((int16_t) w1_buf[DS2438_TEMP_MSB]) << 8) | ((uint16_t) w1_buf[DS2438_TEMP_LSB]); + ret = 0; + } else + ret = -1; + + mutex_unlock(&sl->master->bus_mutex); + +post_unlock: + return ret; +} + +int w1_ds2438_change_config_bit(struct w1_slave *sl, u8 mask, u8 value) +{ + unsigned int retries = W1_DS2438_RETRIES; + u8 w1_buf[3]; + u8 status; + int perform_write = 0; + + while (retries--) { + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_RECALL_MEMORY; + w1_buf[1] = 0x00; + w1_write_block(sl->master, w1_buf, 2); + + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_READ_SCRATCH; + w1_buf[1] = 0x00; + w1_write_block(sl->master, w1_buf, 2); + + /* reading one byte of result */ + status = w1_read_8(sl->master); + + /* if bit0=1, set a value to a mask for easy compare */ + if (value) + value = mask; + + if ((status & mask) == value) + return 0; /* already set as requested */ + else { + /* changing bit */ + status ^= mask; + perform_write = 1; + } + break; + } + + if (perform_write) { + retries = W1_DS2438_RETRIES; + while (retries--) { + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_WRITE_SCRATCH; + w1_buf[1] = 0x00; + w1_buf[2] = status; + w1_write_block(sl->master, w1_buf, 3); + + if (w1_reset_select_slave(sl)) + continue; + w1_buf[0] = W1_DS2438_COPY_SCRATCH; + w1_buf[1] = 0x00; + w1_write_block(sl->master, w1_buf, 2); + + return 0; + } + } + return -1; +} + +uint16_t w1_ds2438_get_voltage(struct w1_slave *sl, int adc_input, uint16_t *voltage) +{ + unsigned int retries = W1_DS2438_RETRIES; + u8 w1_buf[DS2438_PAGE_SIZE + 1 /*for CRC*/]; + unsigned int tm = DS2438_MAX_CONVERSION_TIME; + unsigned long sleep_rem; + int ret; + + mutex_lock(&sl->master->bus_mutex); + + if (w1_ds2438_change_config_bit(sl, DS2438_STATUS_AD, adc_input)) { + ret = -1; + goto pre_unlock; + } + + while (retries--) { + if (w1_reset_select_slave(sl)) + continue; + w1_write_8(sl->master, W1_DS2438_CONVERT_VOLTAGE); + + mutex_unlock(&sl->master->bus_mutex); + sleep_rem = msleep_interruptible(tm); + if (sleep_rem != 0) { + ret = -1; + goto post_unlock; + } + + if (mutex_lock_interruptible(&sl->master->bus_mutex) != 0) { + ret = -1; + goto post_unlock; + } + + break; + } + + if (w1_ds2438_get_page(sl, 0, w1_buf) == 0) { + *voltage = (((uint16_t) w1_buf[DS2438_VOLTAGE_MSB]) << 8) | ((uint16_t) w1_buf[DS2438_VOLTAGE_LSB]); + ret = 0; + } else + ret = -1; + +pre_unlock: + mutex_unlock(&sl->master->bus_mutex); + +post_unlock: + return ret; +} + +static ssize_t iad_write(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + struct w1_slave *sl = kobj_to_w1_slave(kobj); + int ret; + + if (count != 1 || off != 0) + return -EFAULT; + + mutex_lock(&sl->master->bus_mutex); + + if (w1_ds2438_change_config_bit(sl, DS2438_STATUS_IAD, *buf & 0x01) == 0) + ret = 1; + else + ret = -EIO; + + mutex_unlock(&sl->master->bus_mutex); + + return ret; +} + +static ssize_t page0_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + struct w1_slave *sl = kobj_to_w1_slave(kobj); + int ret; + u8 w1_buf[DS2438_PAGE_SIZE + 1 /*for CRC*/]; + + if (off != 0) + return 0; + if (!buf) + return -EINVAL; + + mutex_lock(&sl->master->bus_mutex); + + if (w1_ds2438_get_page(sl, 0, w1_buf) == 0) { + memcpy(buf, &w1_buf, DS2438_PAGE_SIZE); + ret = DS2438_PAGE_SIZE; + } else + ret = -EIO; + + mutex_unlock(&sl->master->bus_mutex); + + return ret; +} + +static ssize_t temperature_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + struct w1_slave *sl = kobj_to_w1_slave(kobj); + int ret; + ssize_t c = PAGE_SIZE; + int16_t temp; + + if (off != 0) + return 0; + if (!buf) + return -EINVAL; + + if (w1_ds2438_get_temperature(sl, &temp) == 0) { + c -= snprintf(buf + PAGE_SIZE - c, c, "%d\n", temp); + ret = PAGE_SIZE - c; + } else + ret = -EIO; + + return ret; +} + +static ssize_t vad_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + struct w1_slave *sl = kobj_to_w1_slave(kobj); + int ret; + ssize_t c = PAGE_SIZE; + uint16_t voltage; + + if (off != 0) + return 0; + if (!buf) + return -EINVAL; + + if (w1_ds2438_get_voltage(sl, DS2438_ADC_INPUT_VAD, &voltage) == 0) { + c -= snprintf(buf + PAGE_SIZE - c, c, "%d\n", voltage); + ret = PAGE_SIZE - c; + } else + ret = -EIO; + + return ret; +} + +static ssize_t vdd_read(struct file *filp, struct kobject *kobj, + struct bin_attribute *bin_attr, char *buf, + loff_t off, size_t count) +{ + struct w1_slave *sl = kobj_to_w1_slave(kobj); + int ret; + ssize_t c = PAGE_SIZE; + uint16_t voltage; + + if (off != 0) + return 0; + if (!buf) + return -EINVAL; + + if (w1_ds2438_get_voltage(sl, DS2438_ADC_INPUT_VDD, &voltage) == 0) { + c -= snprintf(buf + PAGE_SIZE - c, c, "%d\n", voltage); + ret = PAGE_SIZE - c; + } else + ret = -EIO; + + return ret; +} + +static BIN_ATTR(iad, S_IRUGO | S_IWUSR | S_IWGRP, NULL, iad_write, 1); +static BIN_ATTR_RO(page0, DS2438_PAGE_SIZE); +static BIN_ATTR_RO(temperature, 0/* real length varies */); +static BIN_ATTR_RO(vad, 0/* real length varies */); +static BIN_ATTR_RO(vdd, 0/* real length varies */); + +static struct bin_attribute *w1_ds2438_bin_attrs[] = { + &bin_attr_iad, + &bin_attr_page0, + &bin_attr_temperature, + &bin_attr_vad, + &bin_attr_vdd, + NULL, +}; + +static const struct attribute_group w1_ds2438_group = { + .bin_attrs = w1_ds2438_bin_attrs, +}; + +static const struct attribute_group *w1_ds2438_groups[] = { + &w1_ds2438_group, + NULL, +}; + +static struct w1_family_ops w1_ds2438_fops = { + .groups = w1_ds2438_groups, +}; + +static struct w1_family w1_ds2438_family = { + .fid = W1_FAMILY_DS2438, + .fops = &w1_ds2438_fops, +}; +module_w1_family(w1_ds2438_family); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mariusz Bialonczyk "); +MODULE_DESCRIPTION("1-wire driver for Maxim/Dallas DS2438 Smart Battery Monitor"); +MODULE_ALIAS("w1-family-" __stringify(W1_FAMILY_DS2438)); diff --git a/drivers/w1/slaves/w1_ds2760.h b/drivers/w1/slaves/w1_ds2760.h index 58e774141568eb11846fab47ae6e9c822993d673..24168c94eeaef0f8265de85f161fef0914d2eb05 100644 --- a/drivers/w1/slaves/w1_ds2760.h +++ b/drivers/w1/slaves/w1_ds2760.h @@ -24,11 +24,13 @@ #define DS2760_DATA_SIZE 0x40 #define DS2760_PROTECTION_REG 0x00 + #define DS2760_STATUS_REG 0x01 - #define DS2760_STATUS_IE (1 << 2) - #define DS2760_STATUS_SWEN (1 << 3) - #define DS2760_STATUS_RNAOP (1 << 4) - #define DS2760_STATUS_PMOD (1 << 5) +#define DS2760_STATUS_IE (1 << 2) +#define DS2760_STATUS_SWEN (1 << 3) +#define DS2760_STATUS_RNAOP (1 << 4) +#define DS2760_STATUS_PMOD (1 << 5) + #define DS2760_EEPROM_REG 0x07 #define DS2760_SPECIAL_FEATURE_REG 0x08 #define DS2760_VOLTAGE_MSB 0x0c diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c index 90a3d9338fd282a5bc1c8a678c4a505544b86f70..8511d1685db9e8fb3c1dca0d2832ae641353ad4c 100644 --- a/drivers/w1/w1.c +++ b/drivers/w1/w1.c @@ -29,7 +29,6 @@ #include #include "w1.h" -#include "w1_log.h" #include "w1_int.h" #include "w1_family.h" #include "w1_netlink.h" diff --git a/drivers/w1/w1_family.h b/drivers/w1/w1_family.h index c4a6b257a367ffc9e4ce0cf108623ca052c4d84e..869a3ff87d2952ef062ca401d63d947ab7621475 100644 --- a/drivers/w1/w1_family.h +++ b/drivers/w1/w1_family.h @@ -29,6 +29,7 @@ #define W1_COUNTER_DS2423 0x1D #define W1_THERM_DS1822 0x22 #define W1_EEPROM_DS2433 0x23 +#define W1_FAMILY_DS2438 0x26 #define W1_THERM_DS18B20 0x28 #define W1_FAMILY_DS2408 0x29 #define W1_EEPROM_DS2431 0x2D diff --git a/drivers/w1/w1_int.c b/drivers/w1/w1_int.c index 2cae7b29bb5fb5b72803b9d82d65d636b130bacc..1072a2e620bb16f7546c4c0d89da27b998a3fed3 100644 --- a/drivers/w1/w1_int.c +++ b/drivers/w1/w1_int.c @@ -22,7 +22,6 @@ #include #include "w1.h" -#include "w1_log.h" #include "w1_netlink.h" #include "w1_int.h" diff --git a/drivers/w1/w1_io.c b/drivers/w1/w1_io.c index de8bebc278967552222d025dc143054af040a8af..1134e6b1eb023b26ea2f978c046d7796be5f69f6 100644 --- a/drivers/w1/w1_io.c +++ b/drivers/w1/w1_io.c @@ -19,7 +19,6 @@ #include #include "w1.h" -#include "w1_log.h" static int w1_delay_parm = 1; module_param_named(delay_coef, w1_delay_parm, int, 0); diff --git a/drivers/w1/w1_log.h b/drivers/w1/w1_log.h deleted file mode 100644 index dd1422b6afbb11a9fb2376621dff049378c4fc42..0000000000000000000000000000000000000000 --- a/drivers/w1/w1_log.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2004 Evgeniy Polyakov - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef __W1_LOG_H -#define __W1_LOG_H - -#define DEBUG - -#ifdef W1_DEBUG -# define assert(expr) do {} while (0) -#else -# define assert(expr) \ - if(unlikely(!(expr))) { \ - pr_err("Assertion failed! %s,%s,%s,line=%d\n", \ - #expr, __FILE__, __func__, __LINE__); \ - } -#endif - -#endif /* __W1_LOG_H */ - diff --git a/drivers/w1/w1_netlink.c b/drivers/w1/w1_netlink.c index 49e520ca79c5395215701d507c8323391cc6ce93..027950f997d12cb91642ebedf8b18e3242b798c1 100644 --- a/drivers/w1/w1_netlink.c +++ b/drivers/w1/w1_netlink.c @@ -18,13 +18,10 @@ #include #include "w1.h" -#include "w1_log.h" #include "w1_netlink.h" #if defined(CONFIG_W1_CON) && (defined(CONFIG_CONNECTOR) || (defined(CONFIG_CONNECTOR_MODULE) && defined(CONFIG_W1_MODULE))) -#define MIN(a, b) (((a) < (b)) ? (a) : (b)) - /* Bundle together everything required to process a request in one memory * allocation. */ @@ -598,7 +595,7 @@ static void w1_cn_callback(struct cn_msg *cn, struct netlink_skb_parms *nsp) sizeof(struct w1_netlink_msg) + sizeof(struct w1_netlink_cmd)); } - reply_size = MIN(CONNECTOR_MAX_MSG_SIZE, reply_size); + reply_size = min(CONNECTOR_MAX_MSG_SIZE, reply_size); /* allocate space for the block, a copy of the original message, * one node per cmd to point into the original message, diff --git a/drivers/watchdog/cpu5wdt.c b/drivers/watchdog/cpu5wdt.c index 6d03e8e30f8bf4369cfaa6cecde6cb2a6dccb03c..6c3f78e45c265da4b6f9cfefdb8af16a96b30874 100644 --- a/drivers/watchdog/cpu5wdt.c +++ b/drivers/watchdog/cpu5wdt.c @@ -289,7 +289,7 @@ MODULE_DESCRIPTION("sma cpu5 watchdog driver"); MODULE_SUPPORTED_DEVICE("sma cpu5 watchdog"); MODULE_LICENSE("GPL"); -module_param(port, int, 0); +module_param_hw(port, int, ioport, 0); MODULE_PARM_DESC(port, "base address of watchdog card, default is 0x91"); module_param(verbose, int, 0); diff --git a/drivers/watchdog/eurotechwdt.c b/drivers/watchdog/eurotechwdt.c index 23ee53240c4c1858f7d8aa0f7530d43dcc44f650..38e96712264f92648304ba8282a2e4ba8be90319 100644 --- a/drivers/watchdog/eurotechwdt.c +++ b/drivers/watchdog/eurotechwdt.c @@ -97,9 +97,9 @@ MODULE_PARM_DESC(nowayout, #define WDT_TIMER_CFG 0xf3 -module_param(io, int, 0); +module_param_hw(io, int, ioport, 0); MODULE_PARM_DESC(io, "Eurotech WDT io port (default=0x3f0)"); -module_param(irq, int, 0); +module_param_hw(irq, int, irq, 0); MODULE_PARM_DESC(irq, "Eurotech WDT irq (default=10)"); module_param(ev, charp, 0); MODULE_PARM_DESC(ev, "Eurotech WDT event type (default is `int')"); diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c index 70c7194e2810dff05ddc0535dc5d8d7579ab7e77..67fbe35ce7cfe5438f18b0ffb4ebdbfdc057b8fe 100644 --- a/drivers/watchdog/hpwdt.c +++ b/drivers/watchdog/hpwdt.c @@ -34,7 +34,7 @@ #include #include #include -#include +#include #endif /* CONFIG_HPWDT_NMI_DECODING */ #include #include diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index 3d0abc0d59b4f176d62ce30ebc236471cde051d2..347f0389b0899d4183021254203e0a0938600267 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -106,6 +106,10 @@ struct iTCO_wdt_private { struct pci_dev *pci_dev; /* whether or not the watchdog has been suspended */ bool suspended; + /* no reboot API private data */ + void *no_reboot_priv; + /* no reboot update function pointer */ + int (*update_no_reboot_bit)(void *p, bool set); }; /* module parameters */ @@ -170,46 +174,68 @@ static inline u32 no_reboot_bit(struct iTCO_wdt_private *p) return enable_bit; } -static void iTCO_wdt_set_NO_REBOOT_bit(struct iTCO_wdt_private *p) +static int update_no_reboot_bit_def(void *priv, bool set) { - u32 val32; + return 0; +} - /* Set the NO_REBOOT bit: this disables reboots */ - if (p->iTCO_version >= 2) { - val32 = readl(p->gcs_pmc); - val32 |= no_reboot_bit(p); - writel(val32, p->gcs_pmc); - } else if (p->iTCO_version == 1) { - pci_read_config_dword(p->pci_dev, 0xd4, &val32); +static int update_no_reboot_bit_pci(void *priv, bool set) +{ + struct iTCO_wdt_private *p = priv; + u32 val32 = 0, newval32 = 0; + + pci_read_config_dword(p->pci_dev, 0xd4, &val32); + if (set) val32 |= no_reboot_bit(p); - pci_write_config_dword(p->pci_dev, 0xd4, val32); - } + else + val32 &= ~no_reboot_bit(p); + pci_write_config_dword(p->pci_dev, 0xd4, val32); + pci_read_config_dword(p->pci_dev, 0xd4, &newval32); + + /* make sure the update is successful */ + if (val32 != newval32) + return -EIO; + + return 0; } -static int iTCO_wdt_unset_NO_REBOOT_bit(struct iTCO_wdt_private *p) +static int update_no_reboot_bit_mem(void *priv, bool set) { - u32 enable_bit = no_reboot_bit(p); - u32 val32 = 0; + struct iTCO_wdt_private *p = priv; + u32 val32 = 0, newval32 = 0; - /* Unset the NO_REBOOT bit: this enables reboots */ - if (p->iTCO_version >= 2) { - val32 = readl(p->gcs_pmc); - val32 &= ~enable_bit; - writel(val32, p->gcs_pmc); + val32 = readl(p->gcs_pmc); + if (set) + val32 |= no_reboot_bit(p); + else + val32 &= ~no_reboot_bit(p); + writel(val32, p->gcs_pmc); + newval32 = readl(p->gcs_pmc); - val32 = readl(p->gcs_pmc); - } else if (p->iTCO_version == 1) { - pci_read_config_dword(p->pci_dev, 0xd4, &val32); - val32 &= ~enable_bit; - pci_write_config_dword(p->pci_dev, 0xd4, val32); + /* make sure the update is successful */ + if (val32 != newval32) + return -EIO; - pci_read_config_dword(p->pci_dev, 0xd4, &val32); + return 0; +} + +static void iTCO_wdt_no_reboot_bit_setup(struct iTCO_wdt_private *p, + struct itco_wdt_platform_data *pdata) +{ + if (pdata->update_no_reboot_bit) { + p->update_no_reboot_bit = pdata->update_no_reboot_bit; + p->no_reboot_priv = pdata->no_reboot_priv; + return; } - if (val32 & enable_bit) - return -EIO; + if (p->iTCO_version >= 2) + p->update_no_reboot_bit = update_no_reboot_bit_mem; + else if (p->iTCO_version == 1) + p->update_no_reboot_bit = update_no_reboot_bit_pci; + else + p->update_no_reboot_bit = update_no_reboot_bit_def; - return 0; + p->no_reboot_priv = p; } static int iTCO_wdt_start(struct watchdog_device *wd_dev) @@ -222,7 +248,7 @@ static int iTCO_wdt_start(struct watchdog_device *wd_dev) iTCO_vendor_pre_start(p->smi_res, wd_dev->timeout); /* disable chipset's NO_REBOOT bit */ - if (iTCO_wdt_unset_NO_REBOOT_bit(p)) { + if (p->update_no_reboot_bit(p->no_reboot_priv, false)) { spin_unlock(&p->io_lock); pr_err("failed to reset NO_REBOOT flag, reboot disabled by hardware/BIOS\n"); return -EIO; @@ -263,7 +289,7 @@ static int iTCO_wdt_stop(struct watchdog_device *wd_dev) val = inw(TCO1_CNT(p)); /* Set the NO_REBOOT bit to prevent later reboots, just for sure */ - iTCO_wdt_set_NO_REBOOT_bit(p); + p->update_no_reboot_bit(p->no_reboot_priv, true); spin_unlock(&p->io_lock); @@ -428,11 +454,13 @@ static int iTCO_wdt_probe(struct platform_device *pdev) p->iTCO_version = pdata->version; p->pci_dev = to_pci_dev(dev->parent); + iTCO_wdt_no_reboot_bit_setup(p, pdata); + /* * Get the Memory-Mapped GCS or PMC register, we need it for the * NO_REBOOT flag (TCO v2 and v3). */ - if (p->iTCO_version >= 2) { + if (p->iTCO_version >= 2 && !pdata->update_no_reboot_bit) { p->gcs_pmc_res = platform_get_resource(pdev, IORESOURCE_MEM, ICH_RES_MEM_GCS_PMC); @@ -442,14 +470,14 @@ static int iTCO_wdt_probe(struct platform_device *pdev) } /* Check chipset's NO_REBOOT bit */ - if (iTCO_wdt_unset_NO_REBOOT_bit(p) && + if (p->update_no_reboot_bit(p->no_reboot_priv, false) && iTCO_vendor_check_noreboot_on()) { pr_info("unable to reset NO_REBOOT flag, device disabled by hardware/BIOS\n"); return -ENODEV; /* Cannot reset NO_REBOOT bit */ } /* Set the NO_REBOOT bit to prevent later reboots, just for sure */ - iTCO_wdt_set_NO_REBOOT_bit(p); + p->update_no_reboot_bit(p->no_reboot_priv, true); /* The TCO logic uses the TCO_EN bit in the SMI_EN register */ if (!devm_request_region(dev, p->smi_res->start, diff --git a/drivers/watchdog/pc87413_wdt.c b/drivers/watchdog/pc87413_wdt.c index 9f15dd9435d1a4efe5dd69fbab25363be026fe46..06a892e36a8d889ab1c6936b3209d8cd61a2f8c1 100644 --- a/drivers/watchdog/pc87413_wdt.c +++ b/drivers/watchdog/pc87413_wdt.c @@ -579,7 +579,7 @@ MODULE_AUTHOR("Marcus Junker "); MODULE_DESCRIPTION("PC87413 WDT driver"); MODULE_LICENSE("GPL"); -module_param(io, int, 0); +module_param_hw(io, int, ioport, 0); MODULE_PARM_DESC(io, MODNAME " I/O port (default: " __MODULE_STRING(IO_DEFAULT) ")."); diff --git a/drivers/watchdog/sc1200wdt.c b/drivers/watchdog/sc1200wdt.c index 131193a7acdfd0bb07660095e0ff57c500eed6cb..b34d3d5ba632398e357261bd5e666b9e9efc2157 100644 --- a/drivers/watchdog/sc1200wdt.c +++ b/drivers/watchdog/sc1200wdt.c @@ -88,7 +88,7 @@ MODULE_PARM_DESC(isapnp, "When set to 0 driver ISA PnP support will be disabled"); #endif -module_param(io, int, 0); +module_param_hw(io, int, ioport, 0); MODULE_PARM_DESC(io, "io port"); module_param(timeout, int, 0); MODULE_PARM_DESC(timeout, "range is 0-255 minutes, default is 1"); diff --git a/drivers/watchdog/wdt.c b/drivers/watchdog/wdt.c index e0206b5b7d8955526f9598a6cc012fcdfd090873..e481fbbc4ae706b601dd64e21adcb58a5483c1f0 100644 --- a/drivers/watchdog/wdt.c +++ b/drivers/watchdog/wdt.c @@ -78,9 +78,9 @@ static int irq = 11; static DEFINE_SPINLOCK(wdt_lock); -module_param(io, int, 0); +module_param_hw(io, int, ioport, 0); MODULE_PARM_DESC(io, "WDT io port (default=0x240)"); -module_param(irq, int, 0); +module_param_hw(irq, int, irq, 0); MODULE_PARM_DESC(irq, "WDT irq (default=11)"); /* Support for the Fan Tachometer on the WDT501-P */ diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index a6d4378eb8d9fc8aeea4fe1961f5c0f030ce2060..50dcb68d8070756ef8e4a67af8200c9d5e4c8c20 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -709,6 +709,7 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) } EXPORT_SYMBOL(free_xenballooned_pages); +#ifdef CONFIG_XEN_PV static void __init balloon_add_region(unsigned long start_pfn, unsigned long pages) { @@ -732,19 +733,22 @@ static void __init balloon_add_region(unsigned long start_pfn, balloon_stats.total_pages += extra_pfn_end - start_pfn; } +#endif static int __init balloon_init(void) { - int i; - if (!xen_domain()) return -ENODEV; pr_info("Initialising balloon driver\n"); +#ifdef CONFIG_XEN_PV balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) : get_num_physpages(); +#else + balloon_stats.current_pages = get_num_physpages(); +#endif balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -761,14 +765,20 @@ static int __init balloon_init(void) register_sysctl_table(xen_root); #endif - /* - * Initialize the balloon with pages from the extra memory - * regions (see arch/x86/xen/setup.c). - */ - for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) - if (xen_extra_mem[i].n_pfns) - balloon_add_region(xen_extra_mem[i].start_pfn, - xen_extra_mem[i].n_pfns); +#ifdef CONFIG_XEN_PV + { + int i; + + /* + * Initialize the balloon with pages from the extra memory + * regions (see arch/x86/xen/setup.c). + */ + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) + if (xen_extra_mem[i].n_pfns) + balloon_add_region(xen_extra_mem[i].start_pfn, + xen_extra_mem[i].n_pfns); + } +#endif return 0; } diff --git a/drivers/xen/efi.c b/drivers/xen/efi.c index 22f71ffd340677cd8c0dae3c06614041e373c1b9..9243a9051078229172daea01e0d621ac089ac050 100644 --- a/drivers/xen/efi.c +++ b/drivers/xen/efi.c @@ -26,6 +26,7 @@ #include #include #include +#include #include @@ -263,3 +264,20 @@ efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, return efi_data(op).status; } EXPORT_SYMBOL_GPL(xen_efi_query_capsule_caps); + +void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data) +{ + switch (reset_type) { + case EFI_RESET_COLD: + case EFI_RESET_WARM: + xen_reboot(SHUTDOWN_reboot); + break; + case EFI_RESET_SHUTDOWN: + xen_reboot(SHUTDOWN_poweroff); + break; + default: + BUG(); + } +} +EXPORT_SYMBOL_GPL(xen_efi_reset_system); diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6a53577772c92127aa25dac3cd7bc7fe18fe09d9..b52852f38cff9d07e3326e8659113d9fdb2bba39 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -1312,6 +1312,9 @@ static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu) if (!VALID_EVTCHN(evtchn)) return -1; + if (!xen_support_evtchn_rebind()) + return -1; + /* Send future instances of this interrupt to other vcpu. */ bind_vcpu.port = evtchn; bind_vcpu.vcpu = xen_vcpu_nr(tcpu); @@ -1646,14 +1649,20 @@ void xen_callback_vector(void) int rc; uint64_t callback_via; - callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); - rc = xen_set_callback_via(callback_via); - BUG_ON(rc); - pr_info("Xen HVM callback vector for event delivery is enabled\n"); - /* in the restore case the vector has already been allocated */ - if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) - alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, - xen_hvm_callback_vector); + if (xen_have_vector_callback) { + callback_via = HVM_CALLBACK_VECTOR(HYPERVISOR_CALLBACK_VECTOR); + rc = xen_set_callback_via(callback_via); + if (rc) { + pr_err("Request for Xen HVM callback vector failed\n"); + xen_have_vector_callback = 0; + return; + } + pr_info("Xen HVM callback vector for event delivery is enabled\n"); + /* in the restore case the vector has already been allocated */ + if (!test_bit(HYPERVISOR_CALLBACK_VECTOR, used_vectors)) + alloc_intr_gate(HYPERVISOR_CALLBACK_VECTOR, + xen_hvm_callback_vector); + } } #else void xen_callback_vector(void) {} diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 6890897a6f30edab0c0d87926942463f9b481aa9..10f1ef5826595ae53e927243bcdfa5f66c4121c6 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -87,18 +87,6 @@ struct user_evtchn { bool enabled; }; -static evtchn_port_t *evtchn_alloc_ring(unsigned int size) -{ - evtchn_port_t *ring; - size_t s = size * sizeof(*ring); - - ring = kmalloc(s, GFP_KERNEL); - if (!ring) - ring = vmalloc(s); - - return ring; -} - static void evtchn_free_ring(evtchn_port_t *ring) { kvfree(ring); @@ -334,7 +322,7 @@ static int evtchn_resize_ring(struct per_user_data *u) else new_size = 2 * u->ring_size; - new_ring = evtchn_alloc_ring(new_size); + new_ring = kvmalloc(new_size * sizeof(*new_ring), GFP_KERNEL); if (!new_ring) return -ENOMEM; diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 2a165cc8a43cd6768529ffe48126c172e5b9f7df..1275df83070f2186388cdf09aab3bbf1facf1326 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -90,8 +90,10 @@ static int xen_allocate_irq(struct pci_dev *pdev) static int platform_pci_resume(struct pci_dev *pdev) { int err; - if (!xen_pv_domain()) + + if (xen_have_vector_callback) return 0; + err = xen_set_callback_via(callback_via); if (err) { dev_err(&pdev->dev, "platform_pci_resume failure!\n"); @@ -137,15 +139,7 @@ static int platform_pci_probe(struct pci_dev *pdev, platform_mmio = mmio_addr; platform_mmiolen = mmio_len; - - /* - * Xen HVM guests always use the vector callback mechanism. - * L1 Dom0 in a nested Xen environment is a PV guest inside in an - * HVM environment. It needs the platform-pci driver to get - * notifications from L0 Xen, but it cannot use the vector callback - * as it is not exported by L1 Xen. - */ - if (xen_pv_domain()) { + if (!xen_have_vector_callback) { ret = xen_allocate_irq(pdev); if (ret) { dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index e8cef1ad0fe31e0139903399d70730c7eafdc399..8dab0d3dc1726b0f8b19e06b26208969fc33a0ea 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -693,8 +693,8 @@ xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, unsigned long attrs) { #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (__generic_dma_ops(dev)->mmap) - return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr, + if (xen_get_dma_ops(dev)->mmap) + return xen_get_dma_ops(dev)->mmap(dev, vma, cpu_addr, dma_addr, size, attrs); #endif return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); @@ -711,7 +711,7 @@ xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned long attrs) { #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) - if (__generic_dma_ops(dev)->get_sgtable) { + if (xen_get_dma_ops(dev)->get_sgtable) { #if 0 /* * This check verifies that the page belongs to the current domain and @@ -721,7 +721,7 @@ xen_swiotlb_get_sgtable(struct device *dev, struct sg_table *sgt, unsigned long bfn = PHYS_PFN(dma_to_phys(dev, handle)); BUG_ON (!page_is_ram(bfn)); #endif - return __generic_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, + return xen_get_dma_ops(dev)->get_sgtable(dev, sgt, cpu_addr, handle, size, attrs); } #endif diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 328c3987b112dcf84fe40005d936bfbfed601f91..967f069385d0cf1fa9933cdeb0f8804dc0324e06 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -44,14 +44,14 @@ static const struct file_operations capabilities_file_ops = { static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr xenfs_files[] = { + static const struct tree_descr xenfs_files[] = { [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, {""}, }; - static struct tree_descr xenfs_init_files[] = { + static const struct tree_descr xenfs_init_files[] = { [2] = { "xenbus", &xen_xenbus_fops, S_IRUSR|S_IWUSR }, { "capabilities", &capabilities_file_ops, S_IRUGO }, { "privcmd", &xen_privcmd_fops, S_IRUSR|S_IWUSR }, diff --git a/drivers/zorro/zorro-driver.c b/drivers/zorro/zorro-driver.c index eacae1434b73fd7472e2d81b5b86ebc214f3e6af..fa23b7366b98ccbea916e05ad1d810e4c5ad0a50 100644 --- a/drivers/zorro/zorro-driver.c +++ b/drivers/zorro/zorro-driver.c @@ -14,6 +14,8 @@ #include #include +#include "zorro.h" + /** * zorro_match_device - Tell if a Zorro device structure has a matching @@ -161,12 +163,13 @@ static int zorro_uevent(struct device *dev, struct kobj_uevent_env *env) } struct bus_type zorro_bus_type = { - .name = "zorro", - .dev_name = "zorro", - .match = zorro_bus_match, - .uevent = zorro_uevent, - .probe = zorro_device_probe, - .remove = zorro_device_remove, + .name = "zorro", + .dev_name = "zorro", + .dev_groups = zorro_device_attribute_groups, + .match = zorro_bus_match, + .uevent = zorro_uevent, + .probe = zorro_device_probe, + .remove = zorro_device_remove, }; EXPORT_SYMBOL(zorro_bus_type); diff --git a/drivers/zorro/zorro-sysfs.c b/drivers/zorro/zorro-sysfs.c index 9282dbf5abdba6b01e059a45e7fc9895002dbd2d..3d34dba9bb2dd488ffe509792940e51336c1852e 100644 --- a/drivers/zorro/zorro-sysfs.c +++ b/drivers/zorro/zorro-sysfs.c @@ -23,33 +23,33 @@ /* show configuration fields */ #define zorro_config_attr(name, field, format_string) \ -static ssize_t \ -show_##name(struct device *dev, struct device_attribute *attr, char *buf) \ +static ssize_t name##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ { \ struct zorro_dev *z; \ \ z = to_zorro_dev(dev); \ return sprintf(buf, format_string, z->field); \ } \ -static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL); +static DEVICE_ATTR_RO(name); zorro_config_attr(id, id, "0x%08x\n"); zorro_config_attr(type, rom.er_Type, "0x%02x\n"); zorro_config_attr(slotaddr, slotaddr, "0x%04x\n"); zorro_config_attr(slotsize, slotsize, "0x%04x\n"); -static ssize_t -show_serial(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t serial_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct zorro_dev *z; z = to_zorro_dev(dev); return sprintf(buf, "0x%08x\n", be32_to_cpu(z->rom.er_SerialNumber)); } +static DEVICE_ATTR_RO(serial); -static DEVICE_ATTR(serial, S_IRUGO, show_serial, NULL); - -static ssize_t zorro_show_resource(struct device *dev, struct device_attribute *attr, char *buf) +static ssize_t resource_show(struct device *dev, struct device_attribute *attr, + char *buf) { struct zorro_dev *z = to_zorro_dev(dev); @@ -58,8 +58,27 @@ static ssize_t zorro_show_resource(struct device *dev, struct device_attribute * (unsigned long)zorro_resource_end(z), zorro_resource_flags(z)); } +static DEVICE_ATTR_RO(resource); + +static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct zorro_dev *z = to_zorro_dev(dev); -static DEVICE_ATTR(resource, S_IRUGO, zorro_show_resource, NULL); + return sprintf(buf, ZORRO_DEVICE_MODALIAS_FMT "\n", z->id); +} +static DEVICE_ATTR_RO(modalias); + +static struct attribute *zorro_device_attrs[] = { + &dev_attr_id.attr, + &dev_attr_type.attr, + &dev_attr_serial.attr, + &dev_attr_slotaddr.attr, + &dev_attr_slotsize.attr, + &dev_attr_resource.attr, + &dev_attr_modalias.attr, + NULL +}; static ssize_t zorro_read_config(struct file *filp, struct kobject *kobj, struct bin_attribute *bin_attr, @@ -88,32 +107,17 @@ static struct bin_attribute zorro_config_attr = { .read = zorro_read_config, }; -static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, - char *buf) -{ - struct zorro_dev *z = to_zorro_dev(dev); - - return sprintf(buf, ZORRO_DEVICE_MODALIAS_FMT "\n", z->id); -} - -static DEVICE_ATTR(modalias, S_IRUGO, modalias_show, NULL); +static struct bin_attribute *zorro_device_bin_attrs[] = { + &zorro_config_attr, + NULL +}; -int zorro_create_sysfs_dev_files(struct zorro_dev *z) -{ - struct device *dev = &z->dev; - int error; - - /* current configuration's attributes */ - if ((error = device_create_file(dev, &dev_attr_id)) || - (error = device_create_file(dev, &dev_attr_type)) || - (error = device_create_file(dev, &dev_attr_serial)) || - (error = device_create_file(dev, &dev_attr_slotaddr)) || - (error = device_create_file(dev, &dev_attr_slotsize)) || - (error = device_create_file(dev, &dev_attr_resource)) || - (error = device_create_file(dev, &dev_attr_modalias)) || - (error = sysfs_create_bin_file(&dev->kobj, &zorro_config_attr))) - return error; - - return 0; -} +static const struct attribute_group zorro_device_attr_group = { + .attrs = zorro_device_attrs, + .bin_attrs = zorro_device_bin_attrs, +}; +const struct attribute_group *zorro_device_attribute_groups[] = { + &zorro_device_attr_group, + NULL +}; diff --git a/drivers/zorro/zorro.c b/drivers/zorro/zorro.c index d295d9878dff7594980481843ee9688442cee1f4..cc1b1ac57d61e8b75ffde0ae2ea9c8f3a66f04be 100644 --- a/drivers/zorro/zorro.c +++ b/drivers/zorro/zorro.c @@ -197,9 +197,6 @@ static int __init amiga_zorro_probe(struct platform_device *pdev) put_device(&z->dev); continue; } - error = zorro_create_sysfs_dev_files(z); - if (error) - dev_err(&z->dev, "Error creating sysfs files\n"); } /* Mark all available Zorro II memory */ diff --git a/drivers/zorro/zorro.h b/drivers/zorro/zorro.h index 34119fb4e5601603edbd139563f539b3548963c9..4f805c01cfbc017a01f012263f031a8411ac2486 100644 --- a/drivers/zorro/zorro.h +++ b/drivers/zorro/zorro.h @@ -5,5 +5,4 @@ extern void zorro_name_device(struct zorro_dev *z); static inline void zorro_name_device(struct zorro_dev *dev) { } #endif -extern int zorro_create_sysfs_dev_files(struct zorro_dev *z); - +extern const struct attribute_group *zorro_device_attribute_groups[]; diff --git a/firmware/Makefile b/firmware/Makefile index e297e1b52636dadb9bf3f3acb2102c079e73e0e6..fa3e81c2a97b269f2065a45b8045d4944427c1d1 100644 --- a/firmware/Makefile +++ b/firmware/Makefile @@ -176,7 +176,8 @@ quiet_cmd_fwbin = MK_FW $@ wordsize_deps := $(wildcard include/config/64bit.h include/config/32bit.h \ include/config/ppc32.h include/config/ppc64.h \ include/config/superh32.h include/config/superh64.h \ - include/config/x86_32.h include/config/x86_64.h) + include/config/x86_32.h include/config/x86_64.h \ + firmware/Makefile) $(patsubst %,$(obj)/%.gen.S, $(fw-shipped-y)): %: $(wordsize_deps) $(call cmd,fwbin,$(patsubst %.gen.S,%,$@)) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index a89f3cfe3c7d7fba5341ee5adac4a3a1ecd8adec..c202930086edb6fd22e0645cdc5264071af28d1f 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -333,10 +333,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, goto err_names; init_rwsem(&v9ses->rename_sem); - rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) - goto err_names; - v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; @@ -345,7 +341,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (IS_ERR(v9ses->clnt)) { rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto err_bdi; + goto err_names; } v9ses->flags = V9FS_ACCESS_USER; @@ -415,8 +411,6 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, err_clnt: p9_client_destroy(v9ses->clnt); -err_bdi: - bdi_destroy(&v9ses->bdi); err_names: kfree(v9ses->uname); kfree(v9ses->aname); @@ -445,8 +439,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) kfree(v9ses->uname); kfree(v9ses->aname); - bdi_destroy(&v9ses->bdi); - spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 443d12e020436f2e7af92333ae89d3f068dc614a..76eaf49abd3aea3b56644f3a025ccbc48b89bae8 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -114,7 +114,6 @@ struct v9fs_session_info { kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ - struct backing_dev_info bdi; struct rw_semaphore rename_sem; }; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index de3ed862919691344e5080c855403d4236512cd8..a0965fb587a5f7321b7f428b018d88307d471163 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -72,10 +72,12 @@ static int v9fs_set_super(struct super_block *s, void *data) * */ -static void +static int v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, int flags, void *data) { + int ret; + sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); sb->s_blocksize = 1 << sb->s_blocksize_bits; @@ -85,7 +87,11 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_xattr = v9fs_xattr_handlers; } else sb->s_op = &v9fs_super_ops; - sb->s_bdi = &v9ses->bdi; + + ret = super_setup_bdi(sb); + if (ret) + return ret; + if (v9ses->cache) sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024)/PAGE_SIZE; @@ -99,6 +105,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, #endif save_mount_options(sb, data); + return 0; } /** @@ -138,7 +145,9 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto clunk_fid; } - v9fs_fill_super(sb, v9ses, flags, data); + retval = v9fs_fill_super(sb, v9ses, flags, data); + if (retval) + goto release_sb; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) sb->s_d_op = &v9fs_cached_dentry_operations; diff --git a/fs/Kconfig b/fs/Kconfig index 83eab52fb3f69a75aa06a9f2a31760a384508f41..b0e42b6a96b97e37b03b7b61e9def73e814f421e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -39,6 +39,7 @@ config FS_DAX depends on MMU depends on !(ARM || MIPS || SPARC) select FS_IOMAP + select DAX help Direct Access (DAX) can be used on memory-backed block devices. If the block device supports DAX and the filesystem supports DAX, diff --git a/fs/affs/affs.h b/fs/affs/affs.h index 2f8bab390d1379731ce889ff2b1beec5ddf91a53..773749be8290cb782324d341865861a2e64d199a 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include "amigaffs.h" #include #include @@ -173,7 +173,7 @@ extern int affs_link(struct dentry *olddentry, struct inode *dir, struct dentry *dentry); extern int affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, +extern int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); diff --git a/include/linux/amigaffs.h b/fs/affs/amigaffs.h similarity index 100% rename from include/linux/amigaffs.h rename to fs/affs/amigaffs.h diff --git a/fs/affs/dir.c b/fs/affs/dir.c index f1e7294381c5aa48a386a3a7c0db7f4832c568ec..591ecd7f3063731d0eebe1435714e0be3949d455 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -35,7 +35,7 @@ const struct inode_operations affs_dir_inode_operations = { .symlink = affs_symlink, .mkdir = affs_mkdir, .rmdir = affs_rmdir, - .rename = affs_rename, + .rename = affs_rename2, .setattr = affs_notify_change, }; diff --git a/fs/affs/file.c b/fs/affs/file.c index 0deec9cc2362c23fae1e25e88b12ff7769d823e2..196ee7f6fdc40e45235178b55abbc54461fbbb49 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -499,7 +499,7 @@ affs_getemptyblk_ino(struct inode *inode, int block) } static int -affs_do_readpage_ofs(struct page *page, unsigned to) +affs_do_readpage_ofs(struct page *page, unsigned to, int create) { struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; @@ -518,7 +518,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to) boff = tmp % bsize; while (pos < to) { - bh = affs_bread_ino(inode, bidx, 0); + bh = affs_bread_ino(inode, bidx, create); if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); @@ -620,7 +620,7 @@ affs_readpage_ofs(struct file *file, struct page *page) memset(page_address(page) + to, 0, PAGE_SIZE - to); } - err = affs_do_readpage_ofs(page, to); + err = affs_do_readpage_ofs(page, to, 0); if (!err) SetPageUptodate(page); unlock_page(page); @@ -657,7 +657,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping return 0; /* XXX: inefficient but safe in the face of short writes */ - err = affs_do_readpage_ofs(page, PAGE_SIZE); + err = affs_do_readpage_ofs(page, PAGE_SIZE, 1); if (err) { unlock_page(page); put_page(page); @@ -679,7 +679,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, int written; from = pos & (PAGE_SIZE - 1); - to = pos + len; + to = from + len; /* * XXX: not sure if this can handle short copies (len < copied), but * we don't have to, because the page should always be uptodate here, diff --git a/fs/affs/inode.c b/fs/affs/inode.c index abcc59899229c68165b9bcf0830885ecf707ecac..fd4ef3c40e4054ba9f23dec2198b650ef53e4cae 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &affs_file_operations; break; case ST_SOFTLINK: + inode->i_size = strlen((char *)AFFS_HEAD(bh)->table); inode->i_mode |= S_IFLNK; inode_nohighmem(inode); inode->i_op = &affs_symlink_inode_operations; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 96dd1d09a2735676f51b94ff9390bbc114cf8367..46d3ace6761d7df89dcbbc598991a41561e16518 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -365,6 +365,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) symname++; } *p = 0; + inode->i_size = i + 1; mark_buffer_dirty_inode(bh, inode); affs_brelse(bh); mark_inode_dirty(inode); @@ -393,21 +394,14 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) return affs_add_entry(dir, inode, dentry, ST_LINKFILE); } -int +static int affs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry, - unsigned int flags) + struct inode *new_dir, struct dentry *new_dentry) { struct super_block *sb = old_dir->i_sb; struct buffer_head *bh = NULL; int retval; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, - old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); - retval = affs_check_name(new_dentry->d_name.name, new_dentry->d_name.len, affs_nofilenametruncate(old_dentry)); @@ -447,6 +441,76 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry, return retval; } +static int +affs_xrename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + + struct super_block *sb = old_dir->i_sb; + struct buffer_head *bh_old = NULL; + struct buffer_head *bh_new = NULL; + int retval; + + bh_old = affs_bread(sb, d_inode(old_dentry)->i_ino); + if (!bh_old) + return -EIO; + + bh_new = affs_bread(sb, d_inode(new_dentry)->i_ino); + if (!bh_new) + return -EIO; + + /* Remove old header from its parent directory. */ + affs_lock_dir(old_dir); + retval = affs_remove_hash(old_dir, bh_old); + affs_unlock_dir(old_dir); + if (retval) + goto done; + + /* Remove new header from its parent directory. */ + affs_lock_dir(new_dir); + retval = affs_remove_hash(new_dir, bh_new); + affs_unlock_dir(new_dir); + if (retval) + goto done; + + /* Insert old into the new directory with the new name. */ + affs_copy_name(AFFS_TAIL(sb, bh_old)->name, new_dentry); + affs_fix_checksum(sb, bh_old); + affs_lock_dir(new_dir); + retval = affs_insert_hash(new_dir, bh_old); + affs_unlock_dir(new_dir); + + /* Insert new into the old directory with the old name. */ + affs_copy_name(AFFS_TAIL(sb, bh_new)->name, old_dentry); + affs_fix_checksum(sb, bh_new); + affs_lock_dir(old_dir); + retval = affs_insert_hash(old_dir, bh_new); + affs_unlock_dir(old_dir); +done: + mark_buffer_dirty_inode(bh_old, new_dir); + mark_buffer_dirty_inode(bh_new, old_dir); + affs_brelse(bh_old); + affs_brelse(bh_new); + return retval; +} + +int affs_rename2(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + return -EINVAL; + + pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__, + old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry); + + if (flags & RENAME_EXCHANGE) + return affs_xrename(old_dir, old_dentry, new_dir, new_dentry); + + return affs_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static struct dentry *affs_get_parent(struct dentry *child) { struct inode *parent; @@ -477,11 +541,6 @@ static struct inode *affs_nfs_get_inode(struct super_block *sb, u64 ino, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation && inode->i_generation != generation) { - iput(inode); - return ERR_PTR(-ESTALE); - } - return inode; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index a6901360fb81d435bf47a85b781a89a1056fd900..393672997cc23d4e5688dc77421ae81b0df95b48 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -318,7 +318,6 @@ struct afs_volume { unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ struct rw_semaphore server_sem; /* lock for accessing current server */ - struct backing_dev_info bdi; }; /* diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8f76b13d55494bddec9e81203c0734a0f6d811d7..d5990eb160bdf49a3a916c0195d04fbe521ef2b2 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -419,7 +419,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp, call->state = AFS_CALL_COMPLETE; if (ret != -ECONNABORTED) { rxrpc_kernel_abort_call(afs_socket, rxcall, RX_USER_ABORT, - -ret, "KSD"); + ret, "KSD"); } else { abort_code = 0; offset = 0; @@ -478,12 +478,12 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENOTCONN: abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KNC"); + abort_code, ret, "KNC"); goto save_error; case -ENOTSUPP: abort_code = RXGEN_OPCODE; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, -ret, "KIV"); + abort_code, ret, "KIV"); goto save_error; case -ENODATA: case -EBADMSG: @@ -493,7 +493,7 @@ static void afs_deliver_to_call(struct afs_call *call) if (call->state != AFS_CALL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; rxrpc_kernel_abort_call(afs_socket, call->rxcall, - abort_code, EBADMSG, "KUM"); + abort_code, -EBADMSG, "KUM"); goto save_error; } } @@ -754,7 +754,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); default: _leave(" [error]"); return; @@ -792,7 +792,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(afs_socket, call->rxcall, - RX_USER_ABORT, ENOMEM, "KOO"); + RX_USER_ABORT, -ENOMEM, "KOO"); } _leave(" [error]"); } diff --git a/fs/afs/super.c b/fs/afs/super.c index fbdb022b75a27be11b5699203f8c04e32c499cf6..c79633e5cfd80ce6754650dfb1a26890cf035205 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -319,7 +319,10 @@ static int afs_fill_super(struct super_block *sb, sb->s_blocksize_bits = PAGE_SHIFT; sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; - sb->s_bdi = &as->volume->bdi; + ret = super_setup_bdi(sb); + if (ret) + return ret; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; strlcpy(sb->s_id, as->volume->vlocation->vldb.name, sizeof(sb->s_id)); /* allocate the root inode and dentry */ diff --git a/fs/afs/volume.c b/fs/afs/volume.c index 546f9d01710b5b66706331968edc557306ed848e..db73d6dad02b5f8f549cca6ccacdabaa9d458f63 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,11 +106,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; - volume->bdi.ra_pages = VM_MAX_READAHEAD*1024/PAGE_SIZE; - ret = bdi_setup_and_register(&volume->bdi, "afs"); - if (ret) - goto error_bdi; - init_rwsem(&volume->server_sem); /* look up all the applicable server records */ @@ -156,8 +151,6 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) return ERR_PTR(ret); error_discard: - bdi_destroy(&volume->bdi); -error_bdi: up_write(¶ms->cell->vl_sem); for (loop = volume->nservers - 1; loop >= 0; loop--) @@ -207,7 +200,6 @@ void afs_put_volume(struct afs_volume *volume) for (loop = volume->nservers - 1; loop >= 0; loop--) afs_put_server(volume->servers[loop]); - bdi_destroy(&volume->bdi); kfree(volume); _leave(" [destroyed]"); diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 1204d6384d39b9c574d17f863a26db574da9a02c..44727bf18297dfb9434b25bea2af72bfcbb8c2cc 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -7,7 +7,7 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - ; you also + ; you also want to answer Y to "NFS file system support", below. To compile this support as a module, choose M here: the module will be diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c500e954debba1b60a515697276d79c7bd145cdc..63e7c4760bfb468852fb457fd8c2116492bb2a63 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -58,6 +58,7 @@ static struct dentry *befs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); static struct dentry *befs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); +static struct dentry *befs_get_parent(struct dentry *child); static const struct super_operations befs_sops = { .alloc_inode = befs_alloc_inode, /* allocate a new inode */ @@ -93,6 +94,7 @@ static const struct address_space_operations befs_symlink_aops = { static const struct export_operations befs_export_operations = { .fh_to_dentry = befs_fh_to_dentry, .fh_to_parent = befs_fh_to_parent, + .get_parent = befs_get_parent, }; /* @@ -667,6 +669,19 @@ static struct dentry *befs_fh_to_parent(struct super_block *sb, befs_nfs_get_inode); } +static struct dentry *befs_get_parent(struct dentry *child) +{ + struct inode *parent; + struct befs_inode_info *befs_ino = BEFS_I(d_inode(child)); + + parent = befs_iget(child->d_sb, + (unsigned long)befs_ino->i_parent.start); + if (IS_ERR(parent)) + return ERR_CAST(parent); + + return d_obtain_alias(parent); +} + enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index f2deec0a62f0c613349d6f7d94bd6077d2cceb96..25e312cb60716caef70a66f12eb391f8aa234005 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -1,7 +1,7 @@ /* * fs/bfs/inode.c * BFS superblock and inode operations. - * Copyright (C) 1999-2006 Tigran Aivazian + * Copyright (C) 1999-2006 Tigran Aivazian * From fs/minix, Copyright (C) 1991, 1992 Linus Torvalds. * * Made endianness-clean by Andrew Stribblehill , 2005. @@ -19,7 +19,7 @@ #include #include "bfs.h" -MODULE_AUTHOR("Tigran Aivazian "); +MODULE_AUTHOR("Tigran Aivazian "); MODULE_DESCRIPTION("SCO UnixWare BFS filesystem for Linux"); MODULE_LICENSE("GPL"); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index bee1a36bc2ec4e0be1343c11bb07da13bbcb384f..f4718098ac31885b64ebaba7c03af5fe75515092 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -818,7 +818,7 @@ static const struct super_operations s_ops = { static int bm_fill_super(struct super_block *sb, void *data, int silent) { int err; - static struct tree_descr bm_files[] = { + static const struct tree_descr bm_files[] = { [2] = {"status", &bm_status_operations, S_IWUSR|S_IRUGO}, [3] = {"register", &bm_register_operations, S_IWUSR}, /* last one */ {""} diff --git a/fs/block_dev.c b/fs/block_dev.c index 2eca00ec43706bb78955cd221a24593942625d57..519599dddd3692ee373a9eb00d95d5757556ad42 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -103,12 +104,11 @@ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0) - return; - - invalidate_bh_lrus(); - lru_add_drain_all(); /* make sure all lru add caches are flushed */ - invalidate_mapping_pages(mapping, 0, -1); + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ @@ -717,120 +717,6 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(bdev_write_page); -/** - * bdev_direct_access() - Get the address for directly-accessibly memory - * @bdev: The device containing the memory - * @dax: control and output parameters for ->direct_access - * - * If a block device is made up of directly addressable memory, this function - * will tell the caller the PFN and the address of the memory. The address - * may be directly dereferenced within the kernel without the need to call - * ioremap(), kmap() or similar. The PFN is suitable for inserting into - * page tables. - * - * Return: negative errno if an error occurs, otherwise the number of bytes - * accessible at this address. - */ -long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) -{ - sector_t sector = dax->sector; - long avail, size = dax->size; - const struct block_device_operations *ops = bdev->bd_disk->fops; - - /* - * The device driver is allowed to sleep, in order to make the - * memory directly accessible. - */ - might_sleep(); - - if (size < 0) - return size; - if (!blk_queue_dax(bdev_get_queue(bdev)) || !ops->direct_access) - return -EOPNOTSUPP; - if ((sector + DIV_ROUND_UP(size, 512)) > - part_nr_sects_read(bdev->bd_part)) - return -ERANGE; - sector += get_start_sect(bdev); - if (sector % (PAGE_SIZE / 512)) - return -EINVAL; - avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn, size); - if (!avail) - return -ERANGE; - if (avail > 0 && avail & ~PAGE_MASK) - return -ENXIO; - return min(avail, size); -} -EXPORT_SYMBOL_GPL(bdev_direct_access); - -/** - * bdev_dax_supported() - Check if the device supports dax for filesystem - * @sb: The superblock of the device - * @blocksize: The block size of the device - * - * This is a library function for filesystems to check if the block device - * can be mounted with dax option. - * - * Return: negative errno if unsupported, 0 if supported. - */ -int bdev_dax_supported(struct super_block *sb, int blocksize) -{ - struct blk_dax_ctl dax = { - .sector = 0, - .size = PAGE_SIZE, - }; - int err; - - if (blocksize != PAGE_SIZE) { - vfs_msg(sb, KERN_ERR, "error: unsupported blocksize for dax"); - return -EINVAL; - } - - err = bdev_direct_access(sb->s_bdev, &dax); - if (err < 0) { - switch (err) { - case -EOPNOTSUPP: - vfs_msg(sb, KERN_ERR, - "error: device does not support dax"); - break; - case -EINVAL: - vfs_msg(sb, KERN_ERR, - "error: unaligned partition for dax"); - break; - default: - vfs_msg(sb, KERN_ERR, - "error: dax access failed (%d)", err); - } - return err; - } - - return 0; -} -EXPORT_SYMBOL_GPL(bdev_dax_supported); - -/** - * bdev_dax_capable() - Return if the raw device is capable for dax - * @bdev: The device for raw block device access - */ -bool bdev_dax_capable(struct block_device *bdev) -{ - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - }; - - if (!IS_ENABLED(CONFIG_FS_DAX)) - return false; - - dax.sector = 0; - if (bdev_direct_access(bdev, &dax) < 0) - return false; - - dax.sector = bdev->bd_part->nr_sects - (PAGE_SIZE / 512); - if (bdev_direct_access(bdev, &dax) < 0) - return false; - - return true; -} - /* * pseudo-fs */ @@ -885,6 +771,8 @@ static void bdev_evict_inode(struct inode *inode) spin_lock(&bdev_lock); list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + /* Detach inode from wb early as bdi_put() may free bdi->wb */ + inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; @@ -1451,7 +1339,6 @@ int revalidate_disk(struct gendisk *disk) if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); - blk_integrity_revalidate(disk); bdev = bdget_disk(disk, 0); if (!bdev) return ret; @@ -1556,8 +1443,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1622,6 +1507,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); } + + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; @@ -1653,8 +1541,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; @@ -1876,12 +1762,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* - * Detaching bdev inode from its wb in __destroy_inode() - * is too late: the queue which embeds its bdi (along with - * root wb) can be gone as soon as we put_disk() below. - */ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) @@ -2074,7 +1954,6 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); - struct request_queue *q = bdev_get_queue(bdev); struct address_space *mapping; loff_t end = start + len - 1; loff_t isize; @@ -2110,18 +1989,13 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, - GFP_KERNEL, false); + GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: - /* Only punch if the device can do zeroing discard. */ - if (!blk_queue_discard(q) || !q->limits.discard_zeroes_data) - return -EOPNOTSUPP; - error = blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, 0); + error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, + GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; error = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); break; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 7699e16784d313459181c746d0b8c30d468e23a7..24865da63d8fdfd2a979429a426b42fe8781e0aa 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -26,6 +26,11 @@ #include "delayed-ref.h" #include "locking.h" +enum merge_mode { + MERGE_IDENTICAL_KEYS = 1, + MERGE_IDENTICAL_PARENTS, +}; + /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -533,7 +538,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * slot==nritems. In that case, go to the next leaf before we continue. */ if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -577,7 +582,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -629,7 +634,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == (u64)-1) + else if (time_seq == SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -640,7 +645,7 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, 0, 0); else @@ -809,14 +814,12 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, /* * merge backrefs and adjust counts accordingly * - * mode = 1: merge identical keys, if key is set - * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. - * additionally, we could even add a key range for the blocks we - * looked into to merge even more (-> replace unresolved refs by those - * having a parent). - * mode = 2: merge identical parents + * FIXME: For MERGE_IDENTICAL_KEYS, if we add more keys in __add_prelim_ref + * then we can merge more here. Additionally, we could even add a key + * range for the blocks we looked into to merge even more (-> replace + * unresolved refs by those having a parent). */ -static void __merge_refs(struct list_head *head, int mode) +static void __merge_refs(struct list_head *head, enum merge_mode mode) { struct __prelim_ref *pos1; @@ -829,7 +832,7 @@ static void __merge_refs(struct list_head *head, int mode) if (!ref_for_same_block(ref1, ref2)) continue; - if (mode == 1) { + if (mode == MERGE_IDENTICAL_KEYS) { if (!ref1->parent && ref2->parent) swap(ref1, ref2); } else { @@ -1196,7 +1199,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * - * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave * much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). @@ -1243,7 +1246,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == (u64)-1) + if (time_seq == SEQ_LAST) path->skip_locking = 1; /* @@ -1273,9 +1276,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != (u64)-1) { + time_seq != SEQ_LAST) { #else - if (trans && time_seq != (u64)-1) { + if (trans && time_seq != SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1286,7 +1289,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -1374,7 +1377,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (ret) goto out; - __merge_refs(&prefs, 1); + __merge_refs(&prefs, MERGE_IDENTICAL_KEYS); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, extent_item_pos, total_refs, @@ -1382,7 +1385,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, if (ret) goto out; - __merge_refs(&prefs, 2); + __merge_refs(&prefs, MERGE_IDENTICAL_PARENTS); while (!list_empty(&prefs)) { ref = list_first_entry(&prefs, struct __prelim_ref, list); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0c6baaba0651ce10ba5e394ad82a03a917ced4e6..b8622e4d1744de68180f96036ad5ddbc3c195ca8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -124,6 +124,13 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes. + */ + u64 new_delalloc_bytes; + /* * total number of bytes pending defrag, used by stat to check whether * it needs COW. diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c7721a6aa3bb5346cbd9103b4ee3e9f4528a8c4e..10e6b282d09d6e8d31b4ac8740d0119ccad3e786 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -44,7 +44,7 @@ struct compressed_bio { /* number of bios pending for this compressed extent */ - atomic_t pending_bios; + refcount_t pending_bios; /* the pages with the compressed data on them */ struct page **compressed_pages; @@ -161,7 +161,7 @@ static void end_compressed_bio_read(struct bio *bio) /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; inode = cb->inode; @@ -274,7 +274,7 @@ static void end_compressed_bio_write(struct bio *bio) /* if there are more bios still pending for this compressed * extent, just exit */ - if (!atomic_dec_and_test(&cb->pending_bios)) + if (!refcount_dec_and_test(&cb->pending_bios)) goto out; /* ok, we're the last bio for this extent, step one is to @@ -342,7 +342,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); if (!cb) return -ENOMEM; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->start = start; @@ -363,7 +363,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, bio_set_op_attrs(bio, REQ_OP_WRITE, 0); bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); /* create and submit bios for the compressed pages */ bytes_left = compressed_len; @@ -388,7 +388,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ @@ -607,7 +607,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!cb) goto out; - atomic_set(&cb->pending_bios, 0); + refcount_set(&cb->pending_bios, 0); cb->errors = 0; cb->inode = inode; cb->mirror_num = mirror_num; @@ -656,7 +656,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, bio_set_op_attrs (comp_bio, REQ_OP_READ, 0); comp_bio->bi_private = cb; comp_bio->bi_end_io = end_compressed_bio_read; - atomic_inc(&cb->pending_bios); + refcount_set(&cb->pending_bios, 1); for (pg_index = 0; pg_index < nr_pages; pg_index++) { page = cb->compressed_pages[pg_index]; @@ -685,7 +685,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, * we inc the count. Otherwise, the cb might get * freed before we're done setting it up */ - atomic_inc(&cb->pending_bios); + refcount_inc(&cb->pending_bios); if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dc8844037e03b7cfb738d81af6c22b33ee2b31d..a3a75f1de002295c425f5957de7e9aba269a7682 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -567,7 +567,7 @@ tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, int dst_slot, int src_slot, - int nr_items, gfp_t flags) + int nr_items) { struct tree_mod_elem *tm = NULL; struct tree_mod_elem **tm_list = NULL; @@ -578,11 +578,11 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, if (!tree_mod_need_log(fs_info, eb)) return 0; - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), flags); + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); if (!tm_list) return -ENOMEM; - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -596,7 +596,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info, for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, flags); + MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -663,7 +663,7 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, static noinline int tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, struct extent_buffer *old_root, - struct extent_buffer *new_root, gfp_t flags, + struct extent_buffer *new_root, int log_removal) { struct tree_mod_elem *tm = NULL; @@ -678,14 +678,14 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, if (log_removal && btrfs_header_level(old_root) > 0) { nritems = btrfs_header_nritems(old_root); tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - flags); + GFP_NOFS); if (!tm_list) { ret = -ENOMEM; goto free_tms; } for (i = 0; i < nritems; i++) { tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, flags); + MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); if (!tm_list[i]) { ret = -ENOMEM; goto free_tms; @@ -693,7 +693,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info, } } - tm = kzalloc(sizeof(*tm), flags); + tm = kzalloc(sizeof(*tm), GFP_NOFS); if (!tm) { ret = -ENOMEM; goto free_tms; @@ -873,7 +873,7 @@ tree_mod_log_eb_move(struct btrfs_fs_info *fs_info, struct extent_buffer *dst, { int ret; ret = tree_mod_log_insert_move(fs_info, dst, dst_offset, src_offset, - nr_items, GFP_NOFS); + nr_items); BUG_ON(ret < 0); } @@ -943,7 +943,7 @@ tree_mod_log_set_root_pointer(struct btrfs_root *root, { int ret; ret = tree_mod_log_insert_root(root->fs_info, root->node, - new_root_node, GFP_NOFS, log_removal); + new_root_node, log_removal); BUG_ON(ret < 0); } @@ -5392,13 +5392,10 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); + tmp_buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); if (!tmp_buf) { - tmp_buf = vmalloc(fs_info->nodesize); - if (!tmp_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } left_path->search_commit_root = 1; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4115901d9064f68217c3be825f233d7d6872cf6..643c70d2b2e65ab96a93ff4c022756ea7e59d179 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -39,6 +39,7 @@ #include #include #include +#include #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -518,7 +519,7 @@ struct btrfs_caching_control { struct btrfs_work work; struct btrfs_block_group_cache *block_group; u64 progress; - atomic_t count; + refcount_t count; }; /* Once caching_thread() finds this much free space, it will wake up waiters. */ @@ -538,6 +539,14 @@ struct btrfs_io_ctl { unsigned check_crcs:1; }; +/* + * Tree to record all locked full stripes of a RAID5/6 block group + */ +struct btrfs_full_stripe_locks_tree { + struct rb_root root; + struct mutex lock; +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; @@ -648,6 +657,9 @@ struct btrfs_block_group_cache { * Protected by free_space_lock. */ int needs_free_space; + + /* Record locked full stripes for RAID5/6 block group */ + struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; /* delayed seq elem */ @@ -658,6 +670,8 @@ struct seq_list { #define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define SEQ_LAST ((u64)-1) + enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -702,6 +716,11 @@ struct btrfs_delayed_root; #define BTRFS_FS_BTREE_ERR 11 #define BTRFS_FS_LOG1_ERR 12 #define BTRFS_FS_LOG2_ERR 13 +/* + * Indicate that a whole-filesystem exclusive operation is running + * (device replace, resize, device add/delete, balance) + */ +#define BTRFS_FS_EXCL_OP 14 struct btrfs_fs_info { u8 fsid[BTRFS_FSID_SIZE]; @@ -810,7 +829,6 @@ struct btrfs_fs_info { struct btrfs_super_block *super_for_commit; struct super_block *sb; struct inode *btree_inode; - struct backing_dev_info bdi; struct mutex tree_log_mutex; struct mutex transaction_kthread_mutex; struct mutex cleaner_mutex; @@ -1067,8 +1085,6 @@ struct btrfs_fs_info { /* device replace state */ struct btrfs_dev_replace dev_replace; - atomic_t mutually_exclusive_operation_running; - struct percpu_counter bio_counter; wait_queue_head_t replace_wait; @@ -1221,7 +1237,7 @@ struct btrfs_root { dev_t anon_dev; spinlock_t root_item_lock; - atomic_t refs; + refcount_t refs; struct mutex delalloc_mutex; spinlock_t delalloc_lock; @@ -3647,6 +3663,12 @@ int btrfs_scrub_cancel_dev(struct btrfs_fs_info *info, struct btrfs_device *dev); int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, struct btrfs_scrub_progress *progress); +static inline void btrfs_init_full_stripe_locks_tree( + struct btrfs_full_stripe_locks_tree *locks_root) +{ + locks_root->root = RB_ROOT; + mutex_init(&locks_root->lock); +} /* dev-replace.c */ void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info); @@ -3671,8 +3693,7 @@ struct reada_control *btrfs_reada_add(struct btrfs_root *root, struct btrfs_key *start, struct btrfs_key *end); int btrfs_reada_wait(void *handle); void btrfs_reada_detach(void *handle); -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err); +int btree_readahead_hook(struct extent_buffer *eb, int err); static inline int is_fstree(u64 rootid) { diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 1aff676f0e5b5b6c63efd32eee44958b40968e2a..8ae409b5a61d7186f050780beced1bf72cd572ca 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,7 +52,7 @@ static inline void btrfs_init_delayed_node( { delayed_node->root = root; delayed_node->inode_id = inode_id; - atomic_set(&delayed_node->refs, 0); + refcount_set(&delayed_node->refs, 0); delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); @@ -81,7 +81,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = READ_ONCE(btrfs_inode->delayed_node); if (node) { - atomic_inc(&node->refs); + refcount_inc(&node->refs); return node; } @@ -89,14 +89,14 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( node = radix_tree_lookup(&root->delayed_nodes_tree, ino); if (node) { if (btrfs_inode->delayed_node) { - atomic_inc(&node->refs); /* can be accessed */ + refcount_inc(&node->refs); /* can be accessed */ BUG_ON(btrfs_inode->delayed_node != node); spin_unlock(&root->inode_lock); return node; } btrfs_inode->delayed_node = node; /* can be accessed and cached in the inode */ - atomic_add(2, &node->refs); + refcount_add(2, &node->refs); spin_unlock(&root->inode_lock); return node; } @@ -125,7 +125,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( btrfs_init_delayed_node(node, root, ino); /* cached in the btrfs inode and can be accessed */ - atomic_add(2, &node->refs); + refcount_set(&node->refs, 2); ret = radix_tree_preload(GFP_NOFS); if (ret) { @@ -166,7 +166,7 @@ static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root, } else { list_add_tail(&node->n_list, &root->node_list); list_add_tail(&node->p_list, &root->prepare_list); - atomic_inc(&node->refs); /* inserted into list */ + refcount_inc(&node->refs); /* inserted into list */ root->nodes++; set_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags); } @@ -180,7 +180,7 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, spin_lock(&root->lock); if (test_bit(BTRFS_DELAYED_NODE_IN_LIST, &node->flags)) { root->nodes--; - atomic_dec(&node->refs); /* not in the list */ + refcount_dec(&node->refs); /* not in the list */ list_del_init(&node->n_list); if (!list_empty(&node->p_list)) list_del_init(&node->p_list); @@ -201,7 +201,7 @@ static struct btrfs_delayed_node *btrfs_first_delayed_node( p = delayed_root->node_list.next; node = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -228,7 +228,7 @@ static struct btrfs_delayed_node *btrfs_next_delayed_node( p = node->n_list.next; next = list_entry(p, struct btrfs_delayed_node, n_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); out: spin_unlock(&delayed_root->lock); @@ -253,11 +253,11 @@ static void __btrfs_release_delayed_node( btrfs_dequeue_delayed_node(delayed_root, delayed_node); mutex_unlock(&delayed_node->mutex); - if (atomic_dec_and_test(&delayed_node->refs)) { + if (refcount_dec_and_test(&delayed_node->refs)) { bool free = false; struct btrfs_root *root = delayed_node->root; spin_lock(&root->inode_lock); - if (atomic_read(&delayed_node->refs) == 0) { + if (refcount_read(&delayed_node->refs) == 0) { radix_tree_delete(&root->delayed_nodes_tree, delayed_node->inode_id); free = true; @@ -286,7 +286,7 @@ static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( p = delayed_root->prepare_list.next; list_del_init(p); node = list_entry(p, struct btrfs_delayed_node, p_list); - atomic_inc(&node->refs); + refcount_inc(&node->refs); out: spin_unlock(&delayed_root->lock); @@ -308,7 +308,7 @@ static struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len) item->ins_or_del = 0; item->bytes_reserved = 0; item->delayed_node = NULL; - atomic_set(&item->refs, 1); + refcount_set(&item->refs, 1); } return item; } @@ -483,7 +483,7 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) { if (item) { __btrfs_remove_delayed_item(item); - if (atomic_dec_and_test(&item->refs)) + if (refcount_dec_and_test(&item->refs)) kfree(item); } } @@ -1600,14 +1600,14 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, ins_list); item = __btrfs_next_delayed_item(item); } item = __btrfs_first_delayed_deletion_item(delayed_node); while (item) { - atomic_inc(&item->refs); + refcount_inc(&item->refs); list_add_tail(&item->readdir_list, del_list); item = __btrfs_next_delayed_item(item); } @@ -1621,7 +1621,7 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * insert/delete delayed items in this period. So we also needn't * requeue or dequeue this delayed node. */ - atomic_dec(&delayed_node->refs); + refcount_dec(&delayed_node->refs); return true; } @@ -1634,13 +1634,13 @@ void btrfs_readdir_put_delayed_items(struct inode *inode, list_for_each_entry_safe(curr, next, ins_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } list_for_each_entry_safe(curr, next, del_list, readdir_list) { list_del(&curr->readdir_list); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); } @@ -1667,7 +1667,7 @@ int btrfs_should_delete_dir_index(struct list_head *del_list, list_del(&curr->readdir_list); ret = (curr->key.offset == index); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (ret) @@ -1705,7 +1705,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, list_del(&curr->readdir_list); if (curr->key.offset < ctx->pos) { - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); continue; } @@ -1722,7 +1722,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, over = !dir_emit(ctx, name, name_len, location.objectid, d_type); - if (atomic_dec_and_test(&curr->refs)) + if (refcount_dec_and_test(&curr->refs)) kfree(curr); if (over) @@ -1963,7 +1963,7 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) inode_id = delayed_nodes[n - 1]->inode_id + 1; for (i = 0; i < n; i++) - atomic_inc(&delayed_nodes[i]->refs); + refcount_inc(&delayed_nodes[i]->refs); spin_unlock(&root->inode_lock); for (i = 0; i < n; i++) { diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index 40327cc3b99a3bdcd517827652969299e1ce2a2b..c4189d4959343219eadf1db68d8aa5dd3f8d6ee4 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -26,7 +26,7 @@ #include #include #include - +#include #include "ctree.h" /* types of the delayed item */ @@ -67,7 +67,7 @@ struct btrfs_delayed_node { struct rb_root del_root; struct mutex mutex; struct btrfs_inode_item inode_item; - atomic_t refs; + refcount_t refs; u64 index_cnt; unsigned long flags; int count; @@ -80,7 +80,7 @@ struct btrfs_delayed_item { struct list_head readdir_list; /* used for readdir items */ u64 bytes_reserved; struct btrfs_delayed_node *delayed_node; - atomic_t refs; + refcount_t refs; int ins_or_del; u32 data_len; char data[0]; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 6eb80952efb3310ae55de9c2e234319abe14465a..be70d90dfee591953d53100b3c872d0efc5f324f 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -164,7 +164,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans, if (mutex_trylock(&head->mutex)) return 0; - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -590,7 +590,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = count_mod; @@ -682,7 +682,7 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; @@ -739,7 +739,7 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, seq = atomic64_read(&fs_info->tree_mod_seq); /* first set the basic ref node struct up */ - atomic_set(&ref->refs, 1); + refcount_set(&ref->refs, 1); ref->bytenr = bytenr; ref->num_bytes = num_bytes; ref->ref_mod = 1; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 0e537f98f1a1c63c529c118baed1cac26efa194e..c0264ff01b53cfe9e2fa44a7cdd7e9352012f0ee 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -18,6 +18,8 @@ #ifndef __DELAYED_REF__ #define __DELAYED_REF__ +#include + /* these are the possible values of struct btrfs_delayed_ref_node->action */ #define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */ #define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */ @@ -53,7 +55,7 @@ struct btrfs_delayed_ref_node { u64 seq; /* ref count on this data structure */ - atomic_t refs; + refcount_t refs; /* * how many refs is this entry adding or deleting. For @@ -220,8 +222,8 @@ btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op) static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref) { - WARN_ON(atomic_read(&ref->refs) == 0); - if (atomic_dec_and_test(&ref->refs)) { + WARN_ON(refcount_read(&ref->refs) == 0); + if (refcount_dec_and_test(&ref->refs)) { WARN_ON(ref->in_tree); switch (ref->type) { case BTRFS_TREE_BLOCK_REF_KEY: diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e653921f05d93936581785553a8964124f5df1c0..5fe1ca8abc70577fe28ee6fd69899d11b618d479 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -546,8 +546,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&uuid_mutex); + btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_rm_dev_replace_unblocked(fs_info); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return scrub_ret; @@ -665,7 +667,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: srcdev = dev_replace->srcdev; - args->status.progress_1000 = div_u64(dev_replace->cursor_left, + args->status.progress_1000 = div64_u64(dev_replace->cursor_left, div_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } @@ -784,8 +786,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) } btrfs_dev_replace_unlock(dev_replace, 1); - WARN_ON(atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); task = kthread_run(btrfs_dev_replace_kthread, fs_info, "btrfs-devrepl"); return PTR_ERR_OR_ZERO(task); } @@ -814,7 +815,7 @@ static int btrfs_dev_replace_kthread(void *data) (unsigned int)progress); } btrfs_dev_replace_continue_on_mount(fs_info); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return 0; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index eb1ee7b6f532b74409a6bb50fdb204cb2c8332a1..8685d67185d01bf90bcd2cf6d7cdd168e044c777 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -762,7 +762,7 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, err: if (reads_done && test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(fs_info, eb, ret); + btree_readahead_hook(eb, ret); if (ret) { /* @@ -787,7 +787,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) - btree_readahead_hook(eb->fs_info, eb, -EIO); + btree_readahead_hook(eb, -EIO); return -EIO; /* we fixed nothing */ } @@ -1340,7 +1340,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, atomic_set(&root->log_writers, 0); atomic_set(&root->log_batch, 0); atomic_set(&root->orphan_inodes, 0); - atomic_set(&root->refs, 1); + refcount_set(&root->refs, 1); atomic_set(&root->will_be_snapshoted, 0); atomic64_set(&root->qgroup_meta_rsv, 0); root->log_transid = 0; @@ -1808,21 +1808,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) -{ - int err; - - err = bdi_setup_and_register(bdi, "btrfs"); - if (err) - return err; - - bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; - bdi->congested_fn = btrfs_congested_fn; - bdi->congested_data = info; - bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; - return 0; -} - /* * called by the kthread helper functions to finally call the bio end_io * functions. This is where read checksum verification actually happens @@ -2601,16 +2586,10 @@ int open_ctree(struct super_block *sb, goto fail; } - ret = setup_bdi(fs_info, &fs_info->bdi); - if (ret) { - err = ret; - goto fail_srcu; - } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; - goto fail_bdi; + goto fail_srcu; } fs_info->dirty_metadata_batch = PAGE_SIZE * (1 + ilog2(nr_cpu_ids)); @@ -2718,7 +2697,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = 4096; sb->s_blocksize_bits = blksize_bits(4096); - sb->s_bdi = &fs_info->bdi; btrfs_init_btree_inode(fs_info); @@ -2915,9 +2893,12 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); - fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - SZ_4M / PAGE_SIZE); + sb->s_bdi->congested_fn = btrfs_congested_fn; + sb->s_bdi->congested_data = fs_info; + sb->s_bdi->capabilities |= BDI_CAP_CGROUP_WRITEBACK; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages *= btrfs_super_num_devices(disk_super); + sb->s_bdi->ra_pages = max(sb->s_bdi->ra_pages, SZ_4M / PAGE_SIZE); sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); @@ -3285,8 +3266,6 @@ int open_ctree(struct super_block *sb, percpu_counter_destroy(&fs_info->delalloc_bytes); fail_dirty_metadata_bytes: percpu_counter_destroy(&fs_info->dirty_metadata_bytes); -fail_bdi: - bdi_destroy(&fs_info->bdi); fail_srcu: cleanup_srcu_struct(&fs_info->subvol_srcu); fail: @@ -3518,10 +3497,11 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static int write_dev_flush(struct btrfs_device *device, int wait) { + struct request_queue *q = bdev_get_queue(device->bdev); struct bio *bio; int ret = 0; - if (device->nobarriers) + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) return 0; if (wait) { @@ -4007,7 +3987,6 @@ void close_ctree(struct btrfs_fs_info *fs_info) percpu_counter_destroy(&fs_info->dirty_metadata_bytes); percpu_counter_destroy(&fs_info->delalloc_bytes); percpu_counter_destroy(&fs_info->bio_counter); - bdi_destroy(&fs_info->bdi); cleanup_srcu_struct(&fs_info->subvol_srcu); btrfs_free_stripe_hash_table(fs_info); @@ -4343,7 +4322,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, head = rb_entry(node, struct btrfs_delayed_ref_head, href_node); if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); mutex_lock(&head->mutex); @@ -4615,7 +4594,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) t = list_first_entry(&fs_info->trans_list, struct btrfs_transaction, list); if (t->state >= TRANS_STATE_COMMIT_START) { - atomic_inc(&t->use_count); + refcount_inc(&t->use_count); spin_unlock(&fs_info->trans_lock); btrfs_wait_for_commit(fs_info, t->transid); btrfs_put_transaction(t); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e0ec29bfd69f04b4232b75754010594bd3d5f95..21f1ceb85b76737a67c1ffbc02cbd725b09fb510 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,14 +101,14 @@ struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); */ static inline struct btrfs_root *btrfs_grab_fs_root(struct btrfs_root *root) { - if (atomic_inc_not_zero(&root->refs)) + if (refcount_inc_not_zero(&root->refs)) return root; return NULL; } static inline void btrfs_put_fs_root(struct btrfs_root *root) { - if (atomic_dec_and_test(&root->refs)) + if (refcount_dec_and_test(&root->refs)) kfree(root); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index be5477676cc829e4efe89349fc9b7df540fd0dff..e390451c72e6cdb93492e519cea82d5d7b3dfaf9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -131,6 +131,16 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache) if (atomic_dec_and_test(&cache->count)) { WARN_ON(cache->pinned > 0); WARN_ON(cache->reserved > 0); + + /* + * If not empty, someone is still holding mutex of + * full_stripe_lock, which can only be released by caller. + * And it will definitely cause use-after-free when caller + * tries to release full stripe lock. + * + * No better way to resolve, but only to warn. + */ + WARN_ON(!RB_EMPTY_ROOT(&cache->full_stripe_locks_root.root)); kfree(cache->free_space_ctl); kfree(cache); } @@ -316,14 +326,14 @@ get_caching_control(struct btrfs_block_group_cache *cache) } ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); spin_unlock(&cache->lock); return ctl; } static void put_caching_control(struct btrfs_caching_control *ctl) { - if (atomic_dec_and_test(&ctl->count)) + if (refcount_dec_and_test(&ctl->count)) kfree(ctl); } @@ -599,7 +609,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, init_waitqueue_head(&caching_ctl->wait); caching_ctl->block_group = cache; caching_ctl->progress = cache->key.objectid; - atomic_set(&caching_ctl->count, 1); + refcount_set(&caching_ctl->count, 1); btrfs_init_work(&caching_ctl->work, btrfs_cache_helper, caching_thread, NULL, NULL); @@ -620,7 +630,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, struct btrfs_caching_control *ctl; ctl = cache->caching_ctl; - atomic_inc(&ctl->count); + refcount_inc(&ctl->count); prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&cache->lock); @@ -707,7 +717,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } down_write(&fs_info->commit_root_sem); - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); up_write(&fs_info->commit_root_sem); @@ -892,7 +902,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, head = btrfs_find_delayed_ref_head(delayed_refs, bytenr); if (head) { if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -2980,7 +2990,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; ref = &head->node; - atomic_inc(&ref->refs); + refcount_inc(&ref->refs); spin_unlock(&delayed_refs->lock); /* @@ -3003,7 +3013,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, goto again; } out: - assert_qgroups_uptodate(trans); trans->can_flush_pending_bgs = can_flush_pending_bgs; return 0; } @@ -3057,7 +3066,7 @@ static noinline int check_delayed_ref(struct btrfs_root *root, } if (!mutex_trylock(&head->mutex)) { - atomic_inc(&head->node.refs); + refcount_inc(&head->node.refs); spin_unlock(&delayed_refs->lock); btrfs_release_path(path); @@ -3443,7 +3452,8 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, /* * don't bother trying to write stuff out _if_ * a) we're not cached, - * b) we're with nospace_cache mount option. + * b) we're with nospace_cache mount option, + * c) we're with v2 space_cache (FREE_SPACE_TREE). */ dcs = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -9917,6 +9927,7 @@ btrfs_create_block_group_cache(struct btrfs_fs_info *fs_info, btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); mutex_init(&cache->free_space_lock); + btrfs_init_full_stripe_locks_tree(&cache->full_stripe_locks_root); return cache; } @@ -10416,7 +10427,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, &fs_info->caching_block_groups, list) if (ctl->block_group == block_group) { caching_ctl = ctl; - atomic_inc(&caching_ctl->count); + refcount_inc(&caching_ctl->count); break; } } @@ -10850,7 +10861,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ret = find_free_dev_extent_start(trans, device, minlen, start, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 27fdb250b4467f65a8c6a42d06835f3bb3a36aec..d8da3edf2ac39ebcc0bde0ede7da74f0f81ad9ea 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -68,7 +68,7 @@ void btrfs_leak_debug_check(void) pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), - atomic_read(&state->refs)); + refcount_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); } @@ -238,7 +238,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) state->failrec = NULL; RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); - atomic_set(&state->refs, 1); + refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); trace_alloc_extent_state(state, mask, _RET_IP_); return state; @@ -248,7 +248,7 @@ void free_extent_state(struct extent_state *state) { if (!state) return; - if (atomic_dec_and_test(&state->refs)) { + if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); @@ -641,7 +641,7 @@ static int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) { if (clear) - atomic_dec(&cached->refs); + refcount_dec(&cached->refs); state = cached; goto hit_next; } @@ -793,7 +793,7 @@ static void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (state->state & bits) { start = state->start; - atomic_inc(&state->refs); + refcount_inc(&state->refs); wait_on_state(tree, state); free_extent_state(state); goto again; @@ -834,7 +834,7 @@ static void cache_state_if_flags(struct extent_state *state, if (cached_ptr && !(*cached_ptr)) { if (!flags || (state->state & flags)) { *cached_ptr = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } } } @@ -1538,7 +1538,7 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree, if (!found) { *start = state->start; *cached_state = state; - atomic_inc(&state->refs); + refcount_inc(&state->refs); } found++; *end = state->end; @@ -2004,16 +2004,11 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, u64 map_length = 0; u64 sector; struct btrfs_bio *bbio = NULL; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int ret; ASSERT(!(fs_info->sb->s_flags & MS_RDONLY)); BUG_ON(!mirror_num); - /* we can't repair anything in raid56 yet */ - if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num)) - return 0; - bio = btrfs_io_bio_alloc(GFP_NOFS, 1); if (!bio) return -EIO; @@ -2026,17 +2021,35 @@ int repair_io_failure(struct btrfs_inode *inode, u64 start, u64 length, * read repair operation. */ btrfs_bio_counter_inc_blocked(fs_info); - ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, - &map_length, &bbio, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + if (btrfs_is_parity_mirror(fs_info, logical, length, mirror_num)) { + /* + * Note that we don't use BTRFS_MAP_WRITE because it's supposed + * to update all raid stripes, but here we just want to correct + * bad stripe, thus BTRFS_MAP_READ is abused to only get the bad + * stripe's dev and sector. + */ + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, + &map_length, &bbio, 0); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + ASSERT(bbio->mirror_num == 1); + } else { + ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, + &map_length, &bbio, mirror_num); + if (ret) { + btrfs_bio_counter_dec(fs_info); + bio_put(bio); + return -EIO; + } + BUG_ON(mirror_num != bbio->mirror_num); } - BUG_ON(mirror_num != bbio->mirror_num); - sector = bbio->stripes[mirror_num-1].physical >> 9; + + sector = bbio->stripes[bbio->mirror_num - 1].physical >> 9; bio->bi_iter.bi_sector = sector; - dev = bbio->stripes[mirror_num-1].dev; + dev = bbio->stripes[bbio->mirror_num - 1].dev; btrfs_put_bbio(bbio); if (!dev || !dev->bdev || !dev->writeable) { btrfs_bio_counter_dec(fs_info); @@ -2859,7 +2872,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = *em_cached; if (extent_map_in_tree(em) && start >= em->start && start < extent_map_end(em)) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } @@ -2870,7 +2883,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); - atomic_inc(&em->refs); + refcount_inc(&em->refs); *em_cached = em; } return em; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3e4fad4a909d110d9283f979ccb9dec9a48c607c..1eafa2f0ede370ae802bb882557b3d4ad5c26340 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -2,6 +2,7 @@ #define __EXTENTIO__ #include +#include #include "ulist.h" /* bits for the extent state */ @@ -14,14 +15,17 @@ #define EXTENT_DEFRAG (1U << 6) #define EXTENT_BOUNDARY (1U << 9) #define EXTENT_NODATASUM (1U << 10) -#define EXTENT_DO_ACCOUNTING (1U << 11) +#define EXTENT_CLEAR_META_RESV (1U << 11) #define EXTENT_FIRST_DELALLOC (1U << 12) #define EXTENT_NEED_WAIT (1U << 13) #define EXTENT_DAMAGED (1U << 14) #define EXTENT_NORESERVE (1U << 15) #define EXTENT_QGROUP_RESERVED (1U << 16) #define EXTENT_CLEAR_DATA_RESV (1U << 17) +#define EXTENT_DELALLOC_NEW (1U << 18) #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) +#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \ + EXTENT_CLEAR_DATA_RESV) #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC) /* @@ -143,7 +147,7 @@ struct extent_state { /* ADD NEW ELEMENTS AFTER THIS */ wait_queue_head_t wq; - atomic_t refs; + refcount_t refs; unsigned state; struct io_failure_record *failrec; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 26f9ac719d20b4bff1a6b0a456ca45dd1752b4c7..69850155870c067d82768c67f3895a2e7a7c487d 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -55,7 +55,7 @@ struct extent_map *alloc_extent_map(void) em->flags = 0; em->compress_type = BTRFS_COMPRESS_NONE; em->generation = 0; - atomic_set(&em->refs, 1); + refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; } @@ -71,8 +71,8 @@ void free_extent_map(struct extent_map *em) { if (!em) return; - WARN_ON(atomic_read(&em->refs) == 0); - if (atomic_dec_and_test(&em->refs)) { + WARN_ON(refcount_read(&em->refs) == 0); + if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) @@ -322,7 +322,7 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, struct extent_map *em, int modified) { - atomic_inc(&em->refs); + refcount_inc(&em->refs); em->mod_start = em->start; em->mod_len = em->len; @@ -381,7 +381,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, if (strict && !(end > em->start && start < extent_map_end(em))) return NULL; - atomic_inc(&em->refs); + refcount_inc(&em->refs); return em; } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index eb8b8fae036bc3c67ceea03220cdca503626546f..a67b2def54131f10326c71092f80f2cd2d706212 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -2,6 +2,7 @@ #define __EXTENTMAP__ #include +#include #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) @@ -41,7 +42,7 @@ struct extent_map { */ struct map_lookup *map_lookup; }; - atomic_t refs; + refcount_t refs; unsigned int compress_type; struct list_head list; }; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 520cb7230b2d2cb5ca798c0030fa446957799456..da1096eb1a406f648b1bb0c7f3ee1da0e3013646 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1404,6 +1404,47 @@ static noinline int prepare_pages(struct inode *inode, struct page **pages, } +static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, + const u64 start, + const u64 len, + struct extent_state **cached_state) +{ + u64 search_start = start; + const u64 end = start + len - 1; + + while (search_start < end) { + const u64 search_len = end - search_start + 1; + struct extent_map *em; + u64 em_len; + int ret = 0; + + em = btrfs_get_extent(inode, NULL, 0, search_start, + search_len, 0); + if (IS_ERR(em)) + return PTR_ERR(em); + + if (em->block_start != EXTENT_MAP_HOLE) + goto next; + + em_len = em->len; + if (em->start < search_start) + em_len -= search_start - em->start; + if (em_len > search_len) + em_len = search_len; + + ret = set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, + NULL, cached_state, GFP_NOFS); +next: + search_start = extent_map_end(em); + free_extent_map(em); + if (ret) + return ret; + } + return 0; +} + /* * This function locks the extent and properly waits for data=ordered extents * to finish before allowing the pages to be modified if need. @@ -1432,8 +1473,11 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, + round_up(pos + write_bytes - start_pos, fs_info->sectorsize) - 1; - if (start_pos < inode->vfs_inode.i_size) { + if (start_pos < inode->vfs_inode.i_size || + (inode->flags & BTRFS_INODE_PREALLOC)) { struct btrfs_ordered_extent *ordered; + unsigned int clear_bits; + lock_extent_bits(&inode->io_tree, start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -1454,11 +1498,19 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, } if (ordered) btrfs_put_ordered_extent(ordered); - + ret = btrfs_find_new_delalloc_bytes(inode, start_pos, + last_pos - start_pos + 1, + cached_state); + clear_bits = EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG; + if (ret) + clear_bits |= EXTENT_DELALLOC_NEW | EXTENT_LOCKED; clear_extent_bit(&inode->io_tree, start_pos, - last_pos, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - 0, 0, cached_state, GFP_NOFS); + last_pos, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, cached_state, GFP_NOFS); + if (ret) + return ret; *lockstart = start_pos; *lockend = last_pos; ret = 1; @@ -2342,13 +2394,8 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) int ret = 0; em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, *start, *len, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); - return ret; - } + if (IS_ERR(em)) + return PTR_ERR(em); /* Hole or vacuum extent(only exists in no-hole mode) */ if (em->block_start == EXTENT_MAP_HOLE) { @@ -2835,11 +2882,8 @@ static long btrfs_fallocate(struct file *file, int mode, while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, alloc_end - cur_offset, 0); - if (IS_ERR_OR_NULL(em)) { - if (!em) - ret = -ENOMEM; - else - ret = PTR_ERR(em); + if (IS_ERR(em)) { + ret = PTR_ERR(em); break; } last_byte = min(extent_map_end(em), alloc_end); @@ -2856,8 +2900,10 @@ static long btrfs_fallocate(struct file *file, int mode, } ret = btrfs_qgroup_reserve_data(inode, cur_offset, last_byte - cur_offset); - if (ret < 0) + if (ret < 0) { + free_extent_map(em); break; + } } else { /* * Do not need to reserve unwritten extent for this diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index da6841efac26b1be3509ad3e410c34e72b253a65..c5e6180cdb8c9250e3a525e311a42795c4831c32 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -355,7 +355,7 @@ static void io_ctl_map_page(struct btrfs_io_ctl *io_ctl, int clear) io_ctl->orig = io_ctl->cur; io_ctl->size = PAGE_SIZE; if (clear) - memset(io_ctl->cur, 0, PAGE_SIZE); + clear_page(io_ctl->cur); } static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dd7fb22a955a6bff32be4632053bcd9086efe6e2..fc0bd84067582523b958db030a3439d1088f5832 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -167,8 +167,7 @@ static u8 *alloc_bitmap(u32 bitmap_size) if (mem) return mem; - return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_ZERO, PAGE_KERNEL); } int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e71f1ea3391b034dc8e6f55f62d82dbe76e9811..17cbe9306fafd9b9e7247bec4308494ed770b28b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -115,6 +115,31 @@ static struct extent_map *create_io_em(struct inode *inode, u64 start, u64 len, u64 ram_bytes, int compress_type, int type); +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate); + +/* + * Cleanup all submitted ordered extents in specified range to handle errors + * from the fill_dellaloc() callback. + * + * NOTE: caller must ensure that when an error happens, it can not call + * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING + * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata + * to be released, which we want to happen only when finishing the ordered + * extent (btrfs_finish_ordered_io()). Also note that the caller of the + * fill_delalloc() callback already does proper cleanup for the first page of + * the range, that is, it invokes the callback writepage_end_io_hook() for the + * range of the first page. + */ +static inline void btrfs_cleanup_ordered_extents(struct inode *inode, + const u64 offset, + const u64 bytes) +{ + return __endio_write_update_ordered(inode, offset + PAGE_SIZE, + bytes - PAGE_SIZE, false); +} + static int btrfs_dirty_inode(struct inode *inode); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -547,7 +572,7 @@ static noinline void compress_file_range(struct inode *inode, } if (ret <= 0) { unsigned long clear_flags = EXTENT_DELALLOC | - EXTENT_DEFRAG; + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG; unsigned long page_error_op; clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0; @@ -565,8 +590,10 @@ static noinline void compress_file_range(struct inode *inode, PAGE_SET_WRITEBACK | page_error_op | PAGE_END_WRITEBACK); - btrfs_free_reserved_data_space_noquota(inode, start, - end - start + 1); + if (ret == 0) + btrfs_free_reserved_data_space_noquota(inode, + start, + end - start + 1); goto free_pages_out; } } @@ -852,6 +879,7 @@ static noinline void submit_compressed_extents(struct inode *inode, async_extent->start + async_extent->ram_size - 1, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK | @@ -918,10 +946,13 @@ static noinline int cow_file_range(struct inode *inode, u64 num_bytes; unsigned long ram_size; u64 disk_num_bytes; - u64 cur_alloc_size; + u64 cur_alloc_size = 0; u64 blocksize = fs_info->sectorsize; struct btrfs_key ins; struct extent_map *em; + unsigned clear_bits; + unsigned long page_ops; + bool extent_reserved = false; int ret = 0; if (btrfs_is_free_space_inode(BTRFS_I(inode))) { @@ -944,6 +975,7 @@ static noinline int cow_file_range(struct inode *inode, extent_clear_unlock_delalloc(inode, start, end, delalloc_end, NULL, EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_DEFRAG, PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); @@ -966,14 +998,14 @@ static noinline int cow_file_range(struct inode *inode, start + num_bytes - 1, 0); while (disk_num_bytes > 0) { - unsigned long op; - cur_alloc_size = disk_num_bytes; ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); if (ret < 0) goto out_unlock; + cur_alloc_size = ins.offset; + extent_reserved = true; ram_size = ins.offset; em = create_io_em(inode, start, ins.offset, /* len */ @@ -988,7 +1020,6 @@ static noinline int cow_file_range(struct inode *inode, goto out_reserve; free_extent_map(em); - cur_alloc_size = ins.offset; ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) @@ -998,15 +1029,24 @@ static noinline int cow_file_range(struct inode *inode, BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); + /* + * Only drop cache here, and process as normal. + * + * We must not allow extent_clear_unlock_delalloc() + * at out_unlock label to free meta of this ordered + * extent, as its meta should be freed by + * btrfs_finish_ordered_io(). + * + * So we must continue until @start is increased to + * skip current ordered extent. + */ if (ret) - goto out_drop_extent_cache; + btrfs_drop_extent_cache(BTRFS_I(inode), start, + start + ram_size - 1, 0); } btrfs_dec_block_group_reservations(fs_info, ins.objectid); - if (disk_num_bytes < cur_alloc_size) - break; - /* we're not doing compressed IO, don't unlock the first * page (which the caller expects to stay locked), don't * clear any dirty bits and don't set any writeback bits @@ -1014,18 +1054,30 @@ static noinline int cow_file_range(struct inode *inode, * Do set the Private2 bit so we know this page was properly * setup for writepage */ - op = unlock ? PAGE_UNLOCK : 0; - op |= PAGE_SET_PRIVATE2; + page_ops = unlock ? PAGE_UNLOCK : 0; + page_ops |= PAGE_SET_PRIVATE2; extent_clear_unlock_delalloc(inode, start, start + ram_size - 1, delalloc_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC, - op); - disk_num_bytes -= cur_alloc_size; + page_ops); + if (disk_num_bytes < cur_alloc_size) + disk_num_bytes = 0; + else + disk_num_bytes -= cur_alloc_size; num_bytes -= cur_alloc_size; alloc_hint = ins.objectid + ins.offset; start += cur_alloc_size; + extent_reserved = false; + + /* + * btrfs_reloc_clone_csums() error, since start is increased + * extent_clear_unlock_delalloc() at out_unlock label won't + * free metadata of current ordered extent, we're OK to exit. + */ + if (ret) + goto out_unlock; } out: return ret; @@ -1036,12 +1088,35 @@ static noinline int cow_file_range(struct inode *inode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; + page_ops = PAGE_UNLOCK | PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK | + PAGE_END_WRITEBACK; + /* + * If we reserved an extent for our delalloc range (or a subrange) and + * failed to create the respective ordered extent, then it means that + * when we reserved the extent we decremented the extent's size from + * the data space_info's bytes_may_use counter and incremented the + * space_info's bytes_reserved counter by the same amount. We must make + * sure extent_clear_unlock_delalloc() does not try to decrement again + * the data space_info's bytes_may_use counter, therefore we do not pass + * it the flag EXTENT_CLEAR_DATA_RESV. + */ + if (extent_reserved) { + extent_clear_unlock_delalloc(inode, start, + start + cur_alloc_size, + start + cur_alloc_size, + locked_page, + clear_bits, + page_ops); + start += cur_alloc_size; + if (start >= end) + goto out; + } extent_clear_unlock_delalloc(inode, start, end, delalloc_end, locked_page, - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DELALLOC | EXTENT_DEFRAG, - PAGE_UNLOCK | PAGE_CLEAR_DIRTY | - PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK); + clear_bits | EXTENT_CLEAR_DATA_RESV, + page_ops); goto out; } @@ -1414,15 +1489,14 @@ static noinline int run_delalloc_nocow(struct inode *inode, BUG_ON(ret); /* -ENOMEM */ if (root->root_key.objectid == - BTRFS_DATA_RELOC_TREE_OBJECTID) { + BTRFS_DATA_RELOC_TREE_OBJECTID) + /* + * Error handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler + * from freeing metadata of created ordered extent. + */ ret = btrfs_reloc_clone_csums(inode, cur_offset, num_bytes); - if (ret) { - if (!nolock && nocow) - btrfs_end_write_no_snapshoting(root); - goto error; - } - } extent_clear_unlock_delalloc(inode, cur_offset, cur_offset + num_bytes - 1, end, @@ -1434,6 +1508,14 @@ static noinline int run_delalloc_nocow(struct inode *inode, if (!nolock && nocow) btrfs_end_write_no_snapshoting(root); cur_offset = extent_end; + + /* + * btrfs_reloc_clone_csums() error, now we're OK to call error + * handler, as metadata for created ordered extent will only + * be freed by btrfs_finish_ordered_io(). + */ + if (ret) + goto error; if (cur_offset > end) break; } @@ -1509,6 +1591,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, ret = cow_file_range_async(inode, locked_page, start, end, page_started, nr_written); } + if (ret) + btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -1693,6 +1777,14 @@ static void btrfs_set_bit_hook(struct inode *inode, btrfs_add_delalloc_inodes(root, inode); spin_unlock(&BTRFS_I(inode)->lock); } + + if (!(state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&BTRFS_I(inode)->lock); + BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - + state->start; + spin_unlock(&BTRFS_I(inode)->lock); + } } /* @@ -1722,7 +1814,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (*bits & EXTENT_FIRST_DELALLOC) { *bits &= ~EXTENT_FIRST_DELALLOC; - } else if (!(*bits & EXTENT_DO_ACCOUNTING)) { + } else if (!(*bits & EXTENT_CLEAR_META_RESV)) { spin_lock(&inode->lock); inode->outstanding_extents -= num_extents; spin_unlock(&inode->lock); @@ -1733,7 +1825,7 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, * don't need to call dellalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_DO_ACCOUNTING && + if (*bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len); @@ -1741,10 +1833,9 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, if (btrfs_is_testing(fs_info)) return; - if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID - && do_list && !(state->state & EXTENT_NORESERVE) - && (*bits & (EXTENT_DO_ACCOUNTING | - EXTENT_CLEAR_DATA_RESV))) + if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID && + do_list && !(state->state & EXTENT_NORESERVE) && + (*bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota( &inode->vfs_inode, state->start, len); @@ -1759,6 +1850,14 @@ static void btrfs_clear_bit_hook(struct btrfs_inode *inode, btrfs_del_delalloc_inode(root, inode); spin_unlock(&inode->lock); } + + if ((state->state & EXTENT_DELALLOC_NEW) && + (*bits & EXTENT_DELALLOC_NEW)) { + spin_lock(&inode->lock); + ASSERT(inode->new_delalloc_bytes >= len); + inode->new_delalloc_bytes -= len; + spin_unlock(&inode->lock); + } } /* @@ -2791,6 +2890,13 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) u64 logical_len = ordered_extent->len; bool nolock; bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags)) + clear_new_delalloc_bytes = true; nolock = btrfs_is_free_space_inode(BTRFS_I(inode)); @@ -2839,6 +2945,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + range_locked = true; lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, &cached_state); @@ -2864,7 +2971,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) if (IS_ERR(trans)) { ret = PTR_ERR(trans); trans = NULL; - goto out_unlock; + goto out; } trans->block_rsv = &fs_info->delalloc_block_rsv; @@ -2896,7 +3003,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } add_pending_csums(trans, inode, &ordered_extent->list); @@ -2905,14 +3012,26 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ret = btrfs_update_inode_fallback(trans, root, inode); if (ret) { /* -ENOMEM or corruption */ btrfs_abort_transaction(trans, ret); - goto out_unlock; + goto out; } ret = 0; -out_unlock: - unlock_extent_cached(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, &cached_state, GFP_NOFS); out: + if (range_locked || clear_new_delalloc_bytes) { + unsigned int clear_bits = 0; + + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1, + clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, + 0, &cached_state, GFP_NOFS); + } + if (root != fs_info->tree_root) btrfs_delalloc_release_metadata(BTRFS_I(inode), ordered_extent->len); @@ -4401,9 +4520,17 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (extent_type != BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_num_bytes(leaf, fi); + + trace_btrfs_truncate_show_fi_regular( + BTRFS_I(inode), leaf, fi, + found_key.offset); } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { item_end += btrfs_file_extent_inline_len(leaf, path->slots[0], fi); + + trace_btrfs_truncate_show_fi_inline( + BTRFS_I(inode), leaf, fi, path->slots[0], + found_key.offset); } item_end--; } @@ -4603,13 +4730,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, btrfs_free_path(path); - if (err == 0) { - /* only inline file may have last_size != new_size */ - if (new_size >= fs_info->sectorsize || - new_size > fs_info->max_inline) - ASSERT(last_size == new_size); - } - if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { @@ -6735,7 +6855,6 @@ static noinline int uncompress_inline(struct btrfs_path *path, * * This also copies inline extents directly into the page. */ - struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, u64 start, u64 len, @@ -6835,11 +6954,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, found_type == BTRFS_FILE_EXTENT_PREALLOC) { extent_end = extent_start + btrfs_file_extent_num_bytes(leaf, item); + + trace_btrfs_get_extent_show_fi_regular(inode, leaf, item, + extent_start); } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { size_t size; size = btrfs_file_extent_inline_len(leaf, path->slots[0], item); extent_end = ALIGN(extent_start + size, fs_info->sectorsize); + + trace_btrfs_get_extent_show_fi_inline(inode, leaf, item, + path->slots[0], + extent_start); } next: if (start >= extent_end) { @@ -7037,19 +7163,17 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, em = btrfs_get_extent(inode, page, pg_offset, start, len, create); if (IS_ERR(em)) return em; - if (em) { - /* - * if our em maps to - * - a hole or - * - a pre-alloc extent, - * there might actually be delalloc bytes behind it. - */ - if (em->block_start != EXTENT_MAP_HOLE && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return em; - else - hole_em = em; - } + /* + * If our em maps to: + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. + */ + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + return em; + else + hole_em = em; /* check to see if we've wrapped (len == -1 or similar) */ end = start + len; @@ -8127,17 +8251,26 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write_update_ordered(struct inode *inode, - const u64 offset, - const u64 bytes, - const int uptodate) +static void __endio_write_update_ordered(struct inode *inode, + const u64 offset, const u64 bytes, + const bool uptodate) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered = NULL; + struct btrfs_workqueue *wq; + btrfs_work_func_t func; u64 ordered_offset = offset; u64 ordered_bytes = bytes; int ret; + if (btrfs_is_free_space_inode(BTRFS_I(inode))) { + wq = fs_info->endio_freespace_worker; + func = btrfs_freespace_write_helper; + } else { + wq = fs_info->endio_write_workers; + func = btrfs_endio_write_helper; + } + again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -8146,9 +8279,8 @@ static void btrfs_endio_direct_write_update_ordered(struct inode *inode, if (!ret) goto out_test; - btrfs_init_work(&ordered->work, btrfs_endio_write_helper, - finish_ordered_fn, NULL, NULL); - btrfs_queue_work(fs_info->endio_write_workers, &ordered->work); + btrfs_init_work(&ordered->work, func, finish_ordered_fn, NULL, NULL); + btrfs_queue_work(wq, &ordered->work); out_test: /* * our bio might span multiple ordered extents. If we haven't @@ -8166,10 +8298,8 @@ static void btrfs_endio_direct_write(struct bio *bio) struct btrfs_dio_private *dip = bio->bi_private; struct bio *dio_bio = dip->dio_bio; - btrfs_endio_direct_write_update_ordered(dip->inode, - dip->logical_offset, - dip->bytes, - !bio->bi_error); + __endio_write_update_ordered(dip->inode, dip->logical_offset, + dip->bytes, !bio->bi_error); kfree(dip); @@ -8530,10 +8660,10 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, io_bio = NULL; } else { if (write) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, file_offset, dio_bio->bi_iter.bi_size, - 0); + false); else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); @@ -8668,11 +8798,11 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) */ if (dio_data.unsubmitted_oe_range_start < dio_data.unsubmitted_oe_range_end) - btrfs_endio_direct_write_update_ordered(inode, + __endio_write_update_ordered(inode, dio_data.unsubmitted_oe_range_start, dio_data.unsubmitted_oe_range_end - dio_data.unsubmitted_oe_range_start, - 0); + false); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); @@ -8819,6 +8949,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) clear_extent_bit(tree, start, end, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DELALLOC_NEW | EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 0, &cached_state, GFP_NOFS); @@ -8876,8 +9007,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DIRTY | - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, 1, 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | + EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); __btrfs_releasepage(page, GFP_NOFS); @@ -9248,6 +9379,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->new_delalloc_bytes = 0; ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; @@ -9313,6 +9445,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->outstanding_extents); WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); + WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); WARN_ON(BTRFS_I(inode)->defrag_bytes); @@ -9436,7 +9569,7 @@ static int btrfs_getattr(const struct path *path, struct kstat *stat, stat->dev = BTRFS_I(inode)->root->anon_dev; spin_lock(&BTRFS_I(inode)->lock); - delalloc_bytes = BTRFS_I(inode)->delalloc_bytes; + delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes; spin_unlock(&BTRFS_I(inode)->lock); stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) + ALIGN(delalloc_bytes, blocksize)) >> 9; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dabfc7ac48a674db12252722c99509d9462dbe53..e176375f374f917be5c50a46ed1c94444d1949d3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1504,7 +1504,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1619,7 +1619,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); mnt_drop_write_file(file); return ret; } @@ -2661,7 +2661,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; mutex_lock(&fs_info->volume_mutex); @@ -2680,7 +2680,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); return ret; } @@ -2708,7 +2708,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) if (vol_args->flags & ~BTRFS_VOL_ARG_V2_FLAGS_SUPPORTED) return -EOPNOTSUPP; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -2721,7 +2721,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) ret = btrfs_rm_device(fs_info, vol_args->name, 0); } mutex_unlock(&fs_info->volume_mutex); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -2752,7 +2752,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -2772,7 +2772,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out_drop_write: mnt_drop_write_file(file); @@ -3539,12 +3539,9 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = kmalloc(fs_info->nodesize, GFP_KERNEL | __GFP_NOWARN); - if (!buf) { - buf = vmalloc(fs_info->nodesize); - if (!buf) - return ret; - } + buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); + if (!buf) + return ret; path = btrfs_alloc_path(); if (!path) { @@ -4442,13 +4439,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (atomic_xchg( - &fs_info->mutually_exclusive_operation_running, 1)) { + if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - atomic_set( - &fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -4643,7 +4638,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); need_unlock = true; @@ -4689,7 +4684,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } locked: - BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); + BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4745,11 +4740,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) do_balance: /* - * Ownership of bctl and mutually_exclusive_operation_running + * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP * goes to to btrfs_balance. bctl is freed in __cancel_balance, * or, if restriper was paused all the way until unmount, in - * free_fs_info. mutually_exclusive_operation_running is - * cleared in __cancel_balance. + * free_fs_info. The flag is cleared in __cancel_balance. */ need_unlock = false; @@ -4769,7 +4763,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); if (need_unlock) - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); out: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 9a46878ba60fa973562139f32629810b476ed453..7b40e2e7292a41a047b0f8e300304822b7c63e74 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,7 +212,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); /* one ref for the tree */ - atomic_set(&entry->refs, 1); + refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); INIT_LIST_HEAD(&entry->root_extent_list); @@ -358,7 +358,7 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -425,7 +425,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, out: if (!ret && cached && entry) { *cached = entry; - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); } spin_unlock_irqrestore(&tree->lock, flags); return ret == 0; @@ -456,7 +456,7 @@ void btrfs_get_logged_extents(struct btrfs_inode *inode, if (test_and_set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) continue; list_add(&ordered->log_list, logged_list); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); } spin_unlock_irq(&tree->lock); } @@ -565,7 +565,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); - if (atomic_dec_and_test(&entry->refs)) { + if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->log_list)); ASSERT(list_empty(&entry->trans_list)); ASSERT(list_empty(&entry->root_extent_list)); @@ -623,7 +623,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock(&fs_info->trans_lock); trans = fs_info->running_transaction; if (trans) - atomic_inc(&trans->use_count); + refcount_inc(&trans->use_count); spin_unlock(&fs_info->trans_lock); ASSERT(trans); @@ -690,7 +690,7 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nr, list_move_tail(&ordered->root_extent_list, &root->ordered_extents); - atomic_inc(&ordered->refs); + refcount_inc(&ordered->refs); spin_unlock(&root->ordered_extent_lock); btrfs_init_work(&ordered->flush_work, @@ -870,7 +870,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, if (!offset_in_entry(entry, file_offset)) entry = NULL; if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; @@ -911,7 +911,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( } out: if (entry) - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); spin_unlock_irq(&tree->lock); return entry; } @@ -948,7 +948,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - atomic_inc(&entry->refs); + refcount_inc(&entry->refs); out: spin_unlock_irq(&tree->lock); return entry; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 195c93b67fe002861153fb58f6506fa93deb0434..e0c1d5b8d859c95c9e870b8feb735c921a09d8f1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -113,7 +113,7 @@ struct btrfs_ordered_extent { int compress_type; /* reference count */ - atomic_t refs; + refcount_t refs; /* the inode we belong to */ struct inode *inode; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index a59801dc2a340bcd3c5a34af9ef7277061967377..deffbeb74a0be7499c42578efc51903c3737f3bf 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -47,50 +47,6 @@ * - check all ioctl parameters */ -/* - * one struct for each qgroup, organized in fs_info->qgroup_tree. - */ -struct btrfs_qgroup { - u64 qgroupid; - - /* - * state - */ - u64 rfer; /* referenced */ - u64 rfer_cmpr; /* referenced compressed */ - u64 excl; /* exclusive */ - u64 excl_cmpr; /* exclusive compressed */ - - /* - * limits - */ - u64 lim_flags; /* which limits are set */ - u64 max_rfer; - u64 max_excl; - u64 rsv_rfer; - u64 rsv_excl; - - /* - * reservation tracking - */ - u64 reserved; - - /* - * lists - */ - struct list_head groups; /* groups this group is member of */ - struct list_head members; /* groups that are members of this group */ - struct list_head dirty; /* dirty groups */ - struct rb_node node; /* tree of qgroups */ - - /* - * temp variables for accounting operations - * Refer to qgroup_shared_accounting() for details. - */ - u64 old_refcnt; - u64 new_refcnt; -}; - static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, int mod) { @@ -1042,9 +998,12 @@ static void report_reserved_underflow(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes) { - btrfs_warn(fs_info, +#ifdef CONFIG_BTRFS_DEBUG + WARN_ON(qgroup->reserved < num_bytes); + btrfs_debug(fs_info, "qgroup %llu reserved space underflow, have: %llu, to free: %llu", qgroup->qgroupid, qgroup->reserved, num_bytes); +#endif qgroup->reserved = 0; } /* @@ -1075,7 +1034,8 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, qgroup->excl += sign * num_bytes; qgroup->excl_cmpr += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes); + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else qgroup->reserved -= num_bytes; @@ -1100,7 +1060,9 @@ static int __qgroup_excl_accounting(struct btrfs_fs_info *fs_info, WARN_ON(sign < 0 && qgroup->excl < num_bytes); qgroup->excl += sign * num_bytes; if (sign > 0) { - if (WARN_ON(qgroup->reserved < num_bytes)) + trace_qgroup_update_reserve(fs_info, qgroup, + -(s64)num_bytes); + if (qgroup->reserved < num_bytes) report_reserved_underflow(fs_info, qgroup, num_bytes); else @@ -2055,12 +2017,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, if (!ret) { /* - * Use (u64)-1 as time_seq to do special search, which + * Use SEQ_LAST as time_seq to do special search, which * doesn't lock tree or delayed_refs and search current * root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, (u64)-1, &new_roots); + record->bytenr, SEQ_LAST, &new_roots); if (ret < 0) goto cleanup; if (qgroup_to_skip) @@ -2367,6 +2329,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; int ret = 0; + int retried = 0; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2375,7 +2338,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) if (num_bytes == 0) return 0; - +retry: spin_lock(&fs_info->qgroup_lock); quota_root = fs_info->quota_root; if (!quota_root) @@ -2402,6 +2365,27 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); if (enforce && !qgroup_check_limits(qg, num_bytes)) { + /* + * Commit the tree and retry, since we may have + * deletions which would free up space. + */ + if (!retried && qg->reserved > 0) { + struct btrfs_trans_handle *trans; + + spin_unlock(&fs_info->qgroup_lock); + ret = btrfs_start_delalloc_inodes(root, 0); + if (ret) + return ret; + btrfs_wait_ordered_extents(root, -1, 0, (u64)-1); + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_commit_transaction(trans); + if (ret) + return ret; + retried++; + goto retry; + } ret = -EDQUOT; goto out; } @@ -2424,6 +2408,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce) qg = unode_aux_to_qgroup(unode); + trace_qgroup_update_reserve(fs_info, qg, num_bytes); qg->reserved += num_bytes; } @@ -2469,7 +2454,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, qg = unode_aux_to_qgroup(unode); - if (WARN_ON(qg->reserved < num_bytes)) + trace_qgroup_update_reserve(fs_info, qg, -(s64)num_bytes); + if (qg->reserved < num_bytes) report_reserved_underflow(fs_info, qg, num_bytes); else qg->reserved -= num_bytes; @@ -2487,18 +2473,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->qgroup_lock); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) -{ - if (list_empty(&trans->qgroup_ref_list) && !trans->delayed_ref_elem.seq) - return; - btrfs_err(trans->fs_info, - "qgroups not uptodate in trans handle %p: list is%s empty, seq is %#x.%x", - trans, list_empty(&trans->qgroup_ref_list) ? "" : " not", - (u32)(trans->delayed_ref_elem.seq >> 32), - (u32)trans->delayed_ref_elem.seq); - BUG(); -} - /* * returns < 0 on error, 0 when more leafs are to be scanned. * returns 1 when done. @@ -2886,14 +2860,14 @@ static int __btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len, if (ret < 0) goto out; - if (free) { - btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, - BTRFS_I(inode)->root->objectid, - changeset.bytes_changed); + if (free) trace_op = QGROUP_FREE; - } trace_btrfs_qgroup_release_data(inode, start, len, changeset.bytes_changed, trace_op); + if (free) + btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info, + BTRFS_I(inode)->root->objectid, + changeset.bytes_changed); out: ulist_release(&changeset.range_changed); return ret; @@ -2945,6 +2919,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); + trace_qgroup_meta_reserve(root, (s64)num_bytes); ret = qgroup_reserve(root, num_bytes, enforce); if (ret < 0) return ret; @@ -2964,6 +2939,7 @@ void btrfs_qgroup_free_meta_all(struct btrfs_root *root) reserved = atomic64_xchg(&root->qgroup_meta_rsv, 0); if (reserved == 0) return; + trace_qgroup_meta_reserve(root, -(s64)reserved); btrfs_qgroup_free_refroot(fs_info, root->objectid, reserved); } @@ -2978,6 +2954,7 @@ void btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes) BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); WARN_ON(atomic64_read(&root->qgroup_meta_rsv) < num_bytes); atomic64_sub(num_bytes, &root->qgroup_meta_rsv); + trace_qgroup_meta_reserve(root, -(s64)num_bytes); btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes); } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 26932a8a19930bc48ff14ad3c154a7326ce3a111..fe04d3f295c67f0b97b79095885878237260a274 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -61,6 +61,50 @@ struct btrfs_qgroup_extent_record { struct ulist *old_roots; }; +/* + * one struct for each qgroup, organized in fs_info->qgroup_tree. + */ +struct btrfs_qgroup { + u64 qgroupid; + + /* + * state + */ + u64 rfer; /* referenced */ + u64 rfer_cmpr; /* referenced compressed */ + u64 excl; /* exclusive */ + u64 excl_cmpr; /* exclusive compressed */ + + /* + * limits + */ + u64 lim_flags; /* which limits are set */ + u64 max_rfer; + u64 max_excl; + u64 rsv_rfer; + u64 rsv_excl; + + /* + * reservation tracking + */ + u64 reserved; + + /* + * lists + */ + struct list_head groups; /* groups this group is member of */ + struct list_head members; /* groups that are members of this group */ + struct list_head dirty; /* dirty groups */ + struct rb_node node; /* tree of qgroups */ + + /* + * temp variables for accounting operations + * Refer to qgroup_shared_accounting() for details. + */ + u64 old_refcnt; + u64 new_refcnt; +}; + /* * For qgroup event trace points only */ @@ -186,17 +230,12 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, struct btrfs_qgroup_inherit *inherit); void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes); -/* - * TODO: Add proper trace point for it, as btrfs_qgroup_free() is - * called by everywhere, can't provide good trace for delayed ref case. - */ static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes) { - btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes); + btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes); } -void assert_qgroups_uptodate(struct btrfs_trans_handle *trans); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1571bf26dc077a0575b39977ead28e229d68550f..d8ea0eb76325e9b25d42dfa4a99c63918981aa2f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -149,7 +149,7 @@ struct btrfs_raid_bio { int generic_bio_cnt; - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; @@ -389,7 +389,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) if (bio_list_empty(&rbio->bio_list)) { if (!list_empty(&rbio->hash_list)) { list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); BUG_ON(!list_empty(&rbio->plug_list)); } } @@ -480,7 +480,7 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) /* bump our ref if we were not in the list before */ if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags)) - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); if (!list_empty(&rbio->stripe_cache)){ list_move(&rbio->stripe_cache, &table->stripe_cache); @@ -689,7 +689,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) test_bit(RBIO_CACHE_BIT, &cur->flags) && !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) { list_del_init(&cur->hash_list); - atomic_dec(&cur->refs); + refcount_dec(&cur->refs); steal_rbio(cur, rbio); cache_drop = cur; @@ -738,7 +738,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio) } } lockit: - atomic_inc(&rbio->refs); + refcount_inc(&rbio->refs); list_add(&rbio->hash_list, &h->hash_list); out: spin_unlock_irqrestore(&h->lock, flags); @@ -784,7 +784,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) } list_del_init(&rbio->hash_list); - atomic_dec(&rbio->refs); + refcount_dec(&rbio->refs); /* * we use the plug list to hold all the rbios @@ -801,7 +801,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) list_del_init(&rbio->plug_list); list_add(&next->hash_list, &h->hash_list); - atomic_inc(&next->refs); + refcount_inc(&next->refs); spin_unlock(&rbio->bio_list_lock); spin_unlock_irqrestore(&h->lock, flags); @@ -843,8 +843,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio) { int i; - WARN_ON(atomic_read(&rbio->refs) < 0); - if (!atomic_dec_and_test(&rbio->refs)) + if (!refcount_dec_and_test(&rbio->refs)) return; WARN_ON(!list_empty(&rbio->stripe_cache)); @@ -997,7 +996,7 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->stripe_npages = stripe_npages; rbio->faila = -1; rbio->failb = -1; - atomic_set(&rbio->refs, 1); + refcount_set(&rbio->refs, 1); atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); @@ -2118,6 +2117,11 @@ int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio, struct btrfs_raid_bio *rbio; int ret; + if (generic_io) { + ASSERT(bbio->mirror_num == mirror_num); + btrfs_io_bio(bio)->mirror_num = mirror_num; + } + rbio = alloc_rbio(fs_info, bbio, stripe_len); if (IS_ERR(rbio)) { if (generic_io) @@ -2194,6 +2198,8 @@ static void read_rebuild_work(struct btrfs_work *work) /* * The following code is used to scrub/replace the parity stripe * + * Caller must have already increased bio_counter for getting @bbio. + * * Note: We need make sure all the pages that add into the scrub/replace * raid bio are correct and not be changed during the scrub/replace. That * is those pages just hold metadata or file data with checksum. @@ -2231,6 +2237,12 @@ raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, ASSERT(rbio->stripe_npages == stripe_nsectors); bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + /* + * We have already increased bio_counter when getting bbio, record it + * so we can free it at rbio_orig_end_io(). + */ + rbio->generic_bio_cnt = 1; + return rbio; } @@ -2673,6 +2685,12 @@ raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio, return NULL; } + /* + * When we get bbio, we have already increased bio_counter, record it + * so we can free it at rbio_orig_end_io() + */ + rbio->generic_bio_cnt = 1; + return rbio; } diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e88bca87f5d275c7c25b2ee6279fe4cdc1482413..a17e775a4a89fca1e48214c5abb02756dd1c69d6 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -209,9 +209,9 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info, return; } -int btree_readahead_hook(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb, int err) +int btree_readahead_hook(struct extent_buffer *eb, int err) { + struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; struct reada_extent *re; @@ -235,10 +235,10 @@ int btree_readahead_hook(struct btrfs_fs_info *fs_info, return ret; } -static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev, u64 logical, +static struct reada_zone *reada_find_zone(struct btrfs_device *dev, u64 logical, struct btrfs_bio *bbio) { + struct btrfs_fs_info *fs_info = dev->fs_info; int ret; struct reada_zone *zone; struct btrfs_block_group_cache *cache = NULL; @@ -270,6 +270,12 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, if (!zone) return NULL; + ret = radix_tree_preload(GFP_KERNEL); + if (ret) { + kfree(zone); + return NULL; + } + zone->start = start; zone->end = end; INIT_LIST_HEAD(&zone->list); @@ -299,6 +305,7 @@ static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info, zone = NULL; } spin_unlock(&fs_info->reada_lock); + radix_tree_preload_end(); return zone; } @@ -313,7 +320,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, struct btrfs_bio *bbio = NULL; struct btrfs_device *dev; struct btrfs_device *prev_dev; - u32 blocksize; u64 length; int real_stripes; int nzones = 0; @@ -334,7 +340,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!re) return NULL; - blocksize = fs_info->nodesize; re->logical = logical; re->top = *top; INIT_LIST_HEAD(&re->extctl); @@ -344,10 +349,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, /* * map block */ - length = blocksize; + length = fs_info->nodesize; ret = btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &length, &bbio, 0); - if (ret || !bbio || length < blocksize) + if (ret || !bbio || length < fs_info->nodesize) goto error; if (bbio->num_stripes > BTRFS_MAX_MIRRORS) { @@ -367,7 +372,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, if (!dev->bdev) continue; - zone = reada_find_zone(fs_info, dev, logical, bbio); + zone = reada_find_zone(dev, logical, bbio); if (!zone) continue; @@ -386,6 +391,10 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, goto error; } + ret = radix_tree_preload(GFP_KERNEL); + if (ret) + goto error; + /* insert extent in reada_tree + all per-device trees, all or nothing */ btrfs_dev_replace_lock(&fs_info->dev_replace, 0); spin_lock(&fs_info->reada_lock); @@ -395,13 +404,16 @@ static struct reada_extent *reada_find_extent(struct btrfs_fs_info *fs_info, re_exist->refcnt++; spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } if (ret) { spin_unlock(&fs_info->reada_lock); btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); + radix_tree_preload_end(); goto error; } + radix_tree_preload_end(); prev_dev = NULL; dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing( &fs_info->dev_replace); @@ -639,9 +651,9 @@ static int reada_pick_zone(struct btrfs_device *dev) return 1; } -static int reada_start_machine_dev(struct btrfs_fs_info *fs_info, - struct btrfs_device *dev) +static int reada_start_machine_dev(struct btrfs_device *dev) { + struct btrfs_fs_info *fs_info = dev->fs_info; struct reada_extent *re = NULL; int mirror_num = 0; struct extent_buffer *eb = NULL; @@ -754,8 +766,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) - enqueued += reada_start_machine_dev(fs_info, - device); + enqueued += reada_start_machine_dev(device); } mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index a08224eab8b47111b2fc6f7f51ac33e6ca69814e..7d6bc308bf4308f653cc300c1354602ddd0df911 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -501,8 +501,9 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_root_item *item = &root->root_item; - struct timespec ct = current_fs_time(root->fs_info->sb); + struct timespec ct; + ktime_get_real_ts(&ct); spin_lock(&root->root_item_lock); btrfs_set_root_ctransid(item, trans->transid); btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b0251eb1239fce83226650be88c31122a9f108af..c7b45eb2403d09e94b2538dabcb5a1f0116c55dd 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -64,7 +64,7 @@ struct scrub_ctx; #define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */ struct scrub_recover { - atomic_t refs; + refcount_t refs; struct btrfs_bio *bbio; u64 map_length; }; @@ -112,7 +112,7 @@ struct scrub_block { struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; int page_count; atomic_t outstanding_pages; - atomic_t refs; /* free mem on transition to zero */ + refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; struct { @@ -140,9 +140,9 @@ struct scrub_parity { int nsectors; - int stripe_len; + u64 stripe_len; - atomic_t refs; + refcount_t refs; struct list_head spages; @@ -202,7 +202,7 @@ struct scrub_ctx { * doesn't free the scrub context before or while the workers are * doing the wakeup() call. */ - atomic_t refs; + refcount_t refs; }; struct scrub_fixup_nodatasum { @@ -240,6 +240,13 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct full_stripe_lock { + struct rb_node node; + u64 logical; + u64 refs; + struct mutex mutex; +}; + static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -305,7 +312,7 @@ static void scrub_put_ctx(struct scrub_ctx *sctx); static void scrub_pending_bio_inc(struct scrub_ctx *sctx) { - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); atomic_inc(&sctx->bios_in_flight); } @@ -348,6 +355,222 @@ static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info) scrub_pause_off(fs_info); } +/* + * Insert new full stripe lock into full stripe locks tree + * + * Return pointer to existing or newly inserted full_stripe_lock structure if + * everything works well. + * Return ERR_PTR(-ENOMEM) if we failed to allocate memory + * + * NOTE: caller must hold full_stripe_locks_root->lock before calling this + * function + */ +static struct full_stripe_lock *insert_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct full_stripe_lock *entry; + struct full_stripe_lock *ret; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + p = &locks_root->root.rb_node; + while (*p) { + parent = *p; + entry = rb_entry(parent, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) { + p = &(*p)->rb_left; + } else if (fstripe_logical > entry->logical) { + p = &(*p)->rb_right; + } else { + entry->refs++; + return entry; + } + } + + /* Insert new lock */ + ret = kmalloc(sizeof(*ret), GFP_KERNEL); + if (!ret) + return ERR_PTR(-ENOMEM); + ret->logical = fstripe_logical; + ret->refs = 1; + mutex_init(&ret->mutex); + + rb_link_node(&ret->node, parent, p); + rb_insert_color(&ret->node, &locks_root->root); + return ret; +} + +/* + * Search for a full stripe lock of a block group + * + * Return pointer to existing full stripe lock if found + * Return NULL if not found + */ +static struct full_stripe_lock *search_full_stripe_lock( + struct btrfs_full_stripe_locks_tree *locks_root, + u64 fstripe_logical) +{ + struct rb_node *node; + struct full_stripe_lock *entry; + + WARN_ON(!mutex_is_locked(&locks_root->lock)); + + node = locks_root->root.rb_node; + while (node) { + entry = rb_entry(node, struct full_stripe_lock, node); + if (fstripe_logical < entry->logical) + node = node->rb_left; + else if (fstripe_logical > entry->logical) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * Helper to get full stripe logical from a normal bytenr. + * + * Caller must ensure @cache is a RAID56 block group. + */ +static u64 get_full_stripe_logical(struct btrfs_block_group_cache *cache, + u64 bytenr) +{ + u64 ret; + + /* + * Due to chunk item size limit, full stripe length should not be + * larger than U32_MAX. Just a sanity check here. + */ + WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX); + + /* + * round_down() can only handle power of 2, while RAID56 full + * stripe length can be 64KiB * n, so we need to manually round down. + */ + ret = div64_u64(bytenr - cache->key.objectid, cache->full_stripe_len) * + cache->full_stripe_len + cache->key.objectid; + return ret; +} + +/* + * Lock a full stripe to avoid concurrency of recovery and read + * + * It's only used for profiles with parities (RAID5/6), for other profiles it + * does nothing. + * + * Return 0 if we locked full stripe covering @bytenr, with a mutex held. + * So caller must call unlock_full_stripe() at the same context. + * + * Return <0 if encounters error. + */ +static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool *locked_ret) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *existing; + u64 fstripe_start; + int ret = 0; + + *locked_ret = false; + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + + /* Profiles not based on parity don't need full stripe lock */ + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + locks_root = &bg_cache->full_stripe_locks_root; + + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + /* Now insert the full stripe lock */ + mutex_lock(&locks_root->lock); + existing = insert_full_stripe_lock(locks_root, fstripe_start); + mutex_unlock(&locks_root->lock); + if (IS_ERR(existing)) { + ret = PTR_ERR(existing); + goto out; + } + mutex_lock(&existing->mutex); + *locked_ret = true; +out: + btrfs_put_block_group(bg_cache); + return ret; +} + +/* + * Unlock a full stripe. + * + * NOTE: Caller must ensure it's the same context calling corresponding + * lock_full_stripe(). + * + * Return 0 if we unlock full stripe without problem. + * Return <0 for error + */ +static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr, + bool locked) +{ + struct btrfs_block_group_cache *bg_cache; + struct btrfs_full_stripe_locks_tree *locks_root; + struct full_stripe_lock *fstripe_lock; + u64 fstripe_start; + bool freeit = false; + int ret = 0; + + /* If we didn't acquire full stripe lock, no need to continue */ + if (!locked) + return 0; + + bg_cache = btrfs_lookup_block_group(fs_info, bytenr); + if (!bg_cache) { + ASSERT(0); + return -ENOENT; + } + if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK)) + goto out; + + locks_root = &bg_cache->full_stripe_locks_root; + fstripe_start = get_full_stripe_logical(bg_cache, bytenr); + + mutex_lock(&locks_root->lock); + fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start); + /* Unpaired unlock_full_stripe() detected */ + if (!fstripe_lock) { + WARN_ON(1); + ret = -ENOENT; + mutex_unlock(&locks_root->lock); + goto out; + } + + if (fstripe_lock->refs == 0) { + WARN_ON(1); + btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow", + fstripe_lock->logical); + } else { + fstripe_lock->refs--; + } + + if (fstripe_lock->refs == 0) { + rb_erase(&fstripe_lock->node, &locks_root->root); + freeit = true; + } + mutex_unlock(&locks_root->lock); + + mutex_unlock(&fstripe_lock->mutex); + if (freeit) + kfree(fstripe_lock); +out: + btrfs_put_block_group(bg_cache); + return ret; +} + /* * used for workers that require transaction commits (i.e., for the * NOCOW case) @@ -356,7 +579,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx) { struct btrfs_fs_info *fs_info = sctx->fs_info; - atomic_inc(&sctx->refs); + refcount_inc(&sctx->refs); /* * increment scrubs_running to prevent cancel requests from * completing as long as a worker is running. we must also @@ -447,7 +670,7 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) static void scrub_put_ctx(struct scrub_ctx *sctx) { - if (atomic_dec_and_test(&sctx->refs)) + if (refcount_dec_and_test(&sctx->refs)) scrub_free_ctx(sctx); } @@ -462,7 +685,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) sctx = kzalloc(sizeof(*sctx), GFP_KERNEL); if (!sctx) goto nomem; - atomic_set(&sctx->refs, 1); + refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; sctx->pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO; sctx->curr = -1; @@ -857,12 +1080,14 @@ static void scrub_fixup_nodatasum(struct btrfs_work *work) static inline void scrub_get_recover(struct scrub_recover *recover) { - atomic_inc(&recover->refs); + refcount_inc(&recover->refs); } -static inline void scrub_put_recover(struct scrub_recover *recover) +static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, + struct scrub_recover *recover) { - if (atomic_dec_and_test(&recover->refs)) { + if (refcount_dec_and_test(&recover->refs)) { + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(recover->bbio); kfree(recover); } @@ -892,6 +1117,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int mirror_index; int page_num; int success; + bool full_stripe_locked; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -917,6 +1143,24 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) have_csum = sblock_to_check->pagev[0]->have_csum; dev = sblock_to_check->pagev[0]->dev; + /* + * For RAID5/6, race can happen for a different device scrub thread. + * For data corruption, Parity and Data threads will both try + * to recovery the data. + * Race can lead to doubly added csum error, or even unrecoverable + * error. + */ + ret = lock_full_stripe(fs_info, logical, &full_stripe_locked); + if (ret < 0) { + spin_lock(&sctx->stat_lock); + if (ret == -ENOMEM) + sctx->stat.malloc_errors++; + sctx->stat.read_errors++; + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + return ret; + } + if (sctx->is_dev_replace && !is_metadata && !have_csum) { sblocks_for_recheck = NULL; goto nodatasum_case; @@ -1241,7 +1485,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) sblock->pagev[page_index]->sblock = NULL; recover = sblock->pagev[page_index]->recover; if (recover) { - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); sblock->pagev[page_index]->recover = NULL; } @@ -1251,6 +1495,9 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) kfree(sblocks_for_recheck); } + ret = unlock_full_stripe(fs_info, logical, full_stripe_locked); + if (ret < 0) + return ret; return 0; } @@ -1330,20 +1577,23 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, * with a length of PAGE_SIZE, each returned stripe * represents one mirror */ + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &mapped_length, &bbio, 0, 1); + logical, &mapped_length, &bbio); if (ret || !bbio || mapped_length < sublen) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -EIO; } recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS); if (!recover) { btrfs_put_bbio(bbio); + btrfs_bio_counter_dec(fs_info); return -ENOMEM; } - atomic_set(&recover->refs, 1); + refcount_set(&recover->refs, 1); recover->bbio = bbio; recover->map_length = mapped_length; @@ -1365,7 +1615,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; spin_unlock(&sctx->stat_lock); - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); return -ENOMEM; } scrub_page_get(page); @@ -1407,7 +1657,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, scrub_get_recover(recover); page->recover = recover; } - scrub_put_recover(recover); + scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; page_index++; @@ -1497,14 +1747,18 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, bio_add_page(bio, page->page, PAGE_SIZE, 0); if (!retry_failed_mirror && scrub_is_page_on_raid56(page)) { - if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) + if (scrub_submit_raid56_bio_wait(fs_info, bio, page)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } else { bio->bi_iter.bi_sector = page->physical >> 9; bio_set_op_attrs(bio, REQ_OP_READ, 0); - if (btrfsic_submit_bio_wait(bio)) + if (btrfsic_submit_bio_wait(bio)) { + page->io_error = 1; sblock->no_io_error_seen = 0; + } } bio_put(bio); @@ -1634,7 +1888,7 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, if (spage->io_error) { void *mapped_buffer = kmap_atomic(spage->page); - memset(mapped_buffer, 0, PAGE_SIZE); + clear_page(mapped_buffer); flush_dcache_page(spage->page); kunmap_atomic(mapped_buffer); } @@ -1998,12 +2252,12 @@ static int scrub_checksum_super(struct scrub_block *sblock) static void scrub_block_get(struct scrub_block *sblock) { - atomic_inc(&sblock->refs); + refcount_inc(&sblock->refs); } static void scrub_block_put(struct scrub_block *sblock) { - if (atomic_dec_and_test(&sblock->refs)) { + if (refcount_dec_and_test(&sblock->refs)) { int i; if (sblock->sparity) @@ -2187,8 +2441,9 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) int ret; int i; + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2231,6 +2486,7 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2255,7 +2511,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; @@ -2385,7 +2641,7 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, unsigned long *bitmap, u64 start, u64 len) { - u32 offset; + u64 offset; int nsectors; int sectorsize = sparity->sctx->fs_info->sectorsize; @@ -2395,8 +2651,8 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, } start -= sparity->logic_start; - start = div_u64_rem(start, sparity->stripe_len, &offset); - offset /= sectorsize; + start = div64_u64_rem(start, sparity->stripe_len, &offset); + offset = div_u64(offset, sectorsize); nsectors = (int)len / sectorsize; if (offset + nsectors <= sparity->nsectors) { @@ -2555,7 +2811,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, /* one ref inside this function, plus one for each page added to * a bio later on */ - atomic_set(&sblock->refs, 1); + refcount_set(&sblock->refs, 1); sblock->sctx = sctx; sblock->no_io_error_seen = 1; sblock->sparity = sparity; @@ -2694,7 +2950,7 @@ static int get_raid56_logic_offset(u64 physical, int num, for (i = 0; i < nr_data_stripes(map); i++) { *offset = last_offset + i * map->stripe_len; - stripe_nr = div_u64(*offset, map->stripe_len); + stripe_nr = div64_u64(*offset, map->stripe_len); stripe_nr = div_u64(stripe_nr, nr_data_stripes(map)); /* Work out the disk rotation on this stripe-set */ @@ -2765,7 +3021,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) struct btrfs_fs_info *fs_info = sctx->fs_info; struct bio *bio; struct btrfs_raid_bio *rbio; - struct scrub_page *spage; struct btrfs_bio *bbio = NULL; u64 length; int ret; @@ -2775,8 +3030,10 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) goto out; length = sparity->logic_end - sparity->logic_start; + + btrfs_bio_counter_inc_blocked(fs_info); ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start, - &length, &bbio, 0, 1); + &length, &bbio); if (ret || !bbio || !bbio->raid_map) goto bbio_out; @@ -2795,9 +3052,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (!rbio) goto rbio_out; - list_for_each_entry(spage, &sparity->spages, list) - raid56_add_scrub_pages(rbio, spage->page, spage->logical); - scrub_pending_bio_inc(sctx); raid56_parity_submit_scrub_rbio(rbio); return; @@ -2805,6 +3059,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) rbio_out: bio_put(bio); bbio_out: + btrfs_bio_counter_dec(fs_info); btrfs_put_bbio(bbio); bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); @@ -2822,12 +3077,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors) static void scrub_parity_get(struct scrub_parity *sparity) { - atomic_inc(&sparity->refs); + refcount_inc(&sparity->refs); } static void scrub_parity_put(struct scrub_parity *sparity) { - if (!atomic_dec_and_test(&sparity->refs)) + if (!refcount_dec_and_test(&sparity->refs)) return; scrub_parity_check_and_repair(sparity); @@ -2879,7 +3134,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->scrub_dev = sdev; sparity->logic_start = logic_start; sparity->logic_end = logic_end; - atomic_set(&sparity->refs, 1); + refcount_set(&sparity->refs, 1); INIT_LIST_HEAD(&sparity->spages); sparity->dbitmap = sparity->bitmap; sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; @@ -3098,7 +3353,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, physical = map->stripes[num].physical; offset = 0; - nstripes = div_u64(length, map->stripe_len); + nstripes = div64_u64(length, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID0) { offset = map->stripe_len * num; increment = map->stripe_len * map->num_stripes; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a60d5bfb8a49e2bfc3faef10f4ea353a9da5f8b8..fc496a6f842a87d3544e80b3b9e063d57417f788 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5184,13 +5184,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, while (key.offset < ekey->offset + left_len) { ei = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); right_type = btrfs_file_extent_type(eb, ei); - if (right_type != BTRFS_FILE_EXTENT_REG) { + if (right_type != BTRFS_FILE_EXTENT_REG && + right_type != BTRFS_FILE_EXTENT_INLINE) { ret = 0; goto out; } right_disknr = btrfs_file_extent_disk_bytenr(eb, ei); - right_len = btrfs_file_extent_num_bytes(eb, ei); + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + right_len = btrfs_file_extent_inline_len(eb, slot, ei); + right_len = PAGE_ALIGN(right_len); + } else { + right_len = btrfs_file_extent_num_bytes(eb, ei); + } right_offset = btrfs_file_extent_offset(eb, ei); right_gen = btrfs_file_extent_generation(eb, ei); @@ -5204,6 +5210,19 @@ static int is_extent_unchanged(struct send_ctx *sctx, goto out; } + /* + * We just wanted to see if when we have an inline extent, what + * follows it is a regular extent (wanted to check the above + * condition for inline extents too). This should normally not + * happen but it's possible for example when we have an inline + * compressed extent representing data with a size matching + * the page size (currently the same as sector size). + */ + if (right_type == BTRFS_FILE_EXTENT_INLINE) { + ret = 0; + goto out; + } + left_offset_fixed = left_offset; if (key.offset < ekey->offset) { /* Fix the right offset for 2a and 7. */ @@ -6360,22 +6379,16 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) sctx->clone_roots_cnt = arg->clone_sources_count; sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kmalloc(sctx->send_max_size, GFP_KERNEL | __GFP_NOWARN); + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); if (!sctx->send_buf) { - sctx->send_buf = vmalloc(sctx->send_max_size); - if (!sctx->send_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } - sctx->read_buf = kmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL | __GFP_NOWARN); + sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); if (!sctx->read_buf) { - sctx->read_buf = vmalloc(BTRFS_SEND_READ_SIZE); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } sctx->pending_dir_moves = RB_ROOT; @@ -6396,13 +6409,10 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); if (arg->clone_sources_count) { - clone_sources_tmp = kmalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN); + clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); if (!clone_sources_tmp) { - clone_sources_tmp = vmalloc(alloc_size); - if (!clone_sources_tmp) { - ret = -ENOMEM; - goto out; - } + ret = -ENOMEM; + goto out; } ret = copy_from_user(clone_sources_tmp, arg->clone_sources, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9530a333d302c0c13c3e8463814b642a023afc23..4f1cdd5058f12b9970b5894828498624a28c77b5 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1136,6 +1136,13 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_flags |= MS_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; + + err = super_setup_bdi(sb); + if (err) { + btrfs_err(fs_info, "super_setup_bdi failed"); + return err; + } + err = open_ctree(sb, fs_devices, (char *)data); if (err) { btrfs_err(fs_info, "open_ctree failed"); @@ -1788,8 +1795,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) } if (fs_info->fs_devices->missing_devices > - fs_info->num_tolerated_disk_barrier_failures && - !(*flags & MS_RDONLY)) { + fs_info->num_tolerated_disk_barrier_failures) { btrfs_warn(fs_info, "too many missing devices, writeable remount is not allowed"); ret = -EACCES; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ea272432c9305177a1059db321f8aae6f89a09f3..b18ab8f327a53dd10b09bee6169f34ecff5eb809 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -237,7 +237,6 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) { memset(trans, 0, sizeof(*trans)); trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); trans->type = __TRANS_DUMMY; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 61b807de3e164e38877230cdfd1e9a545573f1be..2168654c90a1e6cab355d8270b23db68de0253c3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -60,8 +60,8 @@ static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = { void btrfs_put_transaction(struct btrfs_transaction *transaction) { - WARN_ON(atomic_read(&transaction->use_count) == 0); - if (atomic_dec_and_test(&transaction->use_count)) { + WARN_ON(refcount_read(&transaction->use_count) == 0); + if (refcount_dec_and_test(&transaction->use_count)) { BUG_ON(!list_empty(&transaction->list)); WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root)); if (transaction->delayed_refs.pending_csums) @@ -207,7 +207,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->trans_lock); return -EBUSY; } - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); atomic_inc(&cur_trans->num_writers); extwriter_counter_inc(cur_trans, type); spin_unlock(&fs_info->trans_lock); @@ -257,7 +257,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, * One for this trans handle, one so it will live on until we * commit the transaction. */ - atomic_set(&cur_trans->use_count, 2); + refcount_set(&cur_trans->use_count, 2); atomic_set(&cur_trans->pending_ordered, 0); cur_trans->flags = 0; cur_trans->start_time = get_seconds(); @@ -432,7 +432,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) spin_lock(&fs_info->trans_lock); cur_trans = fs_info->running_transaction; if (cur_trans && is_transaction_blocked(cur_trans)) { - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_event(fs_info->transaction_wait, @@ -572,7 +572,6 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, h->type = type; h->can_flush_pending_bgs = true; - INIT_LIST_HEAD(&h->qgroup_ref_list); INIT_LIST_HEAD(&h->new_bgs); smp_mb(); @@ -744,7 +743,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) list_for_each_entry(t, &fs_info->trans_list, list) { if (t->transid == transid) { cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = 0; break; } @@ -773,7 +772,7 @@ int btrfs_wait_for_commit(struct btrfs_fs_info *fs_info, u64 transid) if (t->state == TRANS_STATE_COMPLETED) break; cur_trans = t; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); break; } } @@ -917,7 +916,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, wake_up_process(info->transaction_kthread); err = -EIO; } - assert_qgroups_uptodate(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); if (must_run_delayed_refs) { @@ -1839,7 +1837,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, /* take transaction reference */ cur_trans = trans->transaction; - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); btrfs_end_transaction(trans); @@ -2015,7 +2013,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) spin_lock(&fs_info->trans_lock); if (cur_trans->state >= TRANS_STATE_COMMIT_START) { spin_unlock(&fs_info->trans_lock); - atomic_inc(&cur_trans->use_count); + refcount_inc(&cur_trans->use_count); ret = btrfs_end_transaction(trans); wait_for_commit(cur_trans); @@ -2035,7 +2033,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) prev_trans = list_entry(cur_trans->list.prev, struct btrfs_transaction, list); if (prev_trans->state != TRANS_STATE_COMPLETED) { - atomic_inc(&prev_trans->use_count); + refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); wait_for_commit(prev_trans); @@ -2130,13 +2128,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - /* Reocrd old roots for later qgroup accounting */ - ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); - if (ret) { - mutex_unlock(&fs_info->reloc_mutex); - goto scrub_continue; - } - /* * make sure none of the code above managed to slip in a * delayed item @@ -2178,6 +2169,24 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) */ btrfs_free_log_root_tree(trans, fs_info); + /* + * commit_fs_roots() can call btrfs_save_ino_cache(), which generates + * new delayed refs. Must handle them or qgroup can be wrong. + */ + ret = btrfs_run_delayed_refs(trans, fs_info, (unsigned long)-1); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) { + mutex_unlock(&fs_info->tree_log_mutex); + mutex_unlock(&fs_info->reloc_mutex); + goto scrub_continue; + } + /* * Since fs roots are all committed, we can get a quite accurate * new_roots. So let's do quota accounting. @@ -2223,7 +2232,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) switch_commit_roots(cur_trans, fs_info); - assert_qgroups_uptodate(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); update_super_roots(fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 5dfb5590fff654a077fd95704682293bfc948a8f..c55e44560103b48e2a917701899c611732d8a013 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -18,6 +18,8 @@ #ifndef __BTRFS_TRANSACTION__ #define __BTRFS_TRANSACTION__ + +#include #include "btrfs_inode.h" #include "delayed-ref.h" #include "ctree.h" @@ -49,7 +51,7 @@ struct btrfs_transaction { * transaction can end */ atomic_t num_writers; - atomic_t use_count; + refcount_t use_count; atomic_t pending_ordered; unsigned long flags; @@ -125,8 +127,6 @@ struct btrfs_trans_handle { unsigned int type; struct btrfs_root *root; struct btrfs_fs_info *fs_info; - struct seq_list delayed_ref_elem; - struct list_head qgroup_ref_list; struct list_head new_bgs; }; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a59674c3e69efb76d27d6705b41ca76d94e82e15..ccfe9fe7754a8d4d80fd3e5b1f0a1d2f2118e4e6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4196,7 +4196,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, if (em->generation <= test_gen) continue; /* Need a ref to keep it from getting evicted from cache */ - atomic_inc(&em->refs); + refcount_inc(&em->refs); set_bit(EXTENT_FLAG_LOGGING, &em->flags); list_add_tail(&em->list, &extents); num++; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index ab8a66d852f91cb04206361a551b8c57760c9c40..017b67daa3bbf375919019e5c089b94f97d3e2a5 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -139,6 +139,11 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static int __btrfs_map_block(struct btrfs_fs_info *fs_info, + enum btrfs_map_op op, + u64 logical, u64 *length, + struct btrfs_bio **bbio_ret, + int mirror_num, int need_raid_map); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -1008,14 +1013,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; + if (!blk_queue_nonrot(q)) + fs_devices->rotating = 1; device->bdev = bdev; device->in_fs_metadata = 0; device->mode = flags; - if (!blk_queue_nonrot(bdev_get_queue(bdev))) - fs_devices->rotating = 1; - fs_devices->open_devices++; if (device->writeable && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -2417,7 +2421,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path fs_info->free_chunk_space += device->total_bytes; spin_unlock(&fs_info->free_chunk_lock); - if (!blk_queue_nonrot(bdev_get_queue(bdev))) + if (!blk_queue_nonrot(q)) fs_info->fs_devices->rotating = 1; tmp = btrfs_super_total_bytes(fs_info->super_copy); @@ -2795,10 +2799,38 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, return ret; } +static struct extent_map *get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct extent_map_tree *em_tree; + struct extent_map *em; + + em_tree = &fs_info->mapping_tree.map_tree; + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, length); + read_unlock(&em_tree->lock); + + if (!em) { + btrfs_crit(fs_info, "unable to find logical %llu length %llu", + logical, length); + return ERR_PTR(-EINVAL); + } + + if (em->start > logical || em->start + em->len < logical) { + btrfs_crit(fs_info, + "found a bad mapping, wanted %llu-%llu, found %llu-%llu", + logical, length, em->start, em->start + em->len); + free_extent_map(em); + return ERR_PTR(-EINVAL); + } + + /* callers are responsible for dropping em's ref. */ + return em; +} + int btrfs_remove_chunk(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; u64 dev_extent_len = 0; @@ -2806,23 +2838,15 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em_tree = &fs_info->mapping_tree.map_tree; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em || em->start > chunk_offset || - em->start + em->len < chunk_offset) { + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - if (em) - free_extent_map(em); - return -EINVAL; + return PTR_ERR(em); } map = em->map_lookup; mutex_lock(&fs_info->chunk_mutex); @@ -3736,7 +3760,7 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) if (ret) btrfs_handle_fs_error(fs_info, ret, NULL); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } /* Non-zero return value signifies invalidity */ @@ -3755,6 +3779,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs) { struct btrfs_fs_info *fs_info = bctl->fs_info; + u64 meta_target, data_target; u64 allowed; int mixed = 0; int ret; @@ -3851,11 +3876,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); - if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < - btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + /* if we're not converting, the target field is uninitialized */ + meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->meta.target : fs_info->avail_metadata_alloc_bits; + data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ? + bctl->data.target : fs_info->avail_data_alloc_bits; + if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) < + btrfs_get_num_tolerated_disk_barrier_failures(data_target)) { btrfs_warn(fs_info, "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", - bctl->meta.target, bctl->data.target); + meta_target, data_target); } if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { @@ -3910,7 +3940,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, __cancel_balance(fs_info); else { kfree(bctl); - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); } return ret; } @@ -4000,7 +4030,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + WARN_ON(test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); @@ -4785,7 +4815,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, stripe_size = div_u64(stripe_size, dev_stripes); /* align to BTRFS_STRIPE_LEN */ - stripe_size = div_u64(stripe_size, raid_stripe_len); + stripe_size = div64_u64(stripe_size, raid_stripe_len); stripe_size *= raid_stripe_len; map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -4833,7 +4863,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ret = add_extent_mapping(em_tree, em, 0); if (!ret) { list_add_tail(&em->list, &trans->transaction->pending_chunks); - atomic_inc(&em->refs); + refcount_inc(&em->refs); } write_unlock(&em_tree->lock); if (ret) { @@ -4888,7 +4918,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_device *device; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map_tree *em_tree; struct extent_map *em; struct map_lookup *map; size_t item_size; @@ -4897,24 +4926,9 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, int i = 0; int ret = 0; - em_tree = &fs_info->mapping_tree.map_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, chunk_size); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %Lu len %Lu", - chunk_offset, chunk_size); - return -EINVAL; - } - - if (em->start != chunk_offset || em->len != chunk_size) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu-%Lu, found %Lu-%Lu", - chunk_offset, chunk_size, em->start, em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); @@ -5055,15 +5069,12 @@ int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; int readonly = 0; int miss_ndevs = 0; int i; - read_lock(&map_tree->map_tree.lock); - em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); - read_unlock(&map_tree->map_tree.lock); - if (!em) + em = get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(em)) return 1; map = em->map_lookup; @@ -5117,34 +5128,19 @@ void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - - /* - * We could return errors for these cases, but that could get ugly and - * we'd probably do the same thing which is just not do anything else - * and exit, so return 1 so the callers don't try to use other copies. - */ - if (!em) { - btrfs_crit(fs_info, "No mapping for %Lu-%Lu", logical, - logical+len); - return 1; - } - - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got %Lu-%Lu", - logical, logical+len, em->start, - em->start + em->len); - free_extent_map(em); + em = get_chunk_map(fs_info, logical, len); + if (IS_ERR(em)) + /* + * We could return errors for these cases, but that could get + * ugly and we'd probably do the same thing which is just not do + * anything else and exit, so return 1 so the callers don't try + * to use other copies. + */ return 1; - } map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) @@ -5160,7 +5156,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) free_extent_map(em); btrfs_dev_replace_lock(&fs_info->dev_replace, 0); - if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) + if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) && + fs_info->dev_replace.tgtdev) ret++; btrfs_dev_replace_unlock(&fs_info->dev_replace, 0); @@ -5173,15 +5170,11 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; unsigned long len = fs_info->sectorsize; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); @@ -5189,20 +5182,16 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, return len; } -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num) { struct extent_map *em; struct map_lookup *map; - struct extent_map_tree *em_tree = &map_tree->map_tree; int ret = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, len); - read_unlock(&em_tree->lock); - BUG_ON(!em); + em = get_chunk_map(fs_info, logical, len); + WARN_ON(IS_ERR(em)); - BUG_ON(em->start > logical || em->start + em->len < logical); map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; @@ -5295,25 +5284,353 @@ static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes) GFP_NOFS|__GFP_NOFAIL); atomic_set(&bbio->error, 0); - atomic_set(&bbio->refs, 1); + refcount_set(&bbio->refs, 1); return bbio; } void btrfs_get_bbio(struct btrfs_bio *bbio) { - WARN_ON(!atomic_read(&bbio->refs)); - atomic_inc(&bbio->refs); + WARN_ON(!refcount_read(&bbio->refs)); + refcount_inc(&bbio->refs); } void btrfs_put_bbio(struct btrfs_bio *bbio) { if (!bbio) return; - if (atomic_dec_and_test(&bbio->refs)) + if (refcount_dec_and_test(&bbio->refs)) kfree(bbio); } +/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ +/* + * Please note that, discard won't be sent to target device of device + * replace. + */ +static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + struct btrfs_bio **bbio_ret) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_bio *bbio; + u64 offset; + u64 stripe_nr; + u64 stripe_nr_end; + u64 stripe_end_offset; + u64 stripe_cnt; + u64 stripe_len; + u64 stripe_offset; + u64 num_stripes; + u32 stripe_index; + u32 factor = 0; + u32 sub_stripes = 0; + u64 stripes_per_dev = 0; + u32 remaining_stripes = 0; + u32 last_stripe = 0; + int ret = 0; + int i; + + /* discard always return a bbio */ + ASSERT(bbio_ret); + + em = get_chunk_map(fs_info, logical, length); + if (IS_ERR(em)) + return PTR_ERR(em); + + map = em->map_lookup; + /* we don't discard raid56 yet */ + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EOPNOTSUPP; + goto out; + } + + offset = logical - em->start; + length = min_t(u64, em->len - offset, length); + + stripe_len = map->stripe_len; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + stripe_nr = div64_u64(offset, stripe_len); + + /* stripe_offset is the offset of this block in its stripe */ + stripe_offset = offset - stripe_nr * stripe_len; + + stripe_nr_end = round_up(offset + length, map->stripe_len); + stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len); + stripe_cnt = stripe_nr_end - stripe_nr; + stripe_end_offset = stripe_nr_end * map->stripe_len - + (offset + length); + /* + * after this, stripe_nr is the number of stripes on this + * device we have to walk to find the data, and stripe_index is + * the number of our device in the stripe array + */ + num_stripes = 1; + stripe_index = 0; + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID0) + sub_stripes = 1; + else + sub_stripes = map->sub_stripes; + + factor = map->num_stripes / sub_stripes; + num_stripes = min_t(u64, map->num_stripes, + sub_stripes * stripe_cnt); + stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); + stripe_index *= sub_stripes; + stripes_per_dev = div_u64_rem(stripe_cnt, factor, + &remaining_stripes); + div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); + last_stripe *= sub_stripes; + } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = map->num_stripes; + } else { + stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, + &stripe_index); + } + + bbio = alloc_btrfs_bio(num_stripes, 0); + if (!bbio) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = map->stripes[stripe_index].dev; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { + bbio->stripes[i].length = stripes_per_dev * + map->stripe_len; + + if (i / sub_stripes < remaining_stripes) + bbio->stripes[i].length += + map->stripe_len; + + /* + * Special for the first stripe and + * the last stripe: + * + * |-------|...|-------| + * |----------| + * off end_off + */ + if (i < sub_stripes) + bbio->stripes[i].length -= + stripe_offset; + + if (stripe_index >= last_stripe && + stripe_index <= (last_stripe + + sub_stripes - 1)) + bbio->stripes[i].length -= + stripe_end_offset; + + if (i == sub_stripes - 1) + stripe_offset = 0; + } else { + bbio->stripes[i].length = length; + } + + stripe_index++; + if (stripe_index == map->num_stripes) { + stripe_index = 0; + stripe_nr++; + } + } + + *bbio_ret = bbio; + bbio->map_type = map->type; + bbio->num_stripes = num_stripes; +out: + free_extent_map(em); + return ret; +} + +/* + * In dev-replace case, for repair case (that's the only case where the mirror + * is selected explicitly when calling btrfs_map_block), blocks left of the + * left cursor can also be read from the target drive. + * + * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the + * array of stripes. + * For READ, it also needs to be supported using the same mirror number. + * + * If the requested block is not left of the left cursor, EIO is returned. This + * can happen because btrfs_num_copies() returns one more in the dev-replace + * case. + */ +static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, + u64 srcdev_devid, int *mirror_num, + u64 *physical) +{ + struct btrfs_bio *bbio = NULL; + int num_stripes; + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + int i; + int ret = 0; + + ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, + logical, &length, &bbio, 0, 0); + if (ret) { + ASSERT(bbio == NULL); + return ret; + } + + num_stripes = bbio->num_stripes; + if (*mirror_num > num_stripes) { + /* + * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror, + * that means that the requested area is not left of the left + * cursor + */ + btrfs_put_bbio(bbio); + return -EIO; + } + + /* + * process the rest of the function using the mirror_num of the source + * drive. Therefore look it up first. At the end, patch the device + * pointer to the one of the target drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add the + * mirror with the lowest physical address + */ + if (found && + physical_of_found <= bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + + btrfs_put_bbio(bbio); + + ASSERT(found); + if (!found) + return -EIO; + + *mirror_num = index_srcdev + 1; + *physical = physical_of_found; + return ret; +} + +static void handle_ops_on_dev_replace(enum btrfs_map_op op, + struct btrfs_bio **bbio_ret, + struct btrfs_dev_replace *dev_replace, + int *num_stripes_ret, int *max_errors_ret) +{ + struct btrfs_bio *bbio = *bbio_ret; + u64 srcdev_devid = dev_replace->srcdev->devid; + int tgtdev_indexes = 0; + int num_stripes = *num_stripes_ret; + int max_errors = *max_errors_ret; + int i; + + if (op == BTRFS_MAP_WRITE) { + int index_where_to_add; + + /* + * duplicate the write operations while the dev replace + * procedure is running. Since the copying of the old disk to + * the new disk takes place at run time while the filesystem is + * mounted writable, the regular write operations to the old + * disk have to be duplicated to go to the new disk as well. + * + * Note that device->missing is handled by the caller, and that + * the write to the old disk is already set up in the stripes + * array. + */ + index_where_to_add = num_stripes; + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* write to new disk, too */ + struct btrfs_bio_stripe *new = + bbio->stripes + index_where_to_add; + struct btrfs_bio_stripe *old = + bbio->stripes + i; + + new->physical = old->physical; + new->length = old->length; + new->dev = dev_replace->tgtdev; + bbio->tgtdev_map[i] = index_where_to_add; + index_where_to_add++; + max_errors++; + tgtdev_indexes++; + } + } + num_stripes = index_where_to_add; + } else if (op == BTRFS_MAP_GET_READ_MIRRORS) { + int index_srcdev = 0; + int found = 0; + u64 physical_of_found = 0; + + /* + * During the dev-replace procedure, the target drive can also + * be used to read data in case it is needed to repair a corrupt + * block elsewhere. This is possible if the requested area is + * left of the left cursor. In this area, the target drive is a + * full copy of the source drive. + */ + for (i = 0; i < num_stripes; i++) { + if (bbio->stripes[i].dev->devid == srcdev_devid) { + /* + * In case of DUP, in order to keep it simple, + * only add the mirror with the lowest physical + * address + */ + if (found && + physical_of_found <= + bbio->stripes[i].physical) + continue; + index_srcdev = i; + found = 1; + physical_of_found = bbio->stripes[i].physical; + } + } + if (found) { + struct btrfs_bio_stripe *tgtdev_stripe = + bbio->stripes + num_stripes; + + tgtdev_stripe->physical = physical_of_found; + tgtdev_stripe->length = + bbio->stripes[index_srcdev].length; + tgtdev_stripe->dev = dev_replace->tgtdev; + bbio->tgtdev_map[index_srcdev] = num_stripes; + + tgtdev_indexes++; + num_stripes++; + } + } + + *num_stripes_ret = num_stripes; + *max_errors_ret = max_errors; + bbio->num_tgtdevs = tgtdev_indexes; + *bbio_ret = bbio; +} + +static bool need_full_stripe(enum btrfs_map_op op) +{ + return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS); +} + static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, @@ -5322,14 +5639,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, { struct extent_map *em; struct map_lookup *map; - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; u64 offset; u64 stripe_offset; - u64 stripe_end_offset; u64 stripe_nr; - u64 stripe_nr_orig; - u64 stripe_nr_end; u64 stripe_len; u32 stripe_index; int i; @@ -5345,23 +5657,13 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, u64 physical_to_patch_in_first_stripe = 0; u64 raid56_full_stripe_start = (u64)-1; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, *length); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu len %llu", - logical, *length); - return -EINVAL; - } + if (op == BTRFS_MAP_DISCARD) + return __btrfs_map_block_for_discard(fs_info, logical, + *length, bbio_ret); - if (em->start > logical || em->start + em->len < logical) { - btrfs_crit(fs_info, - "found a bad mapping, wanted %Lu, found %Lu-%Lu", - logical, em->start, em->start + em->len); - free_extent_map(em); - return -EINVAL; - } + em = get_chunk_map(fs_info, logical, *length); + if (IS_ERR(em)) + return PTR_ERR(em); map = em->map_lookup; offset = logical - em->start; @@ -5400,14 +5702,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, raid56_full_stripe_start *= full_stripe_len; } - if (op == BTRFS_MAP_DISCARD) { - /* we don't discard raid56 yet */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = -EOPNOTSUPP; - goto out; - } - *length = min_t(u64, em->len - offset, *length); - } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { u64 max_len; /* For writes to RAID[56], allow a full stripeset across all disks. For other RAID types and for RAID[56] reads, just allow a single @@ -5438,105 +5733,28 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, btrfs_dev_replace_set_lock_blocking(dev_replace); if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 && - op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS && dev_replace->tgtdev != NULL) { - /* - * in dev-replace case, for repair case (that's the only - * case where the mirror is selected explicitly when - * calling btrfs_map_block), blocks left of the left cursor - * can also be read from the target drive. - * For REQ_GET_READ_MIRRORS, the target drive is added as - * the last one to the array of stripes. For READ, it also - * needs to be supported using the same mirror number. - * If the requested block is not left of the left cursor, - * EIO is returned. This can happen because btrfs_num_copies() - * returns one more in the dev-replace case. - */ - u64 tmp_length = *length; - struct btrfs_bio *tmp_bbio = NULL; - int tmp_num_stripes; - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS, - logical, &tmp_length, &tmp_bbio, 0, 0); - if (ret) { - WARN_ON(tmp_bbio != NULL); - goto out; - } - - tmp_num_stripes = tmp_bbio->num_stripes; - if (mirror_num > tmp_num_stripes) { - /* - * BTRFS_MAP_GET_READ_MIRRORS does not contain this - * mirror, that means that the requested area - * is not left of the left cursor - */ - ret = -EIO; - btrfs_put_bbio(tmp_bbio); - goto out; - } - - /* - * process the rest of the function using the mirror_num - * of the source drive. Therefore look it up first. - * At the end, patch the device pointer to the one of the - * target drive. - */ - for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) - continue; - - /* - * In case of DUP, in order to keep it simple, only add - * the mirror with the lowest physical address - */ - if (found && - physical_of_found <= tmp_bbio->stripes[i].physical) - continue; - - index_srcdev = i; - found = 1; - physical_of_found = tmp_bbio->stripes[i].physical; - } - - btrfs_put_bbio(tmp_bbio); - - if (!found) { - WARN_ON(1); - ret = -EIO; + !need_full_stripe(op) && dev_replace->tgtdev != NULL) { + ret = get_extra_mirror_from_replace(fs_info, logical, *length, + dev_replace->srcdev->devid, + &mirror_num, + &physical_to_patch_in_first_stripe); + if (ret) goto out; - } - - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; + else + patch_the_first_stripe_for_dev_replace = 1; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } num_stripes = 1; stripe_index = 0; - stripe_nr_orig = stripe_nr; - stripe_nr_end = ALIGN(offset + *length, map->stripe_len); - stripe_nr_end = div_u64(stripe_nr_end, map->stripe_len); - stripe_end_offset = stripe_nr_end * map->stripe_len - - (offset + *length); - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->num_stripes, - stripe_nr_end - stripe_nr_orig); stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); - if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) + if (op != BTRFS_MAP_WRITE && op != BTRFS_MAP_GET_READ_MIRRORS) mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->num_stripes; else if (mirror_num) stripe_index = mirror_num - 1; @@ -5549,8 +5767,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD || - op == BTRFS_MAP_GET_READ_MIRRORS) { + if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) { num_stripes = map->num_stripes; } else if (mirror_num) { stripe_index = mirror_num - 1; @@ -5566,10 +5783,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) num_stripes = map->sub_stripes; - else if (op == BTRFS_MAP_DISCARD) - num_stripes = min_t(u64, map->sub_stripes * - (stripe_nr_end - stripe_nr_orig), - map->num_stripes); else if (mirror_num) stripe_index += mirror_num - 1; else { @@ -5587,7 +5800,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ - stripe_nr = div_u64(raid56_full_stripe_start, + stripe_nr = div64_u64(raid56_full_stripe_start, stripe_len * nr_data_stripes(map)); /* RAID[56] write or recovery. Return all stripes */ @@ -5612,8 +5825,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* We distribute the parity blocks across stripes */ div_u64_rem(stripe_nr + stripe_index, map->num_stripes, &stripe_index); - if ((op != BTRFS_MAP_WRITE && op != BTRFS_MAP_DISCARD && - op != BTRFS_MAP_GET_READ_MIRRORS) && mirror_num <= 1) + if ((op != BTRFS_MAP_WRITE && + op != BTRFS_MAP_GET_READ_MIRRORS) && + mirror_num <= 1) mirror_num = 1; } } else { @@ -5635,8 +5849,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } num_alloc_stripes = num_stripes; - if (dev_replace_is_ongoing) { - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) { + if (op == BTRFS_MAP_WRITE) num_alloc_stripes <<= 1; if (op == BTRFS_MAP_GET_READ_MIRRORS) num_alloc_stripes++; @@ -5648,14 +5862,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, ret = -ENOMEM; goto out; } - if (dev_replace_is_ongoing) + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes); /* build raid_map */ - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - need_raid_map && - ((op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) || - mirror_num > 1)) { + if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map && + (need_full_stripe(op) || mirror_num > 1)) { u64 tmp; unsigned rot; @@ -5679,173 +5891,27 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, RAID6_Q_STRIPE; } - if (op == BTRFS_MAP_DISCARD) { - u32 factor = 0; - u32 sub_stripes = 0; - u64 stripes_per_dev = 0; - u32 remaining_stripes = 0; - u32 last_stripe = 0; - if (map->type & - (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - if (map->type & BTRFS_BLOCK_GROUP_RAID0) - sub_stripes = 1; - else - sub_stripes = map->sub_stripes; - - factor = map->num_stripes / sub_stripes; - stripes_per_dev = div_u64_rem(stripe_nr_end - - stripe_nr_orig, - factor, - &remaining_stripes); - div_u64_rem(stripe_nr_end - 1, factor, &last_stripe); - last_stripe *= sub_stripes; - } - - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + stripe_nr * map->stripe_len; - bbio->stripes[i].dev = map->stripes[stripe_index].dev; - - if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | - BTRFS_BLOCK_GROUP_RAID10)) { - bbio->stripes[i].length = stripes_per_dev * - map->stripe_len; - - if (i / sub_stripes < remaining_stripes) - bbio->stripes[i].length += - map->stripe_len; - - /* - * Special for the first stripe and - * the last stripe: - * - * |-------|...|-------| - * |----------| - * off end_off - */ - if (i < sub_stripes) - bbio->stripes[i].length -= - stripe_offset; - - if (stripe_index >= last_stripe && - stripe_index <= (last_stripe + - sub_stripes - 1)) - bbio->stripes[i].length -= - stripe_end_offset; - - if (i == sub_stripes - 1) - stripe_offset = 0; - } else - bbio->stripes[i].length = *length; - - stripe_index++; - if (stripe_index == map->num_stripes) { - /* This could only happen for RAID0/10 */ - stripe_index = 0; - stripe_nr++; - } - } - } else { - for (i = 0; i < num_stripes; i++) { - bbio->stripes[i].physical = - map->stripes[stripe_index].physical + - stripe_offset + - stripe_nr * map->stripe_len; - bbio->stripes[i].dev = - map->stripes[stripe_index].dev; - stripe_index++; - } + for (i = 0; i < num_stripes; i++) { + bbio->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + + stripe_nr * map->stripe_len; + bbio->stripes[i].dev = + map->stripes[stripe_index].dev; + stripe_index++; } - if (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS) + if (need_full_stripe(op)) max_errors = btrfs_chunk_max_errors(map); if (bbio->raid_map) sort_parity_stripes(bbio, num_stripes); - tgtdev_indexes = 0; - if (dev_replace_is_ongoing && - (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_DISCARD) && - dev_replace->tgtdev != NULL) { - int index_where_to_add; - u64 srcdev_devid = dev_replace->srcdev->devid; - - /* - * duplicate the write operations while the dev replace - * procedure is running. Since the copying of the old disk - * to the new disk takes place at run time while the - * filesystem is mounted writable, the regular write - * operations to the old disk have to be duplicated to go - * to the new disk as well. - * Note that device->missing is handled by the caller, and - * that the write to the old disk is already set up in the - * stripes array. - */ - index_where_to_add = num_stripes; - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* write to new disk, too */ - struct btrfs_bio_stripe *new = - bbio->stripes + index_where_to_add; - struct btrfs_bio_stripe *old = - bbio->stripes + i; - - new->physical = old->physical; - new->length = old->length; - new->dev = dev_replace->tgtdev; - bbio->tgtdev_map[i] = index_where_to_add; - index_where_to_add++; - max_errors++; - tgtdev_indexes++; - } - } - num_stripes = index_where_to_add; - } else if (dev_replace_is_ongoing && - op == BTRFS_MAP_GET_READ_MIRRORS && - dev_replace->tgtdev != NULL) { - u64 srcdev_devid = dev_replace->srcdev->devid; - int index_srcdev = 0; - int found = 0; - u64 physical_of_found = 0; - - /* - * During the dev-replace procedure, the target drive can - * also be used to read data in case it is needed to repair - * a corrupt block elsewhere. This is possible if the - * requested area is left of the left cursor. In this area, - * the target drive is a full copy of the source drive. - */ - for (i = 0; i < num_stripes; i++) { - if (bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = bbio->stripes[i].physical; - } - } - if (found) { - struct btrfs_bio_stripe *tgtdev_stripe = - bbio->stripes + num_stripes; - - tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bbio->stripes[index_srcdev].length; - tgtdev_stripe->dev = dev_replace->tgtdev; - bbio->tgtdev_map[index_srcdev] = num_stripes; - - tgtdev_indexes++; - num_stripes++; - } + if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && + need_full_stripe(op)) { + handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes, + &max_errors); } *bbio_ret = bbio; @@ -5853,7 +5919,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, bbio->num_stripes = num_stripes; bbio->max_errors = max_errors; bbio->mirror_num = mirror_num; - bbio->num_tgtdevs = tgtdev_indexes; /* * this is the case that REQ_READ && dev_replace_is_ongoing && @@ -5886,19 +5951,15 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, /* For Scrub/replace */ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map) + struct btrfs_bio **bbio_ret) { - return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, - mirror_num, need_raid_map); + return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len) { - struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree; - struct extent_map_tree *em_tree = &map_tree->map_tree; struct extent_map *em; struct map_lookup *map; u64 *buf; @@ -5908,24 +5969,11 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 rmap_len; int i, j, nr = 0; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_start, 1); - read_unlock(&em_tree->lock); - - if (!em) { - btrfs_err(fs_info, "couldn't find em for chunk %Lu", - chunk_start); + em = get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) return -EIO; - } - if (em->start != chunk_start) { - btrfs_err(fs_info, "bad chunk start, em=%Lu, wanted=%Lu", - em->start, chunk_start); - free_extent_map(em); - return -EIO; - } map = em->map_lookup; - length = em->len; rmap_len = map->stripe_len; @@ -5949,7 +5997,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, continue; stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div_u64(stripe_nr, map->stripe_len); + stripe_nr = div64_u64(stripe_nr, map->stripe_len); if (map->type & BTRFS_BLOCK_GROUP_RAID10) { stripe_nr = stripe_nr * map->num_stripes + i; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 59be81206dd7b949683ad95c80fa561c022ad812..c7d0fbc915cabdee7cfec437e18f795ace77abbb 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -123,7 +123,6 @@ struct btrfs_device { struct list_head resized_list; /* for sending down flush barriers */ - int nobarriers; struct bio *flush_bio; struct completion flush_wait; @@ -298,7 +297,7 @@ struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); struct btrfs_bio { - atomic_t refs; + refcount_t refs; atomic_t stripes_pending; struct btrfs_fs_info *fs_info; u64 map_type; /* get from map_lookup->type */ @@ -400,8 +399,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret, int mirror_num); int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, - struct btrfs_bio **bbio_ret, int mirror_num, - int need_raid_map); + struct btrfs_bio **bbio_ret); int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 devid, u64 **logical, int *naddrs, int *stripe_len); @@ -475,7 +473,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info, struct btrfs_device *tgtdev); void btrfs_scratch_superblocks(struct block_device *bdev, const char *device_path); -int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, +int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len, int mirror_num); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct btrfs_mapping_tree *map_tree, diff --git a/fs/buffer.c b/fs/buffer.c index 9196f2a270daac4d8d4612be27a0bd8288657335..161be58c5cb0f738754b79d87eda879aa3bb9553 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -49,7 +49,6 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -1830,7 +1829,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -1884,7 +1883,7 @@ int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, 0, wbc); + submit_bh_wbc(REQ_OP_WRITE, write_flags, bh, wbc); nr_underway++; } bh = next; @@ -2379,8 +2378,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) goto out; err = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND, - &page, &fsdata); + AOP_FLAG_CONT_EXPAND, &page, &fsdata); if (err) goto out; @@ -2415,9 +2413,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = PAGE_SIZE - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -2449,9 +2446,8 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, } len = offset - zerofrom; - err = pagecache_write_begin(file, mapping, curpos, len, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + err = pagecache_write_begin(file, mapping, curpos, len, 0, + &page, &fsdata); if (err) goto out; zero_user(page, zerofrom, len); @@ -3095,7 +3091,7 @@ void guard_bio_eod(int op, struct bio *bio) } static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags, struct writeback_control *wbc) + struct writeback_control *wbc) { struct bio *bio; @@ -3130,7 +3126,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, bio->bi_end_io = end_bio_bh_io_sync; bio->bi_private = bh; - bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ guard_bio_eod(op, bio); @@ -3145,16 +3140,9 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh, return 0; } -int _submit_bh(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags) +int submit_bh(int op, int op_flags, struct buffer_head *bh) { - return submit_bh_wbc(op, op_flags, bh, bio_flags, NULL); -} -EXPORT_SYMBOL_GPL(_submit_bh); - -int submit_bh(int op, int op_flags, struct buffer_head *bh) -{ - return submit_bh_wbc(op, op_flags, bh, 0, NULL); + return submit_bh_wbc(op, op_flags, bh, NULL); } EXPORT_SYMBOL(submit_bh); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1a3e1b40799a086037fe529e1b0ff02e302adb43..1e71e6ca5ddfb09466bee7d8721d064a4a846176 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -578,7 +578,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) writeback_stat = atomic_long_inc_return(&fsc->writeback_count); if (writeback_stat > CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb)) - set_bdi_congested(&fsc->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); err = ceph_osdc_writepages(osdc, ceph_vino(inode), @@ -670,8 +670,12 @@ static void writepages_finish(struct ceph_osd_request *req) bool remove_page; dout("writepages_finish %p rc %d\n", inode, rc); - if (rc < 0) + if (rc < 0) { mapping_set_error(mapping, rc); + ceph_set_error_write(ci); + } else { + ceph_clear_error_write(ci); + } /* * We lost the cache cap, need to truncate the page before @@ -700,12 +704,9 @@ static void writepages_finish(struct ceph_osd_request *req) if (atomic_long_dec_return(&fsc->writeback_count) < CONGESTION_OFF_THRESH( fsc->mount_options->congestion_kb)) - clear_bdi_congested(&fsc->backing_dev_info, + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - if (rc < 0) - SetPageError(page); - ceph_put_snap_context(page_snap_context(page)); page->private = 0; ClearPagePrivate(page); @@ -979,7 +980,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (atomic_long_inc_return(&fsc->writeback_count) > CONGESTION_ON_THRESH( fsc->mount_options->congestion_kb)) { - set_bdi_congested(&fsc->backing_dev_info, + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } @@ -1892,6 +1893,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); wr_req->r_mtime = ci->vfs_inode.i_mtime; + wr_req->r_abort_on_full = true; err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); if (!err) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 68c78be19d5b78ad181d15b6caf5cc075b6f9f99..a3ebb632294e7b90a315508c9b75671d2ce8ff6a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1015,6 +1015,7 @@ static int send_cap_msg(struct cap_msg_args *arg) void *p; size_t extra_len; struct timespec zerotime = {0}; + struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" @@ -1076,8 +1077,12 @@ static int send_cap_msg(struct cap_msg_args *arg) ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); - /* osd_epoch_barrier (version 5) */ - ceph_encode_32(&p, 0); + /* + * osd_epoch_barrier (version 5) + * The epoch_barrier is protected osdc->lock, so READ_ONCE here in + * case it was recently changed + */ + ceph_encode_32(&p, READ_ONCE(osdc->epoch_barrier)); /* oldest_flush_tid (version 6) */ ceph_encode_64(&p, arg->oldest_flush_tid); @@ -1389,7 +1394,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci, first_tid = cf->tid + 1; capsnap = container_of(cf, struct ceph_cap_snap, cap_flush); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); dout("__flush_snaps %p capsnap %p tid %llu %s\n", @@ -2202,7 +2207,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty)); - atomic_inc(&capsnap->nref); + refcount_inc(&capsnap->nref); spin_unlock(&ci->i_ceph_lock); ret = __send_flush_snap(inode, session, capsnap, cap->mseq, @@ -3633,13 +3638,19 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 5) { + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; + u32 epoch_barrier; + + ceph_decode_32_safe(&p, end, epoch_barrier, bad); + ceph_osdc_update_epoch_barrier(osdc, epoch_barrier); + } + if (le16_to_cpu(msg->hdr.version) >= 8) { u64 flush_tid; u32 caller_uid, caller_gid; - u32 osd_epoch_barrier; u32 pool_ns_len; - /* version >= 5 */ - ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ ceph_decode_64_safe(&p, end, flush_tid, bad); /* version >= 7 */ diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index f2ae393e2c31a2b3dbca7a5f81eeb5b81afa5e1b..4e2d112c982f4b12852ba7b8bad58327783bc3db 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -22,20 +22,19 @@ static int mdsmap_show(struct seq_file *s, void *p) { int i; struct ceph_fs_client *fsc = s->private; + struct ceph_mdsmap *mdsmap; if (fsc->mdsc == NULL || fsc->mdsc->mdsmap == NULL) return 0; - seq_printf(s, "epoch %d\n", fsc->mdsc->mdsmap->m_epoch); - seq_printf(s, "root %d\n", fsc->mdsc->mdsmap->m_root); - seq_printf(s, "session_timeout %d\n", - fsc->mdsc->mdsmap->m_session_timeout); - seq_printf(s, "session_autoclose %d\n", - fsc->mdsc->mdsmap->m_session_autoclose); - for (i = 0; i < fsc->mdsc->mdsmap->m_max_mds; i++) { - struct ceph_entity_addr *addr = - &fsc->mdsc->mdsmap->m_info[i].addr; - int state = fsc->mdsc->mdsmap->m_info[i].state; - + mdsmap = fsc->mdsc->mdsmap; + seq_printf(s, "epoch %d\n", mdsmap->m_epoch); + seq_printf(s, "root %d\n", mdsmap->m_root); + seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); + seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); + seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); + for (i = 0; i < mdsmap->m_num_mds; i++) { + struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; + int state = mdsmap->m_info[i].state; seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, ceph_pr_addr(&addr->in_addr), ceph_mds_state_name(state)); @@ -251,7 +250,7 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) goto out; snprintf(name, sizeof(name), "../../bdi/%s", - dev_name(fsc->backing_dev_info.dev)); + dev_name(fsc->sb->s_bdi->dev)); fsc->debugfs_bdi = debugfs_create_symlink("bdi", fsc->client->debugfs_dir, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3e9ad501addfe92f171a40dffb93c65209819cbe..e071d23f61481bf23fc4407d72ea75c9d1ec2430 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -294,7 +294,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_client *mdsc = fsc->mdsc; int i; int err; - u32 ftype; + unsigned frag = -1; struct ceph_mds_reply_info_parsed *rinfo; dout("readdir %p file %p pos %llx\n", inode, file, ctx->pos); @@ -341,7 +341,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* do we have the correct frag content buffered? */ if (need_send_readdir(fi, ctx->pos)) { struct ceph_mds_request *req; - unsigned frag; int op = ceph_snap(inode) == CEPH_SNAPDIR ? CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR; @@ -352,8 +351,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } if (is_hash_order(ctx->pos)) { - frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), - NULL, NULL); + /* fragtree isn't always accurate. choose frag + * based on previous reply when possible. */ + if (frag == (unsigned)-1) + frag = ceph_choose_frag(ci, fpos_hash(ctx->pos), + NULL, NULL); } else { frag = fpos_frag(ctx->pos); } @@ -378,7 +380,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ceph_mdsc_put_request(req); return -ENOMEM; } + } else if (is_hash_order(ctx->pos)) { + req->r_args.readdir.offset_hash = + cpu_to_le32(fpos_hash(ctx->pos)); } + req->r_dir_release_cnt = fi->dir_release_count; req->r_dir_ordered_cnt = fi->dir_ordered_count; req->r_readdir_cache_idx = fi->readdir_cache_idx; @@ -476,6 +482,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i; struct ceph_vino vino; ino_t ino; + u32 ftype; BUG_ON(rde->offset < ctx->pos); @@ -498,15 +505,17 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ctx->pos++; } + ceph_mdsc_put_request(fi->last_readdir); + fi->last_readdir = NULL; + if (fi->next_offset > 2) { - ceph_mdsc_put_request(fi->last_readdir); - fi->last_readdir = NULL; + frag = fi->frag; goto more; } /* more frags? */ if (!ceph_frag_is_rightmost(fi->frag)) { - unsigned frag = ceph_frag_next(fi->frag); + frag = ceph_frag_next(fi->frag); if (is_hash_order(ctx->pos)) { loff_t new_pos = ceph_make_fpos(ceph_frag_value(frag), fi->next_offset, true); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 26cc95421cca6e62ef10bfd32b18cf1e526b7c51..3fdde0b283c9b86f7fa6aaa9585593699be29fca 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -13,6 +13,38 @@ #include "mds_client.h" #include "cache.h" +static __le32 ceph_flags_sys2wire(u32 flags) +{ + u32 wire_flags = 0; + + switch (flags & O_ACCMODE) { + case O_RDONLY: + wire_flags |= CEPH_O_RDONLY; + break; + case O_WRONLY: + wire_flags |= CEPH_O_WRONLY; + break; + case O_RDWR: + wire_flags |= CEPH_O_RDWR; + break; + } + +#define ceph_sys2wire(a) if (flags & a) { wire_flags |= CEPH_##a; flags &= ~a; } + + ceph_sys2wire(O_CREAT); + ceph_sys2wire(O_EXCL); + ceph_sys2wire(O_TRUNC); + ceph_sys2wire(O_DIRECTORY); + ceph_sys2wire(O_NOFOLLOW); + +#undef ceph_sys2wire + + if (flags) + dout("unused open flags: %x", flags); + + return cpu_to_le32(wire_flags); +} + /* * Ceph file operations * @@ -74,12 +106,9 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes, align = (unsigned long)(it->iov->iov_base + it->iov_offset) & (PAGE_SIZE - 1); npages = calc_pages_for(align, nbytes); - pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL); - if (!pages) { - pages = vmalloc(sizeof(*pages) * npages); - if (!pages) - return ERR_PTR(-ENOMEM); - } + pages = kvmalloc(sizeof(*pages) * npages, GFP_KERNEL); + if (!pages) + return ERR_PTR(-ENOMEM); for (idx = 0; idx < npages; ) { size_t start; @@ -123,7 +152,7 @@ prepare_open_request(struct super_block *sb, int flags, int create_mode) if (IS_ERR(req)) goto out; req->r_fmode = ceph_flags_to_mode(flags); - req->r_args.open.flags = cpu_to_le32(flags); + req->r_args.open.flags = ceph_flags_sys2wire(flags); req->r_args.open.mode = cpu_to_le32(create_mode); out: return req; @@ -192,7 +221,7 @@ int ceph_renew_caps(struct inode *inode) spin_lock(&ci->i_ceph_lock); wanted = __ceph_caps_file_wanted(ci); if (__ceph_is_any_real_caps(ci) && - (!(wanted & CEPH_CAP_ANY_WR) == 0 || ci->i_auth_cap)) { + (!(wanted & CEPH_CAP_ANY_WR) || ci->i_auth_cap)) { int issued = __ceph_caps_issued(ci, NULL); spin_unlock(&ci->i_ceph_lock); dout("renew caps %p want %s issued %s updating mds_wanted\n", @@ -781,6 +810,7 @@ static void ceph_aio_retry_work(struct work_struct *work) req->r_callback = ceph_aio_complete_req; req->r_inode = inode; req->r_priv = aio_req; + req->r_abort_on_full = true; ret = ceph_osdc_start_request(req->r_osdc, req, false); out: @@ -1088,19 +1118,22 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, out: ceph_osdc_put_request(req); - if (ret == 0) { - pos += len; - written += len; - - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) - ceph_check_caps(ceph_inode(inode), - CHECK_CAPS_AUTHONLY, - NULL); - } - } else + if (ret != 0) { + ceph_set_error_write(ci); break; + } + + ceph_clear_error_write(ci); + pos += len; + written += len; + if (pos > i_size_read(inode)) { + check_caps = ceph_inode_set_size(inode, pos); + if (check_caps) + ceph_check_caps(ceph_inode(inode), + CHECK_CAPS_AUTHONLY, + NULL); + } + } if (ret != -EOLDSNAPC && written > 0) { @@ -1306,6 +1339,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) } retry_snap: + /* FIXME: not complete since it doesn't account for being at quota */ if (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL)) { err = -ENOSPC; goto out; @@ -1327,7 +1361,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) inode, ceph_vinop(inode), pos, count, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || - (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC) || + (ci->i_ceph_flags & CEPH_I_ERROR_WRITE)) { struct ceph_snap_context *snapc; struct iov_iter data; inode_unlock(inode); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d449e1c03cbd791922148ad00c3d6d0f0e1599ce..dcce79b844064447af8e542fe34188a4f58e22d9 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1482,10 +1482,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) return readdir_prepopulate_inodes_only(req, session); - if (rinfo->hash_order && req->r_path2) { - last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, - req->r_path2, strlen(req->r_path2)); - last_hash = ceph_frag_value(last_hash); + if (rinfo->hash_order) { + if (req->r_path2) { + last_hash = ceph_str_hash(ci->i_dir_layout.dl_dir_hash, + req->r_path2, + strlen(req->r_path2)); + last_hash = ceph_frag_value(last_hash); + } else if (rinfo->offset_hash) { + /* mds understands offset_hash */ + WARN_ON_ONCE(req->r_readdir_offset != 2); + last_hash = le32_to_cpu(rhead->args.readdir.offset_hash); + } } if (rinfo->dir_dir && @@ -1510,7 +1517,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, } if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2 && - !(rinfo->hash_order && req->r_path2)) { + !(rinfo->hash_order && last_hash)) { /* note dir version at start of readdir so we can tell * if any dentries get dropped */ req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); @@ -2071,11 +2078,6 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); - if (ia_valid & ATTR_MODE) { - err = posix_acl_chmod(inode, attr->ia_mode); - if (err) - goto out_put; - } if (mask) { req->r_inode = inode; @@ -2088,14 +2090,12 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err, ceph_cap_string(dirtied), mask); - ceph_mdsc_put_request(req); - if (mask & CEPH_SETATTR_SIZE) - __ceph_do_pending_vmtruncate(inode); - ceph_free_cap_flush(prealloc_cf); - return err; -out_put: ceph_mdsc_put_request(req); ceph_free_cap_flush(prealloc_cf); + + if (err >= 0 && (mask & CEPH_SETATTR_SIZE)) + __ceph_do_pending_vmtruncate(inode); + return err; } @@ -2114,7 +2114,12 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; - return __ceph_setattr(inode, attr); + err = __ceph_setattr(inode, attr); + + if (err >= 0 && (attr->ia_valid & ATTR_MODE)) + err = posix_acl_chmod(inode, attr->ia_mode); + + return err; } /* diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c681762d76e66be1edf7004b1de8c13568ec6022..f38e56fa97129d646285fa2b433e8130c7b85312 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -189,6 +189,7 @@ static int parse_reply_info_dir(void **p, void *end, info->dir_end = !!(flags & CEPH_READDIR_FRAG_END); info->dir_complete = !!(flags & CEPH_READDIR_FRAG_COMPLETE); info->hash_order = !!(flags & CEPH_READDIR_HASH_ORDER); + info->offset_hash = !!(flags & CEPH_READDIR_OFFSET_HASH); } if (num == 0) goto done; @@ -378,9 +379,9 @@ const char *ceph_session_state_name(int s) static struct ceph_mds_session *get_session(struct ceph_mds_session *s) { - if (atomic_inc_not_zero(&s->s_ref)) { + if (refcount_inc_not_zero(&s->s_ref)) { dout("mdsc get_session %p %d -> %d\n", s, - atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref)); + refcount_read(&s->s_ref)-1, refcount_read(&s->s_ref)); return s; } else { dout("mdsc get_session %p 0 -- FAIL", s); @@ -391,8 +392,8 @@ static struct ceph_mds_session *get_session(struct ceph_mds_session *s) void ceph_put_mds_session(struct ceph_mds_session *s) { dout("mdsc put_session %p %d -> %d\n", s, - atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1); - if (atomic_dec_and_test(&s->s_ref)) { + refcount_read(&s->s_ref), refcount_read(&s->s_ref)-1); + if (refcount_dec_and_test(&s->s_ref)) { if (s->s_auth.authorizer) ceph_auth_destroy_authorizer(s->s_auth.authorizer); kfree(s); @@ -411,7 +412,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, return NULL; session = mdsc->sessions[mds]; dout("lookup_mds_session %p %d\n", session, - atomic_read(&session->s_ref)); + refcount_read(&session->s_ref)); get_session(session); return session; } @@ -441,7 +442,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, { struct ceph_mds_session *s; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return ERR_PTR(-EINVAL); s = kzalloc(sizeof(*s), GFP_NOFS); @@ -466,7 +467,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, INIT_LIST_HEAD(&s->s_caps); s->s_nr_caps = 0; s->s_trim_caps = 0; - atomic_set(&s->s_ref, 1); + refcount_set(&s->s_ref, 1); INIT_LIST_HEAD(&s->s_waiting); INIT_LIST_HEAD(&s->s_unsafe); s->s_num_cap_releases = 0; @@ -494,7 +495,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, } mdsc->sessions[mds] = s; atomic_inc(&mdsc->num_sessions); - atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */ + refcount_inc(&s->s_ref); /* one ref to sessions[], one to caller */ ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds, ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); @@ -1004,7 +1005,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, struct ceph_mds_session *ts; int i, mds = session->s_mds; - if (mds >= mdsc->mdsmap->m_max_mds) + if (mds >= mdsc->mdsmap->m_num_mds) return; mi = &mdsc->mdsmap->m_info[mds]; @@ -1551,9 +1552,15 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_msg *msg = NULL; struct ceph_mds_cap_release *head; struct ceph_mds_cap_item *item; + struct ceph_osd_client *osdc = &mdsc->fsc->client->osdc; struct ceph_cap *cap; LIST_HEAD(tmp_list); int num_cap_releases; + __le32 barrier, *cap_barrier; + + down_read(&osdc->lock); + barrier = cpu_to_le32(osdc->epoch_barrier); + up_read(&osdc->lock); spin_lock(&session->s_cap_lock); again: @@ -1571,7 +1578,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, head = msg->front.iov_base; head->num = cpu_to_le32(0); msg->front.iov_len = sizeof(*head); + + msg->hdr.version = cpu_to_le16(2); + msg->hdr.compat_version = cpu_to_le16(1); } + cap = list_first_entry(&tmp_list, struct ceph_cap, session_caps); list_del(&cap->session_caps); @@ -1589,6 +1600,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, ceph_put_cap(mdsc, cap); if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1604,6 +1620,11 @@ void ceph_send_cap_releases(struct ceph_mds_client *mdsc, spin_unlock(&session->s_cap_lock); if (msg) { + // Append cap_barrier field + cap_barrier = msg->front.iov_base + msg->front.iov_len; + *cap_barrier = barrier; + msg->front.iov_len += sizeof(*cap_barrier); + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); dout("send_cap_releases mds%d %p\n", session->s_mds, msg); ceph_con_send(&session->s_con, msg); @@ -1666,6 +1687,7 @@ struct ceph_mds_request * ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) { struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS); + struct timespec ts; if (!req) return ERR_PTR(-ENOMEM); @@ -1684,7 +1706,8 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) init_completion(&req->r_safe_completion); INIT_LIST_HEAD(&req->r_unsafe_item); - req->r_stamp = current_fs_time(mdsc->fsc->sb); + ktime_get_real_ts(&ts); + req->r_stamp = timespec_trunc(ts, mdsc->fsc->sb->s_time_gran); req->r_op = op; req->r_direct_mode = mode; @@ -1991,7 +2014,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, if (req->r_pagelist) { struct ceph_pagelist *pagelist = req->r_pagelist; - atomic_inc(&pagelist->refcnt); + refcount_inc(&pagelist->refcnt); ceph_msg_data_add_pagelist(msg, pagelist); msg->hdr.data_len = cpu_to_le32(pagelist->length); } else { @@ -2638,8 +2661,10 @@ static void handle_session(struct ceph_mds_session *session, seq = le64_to_cpu(h->seq); mutex_lock(&mdsc->mutex); - if (op == CEPH_SESSION_CLOSE) + if (op == CEPH_SESSION_CLOSE) { + get_session(session); __unregister_session(mdsc, session); + } /* FIXME: this ttl calculation is generous */ session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose; mutex_unlock(&mdsc->mutex); @@ -2728,6 +2753,8 @@ static void handle_session(struct ceph_mds_session *session, kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } + if (op == CEPH_SESSION_CLOSE) + ceph_put_mds_session(session); return; bad: @@ -3107,7 +3134,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, dout("check_new_map new %u old %u\n", newmap->m_epoch, oldmap->m_epoch); - for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { if (mdsc->sessions[i] == NULL) continue; s = mdsc->sessions[i]; @@ -3121,15 +3148,33 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_session_state_name(s->s_state)); - if (i >= newmap->m_max_mds || + if (i >= newmap->m_num_mds || memcmp(ceph_mdsmap_get_addr(oldmap, i), ceph_mdsmap_get_addr(newmap, i), sizeof(struct ceph_entity_addr))) { if (s->s_state == CEPH_MDS_SESSION_OPENING) { /* the session never opened, just close it * out now */ + get_session(s); + __unregister_session(mdsc, s); __wake_requests(mdsc, &s->s_waiting); + ceph_put_mds_session(s); + } else if (i >= newmap->m_num_mds) { + /* force close session for stopped mds */ + get_session(s); __unregister_session(mdsc, s); + __wake_requests(mdsc, &s->s_waiting); + kick_requests(mdsc, i); + mutex_unlock(&mdsc->mutex); + + mutex_lock(&s->s_mutex); + cleanup_session_requests(mdsc, s); + remove_session_caps(s); + mutex_unlock(&s->s_mutex); + + ceph_put_mds_session(s); + + mutex_lock(&mdsc->mutex); } else { /* just close it */ mutex_unlock(&mdsc->mutex); @@ -3167,7 +3212,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, } } - for (i = 0; i < newmap->m_max_mds && i < mdsc->max_sessions; i++) { + for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { s = mdsc->sessions[i]; if (!s) continue; @@ -3881,7 +3926,7 @@ static struct ceph_connection *con_get(struct ceph_connection *con) struct ceph_mds_session *s = con->private; if (get_session(s)) { - dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref)); + dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref)); return con; } dout("mdsc con_get %p FAIL\n", s); @@ -3892,7 +3937,7 @@ static void con_put(struct ceph_connection *con) { struct ceph_mds_session *s = con->private; - dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref) - 1); + dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1); ceph_put_mds_session(s); } diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ac0475a2daa749d3d689956cc45a2913c955ca8f..db57ae98ed345e280358a2ac65e75ac6c71e7c2b 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -82,9 +83,10 @@ struct ceph_mds_reply_info_parsed { struct ceph_mds_reply_dirfrag *dir_dir; size_t dir_buf_size; int dir_nr; - bool dir_complete; bool dir_end; + bool dir_complete; bool hash_order; + bool offset_hash; struct ceph_mds_reply_dir_entry *dir_entries; }; @@ -104,10 +106,13 @@ struct ceph_mds_reply_info_parsed { /* * cap releases are batched and sent to the MDS en masse. + * + * Account for per-message overhead of mds_cap_release header + * and __le32 for osd epoch barrier trailing field. */ -#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - \ +#define CEPH_CAPS_PER_RELEASE ((PAGE_SIZE - sizeof(u32) - \ sizeof(struct ceph_mds_cap_release)) / \ - sizeof(struct ceph_mds_cap_item)) + sizeof(struct ceph_mds_cap_item)) /* @@ -156,7 +161,7 @@ struct ceph_mds_session { unsigned long s_renew_requested; /* last time we sent a renew req */ u64 s_renew_seq; - atomic_t s_ref; + refcount_t s_ref; struct list_head s_waiting; /* waiting requests */ struct list_head s_unsafe; /* unsafe requests */ }; @@ -373,7 +378,7 @@ __ceph_lookup_mds_session(struct ceph_mds_client *, int mds); static inline struct ceph_mds_session * ceph_get_mds_session(struct ceph_mds_session *s) { - atomic_inc(&s->s_ref); + refcount_inc(&s->s_ref); return s; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 5454e2327a5f77874d63344802709697ceb64de1..1a748cf88535bc80b8060be0cae2d4a92f85599f 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -22,11 +22,11 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) int i; /* special case for one mds */ - if (1 == m->m_max_mds && m->m_info[0].state > 0) + if (1 == m->m_num_mds && m->m_info[0].state > 0) return 0; /* count */ - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) if (m->m_info[i].state > 0) n++; if (n == 0) @@ -135,8 +135,9 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) m->m_session_autoclose = ceph_decode_32(p); m->m_max_file_size = ceph_decode_64(p); m->m_max_mds = ceph_decode_32(p); + m->m_num_mds = m->m_max_mds; - m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS); + m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); if (m->m_info == NULL) goto nomem; @@ -207,9 +208,20 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ceph_pr_addr(&addr.in_addr), ceph_mds_state_name(state)); - if (mds < 0 || mds >= m->m_max_mds || state <= 0) + if (mds < 0 || state <= 0) continue; + if (mds >= m->m_num_mds) { + int new_num = max(mds + 1, m->m_num_mds * 2); + void *new_m_info = krealloc(m->m_info, + new_num * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + m->m_num_mds = new_num; + } + info = &m->m_info[mds]; info->global_id = global_id; info->state = state; @@ -229,6 +241,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) info->export_targets = NULL; } } + if (m->m_num_mds > m->m_max_mds) { + /* find max up mds */ + for (i = m->m_num_mds; i >= m->m_max_mds; i--) { + if (i == 0 || m->m_info[i-1].state > 0) + break; + } + m->m_num_mds = i; + } /* pg_pools */ ceph_decode_32_safe(p, end, n, bad); @@ -270,12 +290,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) for (i = 0; i < n; i++) { s32 mds = ceph_decode_32(p); - if (mds >= 0 && mds < m->m_max_mds) { + if (mds >= 0 && mds < m->m_num_mds) { if (m->m_info[mds].laggy) num_laggy++; } } m->m_num_laggy = num_laggy; + + if (n > m->m_num_mds) { + void *new_m_info = krealloc(m->m_info, + n * sizeof(*m->m_info), + GFP_NOFS | __GFP_ZERO); + if (!new_m_info) + goto nomem; + m->m_info = new_m_info; + } + m->m_num_mds = n; } /* inc */ @@ -341,7 +371,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) { int i; - for (i = 0; i < m->m_max_mds; i++) + for (i = 0; i < m->m_num_mds; i++) kfree(m->m_info[i].export_targets); kfree(m->m_info); kfree(m->m_data_pg_pools); @@ -357,7 +387,7 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) return false; if (m->m_num_laggy > 0) return false; - for (i = 0; i < m->m_max_mds; i++) { + for (i = 0; i < m->m_num_mds; i++) { if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) nr_active++; } diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 8f8b41c2ef0f7d472afad0e1abcb4801ac11b221..dab5d6732345b218e12f9bf7d6fea1c5dac38d55 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -519,7 +519,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) capsnap->need_flush ? "" : "no_flush"); ihold(inode); - atomic_set(&capsnap->nref, 1); + refcount_set(&capsnap->nref, 1); INIT_LIST_HEAD(&capsnap->ci_item); capsnap->follows = old_snapc->seq; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 0ec8d0114e57ba80fdc46b1acdc9b7de7373e276..8d7918ce694a9c0d980641ab66b7be6d0aaa33a5 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -544,10 +544,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, struct ceph_options *opt) { struct ceph_fs_client *fsc; - const u64 supported_features = - CEPH_FEATURE_FLOCK | CEPH_FEATURE_DIRLAYOUTHASH | - CEPH_FEATURE_MDSENC | CEPH_FEATURE_MDS_INLINE_DATA; - const u64 required_features = 0; int page_count; size_t size; int err = -ENOMEM; @@ -556,8 +552,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, if (!fsc) return ERR_PTR(-ENOMEM); - fsc->client = ceph_create_client(opt, fsc, supported_features, - required_features); + fsc->client = ceph_create_client(opt, fsc); if (IS_ERR(fsc->client)) { err = PTR_ERR(fsc->client); goto fail; @@ -579,10 +574,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, atomic_long_set(&fsc->writeback_count, 0); - err = bdi_init(&fsc->backing_dev_info); - if (err < 0) - goto fail_client; - err = -ENOMEM; /* * The number of concurrent works can be high but they don't need @@ -590,7 +581,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, */ fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1); if (fsc->wb_wq == NULL) - goto fail_bdi; + goto fail_client; fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1); if (fsc->pg_inv_wq == NULL) goto fail_wb_wq; @@ -624,8 +615,6 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, destroy_workqueue(fsc->pg_inv_wq); fail_wb_wq: destroy_workqueue(fsc->wb_wq); -fail_bdi: - bdi_destroy(&fsc->backing_dev_info); fail_client: ceph_destroy_client(fsc->client); fail: @@ -643,8 +632,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->trunc_wq); - bdi_destroy(&fsc->backing_dev_info); - mempool_destroy(fsc->wb_pagevec_pool); destroy_mount_options(fsc->mount_options); @@ -937,33 +924,32 @@ static int ceph_compare_super(struct super_block *sb, void *data) */ static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); -static int ceph_register_bdi(struct super_block *sb, - struct ceph_fs_client *fsc) +static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) { int err; + err = super_setup_bdi_name(sb, "ceph-%ld", + atomic_long_inc_return(&bdi_seq)); + if (err) + return err; + /* set ra_pages based on rasize mount option? */ if (fsc->mount_options->rasize >= PAGE_SIZE) - fsc->backing_dev_info.ra_pages = + sb->s_bdi->ra_pages = (fsc->mount_options->rasize + PAGE_SIZE - 1) >> PAGE_SHIFT; else - fsc->backing_dev_info.ra_pages = - VM_MAX_READAHEAD * 1024 / PAGE_SIZE; + sb->s_bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_SIZE; if (fsc->mount_options->rsize > fsc->mount_options->rasize && fsc->mount_options->rsize >= PAGE_SIZE) - fsc->backing_dev_info.io_pages = + sb->s_bdi->io_pages = (fsc->mount_options->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; else if (fsc->mount_options->rsize == 0) - fsc->backing_dev_info.io_pages = ULONG_MAX; + sb->s_bdi->io_pages = ULONG_MAX; - err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld", - atomic_long_inc_return(&bdi_seq)); - if (!err) - sb->s_bdi = &fsc->backing_dev_info; - return err; + return 0; } static struct dentry *ceph_mount(struct file_system_type *fs_type, @@ -1018,7 +1004,7 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type, dout("get_sb got existing client %p\n", fsc); } else { dout("get_sb using new client %p\n", fsc); - err = ceph_register_bdi(sb, fsc); + err = ceph_setup_bdi(sb, fsc); if (err < 0) { res = ERR_PTR(err); goto out_splat; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fe6b9cfc4013e63c8b6f6ea3b9b3bd1d4eb89364..a973acd8beaff67b5b0e67b1217c9bb7419c9982 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -14,6 +14,7 @@ #include #include #include +#include #include @@ -92,8 +93,6 @@ struct ceph_fs_client { struct workqueue_struct *trunc_wq; atomic_long_t writeback_count; - struct backing_dev_info backing_dev_info; - #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_dentry_lru, *debugfs_caps; struct dentry *debugfs_congestion_kb; @@ -162,7 +161,7 @@ struct ceph_cap_flush { * data before flushing the snapped state (tracked here) back to the MDS. */ struct ceph_cap_snap { - atomic_t nref; + refcount_t nref; struct list_head ci_item; struct ceph_cap_flush cap_flush; @@ -191,7 +190,7 @@ struct ceph_cap_snap { static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) { - if (atomic_dec_and_test(&capsnap->nref)) { + if (refcount_dec_and_test(&capsnap->nref)) { if (capsnap->xattr_blob) ceph_buffer_put(capsnap->xattr_blob); kfree(capsnap); @@ -473,6 +472,32 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */ #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ +#define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ + +/* + * We set the ERROR_WRITE bit when we start seeing write errors on an inode + * and then clear it when they start succeeding. Note that we do a lockless + * check first, and only take the lock if it looks like it needs to be changed. + * The write submission code just takes this as a hint, so we're not too + * worried if a few slip through in either direction. + */ +static inline void ceph_set_error_write(struct ceph_inode_info *ci) +{ + if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE)) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} + +static inline void ceph_clear_error_write(struct ceph_inode_info *ci) +{ + if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ERROR_WRITE) { + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_WRITE; + spin_unlock(&ci->i_ceph_lock); + } +} static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, long long release_count, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index febc28f9e2c27648e1621531e90cefc482ed2e5d..75267cdd5dfd822b3290221b269cb0fde64a8e83 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -392,6 +392,7 @@ static int __set_xattr(struct ceph_inode_info *ci, if (update_xattr) { int err = 0; + if (xattr && (flags & XATTR_CREATE)) err = -EEXIST; else if (!xattr && (flags & XATTR_REPLACE)) @@ -399,12 +400,14 @@ static int __set_xattr(struct ceph_inode_info *ci, if (err) { kfree(name); kfree(val); + kfree(*newxattr); return err; } if (update_xattr < 0) { if (xattr) __remove_xattr(ci, xattr); kfree(name); + kfree(*newxattr); return 0; } } diff --git a/fs/char_dev.c b/fs/char_dev.c index 44a240c4bb658149c2d2b5b527af35fbf1a95d49..fb8507f521b265ba8feba385154cc908eb812f44 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -471,6 +471,85 @@ int cdev_add(struct cdev *p, dev_t dev, unsigned count) return 0; } +/** + * cdev_set_parent() - set the parent kobject for a char device + * @p: the cdev structure + * @kobj: the kobject to take a reference to + * + * cdev_set_parent() sets a parent kobject which will be referenced + * appropriately so the parent is not freed before the cdev. This + * should be called before cdev_add. + */ +void cdev_set_parent(struct cdev *p, struct kobject *kobj) +{ + WARN_ON(!kobj->state_initialized); + p->kobj.parent = kobj; +} + +/** + * cdev_device_add() - add a char device and it's corresponding + * struct device, linkink + * @dev: the device structure + * @cdev: the cdev structure + * + * cdev_device_add() adds the char device represented by @cdev to the system, + * just as cdev_add does. It then adds @dev to the system using device_add + * The dev_t for the char device will be taken from the struct device which + * needs to be initialized first. This helper function correctly takes a + * reference to the parent device so the parent will not get released until + * all references to the cdev are released. + * + * This helper uses dev->devt for the device number. If it is not set + * it will not add the cdev and it will be equivalent to device_add. + * + * This function should be used whenever the struct cdev and the + * struct device are members of the same structure whose lifetime is + * managed by the struct device. + * + * NOTE: Callers must assume that userspace was able to open the cdev and + * can call cdev fops callbacks at any time, even if this function fails. + */ +int cdev_device_add(struct cdev *cdev, struct device *dev) +{ + int rc = 0; + + if (dev->devt) { + cdev_set_parent(cdev, &dev->kobj); + + rc = cdev_add(cdev, dev->devt, 1); + if (rc) + return rc; + } + + rc = device_add(dev); + if (rc) + cdev_del(cdev); + + return rc; +} + +/** + * cdev_device_del() - inverse of cdev_device_add + * @dev: the device structure + * @cdev: the cdev structure + * + * cdev_device_del() is a helper function to call cdev_del and device_del. + * It should be used whenever cdev_device_add is used. + * + * If dev->devt is not set it will not remove the cdev and will be equivalent + * to device_del. + * + * NOTE: This guarantees that associated sysfs callbacks are not running + * or runnable, however any cdevs already open will remain and their fops + * will still be callable even after this function returns. + */ +void cdev_device_del(struct cdev *cdev, struct device *dev) +{ + device_del(dev); + if (dev->devt) + cdev_del(cdev); +} + static void cdev_unmap(dev_t dev, unsigned count) { kobj_unmap(cdev_map, dev, count); @@ -482,6 +561,10 @@ static void cdev_unmap(dev_t dev, unsigned count) * * cdev_del() removes @p from the system, possibly freeing the structure * itself. + * + * NOTE: This guarantees that cdev device will no longer be able to be + * opened, however any cdevs already open will remain and their fops will + * still be callable even after cdev_del returns. */ void cdev_del(struct cdev *p) { @@ -570,5 +653,8 @@ EXPORT_SYMBOL(cdev_init); EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); +EXPORT_SYMBOL(cdev_set_parent); +EXPORT_SYMBOL(cdev_device_add); +EXPORT_SYMBOL(cdev_device_del); EXPORT_SYMBOL(__register_chrdev); EXPORT_SYMBOL(__unregister_chrdev); diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 07ed81cf1552e6ae97be753de0720741e3f80068..cbd216b572390ca76e481aabf9a311e4b7749d7c 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -68,7 +68,6 @@ struct cifs_sb_info { umode_t mnt_dir_mode; unsigned int mnt_cifs_flags; char *mountdata; /* options received at mount time or via DFS refs */ - struct backing_dev_info bdi; struct delayed_work prune_tlinks; struct rcu_head rcu; char *prepath; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 02b071bf3732ac29808183974636e2ac9ca56ce6..a0b3e7d1be484fd3f026258577e18c165f738f0e 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -83,6 +83,9 @@ convert_sfm_char(const __u16 src_char, char *target) case SFM_COLON: *target = ':'; break; + case SFM_DOUBLEQUOTE: + *target = '"'; + break; case SFM_ASTERISK: *target = '*'; break; @@ -418,6 +421,9 @@ static __le16 convert_to_sfm_char(char src_char, bool end_of_string) case ':': dest_char = cpu_to_le16(SFM_COLON); break; + case '"': + dest_char = cpu_to_le16(SFM_DOUBLEQUOTE); + break; case '*': dest_char = cpu_to_le16(SFM_ASTERISK); break; diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index 3d7298cc0aeb357899dfcd9216d5b76f17cc64d9..8a79a34e66b8bee8cf9575eda76c77faae30862e 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -57,6 +57,7 @@ * not conflict (although almost does) with the mapping above. */ +#define SFM_DOUBLEQUOTE ((__u16) 0xF020) #define SFM_ASTERISK ((__u16) 0xF021) #define SFM_QUESTION ((__u16) 0xF025) #define SFM_COLON ((__u16) 0xF022) @@ -64,8 +65,8 @@ #define SFM_LESSTHAN ((__u16) 0xF023) #define SFM_PIPE ((__u16) 0xF027) #define SFM_SLASH ((__u16) 0xF026) -#define SFM_PERIOD ((__u16) 0xF028) -#define SFM_SPACE ((__u16) 0xF029) +#define SFM_SPACE ((__u16) 0xF028) +#define SFM_PERIOD ((__u16) 0xF029) /* * Mapping mechanism to use when one of the seven reserved characters is diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 058ac9b36f0470a4d04478377cdc12a041df0db4..68abbb0db60898cc417ecfa03a8c11e5fe136c93 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -478,6 +478,7 @@ find_timestamp(struct cifs_ses *ses) unsigned char *blobptr; unsigned char *blobend; struct ntlmssp2_name *attrptr; + struct timespec ts; if (!ses->auth_key.len || !ses->auth_key.response) return 0; @@ -502,7 +503,8 @@ find_timestamp(struct cifs_ses *ses) blobptr += attrsize; /* advance attr value */ } - return cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + ktime_get_real_ts(&ts); + return cpu_to_le64(cifs_UnixTimeToNT(ts)); } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index dd3f5fabfdf6a35e25f9dae28020dcc9bba65007..9a1667e0e8d604ededf2a7ceb27ffcebf24b0ab9 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include "cifsfs.h" @@ -87,6 +88,7 @@ extern mempool_t *cifs_req_poolp; extern mempool_t *cifs_mid_poolp; struct workqueue_struct *cifsiod_wq; +struct workqueue_struct *cifsoplockd_wq; __u32 cifs_lock_secret; /* @@ -138,7 +140,12 @@ cifs_read_super(struct super_block *sb) sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; sb->s_xattr = cifs_xattr_handlers; - sb->s_bdi = &cifs_sb->bdi; + rc = super_setup_bdi(sb); + if (rc) + goto out_no_root; + /* tune readahead according to rsize */ + sb->s_bdi->ra_pages = cifs_sb->rsize / PAGE_SIZE; + sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ inode = cifs_root_iget(sb); @@ -1369,9 +1376,16 @@ init_cifs(void) goto out_clean_proc; } + cifsoplockd_wq = alloc_workqueue("cifsoplockd", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!cifsoplockd_wq) { + rc = -ENOMEM; + goto out_destroy_cifsiod_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_wq; + goto out_destroy_cifsoplockd_wq; rc = cifs_init_inodecache(); if (rc) @@ -1419,7 +1433,9 @@ init_cifs(void) cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); -out_destroy_wq: +out_destroy_cifsoplockd_wq: + destroy_workqueue(cifsoplockd_wq); +out_destroy_cifsiod_wq: destroy_workqueue(cifsiod_wq); out_clean_proc: cifs_proc_clean(); @@ -1442,6 +1458,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(cifsoplockd_wq); destroy_workqueue(cifsiod_wq); cifs_proc_clean(); } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 37f5a41cc50cc523cd76c790100398d4db9e80ca..8be55be70faf6c278de7b7c623501385bf21acaf 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1115,6 +1115,23 @@ struct cifs_io_parms { struct cifs_tcon *tcon; }; +struct cifs_aio_ctx { + struct kref refcount; + struct list_head list; + struct mutex aio_mutex; + struct completion done; + struct iov_iter iter; + struct kiocb *iocb; + struct cifsFileInfo *cfile; + struct bio_vec *bv; + loff_t pos; + unsigned int npages; + ssize_t rc; + unsigned int len; + unsigned int total_len; + bool should_dirty; +}; + struct cifs_readdata; /* asynchronous read support */ @@ -1124,6 +1141,7 @@ struct cifs_readdata { struct completion done; struct cifsFileInfo *cfile; struct address_space *mapping; + struct cifs_aio_ctx *ctx; __u64 offset; unsigned int bytes; unsigned int got_bytes; @@ -1154,6 +1172,7 @@ struct cifs_writedata { enum writeback_sync_modes sync_mode; struct work_struct work; struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; __u64 offset; pid_t pid; unsigned int bytes; @@ -1683,6 +1702,7 @@ void cifs_oplock_break(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; +extern struct workqueue_struct *cifsoplockd_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 97e5d236d26559806ca8bc278f7c248e0579ef77..e49958c3f8bbded4265b71e47a87d736a34676da 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -535,4 +535,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst, struct shash_desc *shash); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); +struct cifs_aio_ctx *cifs_aio_ctx_alloc(void); +void cifs_aio_ctx_release(struct kref *refcount); +int setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw); #endif /* _CIFSPROTO_H */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 5d21f00ae341b4ec002eb33dd79c7a16a1e1dfa4..4c01b3f9abf02efb70648e030998220bab13bd55 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -478,14 +478,14 @@ decode_lanman_negprot_rsp(struct TCP_Server_Info *server, NEGOTIATE_RSP *pSMBr) * this requirement. */ int val, seconds, remain, result; - struct timespec ts, utc; - utc = CURRENT_TIME; + struct timespec ts; + unsigned long utc = ktime_get_real_seconds(); ts = cnvrtDosUnixTm(rsp->SrvTime.Date, rsp->SrvTime.Time, 0); cifs_dbg(FYI, "SrvTime %d sec since 1970 (utc: %d) diff: %d\n", - (int)ts.tv_sec, (int)utc.tv_sec, - (int)(utc.tv_sec - ts.tv_sec)); - val = (int)(utc.tv_sec - ts.tv_sec); + (int)ts.tv_sec, (int)utc, + (int)(utc - ts.tv_sec)); + val = (int)(utc - ts.tv_sec); seconds = abs(val); result = (seconds / MIN_TZ_ADJ) * MIN_TZ_ADJ; remain = seconds % MIN_TZ_ADJ; @@ -718,6 +718,9 @@ CIFSSMBEcho(struct TCP_Server_Info *server) if (rc) return rc; + if (server->capabilities & CAP_UNICODE) + smb->hdr.Flags2 |= SMBFLG2_UNICODE; + /* set up echo request */ smb->hdr.Tid = 0xffff; smb->hdr.WordCount = 1; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index d82467cfb0e2df5ef7e655bc601f4f17f3b890b1..9365c0cf77adbadee2313776f1a9dc7b5468f0bb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -1945,9 +1946,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, } if (!got_ip) { + int len; + const char *slash; + /* No ip= option specified? Try to get it from UNC */ - if (!cifs_convert_address(dstaddr, &vol->UNC[2], - strlen(&vol->UNC[2]))) { + /* Use the address part of the UNC. */ + slash = strchr(&vol->UNC[2], '\\'); + len = slash - &vol->UNC[2]; + if (!cifs_convert_address(dstaddr, &vol->UNC[2], len)) { pr_err("Unable to determine destination address.\n"); goto cifs_parse_mount_err; } @@ -2912,16 +2918,14 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; + bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - if (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) { - if (!(new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH)) - return 0; - /* The prepath should be null terminated strings */ - if (strcmp(new->prepath, old->prepath)) - return 0; - + if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; - } + else if (!old_set && !new_set) + return 1; + return 0; } @@ -3692,10 +3696,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) int referral_walks_count = 0; #endif - rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs"); - if (rc) - return rc; - #ifdef CONFIG_CIFS_DFS_UPCALL try_mount_again: /* cleanup activities if we're chasing a referral */ @@ -3723,7 +3723,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) server = cifs_get_tcp_session(volume_info); if (IS_ERR(server)) { rc = PTR_ERR(server); - bdi_destroy(&cifs_sb->bdi); goto out; } if ((volume_info->max_credits < 20) || @@ -3780,9 +3779,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) cifs_sb->wsize = server->ops->negotiate_wsize(tcon, volume_info); cifs_sb->rsize = server->ops->negotiate_rsize(tcon, volume_info); - /* tune readahead according to rsize */ - cifs_sb->bdi.ra_pages = cifs_sb->rsize / PAGE_SIZE; - remote_path_check: #ifdef CONFIG_CIFS_DFS_UPCALL /* @@ -3899,7 +3895,6 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info) cifs_put_smb_ses(ses); else cifs_put_tcp_session(server, 0); - bdi_destroy(&cifs_sb->bdi); } out: @@ -4102,7 +4097,6 @@ cifs_umount(struct cifs_sb_info *cifs_sb) } spin_unlock(&cifs_sb->tlink_tree_lock); - bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb->mountdata); kfree(cifs_sb->prepath); call_rcu(&cifs_sb->rcu, delayed_free); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 21d4045357397de647b8fe0282ed1a436b7e3a7f..6ef78ad838e6e3e8b4cf24e72de24fcba8e4313c 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2458,11 +2458,14 @@ cifs_uncached_writedata_release(struct kref *refcount) struct cifs_writedata *wdata = container_of(refcount, struct cifs_writedata, refcount); + kref_put(&wdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < wdata->nr_pages; i++) put_page(wdata->pages[i]); cifs_writedata_release(refcount); } +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_writev_complete(struct work_struct *work) { @@ -2478,7 +2481,8 @@ cifs_uncached_writev_complete(struct work_struct *work) spin_unlock(&inode->i_lock); complete(&wdata->done); - + collect_uncached_write_data(wdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&wdata->refcount, cifs_uncached_writedata_release); } @@ -2527,7 +2531,8 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from, static int cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *wdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *wdata_list, + struct cifs_aio_ctx *ctx) { int rc = 0; size_t cur_len; @@ -2595,6 +2600,8 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, wdata->pagesz = PAGE_SIZE; wdata->tailsz = cur_len - ((nr_pages - 1) * PAGE_SIZE); wdata->credits = credits; + wdata->ctx = ctx; + kref_get(&ctx->refcount); if (!wdata->cfile->invalidHandle || !(rc = cifs_reopen_file(wdata->cfile, false))) @@ -2620,81 +2627,61 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, return rc; } -ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +static void collect_uncached_write_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t total_written = 0; - struct cifsFileInfo *open_file; + struct cifs_writedata *wdata, *tmp; struct cifs_tcon *tcon; struct cifs_sb_info *cifs_sb; - struct cifs_writedata *wdata, *tmp; - struct list_head wdata_list; - struct iov_iter saved_from = *from; + struct dentry *dentry = ctx->cfile->dentry; + unsigned int i; int rc; - /* - * BB - optimize the way when signing is disabled. We can drop this - * extra memory-to-memory copying and use iovec buffers for constructing - * write request. - */ - - rc = generic_write_checks(iocb, from); - if (rc <= 0) - return rc; - - INIT_LIST_HEAD(&wdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_writev) - return -ENOSYS; + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(dentry->d_sb); - rc = cifs_write_from_iter(iocb->ki_pos, iov_iter_count(from), from, - open_file, cifs_sb, &wdata_list); + mutex_lock(&ctx->aio_mutex); - /* - * If at least one write was successfully sent, then discard any rc - * value from the later writes. If the other write succeeds, then - * we'll end up returning whatever was written. If it fails, then - * we'll get a new rc value from that. - */ - if (!list_empty(&wdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + rc = ctx->rc; /* * Wait for and collect replies for any successful sends in order of - * increasing offset. Once an error is hit or we get a fatal signal - * while waiting, then return without waiting for any more replies. + * increasing offset. Once an error is hit, then return without waiting + * for any more replies. */ restart_loop: - list_for_each_entry_safe(wdata, tmp, &wdata_list, list) { + list_for_each_entry_safe(wdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable too? */ - rc = wait_for_completion_killable(&wdata->done); - if (rc) - rc = -EINTR; - else if (wdata->result) + if (!try_wait_for_completion(&wdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (wdata->result) rc = wdata->result; else - total_written += wdata->bytes; + ctx->total_len += wdata->bytes; /* resend call if it's a retryable error */ if (rc == -EAGAIN) { struct list_head tmp_list; - struct iov_iter tmp_from = saved_from; + struct iov_iter tmp_from = ctx->iter; INIT_LIST_HEAD(&tmp_list); list_del_init(&wdata->list); iov_iter_advance(&tmp_from, - wdata->offset - iocb->ki_pos); + wdata->offset - ctx->pos); rc = cifs_write_from_iter(wdata->offset, wdata->bytes, &tmp_from, - open_file, cifs_sb, &tmp_list); + ctx->cfile, cifs_sb, &tmp_list, + ctx); - list_splice(&tmp_list, &wdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&wdata->refcount, cifs_uncached_writedata_release); @@ -2705,12 +2692,111 @@ ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) kref_put(&wdata->refcount, cifs_uncached_writedata_release); } + for (i = 0; i < ctx->npages; i++) + put_page(ctx->bv[i].bv_page); + + cifs_stats_bytes_written(tcon, ctx->total_len); + set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(dentry->d_inode)->flags); + + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_writev(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + ssize_t total_written = 0; + struct cifsFileInfo *cfile; + struct cifs_tcon *tcon; + struct cifs_sb_info *cifs_sb; + struct cifs_aio_ctx *ctx; + struct iov_iter saved_from = *from; + int rc; + + /* + * BB - optimize the way when signing is disabled. We can drop this + * extra memory-to-memory copying and use iovec buffers for constructing + * write request. + */ + + rc = generic_write_checks(iocb, from); + if (rc <= 0) + return rc; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_writev) + return -ENOSYS; + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + ctx->pos = iocb->ki_pos; + + rc = setup_aio_ctx_iter(ctx, from, WRITE); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_write_from_iter(iocb->ki_pos, ctx->len, &saved_from, + cfile, cifs_sb, &ctx->list, ctx); + + /* + * If at least one write was successfully sent, then discard any rc + * value from the later writes. If the other write succeeds, then + * we'll end up returning whatever was written. If it fails, then + * we'll get a new rc value from that. + */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_written = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_written = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (unlikely(!total_written)) return rc; iocb->ki_pos += total_written; - set_bit(CIFS_INO_INVALID_MAPPING, &CIFS_I(file_inode(file))->flags); - cifs_stats_bytes_written(tcon, total_written); return total_written; } @@ -2859,6 +2945,7 @@ cifs_uncached_readdata_release(struct kref *refcount) struct cifs_readdata, refcount); unsigned int i; + kref_put(&rdata->ctx->refcount, cifs_aio_ctx_release); for (i = 0; i < rdata->nr_pages; i++) { put_page(rdata->pages[i]); rdata->pages[i] = NULL; @@ -2900,6 +2987,8 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter) return remaining ? -EFAULT : 0; } +static void collect_uncached_read_data(struct cifs_aio_ctx *ctx); + static void cifs_uncached_readv_complete(struct work_struct *work) { @@ -2907,6 +2996,8 @@ cifs_uncached_readv_complete(struct work_struct *work) struct cifs_readdata, work); complete(&rdata->done); + collect_uncached_read_data(rdata->ctx); + /* the below call can possibly free the last ref to aio ctx */ kref_put(&rdata->refcount, cifs_uncached_readdata_release); } @@ -2973,7 +3064,8 @@ cifs_uncached_copy_into_pages(struct TCP_Server_Info *server, static int cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, - struct cifs_sb_info *cifs_sb, struct list_head *rdata_list) + struct cifs_sb_info *cifs_sb, struct list_head *rdata_list, + struct cifs_aio_ctx *ctx) { struct cifs_readdata *rdata; unsigned int npages, rsize, credits; @@ -3020,6 +3112,8 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, rdata->read_into_pages = cifs_uncached_read_into_pages; rdata->copy_into_pages = cifs_uncached_copy_into_pages; rdata->credits = credits; + rdata->ctx = ctx; + kref_get(&ctx->refcount); if (!rdata->cfile->invalidHandle || !(rc = cifs_reopen_file(rdata->cfile, true))) @@ -3042,50 +3136,37 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, return rc; } -ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +static void +collect_uncached_read_data(struct cifs_aio_ctx *ctx) { - struct file *file = iocb->ki_filp; - ssize_t rc; - size_t len; - ssize_t total_read = 0; - loff_t offset = iocb->ki_pos; + struct cifs_readdata *rdata, *tmp; + struct iov_iter *to = &ctx->iter; struct cifs_sb_info *cifs_sb; struct cifs_tcon *tcon; - struct cifsFileInfo *open_file; - struct cifs_readdata *rdata, *tmp; - struct list_head rdata_list; - - len = iov_iter_count(to); - if (!len) - return 0; - - INIT_LIST_HEAD(&rdata_list); - cifs_sb = CIFS_FILE_SB(file); - open_file = file->private_data; - tcon = tlink_tcon(open_file->tlink); - - if (!tcon->ses->server->ops->async_readv) - return -ENOSYS; + unsigned int i; + int rc; - if ((file->f_flags & O_ACCMODE) == O_WRONLY) - cifs_dbg(FYI, "attempting read on write only file instance\n"); + tcon = tlink_tcon(ctx->cfile->tlink); + cifs_sb = CIFS_SB(ctx->cfile->dentry->d_sb); - rc = cifs_send_async_read(offset, len, open_file, cifs_sb, &rdata_list); + mutex_lock(&ctx->aio_mutex); - /* if at least one read request send succeeded, then reset rc */ - if (!list_empty(&rdata_list)) - rc = 0; + if (list_empty(&ctx->list)) { + mutex_unlock(&ctx->aio_mutex); + return; + } - len = iov_iter_count(to); + rc = ctx->rc; /* the loop below should proceed in the order of increasing offsets */ again: - list_for_each_entry_safe(rdata, tmp, &rdata_list, list) { + list_for_each_entry_safe(rdata, tmp, &ctx->list, list) { if (!rc) { - /* FIXME: freezable sleep too? */ - rc = wait_for_completion_killable(&rdata->done); - if (rc) - rc = -EINTR; - else if (rdata->result == -EAGAIN) { + if (!try_wait_for_completion(&rdata->done)) { + mutex_unlock(&ctx->aio_mutex); + return; + } + + if (rdata->result == -EAGAIN) { /* resend call if it's a retryable error */ struct list_head tmp_list; unsigned int got_bytes = rdata->got_bytes; @@ -3111,9 +3192,9 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) rdata->offset + got_bytes, rdata->bytes - got_bytes, rdata->cfile, cifs_sb, - &tmp_list); + &tmp_list, ctx); - list_splice(&tmp_list, &rdata_list); + list_splice(&tmp_list, &ctx->list); kref_put(&rdata->refcount, cifs_uncached_readdata_release); @@ -3131,14 +3212,110 @@ ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) kref_put(&rdata->refcount, cifs_uncached_readdata_release); } - total_read = len - iov_iter_count(to); + for (i = 0; i < ctx->npages; i++) { + if (ctx->should_dirty) + set_page_dirty(ctx->bv[i].bv_page); + put_page(ctx->bv[i].bv_page); + } + + ctx->total_len = ctx->len - iov_iter_count(to); - cifs_stats_bytes_read(tcon, total_read); + cifs_stats_bytes_read(tcon, ctx->total_len); /* mask nodata case */ if (rc == -ENODATA) rc = 0; + ctx->rc = (rc == 0) ? ctx->total_len : rc; + + mutex_unlock(&ctx->aio_mutex); + + if (ctx->iocb && ctx->iocb->ki_complete) + ctx->iocb->ki_complete(ctx->iocb, ctx->rc, 0); + else + complete(&ctx->done); +} + +ssize_t cifs_user_readv(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + ssize_t rc; + size_t len; + ssize_t total_read = 0; + loff_t offset = iocb->ki_pos; + struct cifs_sb_info *cifs_sb; + struct cifs_tcon *tcon; + struct cifsFileInfo *cfile; + struct cifs_aio_ctx *ctx; + + len = iov_iter_count(to); + if (!len) + return 0; + + cifs_sb = CIFS_FILE_SB(file); + cfile = file->private_data; + tcon = tlink_tcon(cfile->tlink); + + if (!tcon->ses->server->ops->async_readv) + return -ENOSYS; + + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + cifs_dbg(FYI, "attempting read on write only file instance\n"); + + ctx = cifs_aio_ctx_alloc(); + if (!ctx) + return -ENOMEM; + + ctx->cfile = cifsFileInfo_get(cfile); + + if (!is_sync_kiocb(iocb)) + ctx->iocb = iocb; + + if (to->type & ITER_IOVEC) + ctx->should_dirty = true; + + rc = setup_aio_ctx_iter(ctx, to, READ); + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + len = ctx->len; + + /* grab a lock here due to read response handlers can access ctx */ + mutex_lock(&ctx->aio_mutex); + + rc = cifs_send_async_read(offset, len, cfile, cifs_sb, &ctx->list, ctx); + + /* if at least one read request send succeeded, then reset rc */ + if (!list_empty(&ctx->list)) + rc = 0; + + mutex_unlock(&ctx->aio_mutex); + + if (rc) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return rc; + } + + if (!is_sync_kiocb(iocb)) { + kref_put(&ctx->refcount, cifs_aio_ctx_release); + return -EIOCBQUEUED; + } + + rc = wait_for_completion_killable(&ctx->done); + if (rc) { + mutex_lock(&ctx->aio_mutex); + ctx->rc = rc = -EINTR; + total_read = ctx->total_len; + mutex_unlock(&ctx->aio_mutex); + } else { + rc = ctx->rc; + total_read = ctx->total_len; + } + + kref_put(&ctx->refcount, cifs_aio_ctx_release); + if (total_read) { iocb->ki_pos += total_read; return total_read; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index b261db34103ce929ed863e34753511ac94ec0355..c3b2fa0b2ec8a6ceed83a74b65bd60cc1e1a7cee 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -322,9 +322,9 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) fattr->cf_mode = S_IFDIR | S_IXUGO | S_IRWXU; fattr->cf_uid = cifs_sb->mnt_uid; fattr->cf_gid = cifs_sb->mnt_gid; - fattr->cf_atime = CURRENT_TIME; - fattr->cf_ctime = CURRENT_TIME; - fattr->cf_mtime = CURRENT_TIME; + ktime_get_real_ts(&fattr->cf_mtime); + fattr->cf_mtime = timespec_trunc(fattr->cf_mtime, sb->s_time_gran); + fattr->cf_atime = fattr->cf_ctime = fattr->cf_mtime; fattr->cf_nlink = 2; fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; } @@ -586,9 +586,10 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, - struct cifs_sb_info *cifs_sb, bool adjust_tz, + struct super_block *sb, bool adjust_tz, bool symlink) { + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); memset(fattr, 0, sizeof(*fattr)); @@ -598,8 +599,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, if (info->LastAccessTime) fattr->cf_atime = cifs_NTtimeToUnix(info->LastAccessTime); - else - fattr->cf_atime = CURRENT_TIME; + else { + ktime_get_real_ts(&fattr->cf_atime); + fattr->cf_atime = timespec_trunc(fattr->cf_atime, sb->s_time_gran); + } fattr->cf_ctime = cifs_NTtimeToUnix(info->ChangeTime); fattr->cf_mtime = cifs_NTtimeToUnix(info->LastWriteTime); @@ -659,7 +662,6 @@ cifs_get_file_info(struct file *filp) FILE_ALL_INFO find_data; struct cifs_fattr fattr; struct inode *inode = file_inode(filp); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsFileInfo *cfile = filp->private_data; struct cifs_tcon *tcon = tlink_tcon(cfile->tlink); struct TCP_Server_Info *server = tcon->ses->server; @@ -671,7 +673,7 @@ cifs_get_file_info(struct file *filp) rc = server->ops->query_file_info(xid, tcon, &cfile->fid, &find_data); switch (rc) { case 0: - cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false, + cifs_all_info_to_fattr(&fattr, &find_data, inode->i_sb, false, false); break; case -EREMOTE: @@ -753,7 +755,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } if (!rc) { - cifs_all_info_to_fattr(&fattr, data, cifs_sb, adjust_tz, + cifs_all_info_to_fattr(&fattr, data, sb, adjust_tz, symlink); } else if (rc == -EREMOTE) { cifs_create_dfs_fattr(&fattr, sb); @@ -1363,9 +1365,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) cifs_inode = CIFS_I(inode); cifs_inode->time = 0; /* will force revalidate to get info when needed */ - inode->i_ctime = current_fs_time(sb); + inode->i_ctime = current_time(inode); } - dir->i_ctime = dir->i_mtime = current_fs_time(sb); + dir->i_ctime = dir->i_mtime = current_time(dir); cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: @@ -1633,7 +1635,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) cifsInode->time = 0; d_inode(direntry)->i_ctime = inode->i_ctime = inode->i_mtime = - current_fs_time(inode->i_sb); + current_time(inode); rmdir_exit: kfree(full_path); @@ -1806,7 +1808,7 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, CIFS_I(source_dir)->time = CIFS_I(target_dir)->time = 0; source_dir->i_ctime = source_dir->i_mtime = target_dir->i_ctime = - target_dir->i_mtime = current_fs_time(source_dir->i_sb); + target_dir->i_mtime = current_time(source_dir); cifs_rename_exit: kfree(info_buf_source); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 265c45fe4ea5e5246f5304480c2b4b66bf9d6c73..76fb0917dc8c37405b4a481831ee6a695d1e265c 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -74,7 +74,8 @@ static long cifs_ioctl_copychunk(unsigned int xid, struct file *dst_file, rc = cifs_file_copychunk_range(xid, src_file.file, 0, dst_file, 0, src_inode->i_size, 0); - + if (rc > 0) + rc = 0; out_fput: fdput(src_file); out_drop_write: @@ -208,10 +209,14 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_IOC_GET_MNT_INFO: + if (pSMBFile == NULL) + break; tcon = tlink_tcon(pSMBFile->tlink); rc = smb_mnt_get_fsinfo(xid, tcon, (void __user *)arg); break; case CIFS_ENUMERATE_SNAPSHOTS: + if (pSMBFile == NULL) + break; if (arg == 0) { rc = -EINVAL; goto cifs_ioc_exit; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index d3fb11529ed96054ac753a9e26898942bb708d56..b08531977daa4084f774c75de33204b2b6fa0902 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -167,13 +168,11 @@ cifs_buf_get(void) /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ - if (ret_buf) { - memset(ret_buf, 0, buf_size + 3); - atomic_inc(&bufAllocCount); + memset(ret_buf, 0, buf_size + 3); + atomic_inc(&bufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totBufAllocCount); + atomic_inc(&totBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } @@ -201,15 +200,13 @@ cifs_small_buf_get(void) albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); - if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ - atomic_inc(&smBufAllocCount); + atomic_inc(&smBufAllocCount); #ifdef CONFIG_CIFS_STATS2 - atomic_inc(&totSmBufAllocCount); + atomic_inc(&totSmBufAllocCount); #endif /* CONFIG_CIFS_STATS2 */ - } return ret_buf; } @@ -492,7 +489,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsiod_wq, + queue_work(cifsoplockd_wq, &netfile->oplock_break); netfile->oplock_break_cancelled = false; @@ -745,3 +742,122 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, } return rc; } + +struct cifs_aio_ctx * +cifs_aio_ctx_alloc(void) +{ + struct cifs_aio_ctx *ctx; + + ctx = kzalloc(sizeof(struct cifs_aio_ctx), GFP_KERNEL); + if (!ctx) + return NULL; + + INIT_LIST_HEAD(&ctx->list); + mutex_init(&ctx->aio_mutex); + init_completion(&ctx->done); + kref_init(&ctx->refcount); + return ctx; +} + +void +cifs_aio_ctx_release(struct kref *refcount) +{ + struct cifs_aio_ctx *ctx = container_of(refcount, + struct cifs_aio_ctx, refcount); + + cifsFileInfo_put(ctx->cfile); + kvfree(ctx->bv); + kfree(ctx); +} + +#define CIFS_AIO_KMALLOC_LIMIT (1024 * 1024) + +int +setup_aio_ctx_iter(struct cifs_aio_ctx *ctx, struct iov_iter *iter, int rw) +{ + ssize_t rc; + unsigned int cur_npages; + unsigned int npages = 0; + unsigned int i; + size_t len; + size_t count = iov_iter_count(iter); + unsigned int saved_len; + size_t start; + unsigned int max_pages = iov_iter_npages(iter, INT_MAX); + struct page **pages = NULL; + struct bio_vec *bv = NULL; + + if (iter->type & ITER_KVEC) { + memcpy(&ctx->iter, iter, sizeof(struct iov_iter)); + ctx->len = count; + iov_iter_advance(iter, count); + return 0; + } + + if (max_pages * sizeof(struct bio_vec) <= CIFS_AIO_KMALLOC_LIMIT) + bv = kmalloc_array(max_pages, sizeof(struct bio_vec), + GFP_KERNEL); + + if (!bv) { + bv = vmalloc(max_pages * sizeof(struct bio_vec)); + if (!bv) + return -ENOMEM; + } + + if (max_pages * sizeof(struct page *) <= CIFS_AIO_KMALLOC_LIMIT) + pages = kmalloc_array(max_pages, sizeof(struct page *), + GFP_KERNEL); + + if (!pages) { + pages = vmalloc(max_pages * sizeof(struct page *)); + if (!bv) { + kvfree(bv); + return -ENOMEM; + } + } + + saved_len = count; + + while (count && npages < max_pages) { + rc = iov_iter_get_pages(iter, pages, count, max_pages, &start); + if (rc < 0) { + cifs_dbg(VFS, "couldn't get user pages (rc=%zd)\n", rc); + break; + } + + if (rc > count) { + cifs_dbg(VFS, "get pages rc=%zd more than %zu\n", rc, + count); + break; + } + + iov_iter_advance(iter, rc); + count -= rc; + rc += start; + cur_npages = DIV_ROUND_UP(rc, PAGE_SIZE); + + if (npages + cur_npages > max_pages) { + cifs_dbg(VFS, "out of vec array capacity (%u vs %u)\n", + npages + cur_npages, max_pages); + break; + } + + for (i = 0; i < cur_npages; i++) { + len = rc > PAGE_SIZE ? PAGE_SIZE : rc; + bv[npages + i].bv_page = pages[i]; + bv[npages + i].bv_offset = start; + bv[npages + i].bv_len = len - start; + rc -= len; + start = 0; + } + + npages += cur_npages; + } + + kvfree(pages); + ctx->bv = bv; + ctx->len = saved_len - count; + ctx->npages = npages; + iov_iter_bvec(&ctx->iter, ITER_BVEC | rw, ctx->bv, npages, ctx->len); + return 0; +} diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index abae6dd2c6b998816db830f40934c86ea8c33fff..cc88f4f0325ef4003bd248c36ca96865ff1b4ddd 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -980,10 +980,10 @@ struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset) cifs_dbg(VFS, "illegal hours %d\n", st->Hours); days = sd->Day; month = sd->Month; - if ((days > 31) || (month > 12)) { + if (days < 1 || days > 31 || month < 1 || month > 12) { cifs_dbg(VFS, "illegal date, month %d day: %d\n", month, days); - if (month > 12) - month = 12; + days = clamp(days, 1, 31); + month = clamp(month, 1, 12); } month -= 1; days += total_days_of_prev_months[month]; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index cc93ba4da9b592468f36d37b80777e88b8e400e6..27bc360c7ffd7e1081f907c5f080dc4ba439fbfc 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -1015,6 +1015,15 @@ cifs_dir_needs_close(struct cifsFileInfo *cfile) return !cfile->srch_inf.endOfSearch && !cfile->invalidHandle; } +static bool +cifs_can_echo(struct TCP_Server_Info *server) +{ + if (server->tcpStatus == CifsGood) + return true; + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1049,6 +1058,7 @@ struct smb_version_operations smb1_operations = { .get_dfs_refer = CIFSGetDFSRefer, .qfs_tcon = cifs_qfs_tcon, .is_path_accessible = cifs_is_path_accessible, + .can_echo = cifs_can_echo, .query_path_info = cifs_query_path_info, .query_file_info = cifs_query_file_info, .get_srv_inum = cifs_get_srv_inum, diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1a04b3a5beb164cb22c166f5c7ac03b65e70d00b..7b08a1446a7fbd08f2f245fa2dbb75261eb3f5b2 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -499,7 +499,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, else cfile->oplock_break_cancelled = true; - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, &cfile->oplock_break); kfree(lw); return true; } @@ -643,7 +643,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsiod_wq, &cfile->oplock_break); + queue_work(cifsoplockd_wq, + &cfile->oplock_break); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 152e37f2ad9213462a2ae439296a6edb49d653f9..c58691834eb2b74fa34f3fe2661ed3e211c4d22e 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -942,6 +942,7 @@ smb3_enum_snapshots(const unsigned int xid, struct cifs_tcon *tcon, } if (snapshot_in.snapshot_array_size < sizeof(struct smb_snapshot_array)) { rc = -ERANGE; + kfree(retbuf); return rc; } diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 02da648041fcd58d6e37431a60d596a5bfd4ca06..48ff7703b91912c79b8412539919c3ce6fe76ac6 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include "smb2pdu.h" @@ -632,8 +633,12 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) } if (rsplen != sizeof(struct validate_negotiate_info_rsp)) { - cifs_dbg(VFS, "invalid size of protocol negotiate response\n"); - return -EIO; + cifs_dbg(VFS, "invalid protocol negotiate response size: %d\n", + rsplen); + + /* relax check since Mac returns max bufsize allowed on ioctl */ + if (rsplen > CIFSMaxBufSize) + return -EIO; } /* check validate negotiate info response matches what we got earlier */ @@ -1853,8 +1858,12 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, * than one credit. Windows typically sets this smaller, but for some * ioctls it may be useful to allow server to send more. No point * limiting what the server can send as long as fits in one credit + * Unfortunately - we can not handle more than CIFS_MAX_MSG_SIZE + * (by default, note that it can be overridden to make max larger) + * in responses (except for read responses which can be bigger. + * We may want to bump this limit up */ - req->MaxOutputResponse = cpu_to_le32(0xFF00); /* < 64K uses 1 credit */ + req->MaxOutputResponse = cpu_to_le32(CIFSMaxBufSize); if (is_fsctl) req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 506b67fc93d9611ea859cf738126fb2d67e52e16..c69ec96e92acb8613db1585d855fa54dce426a5d 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -538,23 +538,19 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = le64_to_cpu(shdr->MessageId); - temp->pid = current->pid; - temp->command = shdr->Command; /* Always LE */ - temp->when_alloc = jiffies; - temp->server = server; - - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = le64_to_cpu(shdr->MessageId); + temp->pid = current->pid; + temp->command = shdr->Command; /* Always LE */ + temp->when_alloc = jiffies; + temp->server = server; + + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index f6e13a977fc8e907e3fa89fc0a5d14a4ab0eccd5..4d64b5b8fc9c6fae67f7f3e0ec74284548ff7c32 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -55,26 +55,22 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) } temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); - if (temp == NULL) - return temp; - else { - memset(temp, 0, sizeof(struct mid_q_entry)); - temp->mid = get_mid(smb_buffer); - temp->pid = current->pid; - temp->command = cpu_to_le16(smb_buffer->Command); - cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); + memset(temp, 0, sizeof(struct mid_q_entry)); + temp->mid = get_mid(smb_buffer); + temp->pid = current->pid; + temp->command = cpu_to_le16(smb_buffer->Command); + cifs_dbg(FYI, "For smb_command %d\n", smb_buffer->Command); /* do_gettimeofday(&temp->when_sent);*/ /* easier to use jiffies */ - /* when mid allocated can be before when sent */ - temp->when_alloc = jiffies; - temp->server = server; + /* when mid allocated can be before when sent */ + temp->when_alloc = jiffies; + temp->server = server; - /* - * The default is for the mid to be synchronous, so the - * default callback just wakes up the current task. - */ - temp->callback = cifs_wake_up_task; - temp->callback_data = current; - } + /* + * The default is for the mid to be synchronous, so the + * default callback just wakes up the current task. + */ + temp->callback = cifs_wake_up_task; + temp->callback_data = current; atomic_inc(&midCount); temp->mid_state = MID_REQUEST_ALLOCATED; diff --git a/fs/coda/inode.c b/fs/coda/inode.c index 2dea594da19968288586138a35c22a81151914e7..6058df380cc00ed0d4be8cb81a0a87369cb9aac3 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -183,10 +183,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) goto unlock_out; } - error = bdi_setup_and_register(&vc->bdi, "coda"); - if (error) - goto unlock_out; - vc->vc_sb = sb; mutex_unlock(&vc->vc_mutex); @@ -197,7 +193,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; sb->s_d_op = &coda_dentry_operations; - sb->s_bdi = &vc->bdi; + + error = super_setup_bdi(sb); + if (error) + goto error; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -228,7 +227,6 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) error: mutex_lock(&vc->vc_mutex); - bdi_destroy(&vc->bdi); vc->vc_sb = NULL; sb->s_fs_info = NULL; unlock_out: @@ -240,7 +238,6 @@ static void coda_put_super(struct super_block *sb) { struct venus_comm *vcp = coda_vcp(sb); mutex_lock(&vcp->vc_mutex); - bdi_destroy(&vcp->bdi); vcp->vc_sb = NULL; sb->s_fs_info = NULL; mutex_unlock(&vcp->vc_mutex); diff --git a/fs/compat.c b/fs/compat.c index c61b506f5bc94ba0dcac0d9a28a7ab1070ac3a66..190b38b39d9ed74efa5816e437232f86ee7070d4 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -15,555 +15,14 @@ * published by the Free Software Foundation. */ -#include -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include - #include -#include -#include #include "internal.h" -/* - * Not all architectures have sys_utime, so implement this in terms - * of sys_utimes. - */ -COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, - struct compat_utimbuf __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t->actime) || - get_user(tv[1].tv_sec, &t->modtime)) - return -EFAULT; - tv[0].tv_nsec = 0; - tv[1].tv_nsec = 0; - } - return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) -{ - struct timespec tv[2]; - - if (t) { - if (compat_get_timespec(&tv[0], &t[0]) || - compat_get_timespec(&tv[1], &t[1])) - return -EFAULT; - - if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) - return 0; - } - return do_utimes(dfd, filename, t ? tv : NULL, flags); -} - -COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) -{ - struct timespec tv[2]; - - if (t) { - if (get_user(tv[0].tv_sec, &t[0].tv_sec) || - get_user(tv[0].tv_nsec, &t[0].tv_usec) || - get_user(tv[1].tv_sec, &t[1].tv_sec) || - get_user(tv[1].tv_nsec, &t[1].tv_usec)) - return -EFAULT; - if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || - tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) - return -EINVAL; - tv[0].tv_nsec *= 1000; - tv[1].tv_nsec *= 1000; - } - return do_utimes(dfd, filename, t ? tv : NULL, 0); -} - -COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) -{ - return compat_sys_futimesat(AT_FDCWD, filename, t); -} - -static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) -{ - struct compat_stat tmp; - - if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) - return -EOVERFLOW; - - memset(&tmp, 0, sizeof(tmp)); - tmp.st_dev = old_encode_dev(stat->dev); - tmp.st_ino = stat->ino; - if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) - return -EOVERFLOW; - tmp.st_mode = stat->mode; - tmp.st_nlink = stat->nlink; - if (tmp.st_nlink != stat->nlink) - return -EOVERFLOW; - SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); - SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); - tmp.st_rdev = old_encode_dev(stat->rdev); - if ((u64) stat->size > MAX_NON_LFS) - return -EOVERFLOW; - tmp.st_size = stat->size; - tmp.st_atime = stat->atime.tv_sec; - tmp.st_atime_nsec = stat->atime.tv_nsec; - tmp.st_mtime = stat->mtime.tv_sec; - tmp.st_mtime_nsec = stat->mtime.tv_nsec; - tmp.st_ctime = stat->ctime.tv_sec; - tmp.st_ctime_nsec = stat->ctime.tv_nsec; - tmp.st_blocks = stat->blocks; - tmp.st_blksize = stat->blksize; - return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; -} - -COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_stat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error; - - error = vfs_lstat(filename, &stat); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} - -#ifndef __ARCH_WANT_STAT64 -COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, - const char __user *, filename, - struct compat_stat __user *, statbuf, int, flag) -{ - struct kstat stat; - int error; - - error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) - return error; - return cp_compat_stat(&stat, statbuf); -} -#endif - -COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, - struct compat_stat __user *, statbuf) -{ - struct kstat stat; - int error = vfs_fstat(fd, &stat); - - if (!error) - error = cp_compat_stat(&stat, statbuf); - return error; -} - -static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) -{ - - if (sizeof ubuf->f_blocks == 4) { - if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | - kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -/* - * The following statfs calls are copies of code from fs/statfs.c and - * should be checked against those from time to time - */ -COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) -{ - struct kstatfs tmp; - int error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs(buf, &tmp); - return error; -} - -static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) -{ - if (sizeof(ubuf->f_bsize) == 4) { - if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | - kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) - return -EOVERFLOW; - /* f_files and f_ffree may be -1; it's okay - * to stuff that into 32 bits */ - if (kbuf->f_files != 0xffffffffffffffffULL - && (kbuf->f_files & 0xffffffff00000000ULL)) - return -EOVERFLOW; - if (kbuf->f_ffree != 0xffffffffffffffffULL - && (kbuf->f_ffree & 0xffffffff00000000ULL)) - return -EOVERFLOW; - } - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || - __put_user(kbuf->f_type, &ubuf->f_type) || - __put_user(kbuf->f_bsize, &ubuf->f_bsize) || - __put_user(kbuf->f_blocks, &ubuf->f_blocks) || - __put_user(kbuf->f_bfree, &ubuf->f_bfree) || - __put_user(kbuf->f_bavail, &ubuf->f_bavail) || - __put_user(kbuf->f_files, &ubuf->f_files) || - __put_user(kbuf->f_ffree, &ubuf->f_ffree) || - __put_user(kbuf->f_namelen, &ubuf->f_namelen) || - __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || - __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || - __put_user(kbuf->f_frsize, &ubuf->f_frsize) || - __put_user(kbuf->f_flags, &ubuf->f_flags) || - __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) - return -EFAULT; - return 0; -} - -COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = user_statfs(pathname, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) -{ - struct kstatfs tmp; - int error; - - if (sz != sizeof(*buf)) - return -EINVAL; - - error = fd_statfs(fd, &tmp); - if (!error) - error = put_compat_statfs64(buf, &tmp); - return error; -} - -/* - * This is a copy of sys_ustat, just dealing with a structure layout. - * Given how simple this syscall is that apporach is more maintainable - * than the various conversion hacks. - */ -COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) -{ - struct compat_ustat tmp; - struct kstatfs sbuf; - int err = vfs_ustat(new_decode_dev(dev), &sbuf); - if (err) - return err; - - memset(&tmp, 0, sizeof(struct compat_ustat)); - tmp.f_tfree = sbuf.f_bfree; - tmp.f_tinode = sbuf.f_ffree; - if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) - return -EFAULT; - return 0; -} - -static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} - -#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 -static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || - __get_user(kfl->l_type, &ufl->l_type) || - __get_user(kfl->l_whence, &ufl->l_whence) || - __get_user(kfl->l_start, &ufl->l_start) || - __get_user(kfl->l_len, &ufl->l_len) || - __get_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 -static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) -{ - if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || - __put_user(kfl->l_type, &ufl->l_type) || - __put_user(kfl->l_whence, &ufl->l_whence) || - __put_user(kfl->l_start, &ufl->l_start) || - __put_user(kfl->l_len, &ufl->l_len) || - __put_user(kfl->l_pid, &ufl->l_pid)) - return -EFAULT; - return 0; -} -#endif - -static unsigned int -convert_fcntl_cmd(unsigned int cmd) -{ - switch (cmd) { - case F_GETLK64: - return F_GETLK; - case F_SETLK64: - return F_SETLK; - case F_SETLKW64: - return F_SETLKW; - } - - return cmd; -} - -COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - mm_segment_t old_fs; - struct flock f; - long ret; - unsigned int conv_cmd; - - switch (cmd) { - case F_GETLK: - case F_SETLK: - case F_SETLKW: - ret = get_compat_flock(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = sys_fcntl(fd, cmd, (unsigned long)&f); - set_fs(old_fs); - if (cmd == F_GETLK && ret == 0) { - /* GETLK was successful and we need to return the data... - * but it needs to fit in the compat structure. - * l_start shouldn't be too big, unless the original - * start + end is greater than COMPAT_OFF_T_MAX, in which - * case the app was asking for trouble, so we return - * -EOVERFLOW in that case. - * l_len could be too big, in which case we just truncate it, - * and only allow the app to see that part of the conflicting - * lock that might make sense to it anyway - */ - - if (f.l_start > COMPAT_OFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_OFF_T_MAX) - f.l_len = COMPAT_OFF_T_MAX; - if (ret == 0) - ret = put_compat_flock(&f, compat_ptr(arg)); - } - break; - - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - ret = get_compat_flock64(&f, compat_ptr(arg)); - if (ret != 0) - break; - old_fs = get_fs(); - set_fs(KERNEL_DS); - conv_cmd = convert_fcntl_cmd(cmd); - ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); - set_fs(old_fs); - if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { - /* need to return lock information - see above for commentary */ - if (f.l_start > COMPAT_LOFF_T_MAX) - ret = -EOVERFLOW; - if (f.l_len > COMPAT_LOFF_T_MAX) - f.l_len = COMPAT_LOFF_T_MAX; - if (ret == 0) - ret = put_compat_flock64(&f, compat_ptr(arg)); - } - break; - - default: - ret = sys_fcntl(fd, cmd, arg); - break; - } - return ret; -} - -COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg) -{ - switch (cmd) { - case F_GETLK64: - case F_SETLK64: - case F_SETLKW64: - case F_OFD_GETLK: - case F_OFD_SETLK: - case F_OFD_SETLKW: - return -EINVAL; - } - return compat_sys_fcntl64(fd, cmd, arg); -} - -/* A write operation does a read from user space and vice versa */ -#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) - -ssize_t compat_rw_copy_check_uvector(int type, - const struct compat_iovec __user *uvector, unsigned long nr_segs, - unsigned long fast_segs, struct iovec *fast_pointer, - struct iovec **ret_pointer) -{ - compat_ssize_t tot_len; - struct iovec *iov = *ret_pointer = fast_pointer; - ssize_t ret = 0; - int seg; - - /* - * SuS says "The readv() function *may* fail if the iovcnt argument - * was less than or equal to 0, or greater than {IOV_MAX}. Linux has - * traditionally returned zero for zero segments, so... - */ - if (nr_segs == 0) - goto out; - - ret = -EINVAL; - if (nr_segs > UIO_MAXIOV) - goto out; - if (nr_segs > fast_segs) { - ret = -ENOMEM; - iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); - if (iov == NULL) - goto out; - } - *ret_pointer = iov; - - ret = -EFAULT; - if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) - goto out; - - /* - * Single unix specification: - * We should -EINVAL if an element length is not >= 0 and fitting an - * ssize_t. - * - * In Linux, the total length is limited to MAX_RW_COUNT, there is - * no overflow possibility. - */ - tot_len = 0; - ret = -EINVAL; - for (seg = 0; seg < nr_segs; seg++) { - compat_uptr_t buf; - compat_ssize_t len; - - if (__get_user(len, &uvector->iov_len) || - __get_user(buf, &uvector->iov_base)) { - ret = -EFAULT; - goto out; - } - if (len < 0) /* size_t not fitting in compat_ssize_t .. */ - goto out; - if (type >= 0 && - !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { - ret = -EFAULT; - goto out; - } - if (len > MAX_RW_COUNT - tot_len) - len = MAX_RW_COUNT - tot_len; - tot_len += len; - iov->iov_base = compat_ptr(buf); - iov->iov_len = (compat_size_t) len; - uvector++; - iov++; - } - ret = tot_len; - -out: - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; @@ -744,653 +203,3 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, out: return retval; } - -struct compat_old_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_offset; - unsigned short d_namlen; - char d_name[1]; -}; - -struct compat_readdir_callback { - struct dir_context ctx; - struct compat_old_linux_dirent __user *dirent; - int result; -}; - -static int compat_fillonedir(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct compat_readdir_callback *buf = - container_of(ctx, struct compat_readdir_callback, ctx); - struct compat_old_linux_dirent __user *dirent; - compat_ulong_t d_ino; - - if (buf->result) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->result = -EOVERFLOW; - return -EOVERFLOW; - } - buf->result++; - dirent = buf->dirent; - if (!access_ok(VERIFY_WRITE, dirent, - (unsigned long)(dirent->d_name + namlen + 1) - - (unsigned long)dirent)) - goto efault; - if ( __put_user(d_ino, &dirent->d_ino) || - __put_user(offset, &dirent->d_offset) || - __put_user(namlen, &dirent->d_namlen) || - __copy_to_user(dirent->d_name, name, namlen) || - __put_user(0, dirent->d_name + namlen)) - goto efault; - return 0; -efault: - buf->result = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, - struct compat_old_linux_dirent __user *, dirent, unsigned int, count) -{ - int error; - struct fd f = fdget_pos(fd); - struct compat_readdir_callback buf = { - .ctx.actor = compat_fillonedir, - .dirent = dirent - }; - - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (buf.result) - error = buf.result; - - fdput_pos(f); - return error; -} - -struct compat_linux_dirent { - compat_ulong_t d_ino; - compat_ulong_t d_off; - unsigned short d_reclen; - char d_name[1]; -}; - -struct compat_getdents_callback { - struct dir_context ctx; - struct compat_linux_dirent __user *current_dir; - struct compat_linux_dirent __user *previous; - int count; - int error; -}; - -static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct compat_linux_dirent __user * dirent; - struct compat_getdents_callback *buf = - container_of(ctx, struct compat_getdents_callback, ctx); - compat_ulong_t d_ino; - int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + - namlen + 2, sizeof(compat_long_t)); - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { - buf->error = -EOVERFLOW; - return -EOVERFLOW; - } - dirent = buf->previous; - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user(d_ino, &dirent->d_ino)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - if (__put_user(d_type, (char __user *) dirent + reclen - 1)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, - struct compat_linux_dirent __user *, dirent, unsigned int, count) -{ - struct fd f; - struct compat_linux_dirent __user * lastdirent; - struct compat_getdents_callback buf = { - .ctx.actor = compat_filldir, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - if (put_user(buf.ctx.pos, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} - -#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 - -struct compat_getdents_callback64 { - struct dir_context ctx; - struct linux_dirent64 __user *current_dir; - struct linux_dirent64 __user *previous; - int count; - int error; -}; - -static int compat_filldir64(struct dir_context *ctx, const char *name, - int namlen, loff_t offset, u64 ino, - unsigned int d_type) -{ - struct linux_dirent64 __user *dirent; - struct compat_getdents_callback64 *buf = - container_of(ctx, struct compat_getdents_callback64, ctx); - int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, - sizeof(u64)); - u64 off; - - buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) - return -EINVAL; - dirent = buf->previous; - - if (dirent) { - if (signal_pending(current)) - return -EINTR; - if (__put_user_unaligned(offset, &dirent->d_off)) - goto efault; - } - dirent = buf->current_dir; - if (__put_user_unaligned(ino, &dirent->d_ino)) - goto efault; - off = 0; - if (__put_user_unaligned(off, &dirent->d_off)) - goto efault; - if (__put_user(reclen, &dirent->d_reclen)) - goto efault; - if (__put_user(d_type, &dirent->d_type)) - goto efault; - if (copy_to_user(dirent->d_name, name, namlen)) - goto efault; - if (__put_user(0, dirent->d_name + namlen)) - goto efault; - buf->previous = dirent; - dirent = (void __user *)dirent + reclen; - buf->current_dir = dirent; - buf->count -= reclen; - return 0; -efault: - buf->error = -EFAULT; - return -EFAULT; -} - -COMPAT_SYSCALL_DEFINE3(getdents64, unsigned int, fd, - struct linux_dirent64 __user *, dirent, unsigned int, count) -{ - struct fd f; - struct linux_dirent64 __user * lastdirent; - struct compat_getdents_callback64 buf = { - .ctx.actor = compat_filldir64, - .current_dir = dirent, - .count = count - }; - int error; - - if (!access_ok(VERIFY_WRITE, dirent, count)) - return -EFAULT; - - f = fdget_pos(fd); - if (!f.file) - return -EBADF; - - error = iterate_dir(f.file, &buf.ctx); - if (error >= 0) - error = buf.error; - lastdirent = buf.previous; - if (lastdirent) { - typeof(lastdirent->d_off) d_off = buf.ctx.pos; - if (__put_user_unaligned(d_off, &lastdirent->d_off)) - error = -EFAULT; - else - error = count - buf.count; - } - fdput_pos(f); - return error; -} -#endif /* __ARCH_WANT_COMPAT_SYS_GETDENTS64 */ - -/* - * Exactly like fs/open.c:sys_open(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(AT_FDCWD, filename, flags, mode); -} - -/* - * Exactly like fs/open.c:sys_openat(), except that it doesn't set the - * O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) -{ - return do_sys_open(dfd, filename, flags, mode); -} - -#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) - -static int poll_select_copy_remaining(struct timespec *end_time, void __user *p, - int timeval, int ret) -{ - struct timespec ts; - - if (!p) - return ret; - - if (current->personality & STICKY_TIMEOUTS) - goto sticky; - - /* No update for zero timeout */ - if (!end_time->tv_sec && !end_time->tv_nsec) - return ret; - - ktime_get_ts(&ts); - ts = timespec_sub(*end_time, ts); - if (ts.tv_sec < 0) - ts.tv_sec = ts.tv_nsec = 0; - - if (timeval) { - struct compat_timeval rtv; - - rtv.tv_sec = ts.tv_sec; - rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; - - if (!copy_to_user(p, &rtv, sizeof(rtv))) - return ret; - } else { - struct compat_timespec rts; - - rts.tv_sec = ts.tv_sec; - rts.tv_nsec = ts.tv_nsec; - - if (!copy_to_user(p, &rts, sizeof(rts))) - return ret; - } - /* - * If an application puts its timeval in read-only memory, we - * don't want the Linux-specific update to the timeval to - * cause a fault after the select has completed - * successfully. However, because we're not updating the - * timeval, we can't restart the system call. - */ - -sticky: - if (ret == -ERESTARTNOHAND) - ret = -EINTR; - return ret; -} - -/* - * Ooo, nasty. We need here to frob 32-bit unsigned longs to - * 64-bit unsigned longs. - */ -static -int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - if (ufdset) { - unsigned long odd; - - if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) - return -EFAULT; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - *fdset++ = h << 32 | l; - nr -= 2; - } - if (odd && __get_user(*fdset, ufdset)) - return -EFAULT; - } else { - /* Tricky, must clear full unsigned long in the - * kernel fdset at the end, this makes sure that - * actually happens. - */ - memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); - } - return 0; -} - -static -int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, - unsigned long *fdset) -{ - unsigned long odd; - nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); - - if (!ufdset) - return 0; - - odd = nr & 1UL; - nr &= ~1UL; - while (nr) { - unsigned long h, l; - l = *fdset++; - h = l >> 32; - if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) - return -EFAULT; - ufdset += 2; - nr -= 2; - } - if (odd && __put_user(*fdset, ufdset)) - return -EFAULT; - return 0; -} - - -/* - * This is a virtual copy of sys_select from fs/select.c and probably - * should be compared to it from time to time - */ - -/* - * We can actually return ERESTARTSYS instead of EINTR, but I'd - * like to be certain this leads to no problems. So I return - * EINTR just for safety. - * - * Update: ERESTARTSYS breaks at least the xview clock binary, so - * I'm trying ERESTARTNOHAND which restart only when you want to. - */ -int compat_core_sys_select(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct timespec *end_time) -{ - fd_set_bits fds; - void *bits; - int size, max_fds, ret = -EINVAL; - struct fdtable *fdt; - long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; - - if (n < 0) - goto out_nofds; - - /* max_fds can increase, so grab it once to avoid race */ - rcu_read_lock(); - fdt = files_fdtable(current->files); - max_fds = fdt->max_fds; - rcu_read_unlock(); - if (n > max_fds) - n = max_fds; - - /* - * We need 6 bitmaps (in/out/ex for both incoming and outgoing), - * since we used fdset we need to allocate memory in units of - * long-words. - */ - size = FDS_BYTES(n); - bits = stack_fds; - if (size > sizeof(stack_fds) / 6) { - bits = kmalloc(6 * size, GFP_KERNEL); - ret = -ENOMEM; - if (!bits) - goto out_nofds; - } - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); - - if ((ret = compat_get_fd_set(n, inp, fds.in)) || - (ret = compat_get_fd_set(n, outp, fds.out)) || - (ret = compat_get_fd_set(n, exp, fds.ex))) - goto out; - zero_fd_set(n, fds.res_in); - zero_fd_set(n, fds.res_out); - zero_fd_set(n, fds.res_ex); - - ret = do_select(n, &fds, end_time); - - if (ret < 0) - goto out; - if (!ret) { - ret = -ERESTARTNOHAND; - if (signal_pending(current)) - goto out; - ret = 0; - } - - if (compat_set_fd_set(n, inp, fds.res_in) || - compat_set_fd_set(n, outp, fds.res_out) || - compat_set_fd_set(n, exp, fds.res_ex)) - ret = -EFAULT; -out: - if (bits != stack_fds) - kfree(bits); -out_nofds: - return ret; -} - -COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timeval __user *, tvp) -{ - struct timespec end_time, *to = NULL; - struct compat_timeval tv; - int ret; - - if (tvp) { - if (copy_from_user(&tv, tvp, sizeof(tv))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, - tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), - (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) - return -EINVAL; - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tvp, 1, ret); - - return ret; -} - -struct compat_sel_arg_struct { - compat_ulong_t n; - compat_uptr_t inp; - compat_uptr_t outp; - compat_uptr_t exp; - compat_uptr_t tvp; -}; - -COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) -{ - struct compat_sel_arg_struct a; - - if (copy_from_user(&a, arg, sizeof(a))) - return -EFAULT; - return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), - compat_ptr(a.exp), compat_ptr(a.tvp)); -} - -static long do_compat_pselect(int n, compat_ulong_t __user *inp, - compat_ulong_t __user *outp, compat_ulong_t __user *exp, - struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, - compat_size_t sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = compat_core_sys_select(n, inp, outp, exp, to); - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - if (ret == -ERESTARTNOHAND) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - return ret; -} - -COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, - compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, - struct compat_timespec __user *, tsp, void __user *, sig) -{ - compat_size_t sigsetsize = 0; - compat_uptr_t up = 0; - - if (sig) { - if (!access_ok(VERIFY_READ, sig, - sizeof(compat_uptr_t)+sizeof(compat_size_t)) || - __get_user(up, (compat_uptr_t __user *)sig) || - __get_user(sigsetsize, - (compat_size_t __user *)(sig+sizeof(up)))) - return -EFAULT; - } - return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), - sigsetsize); -} - -COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, - unsigned int, nfds, struct compat_timespec __user *, tsp, - const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) -{ - compat_sigset_t ss32; - sigset_t ksigmask, sigsaved; - struct compat_timespec ts; - struct timespec end_time, *to = NULL; - int ret; - - if (tsp) { - if (copy_from_user(&ts, tsp, sizeof(ts))) - return -EFAULT; - - to = &end_time; - if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) - return -EINVAL; - } - - if (sigmask) { - if (sigsetsize != sizeof(compat_sigset_t)) - return -EINVAL; - if (copy_from_user(&ss32, sigmask, sizeof(ss32))) - return -EFAULT; - sigset_from_compat(&ksigmask, &ss32); - - sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); - sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); - } - - ret = do_sys_poll(ufds, nfds, to); - - /* We can restart this syscall, usually */ - if (ret == -EINTR) { - /* - * Don't restore the signal mask yet. Let do_signal() deliver - * the signal on the way back to userspace, before the signal - * mask is restored. - */ - if (sigmask) { - memcpy(¤t->saved_sigmask, &sigsaved, - sizeof(sigsaved)); - set_restore_sigmask(); - } - ret = -ERESTARTNOHAND; - } else if (sigmask) - sigprocmask(SIG_SETMASK, &sigsaved, NULL); - - ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); - - return ret; -} - -#ifdef CONFIG_FHANDLE -/* - * Exactly like fs/open.c:sys_open_by_handle_at(), except that it - * doesn't set the O_LARGEFILE flag. - */ -COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, - struct file_handle __user *, handle, int, flags) -{ - return do_handle_open(mountdirfd, handle, flags); -} -#endif diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 11d087b2b28edd79bf03b367d45bfd49bd210352..6116d5275a3ecc7f95d3e3009cca261f8b7c075d 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -833,7 +833,7 @@ static int compat_ioctl_preallocate(struct file *file, */ #define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) -#define COMPATIBLE_IOCTL(cmd) XFORM(cmd), +#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), /* ioctl should not be warned about even if it's not implemented. Valid reasons to use this: - It is implemented with ->compat_ioctl on some device, but programs diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 37b49894c762344841117b2a0042c5e9ec8b7140..d1bb02b1ee589ecd68876ff240bf57e756a70466 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e39696e644942a80d110035e2d49570840823af7..1e1f8a361b75479f3051cb924207f633cfbe8487 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -13,8 +13,6 @@ #include -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /* Encryption parameters */ #define FS_XTS_TWEAK_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 @@ -22,10 +20,6 @@ #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 #define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 #define FS_KEY_DERIVATION_NONCE_SIZE 16 @@ -51,13 +45,6 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 8cdfddce2b34868f0cfe3f71da55d64187172a38..179e578b875b1abbc58fbb2d3fd634b3608243ad 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4908906d54d562263093cd5245fcb14e36d18b8e..210976e7a269ff0760cf81f6b1975632f9876862 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); @@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else @@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -143,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -172,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -202,9 +245,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; diff --git a/fs/dax.c b/fs/dax.c index 85abd741253d4b89a42c6c5f2b4dd7bcf456e843..c22eaf162f95c1456563b31a8362da9e531c9185 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -55,32 +55,6 @@ static int __init init_dax_wait_table(void) } fs_initcall(init_dax_wait_table); -static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) -{ - struct request_queue *q = bdev->bd_queue; - long rc = -EIO; - - dax->addr = ERR_PTR(-EIO); - if (blk_queue_enter(q, true) != 0) - return rc; - - rc = bdev_direct_access(bdev, dax); - if (rc < 0) { - dax->addr = ERR_PTR(rc); - blk_queue_exit(q); - return rc; - } - return rc; -} - -static void dax_unmap_atomic(struct block_device *bdev, - const struct blk_dax_ctl *dax) -{ - if (IS_ERR(dax->addr)) - return; - blk_queue_exit(bdev->bd_queue); -} - static int dax_is_pmd_entry(void *entry) { return (unsigned long)entry & RADIX_DAX_PMD; @@ -101,26 +75,6 @@ static int dax_is_empty_entry(void *entry) return (unsigned long)entry & RADIX_DAX_EMPTY; } -struct page *read_dax_sector(struct block_device *bdev, sector_t n) -{ - struct page *page = alloc_pages(GFP_KERNEL, 0); - struct blk_dax_ctl dax = { - .size = PAGE_SIZE, - .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), - }; - long rc; - - if (!page) - return ERR_PTR(-ENOMEM); - - rc = dax_map_atomic(bdev, &dax); - if (rc < 0) - return ERR_PTR(rc); - memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); - dax_unmap_atomic(bdev, &dax); - return page; -} - /* * DAX radix tree locking */ @@ -506,35 +460,6 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) return ret; } -/* - * Invalidate exceptional DAX entry if easily possible. This handles DAX - * entries for invalidate_inode_pages() so we evict the entry only if we can - * do so without blocking. - */ -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - int ret = 0; - void *entry, **slot; - struct radix_tree_root *page_tree = &mapping->page_tree; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(page_tree, index, NULL, &slot); - if (!entry || !radix_tree_exceptional_entry(entry) || - slot_locked(mapping, slot)) - goto out; - if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || - radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) - goto out; - radix_tree_delete(page_tree, index); - mapping->nrexceptional--; - ret = 1; -out: - spin_unlock_irq(&mapping->tree_lock); - if (ret) - dax_wake_mapping_entry_waiter(mapping, index, entry, true); - return ret; -} - /* * Invalidate exceptional DAX entry if it is clean. */ @@ -555,21 +480,25 @@ int dax_invalidate_mapping_entry_sync(struct address_space *mapping, static int dax_load_hole(struct address_space *mapping, void **entry, struct vm_fault *vmf) { + struct inode *inode = mapping->host; struct page *page; int ret; /* Hole page already exists? Return it... */ if (!radix_tree_exceptional_entry(*entry)) { page = *entry; - goto out; + goto finish_fault; } /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) - return VM_FAULT_OOM; - out: + if (!page) { + ret = VM_FAULT_OOM; + goto out; + } + +finish_fault: vmf->page = page; ret = finish_fault(vmf); vmf->page = NULL; @@ -577,26 +506,37 @@ static int dax_load_hole(struct address_space *mapping, void **entry, if (!ret) { /* Grab reference for PTE that is now referencing the page */ get_page(page); - return VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; } +out: + trace_dax_load_hole(inode, vmf, ret); return ret; } -static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size, - struct page *to, unsigned long vaddr) +static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, struct page *to, + unsigned long vaddr) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *vto; - - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); + void *vto, *kaddr; + pgoff_t pgoff; + pfn_t pfn; + long rc; + int id; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)dax.addr, vaddr, to); + copy_user_page(vto, (void __force *)kaddr, vaddr, to); kunmap_atomic(vto); - dax_unmap_atomic(bdev, &dax); + dax_read_unlock(id); return 0; } @@ -764,12 +704,16 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, } static int dax_writeback_one(struct block_device *bdev, - struct address_space *mapping, pgoff_t index, void *entry) + struct dax_device *dax_dev, struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_root *page_tree = &mapping->page_tree; - struct blk_dax_ctl dax; - void *entry2, **slot; - int ret = 0; + void *entry2, **slot, *kaddr; + long ret = 0, id; + sector_t sector; + pgoff_t pgoff; + size_t size; + pfn_t pfn; /* * A page got tagged dirty in DAX mapping? Something is seriously @@ -818,26 +762,29 @@ static int dax_writeback_one(struct block_device *bdev, * 'entry'. This allows us to flush for PMD_SIZE and not have to * worry about partial PMD writebacks. */ - dax.sector = dax_radix_sector(entry); - dax.size = PAGE_SIZE << dax_radix_order(entry); + sector = dax_radix_sector(entry); + size = PAGE_SIZE << dax_radix_order(entry); + + id = dax_read_lock(); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + goto dax_unlock; /* - * We cannot hold tree_lock while calling dax_map_atomic() because it - * eventually calls cond_resched(). + * dax_direct_access() may sleep, so cannot hold tree_lock over + * its invocation. */ - ret = dax_map_atomic(bdev, &dax); - if (ret < 0) { - put_locked_mapping_entry(mapping, index, entry); - return ret; - } + ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); + if (ret < 0) + goto dax_unlock; - if (WARN_ON_ONCE(ret < dax.size)) { + if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { ret = -EIO; - goto unmap; + goto dax_unlock; } - dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn)); - wb_cache_pmem(dax.addr, dax.size); + dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); + wb_cache_pmem(kaddr, size); /* * After we have flushed the cache, we can clear the dirty tag. There * cannot be new dirty data in the pfn after the flush has completed as @@ -847,8 +794,9 @@ static int dax_writeback_one(struct block_device *bdev, spin_lock_irq(&mapping->tree_lock); radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); spin_unlock_irq(&mapping->tree_lock); - unmap: - dax_unmap_atomic(bdev, &dax); + trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); + dax_unlock: + dax_read_unlock(id); put_locked_mapping_entry(mapping, index, entry); return ret; @@ -869,6 +817,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t start_index, end_index; pgoff_t indices[PAGEVEC_SIZE]; + struct dax_device *dax_dev; struct pagevec pvec; bool done = false; int i, ret = 0; @@ -879,9 +828,15 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) return 0; + dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + if (!dax_dev) + return -EIO; + start_index = wbc->range_start >> PAGE_SHIFT; end_index = wbc->range_end >> PAGE_SHIFT; + trace_dax_writeback_range(inode, start_index, end_index); + tag_pages_for_writeback(mapping, start_index, end_index); pagevec_init(&pvec, 0); @@ -899,38 +854,50 @@ int dax_writeback_mapping_range(struct address_space *mapping, break; } - ret = dax_writeback_one(bdev, mapping, indices[i], - pvec.pages[i]); + ret = dax_writeback_one(bdev, dax_dev, mapping, + indices[i], pvec.pages[i]); if (ret < 0) - return ret; + goto out; } } - return 0; +out: + put_dax(dax_dev); + trace_dax_writeback_range_done(inode, start_index, end_index); + return (ret < 0 ? ret : 0); } EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); static int dax_insert_mapping(struct address_space *mapping, - struct block_device *bdev, sector_t sector, size_t size, - void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) + struct block_device *bdev, struct dax_device *dax_dev, + sector_t sector, size_t size, void **entryp, + struct vm_area_struct *vma, struct vm_fault *vmf) { unsigned long vaddr = vmf->address; - struct blk_dax_ctl dax = { - .sector = sector, - .size = size, - }; - void *ret; void *entry = *entryp; + void *ret, *kaddr; + pgoff_t pgoff; + int id, rc; + pfn_t pfn; + + rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (rc) + return rc; - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - dax_unmap_atomic(bdev, &dax); + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + dax_read_unlock(id); - ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0); + ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0); if (IS_ERR(ret)) return PTR_ERR(ret); *entryp = ret; - return vm_insert_mixed(vma, vaddr, dax.pfn); + trace_dax_insert_mapping(mapping->host, vmf, ret); + return vm_insert_mixed(vma, vaddr, pfn); } /** @@ -941,6 +908,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) { struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; + struct inode *inode = mapping->host; void *entry, **slot; pgoff_t index = vmf->pgoff; @@ -950,6 +918,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) if (entry) put_unlocked_mapping_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); + trace_dax_pfn_mkwrite_no_entry(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY); @@ -962,6 +931,7 @@ int dax_pfn_mkwrite(struct vm_fault *vmf) */ finish_mkwrite_fault(vmf); put_locked_mapping_entry(mapping, index, entry); + trace_dax_pfn_mkwrite(inode, vmf, VM_FAULT_NOPAGE); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -979,24 +949,34 @@ static bool dax_range_is_aligned(struct block_device *bdev, return true; } -int __dax_zero_page_range(struct block_device *bdev, sector_t sector, - unsigned int offset, unsigned int length) +int __dax_zero_page_range(struct block_device *bdev, + struct dax_device *dax_dev, sector_t sector, + unsigned int offset, unsigned int size) { - struct blk_dax_ctl dax = { - .sector = sector, - .size = PAGE_SIZE, - }; - - if (dax_range_is_aligned(bdev, offset, length)) { - sector_t start_sector = dax.sector + (offset >> 9); + if (dax_range_is_aligned(bdev, offset, size)) { + sector_t start_sector = sector + (offset >> 9); return blkdev_issue_zeroout(bdev, start_sector, - length >> 9, GFP_NOFS, true); + size >> 9, GFP_NOFS, 0); } else { - if (dax_map_atomic(bdev, &dax) < 0) - return PTR_ERR(dax.addr); - clear_pmem(dax.addr + offset, length); - dax_unmap_atomic(bdev, &dax); + pgoff_t pgoff; + long rc, id; + void *kaddr; + pfn_t pfn; + + rc = bdev_dax_pgoff(bdev, sector, PAGE_SIZE, &pgoff); + if (rc) + return rc; + + id = dax_read_lock(); + rc = dax_direct_access(dax_dev, pgoff, 1, &kaddr, + &pfn); + if (rc < 0) { + dax_read_unlock(id); + return rc; + } + clear_pmem(kaddr + offset, size); + dax_read_unlock(id); } return 0; } @@ -1011,9 +991,12 @@ static loff_t dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, struct iomap *iomap) { + struct block_device *bdev = iomap->bdev; + struct dax_device *dax_dev = iomap->dax_dev; struct iov_iter *iter = data; loff_t end = pos + length, done = 0; ssize_t ret = 0; + int id; if (iov_iter_rw(iter) == READ) { end = min(end, i_size_read(inode)); @@ -1032,40 +1015,48 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, * into page tables. We have to tear down these mappings so that data * written by write(2) is visible in mmap. */ - if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) { + if (iomap->flags & IOMAP_F_NEW) { invalidate_inode_pages2_range(inode->i_mapping, pos >> PAGE_SHIFT, (end - 1) >> PAGE_SHIFT); } + id = dax_read_lock(); while (pos < end) { unsigned offset = pos & (PAGE_SIZE - 1); - struct blk_dax_ctl dax = { 0 }; + const size_t size = ALIGN(length + offset, PAGE_SIZE); + const sector_t sector = dax_iomap_sector(iomap, pos); ssize_t map_len; + pgoff_t pgoff; + void *kaddr; + pfn_t pfn; if (fatal_signal_pending(current)) { ret = -EINTR; break; } - dax.sector = dax_iomap_sector(iomap, pos); - dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK; - map_len = dax_map_atomic(iomap->bdev, &dax); + ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); + if (ret) + break; + + map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), + &kaddr, &pfn); if (map_len < 0) { ret = map_len; break; } - dax.addr += offset; + map_len = PFN_PHYS(map_len); + kaddr += offset; map_len -= offset; if (map_len > end - pos) map_len = end - pos; if (iov_iter_rw(iter) == WRITE) - map_len = copy_from_iter_pmem(dax.addr, map_len, iter); + map_len = copy_from_iter_pmem(kaddr, map_len, iter); else - map_len = copy_to_iter(dax.addr, map_len, iter); - dax_unmap_atomic(iomap->bdev, &dax); + map_len = copy_to_iter(kaddr, map_len, iter); if (map_len <= 0) { ret = map_len ? map_len : -EFAULT; break; @@ -1075,6 +1066,7 @@ dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, length -= map_len; done += map_len; } + dax_read_unlock(id); return done ? done : ret; } @@ -1142,34 +1134,39 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, int vmf_ret = 0; void *entry; + trace_dax_pte_fault(inode, vmf, vmf_ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ - if (pos >= i_size_read(inode)) - return VM_FAULT_SIGBUS; + if (pos >= i_size_read(inode)) { + vmf_ret = VM_FAULT_SIGBUS; + goto out; + } if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) flags |= IOMAP_WRITE; + entry = grab_mapping_entry(mapping, vmf->pgoff, 0); + if (IS_ERR(entry)) { + vmf_ret = dax_fault_return(PTR_ERR(entry)); + goto out; + } + /* * Note that we don't bother to use iomap_apply here: DAX required * the file system block size to be equal the page size, which means * that we never have to deal with more than a single extent here. */ error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); - if (error) - return dax_fault_return(error); - if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { - vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ - goto finish_iomap; + if (error) { + vmf_ret = dax_fault_return(error); + goto unlock_entry; } - - entry = grab_mapping_entry(mapping, vmf->pgoff, 0); - if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); - goto finish_iomap; + if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { + error = -EIO; /* fs corruption? */ + goto error_finish_iomap; } sector = dax_iomap_sector(&iomap, pos); @@ -1181,8 +1178,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, clear_user_highpage(vmf->cow_page, vaddr); break; case IOMAP_MAPPED: - error = copy_user_dax(iomap.bdev, sector, PAGE_SIZE, - vmf->cow_page, vaddr); + error = copy_user_dax(iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, vmf->cow_page, vaddr); break; default: WARN_ON_ONCE(1); @@ -1191,13 +1188,13 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, } if (error) - goto error_unlock_entry; + goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); vmf_ret = finish_fault(vmf); if (!vmf_ret) vmf_ret = VM_FAULT_DONE_COW; - goto unlock_entry; + goto finish_iomap; } switch (iomap.type) { @@ -1207,8 +1204,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, mem_cgroup_count_vm_event(vmf->vma->vm_mm, PGMAJFAULT); major = VM_FAULT_MAJOR; } - error = dax_insert_mapping(mapping, iomap.bdev, sector, - PAGE_SIZE, &entry, vmf->vma, vmf); + error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, + sector, PAGE_SIZE, &entry, vmf->vma, vmf); /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error == -EBUSY) error = 0; @@ -1217,7 +1214,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { vmf_ret = dax_load_hole(mapping, &entry, vmf); - goto unlock_entry; + goto finish_iomap; } /*FALLTHRU*/ default: @@ -1226,10 +1223,8 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, break; } - error_unlock_entry: + error_finish_iomap: vmf_ret = dax_fault_return(error) | major; - unlock_entry: - put_locked_mapping_entry(mapping, vmf->pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; @@ -1244,6 +1239,10 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, */ ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, vmf->pgoff, entry); + out: + trace_dax_pte_fault_done(inode, vmf, vmf_ret); return vmf_ret; } @@ -1258,41 +1257,48 @@ static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, loff_t pos, void **entryp) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; + const sector_t sector = dax_iomap_sector(iomap, pos); + struct dax_device *dax_dev = iomap->dax_dev; struct block_device *bdev = iomap->bdev; struct inode *inode = mapping->host; - struct blk_dax_ctl dax = { - .sector = dax_iomap_sector(iomap, pos), - .size = PMD_SIZE, - }; - long length = dax_map_atomic(bdev, &dax); - void *ret = NULL; - - if (length < 0) /* dax_map_atomic() failed */ + const size_t size = PMD_SIZE; + void *ret = NULL, *kaddr; + long length = 0; + pgoff_t pgoff; + pfn_t pfn; + int id; + + if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0) goto fallback; - if (length < PMD_SIZE) - goto unmap_fallback; - if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) - goto unmap_fallback; - if (!pfn_t_devmap(dax.pfn)) - goto unmap_fallback; - dax_unmap_atomic(bdev, &dax); - - ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector, + id = dax_read_lock(); + length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); + if (length < 0) + goto unlock_fallback; + length = PFN_PHYS(length); + + if (length < size) + goto unlock_fallback; + if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR) + goto unlock_fallback; + if (!pfn_t_devmap(pfn)) + goto unlock_fallback; + dax_read_unlock(id); + + ret = dax_insert_mapping_entry(mapping, vmf, *entryp, sector, RADIX_DAX_PMD); if (IS_ERR(ret)) goto fallback; *entryp = ret; - trace_dax_pmd_insert_mapping(inode, vmf, length, dax.pfn, ret); + trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret); return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, - dax.pfn, vmf->flags & FAULT_FLAG_WRITE); + pfn, vmf->flags & FAULT_FLAG_WRITE); - unmap_fallback: - dax_unmap_atomic(bdev, &dax); +unlock_fallback: + dax_read_unlock(id); fallback: - trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, - dax.pfn, ret); + trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret); return VM_FAULT_FALLBACK; } @@ -1381,6 +1387,16 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, if ((pgoff | PG_PMD_COLOUR) > max_pgoff) goto fallback; + /* + * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX + * PMD or a HZP entry. If it can't (because a 4k page is already in + * the tree, for instance), it will return -EEXIST and we just fall + * back to 4k entries. + */ + entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); + if (IS_ERR(entry)) + goto fallback; + /* * Note that we don't use iomap_apply here. We aren't doing I/O, only * setting up a mapping, so really we're using iomap_begin() as a way @@ -1389,21 +1405,11 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pos = (loff_t)pgoff << PAGE_SHIFT; error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); if (error) - goto fallback; + goto unlock_entry; if (iomap.offset + iomap.length < pos + PMD_SIZE) goto finish_iomap; - /* - * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX - * PMD or a HZP entry. If it can't (because a 4k page is already in - * the tree, for instance), it will return -EEXIST and we just fall - * back to 4k entries. - */ - entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); - if (IS_ERR(entry)) - goto finish_iomap; - switch (iomap.type) { case IOMAP_MAPPED: result = dax_pmd_insert_mapping(vmf, &iomap, pos, &entry); @@ -1411,7 +1417,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (WARN_ON_ONCE(write)) - goto unlock_entry; + break; result = dax_pmd_load_hole(vmf, &iomap, &entry); break; default: @@ -1419,8 +1425,6 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, break; } - unlock_entry: - put_locked_mapping_entry(mapping, pgoff, entry); finish_iomap: if (ops->iomap_end) { int copied = PMD_SIZE; @@ -1436,6 +1440,8 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, &iomap); } + unlock_entry: + put_locked_mapping_entry(mapping, pgoff, entry); fallback: if (result == VM_FAULT_FALLBACK) { split_huge_pmd(vma, vmf->pmd, vmf->address); diff --git a/fs/dcache.c b/fs/dcache.c index 95d71eda81420a506c5a1f4a4f5283b31310d5e4..cddf39777835d0d27a71862b244cec37528b195b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -419,6 +419,8 @@ static void dentry_lru_add(struct dentry *dentry) { if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); + else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) + dentry->d_flags |= DCACHE_REFERENCED; } /** @@ -779,8 +781,6 @@ void dput(struct dentry *dentry) goto kill_it; } - if (!(dentry->d_flags & DCACHE_REFERENCED)) - dentry->d_flags |= DCACHE_REFERENCED; dentry_lru_add(dentry); dentry->d_lockref.count--; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 7fd4ec4bb21406b1d427f24ee7789c54b62c335b..e892ae7d89f8bc078848d8752affa1598a7e1e11 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -199,7 +199,7 @@ static const struct dentry_operations debugfs_dops = { static int debug_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr debug_files[] = {{""}}; + static const struct tree_descr debug_files[] = {{""}}; struct debugfs_fs_info *fsi; int err; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 95c1c8d3453922be63c7c0d13e2bd8acd18d7e12..9c351bf757b20e037f39aeadf0fa0ed12f963db6 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -349,7 +349,6 @@ struct ecryptfs_mount_crypt_stat { struct ecryptfs_sb_info { struct super_block *wsi_sb; struct ecryptfs_mount_crypt_stat mount_crypt_stat; - struct backing_dev_info bdi; }; /* file private data. */ diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 151872dcc1f402ef47273b6da96a5d15a3f251ba..9014479d01600be356b87c284c675ada55b6a723 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -519,12 +519,11 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } - rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs"); + rc = super_setup_bdi(s); if (rc) goto out1; ecryptfs_set_superblock_private(s, sbi); - s->s_bdi = &sbi->bdi; /* ->kill_sb() will take care of sbi after that point */ sbi = NULL; @@ -633,7 +632,6 @@ static void ecryptfs_kill_block_super(struct super_block *sb) if (!sb_info) return; ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); - bdi_destroy(&sb_info->bdi); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 341251421ced00ab1be854c4145cf408cfc93c2f..5420767c9b686a7a9db30bbf932e85071c83c9f6 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -42,6 +42,7 @@ #include #include #include +#include /* * LOCKING: @@ -224,6 +225,11 @@ struct eventpoll { /* used to optimize loop detection check */ int visited; struct list_head visited_list_link; + +#ifdef CONFIG_NET_RX_BUSY_POLL + /* used to track busy poll napi_id */ + unsigned int napi_id; +#endif }; /* Wait structure used by the poll hooks */ @@ -384,6 +390,77 @@ static inline int ep_events_available(struct eventpoll *ep) return !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR; } +#ifdef CONFIG_NET_RX_BUSY_POLL +static bool ep_busy_loop_end(void *p, unsigned long start_time) +{ + struct eventpoll *ep = p; + + return ep_events_available(ep) || busy_loop_timeout(start_time); +} +#endif /* CONFIG_NET_RX_BUSY_POLL */ + +/* + * Busy poll if globally on and supporting sockets found && no events, + * busy loop will return if need_resched or ep_events_available. + * + * we must do our busy polling with irqs enabled + */ +static void ep_busy_loop(struct eventpoll *ep, int nonblock) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(ep->napi_id); + + if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) + napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep); +#endif +} + +static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + if (ep->napi_id) + ep->napi_id = 0; +#endif +} + +/* + * Set epoll busy poll NAPI ID from sk. + */ +static inline void ep_set_busy_poll_napi_id(struct epitem *epi) +{ +#ifdef CONFIG_NET_RX_BUSY_POLL + struct eventpoll *ep; + unsigned int napi_id; + struct socket *sock; + struct sock *sk; + int err; + + if (!net_busy_loop_on()) + return; + + sock = sock_from_file(epi->ffd.file, &err); + if (!sock) + return; + + sk = sock->sk; + if (!sk) + return; + + napi_id = READ_ONCE(sk->sk_napi_id); + ep = epi->ep; + + /* Non-NAPI IDs can be rejected + * or + * Nothing to do if we already have this ID + */ + if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + return; + + /* record NAPI ID for use in next busy poll */ + ep->napi_id = napi_id; +#endif +} + /** * ep_call_nested - Perform a bound (possibly) nested call, by checking * that the recursion limit is not exceeded, and that @@ -1022,6 +1099,8 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k spin_lock_irqsave(&ep->lock, flags); + ep_set_busy_poll_napi_id(epi); + /* * If the event mask does not contain any poll(2) event, we consider the * descriptor to be disabled. This condition is likely the effect of the @@ -1363,6 +1442,9 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, /* We have to drop the new item inside our item list to keep track of it */ spin_lock_irqsave(&ep->lock, flags); + /* record NAPI ID of new item if present */ + ep_set_busy_poll_napi_id(epi); + /* If the file is already "ready" we drop it inside the ready list */ if ((revents & event->events) && !ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1637,9 +1719,20 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, } fetch_events: + + if (!ep_events_available(ep)) + ep_busy_loop(ep, timed_out); + spin_lock_irqsave(&ep->lock, flags); if (!ep_events_available(ep)) { + /* + * Busy poll timed out. Drop NAPI ID for now, we can add + * it back in when we have moved a socket with a valid NAPI + * ID onto the ready list. + */ + ep_reset_busy_poll_napi_id(ep); + /* * We don't have any available event to return to the caller. * We need to sleep here, and we will be wake up by diff --git a/fs/exec.c b/fs/exec.c index 65145a3df065192345c66ebc311d464fe09a6f29..72934df6847150ba50dfbadad78fe10e01d2eadd 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1320,6 +1320,7 @@ void setup_new_exec(struct linux_binprm * bprm) else set_dumpable(current->mm, suid_dumpable); + arch_setup_new_exec(); perf_event_exec(); __set_task_comm(current, kbasename(bprm->filename), true); diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 42f9a0a0c4caf09722bc69fa278d509a7b7f16b3..8eeb694332fe811e820ed342ebbecc25bb678a1f 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -405,8 +405,7 @@ int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de, int err; lock_page(page); - err = exofs_write_begin(NULL, page->mapping, pos, len, - AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + err = exofs_write_begin(NULL, page->mapping, pos, len, 0, &page, NULL); if (err) EXOFS_ERR("exofs_set_link: exofs_write_begin FAILED => %d\n", err); diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 2e86086bc9403efe99a25ab0df439db0c714eb4e..5dc392404559b033445a38a0f25bfacfd41a5c9c 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -64,7 +64,6 @@ struct exofs_dev { * our extension to the in-memory superblock */ struct exofs_sb_info { - struct backing_dev_info bdi; /* register our bdi with VFS */ struct exofs_sb_stats s_ess; /* Written often, pre-allocate*/ int s_timeout; /* timeout for OSD operations */ uint64_t s_nextid; /* highest object ID used */ diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 1076a4233b3962e34df904f55f9923dbdec2b48f..819624cfc8da4c7bad401ee26611d69b7ff38f98 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -464,7 +464,6 @@ static void exofs_put_super(struct super_block *sb) sbi->one_comp.obj.partition); exofs_sysfs_sb_del(sbi); - bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; } @@ -809,8 +808,12 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) __sbi_read_stats(sbi); /* set up operation vectors */ - sbi->bdi.ra_pages = __ra_pages(&sbi->layout); - sb->s_bdi = &sbi->bdi; + ret = super_setup_bdi(sb); + if (ret) { + EXOFS_DBGMSG("Failed to super_setup_bdi\n"); + goto free_sbi; + } + sb->s_bdi->ra_pages = __ra_pages(&sbi->layout); sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; @@ -836,14 +839,6 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) goto free_sbi; } - ret = bdi_setup_and_register(&sbi->bdi, "exofs"); - if (ret) { - EXOFS_DBGMSG("Failed to bdi_setup_and_register\n"); - dput(sb->s_root); - sb->s_root = NULL; - goto free_sbi; - } - exofs_sysfs_dbg_print(); _exofs_print_device("Mounting", opts->dev_name, ore_comp_dev(&sbi->oc, 0), diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 5e64de9c5093fa5d3a3e361305f8c648284c4e24..03f5ce1d3dbe1c334958c1de072ff88ce1cdeb8a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -779,7 +779,6 @@ extern void ext2_evict_inode(struct inode *); extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); -extern void ext2_get_inode_flags(struct ext2_inode_info *); extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -796,7 +795,8 @@ void ext2_error(struct super_block *, const char *, const char *, ...); extern __printf(3, 4) void ext2_msg(struct super_block *, const char *, const char *, ...); extern void ext2_update_dynamic_rev (struct super_block *sb); -extern void ext2_write_super (struct super_block *); +extern void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait); /* * Inodes and files operations diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 128cce5406455716172a88a50cbeddd6edf046f2..26d77f9f8c12e69f48bc6cc1c4f47bb6fab27ebd 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -799,6 +799,7 @@ int ext2_get_block(struct inode *inode, sector_t iblock, static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long max_blocks = (length + (1 << blkbits) - 1) >> blkbits; @@ -812,8 +813,13 @@ static int ext2_iomap_begin(struct inode *inode, loff_t offset, loff_t length, return ret; iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; iomap->offset = (u64)first_block << blkbits; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; if (ret == 0) { iomap->type = IOMAP_HOLE; @@ -835,6 +841,7 @@ static int ext2_iomap_end(struct inode *inode, loff_t offset, loff_t length, ssize_t written, unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if (iomap->type == IOMAP_MAPPED && written < length && (flags & IOMAP_WRITE)) @@ -1384,25 +1391,6 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_DAX; } -/* Propagate flags from i_flags to EXT2_I(inode)->i_flags */ -void ext2_get_inode_flags(struct ext2_inode_info *ei) -{ - unsigned int flags = ei->vfs_inode.i_flags; - - ei->i_flags &= ~(EXT2_SYNC_FL|EXT2_APPEND_FL| - EXT2_IMMUTABLE_FL|EXT2_NOATIME_FL|EXT2_DIRSYNC_FL); - if (flags & S_SYNC) - ei->i_flags |= EXT2_SYNC_FL; - if (flags & S_APPEND) - ei->i_flags |= EXT2_APPEND_FL; - if (flags & S_IMMUTABLE) - ei->i_flags |= EXT2_IMMUTABLE_FL; - if (flags & S_NOATIME) - ei->i_flags |= EXT2_NOATIME_FL; - if (flags & S_DIRSYNC) - ei->i_flags |= EXT2_DIRSYNC_FL; -} - struct inode *ext2_iget (struct super_block *sb, unsigned long ino) { struct ext2_inode_info *ei; @@ -1563,7 +1551,6 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) if (ei->i_state & EXT2_STATE_NEW) memset(raw_inode, 0, EXT2_SB(sb)->s_inode_size); - ext2_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); if (!(test_opt(sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(uid)); @@ -1615,7 +1602,7 @@ static int __ext2_write_inode(struct inode *inode, int do_sync) EXT2_SET_RO_COMPAT_FEATURE(sb, EXT2_FEATURE_RO_COMPAT_LARGE_FILE); spin_unlock(&EXT2_SB(sb)->s_lock); - ext2_write_super(sb); + ext2_sync_super(sb, EXT2_SB(sb)->s_es, 1); } } } diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 191e02b28ce8ff0329646313a7e8445b36ef7b93..087f122cca42063a796367364e09874581fba90d 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -29,7 +29,6 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case EXT2_IOC_GETFLAGS: - ext2_get_inode_flags(ei); flags = ei->i_flags & EXT2_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT2_IOC_SETFLAGS: { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 9e25a71fe1a27e584ab3754f0a70b26e94f9a44c..9c2028b50e5c389ed6f86a89d9bf94960fcfc8cb 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -32,12 +32,12 @@ #include #include #include +#include #include "ext2.h" #include "xattr.h" #include "acl.h" -static void ext2_sync_super(struct super_block *sb, - struct ext2_super_block *es, int wait); +static void ext2_write_super(struct super_block *sb); static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); @@ -123,13 +123,29 @@ void ext2_update_dynamic_rev(struct super_block *sb) */ } +#ifdef CONFIG_QUOTA +static int ext2_quota_off(struct super_block *sb, int type); + +static void ext2_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + ext2_quota_off(sb, type); +} +#else +static inline void ext2_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext2_put_super (struct super_block * sb) { int db_count; int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext2_quota_off_umount(sb); if (sbi->s_mb_cache) { ext2_xattr_destroy_cache(sbi->s_mb_cache); @@ -314,10 +330,23 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root) #ifdef CONFIG_QUOTA static ssize_t ext2_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); static ssize_t ext2_quota_write(struct super_block *sb, int type, const char *data, size_t len, loff_t off); +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); static struct dquot **ext2_get_dquots(struct inode *inode) { return EXT2_I(inode)->i_dquot; } + +static const struct quotactl_ops ext2_quotactl_ops = { + .quota_on = ext2_quota_on, + .quota_off = ext2_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; #endif static const struct super_operations ext2_sops = { @@ -1117,7 +1146,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &ext2_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -1194,8 +1223,8 @@ static void ext2_clear_super_error(struct super_block *sb) } } -static void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, - int wait) +void ext2_sync_super(struct super_block *sb, struct ext2_super_block *es, + int wait) { ext2_clear_super_error(sb); spin_lock(&EXT2_SB(sb)->s_lock); @@ -1270,7 +1299,7 @@ static int ext2_unfreeze(struct super_block *sb) return 0; } -void ext2_write_super(struct super_block *sb) +static void ext2_write_super(struct super_block *sb) { if (!(sb->s_flags & MS_RDONLY)) ext2_sync_fs(sb, 1); @@ -1548,6 +1577,51 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, return len - towrite; } +static int ext2_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + EXT2_I(inode)->i_flags |= EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int ext2_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + EXT2_I(inode)->i_flags &= ~(EXT2_NOATIME_FL | EXT2_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + #endif static struct file_system_type ext2_fs_type = { diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 354103f3490c3cba719b64a8d0f297200fde67e6..d9beca1653c597bcc9f48835e766332124f07cc3 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -4,11 +4,11 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o readpage.o sysfs.o +ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index fb69ee2388dba0b83b4f29c57b37ec5cb0aefb34..8e8046104f4d8a945f2eae5e3bca7cb214d3cbb8 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2356,17 +2356,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } -static unsigned char ext4_filetype_table[] = { +static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; @@ -2477,7 +2476,6 @@ extern int ext4_truncate(struct inode *); extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length); extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); extern void ext4_set_inode_flags(struct inode *); -extern void ext4_get_inode_flags(struct ext4_inode_info *); extern int ext4_alloc_da_blocks(struct inode *inode); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); @@ -3051,7 +3049,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index cefa9835f275d9b062ae9b13765ea743edb53f64..831fd6beebf01bfa65c1c4a3ae505ed84bd35526 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -257,6 +257,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, enum page_entry_size pe_size) { int result; + handle_t *handle = NULL; struct inode *inode = file_inode(vmf->vma->vm_file); struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; @@ -264,12 +265,24 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf, if (write) { sb_start_pagefault(sb); file_update_time(vmf->vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, + EXT4_DATA_TRANS_BLOCKS(sb)); + } else { + down_read(&EXT4_I(inode)->i_mmap_sem); } - down_read(&EXT4_I(inode)->i_mmap_sem); - result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); - up_read(&EXT4_I(inode)->i_mmap_sem); - if (write) + if (!IS_ERR(handle)) + result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops); + else + result = VM_FAULT_SIGBUS; + if (write) { + if (!IS_ERR(handle)) + ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); + } else { + up_read(&EXT4_I(inode)->i_mmap_sem); + } return result; } diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c new file mode 100644 index 0000000000000000000000000000000000000000..b194360988371638272871cdd95ab650e58d295c --- /dev/null +++ b/fs/ext4/fsmap.c @@ -0,0 +1,722 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "ext4.h" +#include +#include "fsmap.h" +#include "mballoc.h" +#include +#include +#include + +/* Convert an ext4_fsmap to an fsmap. */ +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = 0; + dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an ext4_fsmap. */ +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; +} + +/* getfsmap query state */ +struct ext4_getfsmap_info { + struct ext4_fsmap_head *gfi_head; + ext4_fsmap_format_t gfi_formatter; /* formatting fn */ + void *gfi_format_arg;/* format buffer */ + ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ + u32 gfi_dev; /* device id */ + ext4_group_t gfi_agno; /* bg number, if applicable */ + struct ext4_fsmap gfi_low; /* low rmap key */ + struct ext4_fsmap gfi_high; /* high rmap key */ + struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ + struct list_head gfi_meta_list; /* fixed metadata list */ + bool gfi_last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct ext4_getfsmap_dev { + int (*gfd_fn)(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info); + u32 gfd_dev; +}; + +/* Compare two getfsmap device handlers. */ +static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) +{ + const struct ext4_getfsmap_dev *d1 = p1; + const struct ext4_getfsmap_dev *d2 = p2; + + return d1->gfd_dev - d2->gfd_dev; +} + +/* Compare a record against our starting point */ +static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + return rec->fmr_physical < info->gfi_low.fmr_physical; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +static int ext4_getfsmap_helper(struct super_block *sb, + struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + struct ext4_fsmap fmr; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t rec_fsblk = rec->fmr_physical; + ext4_group_t agno; + ext4_grpblk_t cno; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (ext4_getfsmap_rec_before_low_key(info, rec)) { + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->gfi_head->fmh_count == 0) { + if (rec_fsblk > info->gfi_next_fsblk) + info->gfi_head->fmh_entries++; + + if (info->gfi_last) + return EXT4_QUERY_RANGE_CONTINUE; + + info->gfi_head->fmh_entries++; + + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_fsblk > info->gfi_next_fsblk) { + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, + &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, + EXT4_C2B(sbi, cno), + rec_fsblk - info->gfi_next_fsblk, + EXT4_FMR_OWN_UNKNOWN); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = info->gfi_next_fsblk; + fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; + fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + } + + if (info->gfi_last) + goto out; + + /* Fill out the extent we found */ + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), + rec->fmr_length, rec->fmr_owner); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = rec_fsblk; + fmr.fmr_owner = rec->fmr_owner; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + fmr.fmr_length = rec->fmr_length; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + +out: + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; +} + +static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) +{ + return fmr->fmr_physical + fmr->fmr_length; +} + +/* Transform a blockgroup's free record into a fsmap */ +static int ext4_getfsmap_datadev_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_fsmap irec; + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb; + ext4_fsblk_t fslen; + int error; + + fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); + fslen = EXT4_C2B(sbi, len); + + /* If the retained free extent record is set... */ + if (info->gfi_lastfree.fmr_owner) { + /* ...and abuts this one, lengthen it and return. */ + if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { + info->gfi_lastfree.fmr_length += fslen; + return 0; + } + + /* + * There's a gap between the two free extents; emit the + * retained extent prior to merging the meta_list. + */ + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + + /* Merge in any relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + } else if (p->fmr_physical < fsb) { + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + + list_del(&p->fmr_list); + kfree(p); + } + } + + irec.fmr_device = 0; + irec.fmr_physical = fsb; + irec.fmr_length = fslen; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + /* If this is a free extent at the end of a bg, buffer it. */ + if (ext4_fsmap_next_pblk(&irec) == + ext4_group_first_block_no(sb, agno + 1)) { + info->gfi_lastfree = irec; + return 0; + } + + /* Otherwise, emit it */ + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Execute a getfsmap query against the log device. */ +static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_fsmap irec; + + /* Set up search keys */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + irec.fmr_physical = journal->j_blk_offset; + irec.fmr_length = journal->j_maxlen; + irec.fmr_owner = EXT4_FMR_OWN_LOG; + irec.fmr_flags = 0; + + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Helper to fill out an ext4_fsmap. */ +static inline int ext4_getfsmap_fill(struct list_head *meta_list, + ext4_fsblk_t fsb, ext4_fsblk_t len, + uint64_t owner) +{ + struct ext4_fsmap *fsm; + + fsm = kmalloc(sizeof(*fsm), GFP_NOFS); + if (!fsm) + return -ENOMEM; + fsm->fmr_device = 0; + fsm->fmr_flags = 0; + fsm->fmr_physical = fsb; + fsm->fmr_owner = owner; + fsm->fmr_length = len; + list_add_tail(&fsm->fmr_list, meta_list); + + return 0; +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, + ext4_group_t agno, + struct list_head *meta_list) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); + ext4_fsblk_t len; + unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); + int error; + + /* Record the superblock. */ + if (ext4_bg_has_super(sb, agno)) { + error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); + if (error) + return error; + fsb++; + } + + /* Record the group descriptors. */ + len = ext4_bg_num_gdb(sb, agno); + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_GDT); + if (error) + return error; + fsb += len; + + /* Reserved GDT blocks */ + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { + len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_RESV_GDT); + if (error) + return error; + } + + return 0; +} + +/* Compare two fsmap items. */ +static int ext4_getfsmap_compare(void *priv, + struct list_head *a, + struct list_head *b) +{ + struct ext4_fsmap *fa; + struct ext4_fsmap *fb; + + fa = container_of(a, struct ext4_fsmap, fmr_list); + fb = container_of(b, struct ext4_fsmap, fmr_list); + if (fa->fmr_physical < fb->fmr_physical) + return -1; + else if (fa->fmr_physical > fb->fmr_physical) + return 1; + return 0; +} + +/* Merge adjacent extents of fixed metadata. */ +static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *prev = NULL; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + if (!prev) { + prev = p; + continue; + } + + if (prev->fmr_owner == p->fmr_owner && + prev->fmr_physical + prev->fmr_length == p->fmr_physical) { + prev->fmr_length += p->fmr_length; + list_del(&p->fmr_list); + kfree(p); + } else + prev = p; + } +} + +/* Free a list of fixed metadata. */ +static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + list_del(&p->fmr_list); + kfree(p); + } +} + +/* Find all the fixed metadata in the filesystem. */ +int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) +{ + struct ext4_group_desc *gdp; + ext4_group_t agno; + int error; + + INIT_LIST_HEAD(meta_list); + + /* Collect everything. */ + for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { + gdp = ext4_get_group_desc(sb, agno, NULL); + if (!gdp) { + error = -EFSCORRUPTED; + goto err; + } + + /* Superblock & GDT */ + error = ext4_getfsmap_find_sb(sb, agno, meta_list); + if (error) + goto err; + + /* Block bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_block_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_BLKBM); + if (error) + goto err; + + /* Inode bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_INOBM); + if (error) + goto err; + + /* Inodes */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group, + EXT4_FMR_OWN_INODES); + if (error) + goto err; + } + + /* Sort the list */ + list_sort(NULL, meta_list, ext4_getfsmap_compare); + + /* Merge adjacent extents */ + ext4_getfsmap_merge_fixed_metadata(meta_list); + + return 0; +err: + ext4_getfsmap_free_fixed_metadata(meta_list); + return error; +} + +/* Execute a getfsmap query against the buddy bitmaps */ +static int ext4_getfsmap_datadev(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start_fsb; + ext4_fsblk_t end_fsb; + ext4_fsblk_t eofs; + ext4_group_t start_ag; + ext4_group_t end_ag; + ext4_grpblk_t first_cluster; + ext4_grpblk_t last_cluster; + int error = 0; + + eofs = ext4_blocks_count(sbi->s_es); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = keys[0].fmr_physical; + end_fsb = keys[1].fmr_physical; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); + ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); + + /* + * Convert the fsmap low/high keys to bg based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the bg. + */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + /* Assemble a list of all the fixed-location metadata. */ + error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); + if (error) + goto err; + + /* Query each bg */ + for (info->gfi_agno = start_ag; + info->gfi_agno <= end_ag; + info->gfi_agno++) { + /* + * Set the bg high key from the fsmap high key if this + * is the last bg that we're querying. + */ + if (info->gfi_agno == end_ag) { + info->gfi_high = keys[1]; + info->gfi_high.fmr_physical = EXT4_C2B(sbi, + last_cluster); + info->gfi_high.fmr_length = 0; + } + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + error = ext4_mballoc_query_range(sb, info->gfi_agno, + EXT4_B2C(sbi, info->gfi_low.fmr_physical), + EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_datadev_helper, info); + if (error) + goto err; + + /* + * Set the bg low key to the start of the bg prior to + * moving on to the next bg. + */ + if (info->gfi_agno == start_ag) + memset(&info->gfi_low, 0, sizeof(info->gfi_low)); + } + + /* Do we have a retained free extent? */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + goto err; + } + + /* Report any gaps at the end of the bg */ + info->gfi_last = true; + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + if (error) + goto err; + +err: + ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); + return error; +} + +/* Do we recognize the device? */ +static bool ext4_getfsmap_is_valid_device(struct super_block *sb, + struct ext4_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) + return true; + if (EXT4_SB(sb)->journal_bdev && + fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, + struct ext4_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + return false; +} + +#define EXT4_GETFSMAP_DEVS 2 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide block addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this + * is how we detect gaps in the fsmap + * records and report them. + * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from + * dkeys; used to query the free space. + */ +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg) +{ + struct ext4_fsmap dkeys[2]; /* per-dev keys */ + struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; + struct ext4_getfsmap_info info = {0}; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || + !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); + handlers[0].gfd_fn = ext4_getfsmap_datadev; + if (EXT4_SB(sb)->journal_bdev) { + handlers[1].gfd_dev = new_encode_dev( + EXT4_SB(sb)->journal_bdev->bd_dev); + handlers[1].gfd_fn = ext4_getfsmap_logdev; + } + + sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), + ext4_getfsmap_dev_compare, NULL); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * Bump the physical offset as there can be no other mapping for + * the same physical block range. + */ + dkeys[0] = head->fmh_keys[0]; + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); + + if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.gfi_formatter = formatter; + info.gfi_format_arg = arg; + info.gfi_head = head; + + /* For each device we support... */ + for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].gfd_fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); + + info.gfi_dev = handlers[i].gfd_dev; + info.gfi_last = false; + info.gfi_agno = -1; + error = handlers[i].gfd_fn(sb, dkeys, &info); + if (error) + break; + info.gfi_next_fsblk = 0; + } + + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h new file mode 100644 index 0000000000000000000000000000000000000000..9a2cd367cc66bd0b6ee97cade916cbc801051a7a --- /dev/null +++ b/fs/ext4/fsmap.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __EXT4_FSMAP_H__ +#define __EXT4_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct ext4_fsmap { + struct list_head fmr_list; + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + uint64_t fmr_length; /* length of segment, blocks */ +}; + +struct ext4_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct ext4_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src); +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *); + +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg); + +#define EXT4_QUERY_RANGE_ABORT 1 +#define EXT4_QUERY_RANGE_CONTINUE 0 + +/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */ +#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ +#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ + +#endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 17bc043308f33bb28cdb4d63ea7393b0dfc310e1..98ac2f1f23b3d74ec29ac31eeb276da4f1de5605 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1098,6 +1098,17 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, if (err) goto fail_drop; + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_inherit_context(dir, inode, handle, true); + if (err) + goto fail_free_drop; + } + err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; @@ -1119,12 +1130,6 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ei->i_datasync_tid = handle->h_transaction->t_tid; } - if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); - if (err) - goto fail_free_drop; - } - err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 375fb1c05d49ce87a287213720ca0ebf5e0deef1..d5dea4c293ef46be022c538d0465e60284b20fe4 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, fname); + ext4_insert_dentry(inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b9ffa9f4191f4cb3a122c215894ef76689fd9604..5834c4d76be80969125a9c2d56309e990ba07e84 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1643,6 +1643,7 @@ struct mpage_da_data { */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ + unsigned int do_map:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, /* First block in the extent? */ if (map->m_len == 0) { + /* We cannot map unless handle is started... */ + if (!mpd->do_map) + return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; @@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* Found extent to map? */ if (mpd->map.m_len) return 0; + /* Buffer needs mapping and handle is not started? */ + if (!mpd->do_map) + return 0; /* Everything mapped so far and we hit EOF */ break; } @@ -2747,6 +2754,29 @@ static int ext4_writepages(struct address_space *mapping, tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); done = false; blk_start_plug(&plug); + + /* + * First writeback pages that don't need mapping - we can avoid + * starting a transaction unnecessarily and also avoid being blocked + * in the block layer on device congestion while having transaction + * started. + */ + mpd.do_map = 0; + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + goto unplug; + } + ret = mpage_prepare_extent_to_map(&mpd); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + ext4_put_io_end_defer(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); + if (ret < 0) + goto unplug; + while (!done && mpd.first_page <= mpd.last_page) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); @@ -2775,8 +2805,10 @@ static int ext4_writepages(struct address_space *mapping, wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; break; } + mpd.do_map = 1; trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); @@ -2807,6 +2839,7 @@ static int ext4_writepages(struct address_space *mapping, if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; + mpd.do_map = 0; } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); @@ -2824,6 +2857,7 @@ static int ext4_writepages(struct address_space *mapping, ext4_journal_stop(handle); } else ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2839,6 +2873,7 @@ static int ext4_writepages(struct address_space *mapping, if (ret) break; } +unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; @@ -3305,6 +3340,7 @@ static int ext4_releasepage(struct page *page, gfp_t wait) static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, unsigned flags, struct iomap *iomap) { + struct block_device *bdev; unsigned int blkbits = inode->i_blkbits; unsigned long first_block = offset >> blkbits; unsigned long last_block = (offset + length - 1) >> blkbits; @@ -3373,7 +3409,12 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, } iomap->flags = 0; - iomap->bdev = inode->i_sb->s_bdev; + bdev = inode->i_sb->s_bdev; + iomap->bdev = bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; iomap->offset = first_block << blkbits; if (ret == 0) { @@ -3406,6 +3447,7 @@ static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, int blkbits = inode->i_blkbits; bool truncate = false; + put_dax(iomap->dax_dev); if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT)) return 0; @@ -4502,31 +4544,6 @@ void ext4_set_inode_flags(struct inode *inode) S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } -/* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ -void ext4_get_inode_flags(struct ext4_inode_info *ei) -{ - unsigned int vfs_fl; - unsigned long old_fl, new_fl; - - do { - vfs_fl = ei->vfs_inode.i_flags; - old_fl = ei->i_flags; - new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL| - EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL| - EXT4_DIRSYNC_FL); - if (vfs_fl & S_SYNC) - new_fl |= EXT4_SYNC_FL; - if (vfs_fl & S_APPEND) - new_fl |= EXT4_APPEND_FL; - if (vfs_fl & S_IMMUTABLE) - new_fl |= EXT4_IMMUTABLE_FL; - if (vfs_fl & S_NOATIME) - new_fl |= EXT4_NOATIME_FL; - if (vfs_fl & S_DIRSYNC) - new_fl |= EXT4_DIRSYNC_FL; - } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl); -} - static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, struct ext4_inode_info *ei) { @@ -4963,7 +4980,6 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_test_inode_state(inode, EXT4_STATE_NEW)) memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); - ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); @@ -5874,6 +5890,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf) file_update_time(vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_ret; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index a4273ddb992232e1587e33fd7f3a23ea926aaa7e..0c21e22acd74f37ec21f61d6d990fad33ce767db 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,6 +19,9 @@ #include #include "ext4_jbd2.h" #include "ext4.h" +#include +#include "fsmap.h" +#include /** * Swap memory between @a and @b for @len bytes. @@ -443,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) return iflags; } -int ext4_shutdown(struct super_block *sb, unsigned long arg) +static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u32 flags; @@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; } +struct getfsmap_info { + struct super_block *gi_sb; + struct fsmap_head __user *gi_data; + unsigned int gi_idx; + __u32 gi_last_flags; +}; + +static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_ext4_getfsmap_mapping(info->gi_sb, xfm); + + info->gi_last_flags = xfm->fmr_flags; + ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); + if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +static int ext4_ioc_getfsmap(struct super_block *sb, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = {0}; + struct ext4_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + /* + * ext4 doesn't report file extents at all, so the only valid + * file offsets are the magic ones (all zeroes or all ones). + */ + if (head.fmh_keys[0].fmr_offset || + (head.fmh_keys[1].fmr_offset != 0 && + head.fmh_keys[1].fmr_offset != -1ULL)) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); + trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); + + info.gi_sb = sb; + info.gi_data = arg; + error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); + if (error == EXT4_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.gi_idx) { + info.gi_last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, + &info.gi_last_flags, + sizeof(info.gi_last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -499,8 +586,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { + case FS_IOC_GETFSMAP: + return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETFLAGS: - ext4_get_inode_flags(ei); flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { @@ -888,7 +976,6 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct fsxattr fa; memset(&fa, 0, sizeof(struct fsxattr)); - ext4_get_inode_flags(ei); fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); if (ext4_has_feature_project(inode->i_sb)) { @@ -1009,6 +1096,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: + case FS_IOC_GETFSMAP: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 354dc1a894c29bb86dcafcd1b2a5d8a53ca7119d..5083bce20ac4dc2243ace1bba19ef506b90d9c29 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep; #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; -static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { +static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" @@ -2393,7 +2393,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) return 0; size = roundup_pow_of_two(sizeof(*sbi->s_group_info) * size); - new_groupinfo = ext4_kvzalloc(size, GFP_KERNEL); + new_groupinfo = kvzalloc(size, GFP_KERNEL); if (!new_groupinfo) { ext4_msg(sb, KERN_ERR, "can't allocate buddy meta group"); return -ENOMEM; @@ -5277,3 +5277,52 @@ int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range) range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } + +/* Iterate all the free extents in the group. */ +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv) +{ + void *bitmap; + ext4_grpblk_t next; + struct ext4_buddy e4b; + int error; + + error = ext4_mb_load_buddy(sb, group, &e4b); + if (error) + return error; + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + while (start <= end) { + start = mb_find_next_zero_bit(bitmap, end + 1, start); + if (start > end) + break; + next = mb_find_next_bit(bitmap, end + 1, start); + + ext4_unlock_group(sb, group); + error = formatter(sb, group, start, next - start, priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + + start = next + 1; + } + + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(&e4b); + + return error; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 1aba469f82209fe40d602579a17964346749e024..2bed62084a8c1592e638f311a1a9574057a7e6a6 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } + +typedef int (*ext4_mballoc_query_range_fn)( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t len, + void *priv); + +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv); + #endif diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 07e5e140577176e13db84089eaa048713ddb4797..b81f7d46f344d482d4a7ab15a5cc1ad3654b3259 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } /* - * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * Test whether a directory entry matches the filename being searched for. * - * `len <= EXT4_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. + * Return: %true if the directory entry matches, otherwise %false. */ -static inline int ext4_match(struct ext4_filename *fname, - struct ext4_dir_entry_2 *de) +static inline bool ext4_match(const struct ext4_filename *fname, + const struct ext4_dir_entry_2 *de) { - const void *name = fname_name(fname); - u32 len = fname_len(fname); + struct fscrypt_name f; if (!de->inode) - return 0; + return false; + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (unlikely(!name)) { - if (fname->usr_fname->name[0] == '_') { - int ret; - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - fname->crypto_buf.name + 8, 16); - return (ret == 0) ? 1 : 0; - } - name = fname->crypto_buf.name; - len = fname->crypto_buf.len; - } + f.crypto_buf = fname->crypto_buf; #endif - if (de->name_len != len) - return 0; - return (memcmp(de->name, name, len) == 0) ? 1 : 0; + return fscrypt_match_name(&f, de->name, de->name_len); } /* @@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - int res; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(fname, de); - if (res < 0) { - res = -1; - goto return_result; - } - if (res > 0) { - /* found a match - just to be sure, do - * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, - bh->b_size, offset)) { - res = -1; - goto return_result; - } - *res_dir = de; - res = 1; - goto return_result; - } - + if ((char *) de + de->name_len <= dlimit && + ext4_match(fname, de)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) + return -1; + *res_dir = de; + return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) { - res = -1; - goto return_result; - } + if (de_len <= 0) + return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - - res = 0; -return_result: - return res; + return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1616,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - int nokey = ext4_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - if (nokey) { - iput(inode); - return ERR_PTR(-ENOKEY); - } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) dir->i_ino, - (unsigned long) inode->i_ino); + dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } @@ -1833,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; - int res; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) { - res = -EFSCORRUPTED; - goto return_result; - } - /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(fname, de); - if (res < 0) - goto return_result; - if (res > 0) { - res = -EEXIST; - goto return_result; - } + buf, buf_size, offset)) + return -EFSCORRUPTED; + if (ext4_match(fname, de)) + return -EEXIST; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1858,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - res = -ENOSPC; - else { - *dest_de = de; - res = 0; - } -return_result: - return res; + return -ENOSPC; + + *dest_de = de; + return 0; } -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname) +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) { int nlen, rlen; @@ -1892,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); - return 0; } /* @@ -1928,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } - /* By now the buffer is marked for journaling. Due to crypto operations, - * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, fname); - if (err < 0) - return err; + /* By now the buffer is marked for journaling */ + ext4_insert_dentry(inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 208241b06662fbccfd741340ba167e68a711418e..1a82138ba7391ac8fd960c0294715d13051fb232 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; + char b[BDEVNAME_SIZE]; - BUG_ON(!io_end); + if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_iter.bi_sector, + (unsigned) bio_sectors(bio), + bio->bi_error)) { + ext4_finish_bio(bio); + bio_put(bio); + return; + } bio->bi_end_io = NULL; if (bio->bi_error) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a9448db1cf7e87c2bd41daac6fbab7729bc87ecc..0b177da9ea8271f4a46208937d3c43fd20c0e9b7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -49,6 +50,7 @@ #include "xattr.h" #include "acl.h" #include "mballoc.h" +#include "fsmap.h" #define CREATE_TRACE_POINTS #include @@ -839,6 +841,28 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi) } } +#ifdef CONFIG_QUOTA +static int ext4_quota_off(struct super_block *sb, int type); + +static inline void ext4_quota_off_umount(struct super_block *sb) +{ + int type; + + if (ext4_has_feature_quota(sb)) { + dquot_disable(sb, -1, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + } else { + /* Use our quota_off function to clear inode flags etc. */ + for (type = 0; type < EXT4_MAXQUOTAS; type++) + ext4_quota_off(sb, type); + } +} +#else +static inline void ext4_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void ext4_put_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -847,7 +871,7 @@ static void ext4_put_super(struct super_block *sb) int i, err; ext4_unregister_li_request(sb); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + ext4_quota_off_umount(sb); flush_workqueue(sbi->rsv_conversion_wq); destroy_workqueue(sbi->rsv_conversion_wq); @@ -1208,7 +1232,7 @@ static const struct fscrypt_operations ext4_cryptops = { #endif #ifdef CONFIG_QUOTA -static char *quotatypes[] = INITQFNAMES; +static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); @@ -1218,7 +1242,6 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); -static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1422,7 +1445,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" +static const char deprecated_msg[] = + "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; #ifdef CONFIG_QUOTA @@ -2132,7 +2156,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) return 0; size = roundup_pow_of_two(size * sizeof(struct flex_groups)); - new_groups = ext4_kvzalloc(size, GFP_KERNEL); + new_groups = kvzalloc(size, GFP_KERNEL); if (!new_groups) { ext4_msg(sb, KERN_ERR, "not enough memory for %d flex groups", size / (int) sizeof(struct flex_groups)); @@ -3866,7 +3890,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } } - sbi->s_group_desc = ext4_kvmalloc(db_count * + sbi->s_group_desc = kvmalloc(db_count * sizeof(struct buffer_head *), GFP_KERNEL); if (sbi->s_group_desc == NULL) { @@ -3877,6 +3901,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) bgl_lock_init(sbi->s_blockgroup_lock); + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sb_breadahead(sb, block); + } + for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); @@ -4629,7 +4659,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (sync) { unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); if (error) return error; @@ -5344,11 +5374,33 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (err) return err; } + lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_QUOTA); err = dquot_quota_on(sb, type, format_id, path); - if (err) + if (err) { lockdep_set_quota_inode(path->dentry->d_inode, I_DATA_SEM_NORMAL); + } else { + struct inode *inode = d_inode(path->dentry); + handle_t *handle; + + /* + * Set inode flags to prevent userspace from messing with quota + * files. If this fails, we return success anyway since quotas + * are already enabled and this is not a hard failure. + */ + inode_lock(inode); + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); + if (IS_ERR(handle)) + goto unlock_inode; + EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + unlock_inode: + inode_unlock(inode); + } return err; } @@ -5422,24 +5474,39 @@ static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; handle_t *handle; + int err; /* Force all delayed allocation blocks to be allocated. * Caller already holds s_umount sem */ if (test_opt(sb, DELALLOC)) sync_filesystem(sb); - if (!inode) + if (!inode || !igrab(inode)) goto out; - /* Update modification times of quota files when userspace can - * start looking at them */ + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + /* + * Update modification times of quota files when userspace can + * start looking at them. If we fail, we return success anyway since + * this is not a hard failure and quotas are already disabled. + */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); if (IS_ERR(handle)) - goto out; + goto out_unlock; + EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - +out_unlock: + inode_unlock(inode); +out_put: + iput(inode); + return err; out: return dquot_quota_off(sb, type); } diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 42145be5c6b4dea4258b35586693ae336f047d25..d74dc5f81a0483b2bfeab1ab1216870ce8d25506 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -34,7 +34,7 @@ typedef enum { ptr_ext4_super_block_offset, } attr_ptr_t; -static const char *proc_dirname = "fs/ext4"; +static const char proc_dirname[] = "fs/ext4"; static struct proc_dir_entry *ext4_proc_root; struct ext4_attr { @@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \ PROC_FILE_SHOW_DEFN(es_shrinker_info); PROC_FILE_SHOW_DEFN(options); -static struct ext4_proc_files { +static const struct ext4_proc_files { const char *name; const struct file_operations *fops; } proc_files[] = { @@ -388,7 +388,7 @@ static struct ext4_proc_files { int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; int err; sbi->s_kobj.kset = &ext4_kset; @@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; if (sbi->s_proc) { for (p = proc_files; p->name; p++) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 996e7900d4c8ea2d16f65f47e3f1082552b2fc66..8fb7ce14e6ebe3a4ba1f2bd2da36b314819d24a5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -78,10 +78,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); -static const struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, @@ -163,20 +161,9 @@ ext4_xattr_handler(int name_index) return handler; } -/* - * Inode operation listxattr() - * - * d_inode(dentry)->i_mutex: don't care - */ -ssize_t -ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext4_xattr_list(dentry, buffer, size); -} - static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, - void *value_start) +ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, + void *value_start) { struct ext4_xattr_entry *e = entry; @@ -230,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EFSCORRUPTED; if (!ext4_xattr_block_csum_verify(inode, bh)) return -EFSBADCRC; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); + error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -246,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -257,20 +244,9 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) -static inline int -ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EFSCORRUPTED; - return 0; -} - static int ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) + const char *name, int sorted) { struct ext4_xattr_entry *entry; size_t name_len; @@ -290,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, break; } *pentry = entry; - if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EFSCORRUPTED; return cmp ? -ENODATA : 0; } @@ -319,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(inode, bh)) { -bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; @@ -327,9 +300,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, } ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EFSCORRUPTED) - goto bad_block; + error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -366,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - error = ext4_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); + entry = IFIRST(header); + error = ext4_xattr_find_entry(&entry, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -519,7 +489,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) } /* - * ext4_xattr_list() + * Inode operation listxattr() + * + * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. @@ -528,8 +500,8 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -static int -ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; @@ -804,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); + i->name, 1); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -1076,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return error; /* Find the named attribute. */ error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); + i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0339daf4ca02fac4090fc59783bbfa745bccc9eb..ea9c317b5916ee51610069c2c58cdb504f95969d 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping, get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); @@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,21 +980,26 @@ static int block_operations(struct f2fs_sb_info *sbi) err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); @@ -986,11 +1007,20 @@ static int block_operations(struct f2fs_sb_info *sbi) up_write(&sbi->node_write); err = sync_node_pages(sbi, &wbc); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); spin_lock(&sbi->cp_lock); @@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write nat bits */ if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); - unsigned int i; block_t blk; cp_ver |= ((__u64)crc32 << 32); @@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1602b4bccae61e8ac9fcff3d4810b953294773e2..7c0f6bdf817d4370b74b36de9096fad73c75fbd4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } @@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } @@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -939,7 +957,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -948,7 +966,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_bios(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = false; + goto got_it; + } + } + + if (fio->need_lock) + f2fs_lock_op(fio->sbi); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - PAGE_SIZE, 0, - fio->page->index, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } +got_it: + err = encrypt_one_page(fio); + if (err) + goto out_writepage; set_page_writeback(page); @@ -1334,22 +1398,27 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + f2fs_put_dnode(&dn); + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); return err; } @@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = false, + .need_lock = true, }; trace_f2fs_writepage(page, DATA); @@ -1408,6 +1479,7 @@ static int __write_data_page(struct page *page, bool *submitted, /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = false; err = do_write_data_page(&fio); goto done; } @@ -1416,6 +1488,8 @@ static int __write_data_page(struct page *page, bool *submitted, need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { @@ -1423,12 +1497,12 @@ static int __write_data_page(struct page *page, bool *submitted, if (!err) goto out; } - f2fs_lock_op(sbi); + if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1441,12 +1515,14 @@ static int __write_data_page(struct page *page, bool *submitted, if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, DATA, WRITE); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); @@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1580,8 +1662,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1723,7 +1816,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1737,7 +1831,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ee2d0a485fc3478fc5f93b5b85c6dad0431e8ea0..87f449845f5f9a8010ee13c40548fb6620802872 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) - si->nr_flush = - atomic_read(&SM_I(sbi)->fcc_info->submit_flush); - if (SM_I(sbi) && SM_I(sbi)->dcc_info) - si->nr_discard = - atomic_read(&SM_I(sbi)->dcc_info->submit_discard); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -208,8 +224,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); - if (SM_I(sbi)->dcc_info) + if (SM_I(sbi)->dcc_info) { si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, - si->nr_flush, si->nr_discard); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8d5c62b07b283f53e90ded2366c8bb9375409fa2..94756f55a97e7d86052cdb49ce79fcc131738cfd 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -207,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -254,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -337,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -384,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -438,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. @@ -542,7 +511,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -599,11 +568,9 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -911,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6934f014e0f1d9a458a6489422612a6484fe198..2f98d70397013317c393257c3a957f4fa12e1c00 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +406,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0a6e115562f62edca5b60ee4c833e889a904c202..2185c7a040a12cf8b6e190df97b58bc7eecc6299 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,6 +50,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -62,7 +63,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -124,22 +125,20 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) -#define MAX_DISCARD_BLOCKS(sbi) \ - ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) -#define DISCARD_ISSUE_RATE 8 +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -181,37 +180,63 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + enum { D_PREP, D_SUBMIT, D_DONE, }; +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ - int state; /* state */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ }; struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - int nr_discards; /* # of discards in the list */ - struct list_head discard_cmd_list; /* discard cmd list */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; - int max_discards; /* max. discards to be issued */ - atomic_t submit_discard; /* # of issued discard */ + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -222,13 +247,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -311,6 +339,11 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + /* * For INODE and NODE manager */ @@ -323,26 +356,24 @@ struct f2fs_dentry_ptr { int max; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) +{ + d->inode = inode; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; } /* @@ -374,16 +405,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) +{ + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); +} + static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -562,7 +625,6 @@ struct f2fs_nm_info { unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ - spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ @@ -641,7 +703,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -672,6 +735,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -722,6 +786,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, @@ -737,9 +802,10 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ + bool need_lock; /* indicate we need to lock cp_rwsem */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -827,6 +893,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -879,6 +946,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -912,11 +982,12 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ /* For sysfs suppport */ @@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1707,6 +1778,7 @@ enum { FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -2005,36 +2071,10 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kmalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags, PAGE_KERNEL); - return ret; -} - -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) -{ - void *ret; - - ret = kzalloc(size, flags | __GFP_NOWARN); - if (!ret) - ret = __vmalloc(size, flags | __GFP_ZERO, PAGE_KERNEL); - return ret; -} - #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) - /* * file.c */ @@ -2096,8 +2136,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -2133,7 +2171,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2184,6 +2223,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); @@ -2193,7 +2233,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); @@ -2205,7 +2245,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -void rewrite_data_page(struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); @@ -2310,7 +2350,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, int start_gc_thread(struct f2fs_sb_info *sbi); void stop_gc_thread(struct f2fs_sb_info *sbi); block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); void build_gc_manager(struct f2fs_sb_info *sbi); /* @@ -2334,11 +2375,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int aw_cnt, max_aw_cnt; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2421,11 +2466,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2435,14 +2491,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2450,7 +2506,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); @@ -2458,32 +2514,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2509,7 +2568,7 @@ extern struct kmem_cache *inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void read_inline_data(struct page *page, struct page *ipage); -bool truncate_inline_inode(struct page *ipage, u64 from); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); @@ -2544,6 +2603,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5f7317875a6726b9ea968f81cbf401b2c8966f8f..61af721329fa28563cd7ed6cbe9e3a72a459c69c 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); @@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(ipage, from); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -1012,11 +1011,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { @@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + + inode_unlock(inode); out: mnt_drop_write_file(filp); return ret; @@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + goto out; + } + +inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; - err = mnt_want_write_file(filp); - if (err) - return err; - - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } + if (f2fs_readonly(sbi->sb)) + return -EROFS; if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } + sizeof(range))) + return -EFAULT; /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2198,6 +2212,69 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); default: return -ENOTTY; } @@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 418fd988164677623cf0ad305d34f7855fb608dd..026522107ca3f843dee496b0f1ab5359c10eeb6a 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -84,7 +84,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int valid_blocks = - get_valid_blocks(sbi, segno, sbi->segs_per_sec); + get_valid_blocks(sbi, segno, true); return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? valid_blocks * 2 : valid_blocks; @@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); @@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = true, }; bool is_dirty = PageDirty(page); int err; @@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -924,7 +942,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_finish_plug(&plug); if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + get_valid_blocks(sbi, start_segno, true) == 0) sec_freed = 1; stat_inc_call_count(sbi->stat_info); @@ -932,13 +950,14 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), @@ -959,9 +978,11 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + if (prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } if (has_not_enough_free_secs(sbi, 0, 0)) gc_type = FG_GC; } @@ -981,13 +1002,17 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -999,7 +1024,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431ea00f937e5b1e59b5f90999091d2a..eb2e031ea887bed43229f63e9923986bc14f3955 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e32a9e5279686035829dba5d45396e22cf10438f..e4c527c4e7d0b382907b0c8fbf95d0f15f4bbc09 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; if (from >= MAX_INLINE_DATA) - return false; + return; addr = inline_data_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -267,9 +270,8 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -296,11 +298,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir, unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); + make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 24bb8213d974b710b43f286b1089ccff7946a50e..518f49643092582ae256c425616e942ed2392c40 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -316,7 +316,6 @@ int update_inode_page(struct inode *inode) } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); - goto no_delete; - } -#endif - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -389,6 +382,12 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -411,7 +410,10 @@ void f2fs_evict_inode(struct inode *inode) stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 98f00a3a7f5016a42741169736f1182efe83a3b0..c31b40e5f9cf6dce167f3c24381df1e31c63acf2 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -423,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -487,6 +486,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -508,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -524,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -554,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -569,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -595,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -622,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -720,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -779,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -908,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -917,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -972,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 481aa8dc79f46f4c156cf67cca665e8160e36e6a..4547c5c5cd989373235531bb21193531bd05e720 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -22,7 +22,7 @@ #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -1463,6 +1463,9 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) return false; - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return false; - } - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return true; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return true; - } - return true; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build, bool locked) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!locked) - spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - if (!locked) - spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -2026,7 +2057,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* @@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); - - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + /* Allow dirty nats by node block allocation in write_begin */ } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) @@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK; last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; - spin_lock(&nm_i->free_nid_lock); + spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) - update_free_nid_bitmap(sbi, nid, true, true, true); - spin_unlock(&nm_i->free_nid_lock); + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } for (i = 0; i < nm_i->nat_blocks; i++) { @@ -2621,23 +2652,20 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); - nm_i->free_nid_bitmap = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); if (!nm_i->free_nid_bitmap) return -ENOMEM; - nm_i->nat_block_bitmap = f2fs_kvzalloc(nm_i->nat_blocks / 8, + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, GFP_KERNEL); if (!nm_i->nat_block_bitmap) return -ENOMEM; - nm_i->free_nid_count = f2fs_kvzalloc(nm_i->nat_blocks * + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; - - spin_lock_init(&nm_i->free_nid_lock); - return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 2f9603fa85a59842281b3443e4984351b2692af8..558048e33cf9a6c1920f03673a98409c34325f4c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d025aa83fb5bb344fcb5799f080d033895e584aa..907d6b7dde6a7f636ea7027d461cfd029b8b1ae5 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; @@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 29ef7088c5582a480b6a1f7965fbbcca4f07e24e..96845854e7ee808c2df5227ddf68a614fcc779ba 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .encrypted_page = NULL, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode, } fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = false, err = do_write_data_page(&fio); if (err) { unlock_page(page); @@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio->bi_bdev = bdev; ret = submit_bio_wait(bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - trace_f2fs_issue_flush(FDEV(i).bdev, - test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -458,6 +489,8 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return submit_flush_wait(sbi); - - if (!atomic_read(&fcc->submit_flush)) { - int ret; + if (!test_opt(sbi, FLUSH_MERGE)) { + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + return ret; + } - atomic_inc(&fcc->submit_flush); + if (!atomic_read(&fcc->issing_flush)) { + atomic_inc(&fcc->issing_flush); ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); + atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) @@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issing_flush, 0); } return cmd.ret; @@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void __add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; - bio->bi_private = dc; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; - mutex_lock(&dcc->cmd_lock); - list_add_tail(&dc->list, cmd_list); - mutex_unlock(&dcc->cmd_lock); + return dc; } -static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - int err = dc->bio->bi_error; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - if (err == -EOPNOTSUPP) - err = 0; + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - bio_put(dc->bio); list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); - struct discard_cmd *dc, *tmp; - struct blk_plug plug; - mutex_lock(&dcc->cmd_lock); + if (dc->error == -EOPNOTSUPP) + dc->error = 0; - blk_start_plug(&plug); + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", dc->error); + __detach_discard_cmd(dcc, dc); +} - list_for_each_entry_safe(dc, tmp, wait_list, list) { +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - if (blkaddr == NULL_ADDR) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - } - continue; + dc->error = bio->bi_error; + dc->state = D_DONE; + complete(&dc->wait); + bio_put(bio); +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + list_move_tail(&dc->list, &dcc->wait_list); } + } else { + __remove_discard_cmd(sbi, dc); + } +} - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } } - blk_finish_plug(&plug); +} - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } + mutex_unlock(&dcc->cmd_lock); } -static void f2fs_submit_discard_endio(struct bio *bio) +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + block_t lblkstart = blkstart; - complete(&dc->wait); - dc->state = D_DONE; + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; } -static int issue_discard_thread(void *data) +static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { - struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *cmd_list = &dcc->discard_cmd_list; + struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0; -repeat: - if (kthread_should_stop()) - return 0; + int i, iter = 0; + mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (!issue_cond || is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + goto out; + } + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, cmd_list, list) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - if (iter++ > DISCARD_ISSUE_RATE) - break; - } else if (dc->state == D_DONE) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || dc->state == D_DONE) { + if (dc->ref) + continue; + wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } } mutex_unlock(&dcc->cmd_lock); +} - blk_finish_plug(&plug); +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; - iter = 0; - congestion_wait(BLK_RW_SYNC, HZ/50); + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); - wait_event_interruptible(*q, - kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); - goto repeat; + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + } } - -/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { - struct bio *bio = NULL; - block_t lblkstart = blkstart; - int err; + __issue_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, false); +} - trace_f2fs_issue_discard(bdev, blkstart, blklen); +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; - if (sbi->s_ndevs) { - int devi = f2fs_target_device_index(sbi, blkstart); + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); - blkstart -= FDEV(devi).start_blk; - } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= REQ_SYNC; + congestion_wait(BLK_RW_SYNC, HZ/50); - __add_discard_cmd(sbi, bio, lblkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); - } - return err; + wait_event_interruptible(*q, kthread_should_stop() || + atomic_read(&dcc->discard_cmd_cnt)); + goto repeat; } #ifdef CONFIG_BLK_DEV_ZONED @@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { @@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); @@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len < MAX_DISCARD_BLOCKS(sbi)) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->dcc_info->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -1043,22 +1313,46 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (force && len < cpc->trim_minlen) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } + + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; - int err = 0; + int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; @@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_cmd_list); + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); - atomic_set(&dcc->submit_discard, 0); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -1091,20 +1391,22 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) return err; } -static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - if (dcc && dcc->f2fs_issue_discard) { + if (!dcc) + return; + + if (dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } - if (free) { - kfree(dcc); - SM_I(sbi)->dcc_info = NULL; - } + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1398,8 +1711,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; int i, cnt; bool reversed = false; /* need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; return 1; + } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { @@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, i, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } } return 0; } @@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (p_type == DATA) { struct inode *inode = page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { if (IS_DNODE(page)) return is_cold_node(page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } @@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + return f2fs_submit_page_bio(fio); } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2501,13 +2834,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2540,7 +2873,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2573,7 +2906,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time); - sit_i->mounted_time = CURRENT_TIME_SEC.tv_sec; + sit_i->mounted_time = ktime_get_real_seconds(); mutex_init(&sit_i->sentry_lock); return 0; } @@ -2591,12 +2924,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2764,7 +3109,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2786,7 +3131,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; @@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); - destroy_discard_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e8ad4280a5016d293817115a31a94daaf8d2d49..010f336a75730e5920d67a38d04b64cebc59bf57 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,84 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +109,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +138,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +236,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int overprovision_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); } static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool need_SSR(struct f2fs_sb_info *sbi) @@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return false; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + 2 * reserved_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,17 +561,15 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; @@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -691,8 +712,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); - return sit_i->elapsed_time + CURRENT_TIME_SEC.tv_sec - - sit_i->mounted_time; + time64_t now = ktime_get_real_seconds(); + + return sit_i->elapsed_time + now - sit_i->mounted_time; } static inline void set_summary(struct f2fs_summary *sum, nid_t nid, @@ -719,7 +741,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96fe8ed7310001c666c5f370ea735b5aecdc2518..83355ec4a92cdeb86d4be8d4630e01e7b0c96a28 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (!sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else @@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); @@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1917,6 +1944,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); @@ -2022,6 +2050,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_join_shrinker(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) @@ -2046,10 +2078,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - err = f2fs_build_stats(sbi); - if (err) - goto free_root_inode; - if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -2143,7 +2171,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } - f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2161,6 +2188,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a7ff7370c1c6f2d649b770ed70acc..bccbbf2616d2b2cd3e5c3380b809f545dc1802a9 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7298a4488f7f59cfc68587c4fbc3b1ac6293f9aa..832c5110abab51208e77751056436e7ba7e7ff16 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; - unsigned int inline_size = 0; + unsigned int inline_size = inline_xattr_size(inode); int err = 0; - inline_size = inline_xattr_size(inode); - if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, const void *value, size_t size) { void *pval = entry->e_name + entry->e_name_len; - return (entry->e_value_size == size) && !memcmp(pval, value, size); + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); } static int __f2fs_setxattr(struct inode *inode, int index, diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d5a94928c11695cde054f6143cb604d0bf850fc0..dbcd1d16e66982e07233e66fe400e95597a6c2df 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,8 +72,8 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) -#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) diff --git a/fs/fcntl.c b/fs/fcntl.c index be8fbe289087e61222103ec5675bb27857c502e0..f4e7267d117fb83b69f6f5be2305fb8b375077c8 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -420,6 +421,162 @@ SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, } #endif +#ifdef CONFIG_COMPAT +static int get_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +static int put_compat_flock(struct flock *kfl, struct compat_flock __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} + +#ifndef HAVE_ARCH_GET_COMPAT_FLOCK64 +static int get_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_READ, ufl, sizeof(*ufl)) || + __get_user(kfl->l_type, &ufl->l_type) || + __get_user(kfl->l_whence, &ufl->l_whence) || + __get_user(kfl->l_start, &ufl->l_start) || + __get_user(kfl->l_len, &ufl->l_len) || + __get_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +#ifndef HAVE_ARCH_PUT_COMPAT_FLOCK64 +static int put_compat_flock64(struct flock *kfl, struct compat_flock64 __user *ufl) +{ + if (!access_ok(VERIFY_WRITE, ufl, sizeof(*ufl)) || + __put_user(kfl->l_type, &ufl->l_type) || + __put_user(kfl->l_whence, &ufl->l_whence) || + __put_user(kfl->l_start, &ufl->l_start) || + __put_user(kfl->l_len, &ufl->l_len) || + __put_user(kfl->l_pid, &ufl->l_pid)) + return -EFAULT; + return 0; +} +#endif + +static unsigned int +convert_fcntl_cmd(unsigned int cmd) +{ + switch (cmd) { + case F_GETLK64: + return F_GETLK; + case F_SETLK64: + return F_SETLK; + case F_SETLKW64: + return F_SETLKW; + } + + return cmd; +} + +COMPAT_SYSCALL_DEFINE3(fcntl64, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + mm_segment_t old_fs; + struct flock f; + long ret; + unsigned int conv_cmd; + + switch (cmd) { + case F_GETLK: + case F_SETLK: + case F_SETLKW: + ret = get_compat_flock(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + ret = sys_fcntl(fd, cmd, (unsigned long)&f); + set_fs(old_fs); + if (cmd == F_GETLK && ret == 0) { + /* GETLK was successful and we need to return the data... + * but it needs to fit in the compat structure. + * l_start shouldn't be too big, unless the original + * start + end is greater than COMPAT_OFF_T_MAX, in which + * case the app was asking for trouble, so we return + * -EOVERFLOW in that case. + * l_len could be too big, in which case we just truncate it, + * and only allow the app to see that part of the conflicting + * lock that might make sense to it anyway + */ + + if (f.l_start > COMPAT_OFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_OFF_T_MAX) + f.l_len = COMPAT_OFF_T_MAX; + if (ret == 0) + ret = put_compat_flock(&f, compat_ptr(arg)); + } + break; + + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + ret = get_compat_flock64(&f, compat_ptr(arg)); + if (ret != 0) + break; + old_fs = get_fs(); + set_fs(KERNEL_DS); + conv_cmd = convert_fcntl_cmd(cmd); + ret = sys_fcntl(fd, conv_cmd, (unsigned long)&f); + set_fs(old_fs); + if ((conv_cmd == F_GETLK || conv_cmd == F_OFD_GETLK) && ret == 0) { + /* need to return lock information - see above for commentary */ + if (f.l_start > COMPAT_LOFF_T_MAX) + ret = -EOVERFLOW; + if (f.l_len > COMPAT_LOFF_T_MAX) + f.l_len = COMPAT_LOFF_T_MAX; + if (ret == 0) + ret = put_compat_flock64(&f, compat_ptr(arg)); + } + break; + + default: + ret = sys_fcntl(fd, cmd, arg); + break; + } + return ret; +} + +COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, + compat_ulong_t, arg) +{ + switch (cmd) { + case F_GETLK64: + case F_SETLK64: + case F_SETLKW64: + case F_OFD_GETLK: + case F_OFD_SETLK: + case F_OFD_SETLKW: + return -EINVAL; + } + return compat_sys_fcntl64(fd, cmd, arg); +} +#endif + /* Table to convert sigio signal codes into poll band bitmaps */ static const long band_table[NSIGPOLL] = { @@ -742,16 +899,10 @@ static int __init fcntl_init(void) * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * is defined as O_NONBLOCK on some platforms and not on others. */ - BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( - O_RDONLY | O_WRONLY | O_RDWR | - O_CREAT | O_EXCL | O_NOCTTY | - O_TRUNC | O_APPEND | /* O_NONBLOCK | */ - __O_SYNC | O_DSYNC | FASYNC | - O_DIRECT | O_LARGEFILE | O_DIRECTORY | - O_NOFOLLOW | O_NOATIME | O_CLOEXEC | - __FMODE_EXEC | O_PATH | __O_TMPFILE | - __FMODE_NONOTIFY - )); + BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != + HWEIGHT32( + (VALID_OPEN_FLAGS & ~(O_NONBLOCK | O_NDELAY)) | + __FMODE_EXEC | __FMODE_NONOTIFY)); fasync_cache = kmem_cache_create("fasync_cache", sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL); diff --git a/fs/fhandle.c b/fs/fhandle.c index 5559168d5637310d2712eab97af1117a786cfba1..58a61f55e0d09f868bb1bac6a0b4d6e5042822ad 100644 --- a/fs/fhandle.c +++ b/fs/fhandle.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "internal.h" #include "mount.h" @@ -264,3 +265,15 @@ SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, ret = do_handle_open(mountdirfd, handle, flags); return ret; } + +#ifdef CONFIG_COMPAT +/* + * Exactly like fs/open.c:sys_open_by_handle_at(), except that it + * doesn't set the O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open_by_handle_at, int, mountdirfd, + struct file_handle __user *, handle, int, flags) +{ + return do_handle_open(mountdirfd, handle, flags); +} +#endif diff --git a/fs/file.c b/fs/file.c index ad6f094f2eff2f90314d237fa3f3cb16c5b79b66..1c2972e3a405455e292520032342276cc621d369 100644 --- a/fs/file.c +++ b/fs/file.c @@ -42,7 +42,7 @@ static void *alloc_fdmem(size_t size) if (data != NULL) return data; } - return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 6e22748b0704c509edad6a0e93d0a6626601137b..b9ea99c5b5b31948882d37113d520ab9cdcf9564 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -292,7 +292,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc) static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent) { - struct tree_descr empty_descr = {""}; + static const struct tree_descr empty_descr = {""}; struct fuse_conn *fc; int err; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b681b43c766e11daf45814fc694b4b988f218a43..c16d00e5326459c6608268f9ce3f68f30467eac6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -20,6 +20,7 @@ #include #include #include +#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); MODULE_ALIAS("devname:fuse"); @@ -45,7 +46,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, INIT_LIST_HEAD(&req->list); INIT_LIST_HEAD(&req->intr_entry); init_waitqueue_head(&req->waitq); - atomic_set(&req->count, 1); + refcount_set(&req->count, 1); req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; @@ -102,21 +103,20 @@ void fuse_request_free(struct fuse_req *req) void __fuse_get_request(struct fuse_req *req) { - atomic_inc(&req->count); + refcount_inc(&req->count); } /* Must be called with > 1 refcount */ static void __fuse_put_request(struct fuse_req *req) { - BUG_ON(atomic_read(&req->count) < 2); - atomic_dec(&req->count); + refcount_dec(&req->count); } -static void fuse_req_init_context(struct fuse_req *req) +static void fuse_req_init_context(struct fuse_conn *fc, struct fuse_req *req) { req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid()); req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid()); - req->in.h.pid = current->pid; + req->in.h.pid = pid_nr_ns(task_pid(current), fc->pid_ns); } void fuse_set_initialized(struct fuse_conn *fc) @@ -163,7 +163,7 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, goto out; } - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); if (for_background) __set_bit(FR_BACKGROUND, &req->flags); @@ -256,7 +256,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, if (!req) req = get_reserved_req(fc, file); - fuse_req_init_context(req); + fuse_req_init_context(fc, req); __set_bit(FR_WAITING, &req->flags); __clear_bit(FR_BACKGROUND, &req->flags); return req; @@ -264,7 +264,7 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { - if (atomic_dec_and_test(&req->count)) { + if (refcount_dec_and_test(&req->count)) { if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background @@ -382,9 +382,9 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&fc->blocked_waitq); if (fc->num_background == fc->congestion_threshold && - fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); - clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + fc->connected && fc->sb) { + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + clear_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -573,10 +573,9 @@ void fuse_request_send_background_locked(struct fuse_conn *fc, fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; - if (fc->num_background == fc->congestion_threshold && - fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, BLK_RW_SYNC); - set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); + if (fc->num_background == fc->congestion_threshold && fc->sb) { + set_bdi_congested(fc->sb->s_bdi, BLK_RW_SYNC); + set_bdi_congested(fc->sb->s_bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); @@ -1223,6 +1222,9 @@ static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_in *in; unsigned reqsize; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + restart: spin_lock(&fiq->waitq.lock); err = -EAGAIN; @@ -1821,6 +1823,9 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_req *req; struct fuse_out_header oh; + if (task_active_pid_ns(current) != fc->pid_ns) + return -EIO; + if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ec238fb5a584b1c3cc5bc545806575160f78cde4..3ee4fdc3da9ec359ad847afa36354240329a2da6 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -58,7 +58,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) } INIT_LIST_HEAD(&ff->write_entry); - atomic_set(&ff->count, 1); + refcount_set(&ff->count, 1); RB_CLEAR_NODE(&ff->polled_node); init_waitqueue_head(&ff->poll_wait); @@ -77,7 +77,7 @@ void fuse_file_free(struct fuse_file *ff) static struct fuse_file *fuse_file_get(struct fuse_file *ff) { - atomic_inc(&ff->count); + refcount_inc(&ff->count); return ff; } @@ -88,7 +88,7 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) static void fuse_file_put(struct fuse_file *ff, bool sync) { - if (atomic_dec_and_test(&ff->count)) { + if (refcount_dec_and_test(&ff->count)) { struct fuse_req *req = ff->reserved_req; if (ff->fc->no_open) { @@ -293,7 +293,7 @@ static int fuse_release(struct inode *inode, struct file *file) void fuse_sync_release(struct fuse_file *ff, int flags) { - WARN_ON(atomic_read(&ff->count) != 1); + WARN_ON(refcount_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); /* * iput(NULL) is a no-op and since the refcount is 1 and everything's @@ -2083,7 +2083,8 @@ static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma) return generic_file_mmap(file, vma); } -static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, +static int convert_fuse_file_lock(struct fuse_conn *fc, + const struct fuse_file_lock *ffl, struct file_lock *fl) { switch (ffl->type) { @@ -2098,7 +2099,14 @@ static int convert_fuse_file_lock(const struct fuse_file_lock *ffl, fl->fl_start = ffl->start; fl->fl_end = ffl->end; - fl->fl_pid = ffl->pid; + + /* + * Convert pid into the caller's pid namespace. If the pid + * does not map into the namespace fl_pid will get set to 0. + */ + rcu_read_lock(); + fl->fl_pid = pid_vnr(find_pid_ns(ffl->pid, fc->pid_ns)); + rcu_read_unlock(); break; default: @@ -2147,7 +2155,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) args.out.args[0].value = &outarg; err = fuse_simple_request(fc, &args); if (!err) - err = convert_fuse_file_lock(&outarg.lk, fl); + err = convert_fuse_file_lock(fc, &outarg.lk, fl); return err; } @@ -2159,7 +2167,8 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) FUSE_ARGS(args); struct fuse_lk_in inarg; int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK; - pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0; + struct pid *pid = fl->fl_type != F_UNLCK ? task_tgid(current) : NULL; + pid_t pid_nr = pid_nr_ns(pid, fc->pid_ns); int err; if (fl->fl_lmops && fl->fl_lmops->lm_grant) { @@ -2168,10 +2177,13 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) } /* Unlock on close is handled by the flush method */ - if (fl->fl_flags & FL_CLOSE) + if ((fl->fl_flags & FL_CLOSE_POSIX) == FL_CLOSE_POSIX) return 0; - fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg); + if (pid && pid_nr == 0) + return -EOVERFLOW; + + fuse_lk_fill(&args, file, fl, opcode, pid_nr, flock, &inarg); err = fuse_simple_request(fc, &args); /* locking is restartable */ diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 32ac2c9b09c0302c99337c374263a6fa428b5cf4..1bd7ffdad593977013c1ddd233b2a91093471471 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -24,6 +24,8 @@ #include #include #include +#include +#include /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -137,7 +139,7 @@ struct fuse_file { u64 nodeid; /** Refcount */ - atomic_t count; + refcount_t count; /** FOPEN_* flags returned by open */ u32 open_flags; @@ -306,7 +308,7 @@ struct fuse_req { struct list_head intr_entry; /** refcount */ - atomic_t count; + refcount_t count; /** Unique ID for the interrupt request */ u64 intr_unique; @@ -448,7 +450,7 @@ struct fuse_conn { spinlock_t lock; /** Refcount */ - atomic_t count; + refcount_t count; /** Number of fuse_dev's */ atomic_t dev_count; @@ -461,6 +463,9 @@ struct fuse_conn { /** The group id for this mount */ kgid_t group_id; + /** The pid namespace for this mount */ + struct pid_namespace *pid_ns; + /** Maximum read size */ unsigned max_read; @@ -527,9 +532,6 @@ struct fuse_conn { /** Filesystem supports NFS exporting. Only set in INIT */ unsigned export_support:1; - /** Set if bdi is valid */ - unsigned bdi_initialized:1; - /** write-back cache policy (default is write-through) */ unsigned writeback_cache:1; @@ -631,9 +633,6 @@ struct fuse_conn { /** Negotiated minor version */ unsigned minor; - /** Backing dev info */ - struct backing_dev_info bdi; - /** Entry on the fuse_conn_list */ struct list_head entry; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 6fe6a88ecb4afd9eaaad1b0c75fe961108efc64a..5a1b58f8fef4d2437842b73d4c1c12b2e87a32c9 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -21,6 +21,7 @@ #include #include #include +#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -386,12 +387,6 @@ static void fuse_send_destroy(struct fuse_conn *fc) } } -static void fuse_bdi_destroy(struct fuse_conn *fc) -{ - if (fc->bdi_initialized) - bdi_destroy(&fc->bdi); -} - static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); @@ -403,7 +398,6 @@ static void fuse_put_super(struct super_block *sb) list_del(&fc->entry); fuse_ctl_remove_conn(fc); mutex_unlock(&fuse_mutex); - fuse_bdi_destroy(fc); fuse_conn_put(fc); } @@ -608,7 +602,7 @@ void fuse_conn_init(struct fuse_conn *fc) memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); - atomic_set(&fc->count, 1); + refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); @@ -626,14 +620,16 @@ void fuse_conn_init(struct fuse_conn *fc) fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { - if (atomic_dec_and_test(&fc->count)) { + if (refcount_dec_and_test(&fc->count)) { if (fc->destroy_req) fuse_request_free(fc->destroy_req); + put_pid_ns(fc->pid_ns); fc->release(fc); } } @@ -641,7 +637,7 @@ EXPORT_SYMBOL_GPL(fuse_conn_put); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) { - atomic_inc(&fc->count); + refcount_inc(&fc->count); return fc; } EXPORT_SYMBOL_GPL(fuse_conn_get); @@ -928,7 +924,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->no_flock = 1; } - fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); + fc->sb->s_bdi->ra_pages = + min(fc->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; fc->max_write = max_t(unsigned, 4096, fc->max_write); @@ -944,7 +941,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) arg->major = FUSE_KERNEL_VERSION; arg->minor = FUSE_KERNEL_MINOR_VERSION; - arg->max_readahead = fc->bdi.ra_pages * PAGE_SIZE; + arg->max_readahead = fc->sb->s_bdi->ra_pages * PAGE_SIZE; arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC | FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK | FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ | @@ -976,27 +973,18 @@ static void fuse_free_conn(struct fuse_conn *fc) static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + char *suffix = ""; - fc->bdi.name = "fuse"; - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; - - err = bdi_init(&fc->bdi); + if (sb->s_bdev) + suffix = "-fuseblk"; + err = super_setup_bdi_name(sb, "%u:%u%s", MAJOR(fc->dev), + MINOR(fc->dev), suffix); if (err) return err; - fc->bdi_initialized = 1; - - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - - if (err) - return err; + sb->s_bdi->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_SIZE; + /* fuse does it's own writeback accounting */ + sb->s_bdi->capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT; /* * For a single fuse filesystem use max 1% of dirty + @@ -1010,7 +998,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) * * /sys/class/bdi//max_ratio */ - bdi_set_max_ratio(&fc->bdi, 1); + bdi_set_max_ratio(sb->s_bdi, 1); return 0; } @@ -1113,8 +1101,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (err) goto err_dev_free; - sb->s_bdi = &fc->bdi; - /* Handle umasking inside the fuse code */ if (sb->s_flags & MS_POSIXACL) fc->dont_mask = 1; @@ -1182,7 +1168,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) err_dev_free: fuse_dev_free(fud); err_put_conn: - fuse_bdi_destroy(fc); fuse_conn_put(fc); err_fput: fput(file); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 01b97c012c6e9635540eaf2126cd5358726c1082..4d810be532ddcb6d9d18b234383c5167b0780e32 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -38,11 +38,6 @@ struct metapath { __u16 mp_list[GFS2_MAX_META_HEIGHT]; }; -struct strip_mine { - int sm_first; - unsigned int sm_height; -}; - /** * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page * @ip: the inode @@ -252,6 +247,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) return 1; } +/** + * metaptr1 - Return the first possible metadata pointer in a metaath buffer + * @height: The metadata height (0 = dinode) + * @mp: The metapath + */ +static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp) +{ + struct buffer_head *bh = mp->mp_bh[height]; + if (height == 0) + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode))); + return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header))); +} + /** * metapointer - Return pointer to start of metadata in a buffer * @height: The metadata height (0 = dinode) @@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp) static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) { - struct buffer_head *bh = mp->mp_bh[height]; - unsigned int head_size = (height > 0) ? - sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); - return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height]; + __be64 *p = metaptr1(height, mp); + return p + mp->mp_list[height]; } static void gfs2_metapath_ra(struct gfs2_glock *gl, @@ -295,6 +301,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl, } } +/** + * lookup_mp_height - helper function for lookup_metapath + * @ip: the inode + * @mp: the metapath + * @h: the height which needs looking up + */ +static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + __be64 *ptr = metapointer(h, mp); + u64 dblock = be64_to_cpu(*ptr); + + if (!dblock) + return h + 1; + + return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]); +} + /** * lookup_metapath - Walk the metadata tree to a specific point * @ip: The inode @@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) { unsigned int end_of_metadata = ip->i_height - 1; unsigned int x; - __be64 *ptr; - u64 dblock; int ret; for (x = 0; x < end_of_metadata; x++) { - ptr = metapointer(x, mp); - dblock = be64_to_cpu(*ptr); - if (!dblock) - return x + 1; - - ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]); + ret = lookup_mp_height(ip, mp, x); if (ret) return ret; } @@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp) return ip->i_height; } +/** + * fillup_metapath - fill up buffers for the metadata path to a specific height + * @ip: The inode + * @mp: The metapath + * @h: The height to which it should be mapped + * + * Similar to lookup_metapath, but does lookups for a range of heights + * + * Returns: error or height of metadata tree + */ + +static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h) +{ + unsigned int start_h = h - 1; + int ret; + + if (h) { + /* find the first buffer we need to look up. */ + while (start_h > 0 && mp->mp_bh[start_h] == NULL) + start_h--; + for (; start_h < h; start_h++) { + ret = lookup_mp_height(ip, mp, start_h); + if (ret) + return ret; + } + } + return ip->i_height; +} + static inline void release_metapath(struct metapath *mp) { int i; @@ -422,6 +467,13 @@ enum alloc_state { /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ }; +static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt) +{ + if (hgt) + return sdp->sd_inptrs; + return sdp->sd_diptrs; +} + /** * gfs2_bmap_alloc - Build a metadata tree of the requested height * @inode: The GFS2 inode @@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, BUG_ON(maxlen == 0); - memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); + memset(&mp, 0, sizeof(mp)); bmap_lock(ip, create); clear_buffer_mapped(bh_map); clear_buffer_new(bh_map); @@ -701,252 +753,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi return ret; } -/** - * do_strip - Look for a layer a particular layer of the file and strip it off - * @ip: the inode - * @dibh: the dinode buffer - * @bh: A buffer of pointers - * @top: The first pointer in the buffer - * @bottom: One more than the last pointer - * @height: the height this buffer is at - * @sm: a pointer to a struct strip_mine - * - * Returns: errno - */ - -static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, - struct buffer_head *bh, __be64 *top, __be64 *bottom, - unsigned int height, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrp_list rlist; - struct gfs2_trans *tr; - u64 bn, bstart; - u32 blen, btotal; - __be64 *p; - unsigned int rg_blocks = 0; - int metadata; - unsigned int revokes = 0; - int x; - int error; - int jblocks_rqsted; - - error = gfs2_rindex_update(sdp); - if (error) - return error; - - if (!*top) - sm->sm_first = 0; - - if (height != sm->sm_height) - return 0; - - if (sm->sm_first) { - top++; - sm->sm_first = 0; - } - - metadata = (height != ip->i_height - 1); - if (metadata) - revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; - else if (ip->i_depth) - revokes = sdp->sd_inptrs; - - memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - bstart = 0; - blen = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - - bstart = bn; - blen = 1; - } - } - - if (bstart) - gfs2_rlist_add(ip, &rlist, bstart); - else - goto out; /* Nothing to do */ - - gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE); - - for (x = 0; x < rlist.rl_rgrps; x++) { - struct gfs2_rgrpd *rgd; - rgd = rlist.rl_ghs[x].gh_gl->gl_object; - rg_blocks += rgd->rd_length; - } - - error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs); - if (error) - goto out_rlist; - - if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(&ip->i_res); - -restart: - jblocks_rqsted = rg_blocks + RES_DINODE + - RES_INDIRECT + RES_STATFS + RES_QUOTA + - gfs2_struct2blk(sdp, revokes, sizeof(u64)); - if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2)) - jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2); - error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); - if (error) - goto out_rg_gunlock; - - tr = current->journal_info; - down_write(&ip->i_rw_mutex); - - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_trans_add_meta(ip->i_gl, bh); - - bstart = 0; - blen = 0; - btotal = 0; - - for (p = top; p < bottom; p++) { - if (!*p) - continue; - - /* check for max reasonable journal transaction blocks */ - if (tr->tr_num_buf_new + RES_STATFS + - RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { - if (rg_blocks >= tr->tr_num_buf_new) - rg_blocks -= tr->tr_num_buf_new; - else - rg_blocks = 0; - break; - } - - bn = be64_to_cpu(*p); - - if (bstart + blen == bn) - blen++; - else { - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - bstart = bn; - blen = 1; - } - - *p = 0; - gfs2_add_inode_blocks(&ip->i_inode, -1); - } - if (p == bottom) - rg_blocks = 0; - - if (bstart) { - __gfs2_free_blocks(ip, bstart, blen, metadata); - btotal += blen; - } - - gfs2_statfs_change(sdp, 0, +btotal, 0); - gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, - ip->i_inode.i_gid); - - ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); - - gfs2_dinode_out(ip, dibh->b_data); - - up_write(&ip->i_rw_mutex); - - gfs2_trans_end(sdp); - - if (rg_blocks) - goto restart; - -out_rg_gunlock: - gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs); -out_rlist: - gfs2_rlist_free(&rlist); -out: - return error; -} - -/** - * recursive_scan - recursively scan through the end of a file - * @ip: the inode - * @dibh: the dinode buffer - * @mp: the path through the metadata to the point to start - * @height: the height the recursion is at - * @block: the indirect block to look at - * @first: 1 if this is the first block - * @sm: data opaque to this function to pass to @bc - * - * When this is first called @height and @block should be zero and - * @first should be 1. - * - * Returns: errno - */ - -static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh, - struct metapath *mp, unsigned int height, - u64 block, int first, struct strip_mine *sm) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct buffer_head *bh = NULL; - __be64 *top, *bottom; - u64 bn; - int error; - int mh_size = sizeof(struct gfs2_meta_header); - - if (!height) { - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - return error; - dibh = bh; - - top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0]; - bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs; - } else { - error = gfs2_meta_indirect_buffer(ip, height, block, &bh); - if (error) - return error; - - top = (__be64 *)(bh->b_data + mh_size) + - (first ? mp->mp_list[height] : 0); - - bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs; - } - - error = do_strip(ip, dibh, bh, top, bottom, height, sm); - if (error) - goto out; - - if (height < ip->i_height - 1) { - - gfs2_metapath_ra(ip->i_gl, bh, top); - - for (; top < bottom; top++, first = 0) { - if (!*top) - continue; - - bn = be64_to_cpu(*top); - - error = recursive_scan(ip, dibh, mp, height + 1, bn, - first, sm); - if (error) - break; - } - } -out: - brelse(bh); - return error; -} - - /** * gfs2_block_truncate_page - Deal with zeroing out data for truncate * @@ -1106,41 +912,406 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize) return error; } -static int trunc_dealloc(struct gfs2_inode *ip, u64 size) +/** + * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein + * @ip: inode + * @rg_gh: holder of resource group glock + * @mp: current metapath fully populated with buffers + * @btotal: place to keep count of total blocks freed + * @hgt: height we're processing + * @first: true if this is the first call to this function for this height + * + * We sweep a metadata buffer (provided by the metapath) for blocks we need to + * free, and free them all. However, we do it one rgrp at a time. If this + * block has references to multiple rgrps, we break it into individual + * transactions. This allows other processes to use the rgrps while we're + * focused on a single one, for better concurrency / performance. + * At every transaction boundary, we rewrite the inode into the journal. + * That way the bitmaps are kept consistent with the inode and we can recover + * if we're interrupted by power-outages. + * + * Returns: 0, or return code if an error occurred. + * *btotal has the total number of blocks freed + */ +static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh, + const struct metapath *mp, u32 *btotal, int hgt, + bool preserve1) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int height = ip->i_height; - u64 lblock; - struct metapath mp; - int error; + struct gfs2_rgrpd *rgd; + struct gfs2_trans *tr; + struct buffer_head *bh = mp->mp_bh[hgt]; + __be64 *top, *bottom, *p; + int blks_outside_rgrp; + u64 bn, bstart, isize_blks; + s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */ + int meta = ((hgt != ip->i_height - 1) ? 1 : 0); + int ret = 0; + bool buf_in_tr = false; /* buffer was added to transaction */ + + if (gfs2_metatype_check(sdp, bh, + (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI))) + return -EIO; + +more_rgrps: + blks_outside_rgrp = 0; + bstart = 0; + blen = 0; + top = metapointer(hgt, mp); /* first ptr from metapath */ + /* If we're keeping some data at the truncation point, we've got to + preserve the metadata tree by adding 1 to the starting metapath. */ + if (preserve1) + top++; + + bottom = (__be64 *)(bh->b_data + bh->b_size); + + for (p = top; p < bottom; p++) { + if (!*p) + continue; + bn = be64_to_cpu(*p); + if (gfs2_holder_initialized(rd_gh)) { + rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object; + gfs2_assert_withdraw(sdp, + gfs2_glock_is_locked_by_me(rd_gh->gh_gl)); + } else { + rgd = gfs2_blk2rgrpd(sdp, bn, false); + ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + 0, rd_gh); + if (ret) + goto out; + + /* Must be done with the rgrp glock held: */ + if (gfs2_rs_active(&ip->i_res) && + rgd == ip->i_res.rs_rbm.rgd) + gfs2_rs_deltree(&ip->i_res); + } + + if (!rgrp_contains_block(rgd, bn)) { + blks_outside_rgrp++; + continue; + } + + /* The size of our transactions will be unknown until we + actually process all the metadata blocks that relate to + the rgrp. So we estimate. We know it can't be more than + the dinode's i_blocks and we don't want to exceed the + journal flush threshold, sd_log_thresh2. */ + if (current->journal_info == NULL) { + unsigned int jblocks_rqsted, revokes; + + jblocks_rqsted = rgd->rd_length + RES_DINODE + + RES_INDIRECT; + isize_blks = gfs2_get_inode_blocks(&ip->i_inode); + if (isize_blks > atomic_read(&sdp->sd_log_thresh2)) + jblocks_rqsted += + atomic_read(&sdp->sd_log_thresh2); + else + jblocks_rqsted += isize_blks; + revokes = jblocks_rqsted; + if (meta) + revokes += hptrs(sdp, hgt); + else if (ip->i_depth) + revokes += sdp->sd_inptrs; + ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes); + if (ret) + goto out_unlock; + down_write(&ip->i_rw_mutex); + } + /* check if we will exceed the transaction blocks requested */ + tr = current->journal_info; + if (tr->tr_num_buf_new + RES_STATFS + + RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) { + /* We set blks_outside_rgrp to ensure the loop will + be repeated for the same rgrp, but with a new + transaction. */ + blks_outside_rgrp++; + /* This next part is tricky. If the buffer was added + to the transaction, we've already set some block + pointers to 0, so we better follow through and free + them, or we will introduce corruption (so break). + This may be impossible, or at least rare, but I + decided to cover the case regardless. + + If the buffer was not added to the transaction + (this call), doing so would exceed our transaction + size, so we need to end the transaction and start a + new one (so goto). */ + + if (buf_in_tr) + break; + goto out_unlock; + } + + gfs2_trans_add_meta(ip->i_gl, bh); + buf_in_tr = true; + *p = 0; + if (bstart + blen == bn) { + blen++; + continue; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } + bstart = bn; + blen = 1; + } + if (bstart) { + __gfs2_free_blocks(ip, bstart, (u32)blen, meta); + (*btotal) += blen; + gfs2_add_inode_blocks(&ip->i_inode, -blen); + } +out_unlock: + if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks + outside the rgrp we just processed, + do it all over again. */ + if (current->journal_info) { + struct buffer_head *dibh = mp->mp_bh[0]; + + /* Every transaction boundary, we rewrite the dinode + to keep its di_blocks current in case of failure. */ + ip->i_inode.i_mtime = ip->i_inode.i_ctime = + current_time(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); + goto more_rgrps; + } +out: + return ret; +} + +/** + * find_nonnull_ptr - find a non-null pointer given a metapath and height + * assumes the metapath is valid (with buffers) out to height h + * @mp: starting metapath + * @h: desired height to search + * + * Returns: true if a non-null pointer was found in the metapath buffer + * false if all remaining pointers are NULL in the buffer + */ +static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp, + unsigned int h) +{ + __be64 *ptr; + unsigned int ptrs = hptrs(sdp, h) - 1; + + while (true) { + ptr = metapointer(h, mp); + if (*ptr) /* if we have a non-null pointer */ + return true; + + if (mp->mp_list[h] < ptrs) + mp->mp_list[h]++; + else + return false; /* no more pointers in this buffer */ + } +} + +enum dealloc_states { + DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */ + DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */ + DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */ + DEALLOC_DONE = 3, /* process complete */ +}; - if (!size) +/** + * trunc_dealloc - truncate a file down to a desired size + * @ip: inode to truncate + * @newsize: The desired size of the file + * + * This function truncates a file to newsize. It works from the + * bottom up, and from the right to the left. In other words, it strips off + * the highest layer (data) before stripping any of the metadata. Doing it + * this way is best in case the operation is interrupted by power failure, etc. + * The dinode is rewritten in every transaction to guarantee integrity. + */ +static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct metapath mp; + struct buffer_head *dibh, *bh; + struct gfs2_holder rd_gh; + u64 lblock; + __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */ + unsigned int strip_h = ip->i_height - 1; + u32 btotal = 0; + int ret, state; + int mp_h; /* metapath buffers are read in to this height */ + sector_t last_ra = 0; + u64 prev_bnr = 0; + bool preserve1; /* need to preserve the first meta pointer? */ + + if (!newsize) lblock = 0; else - lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; + lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift; + memset(&mp, 0, sizeof(mp)); find_metapath(sdp, lblock, &mp, ip->i_height); - error = gfs2_rindex_update(sdp); - if (error) - return error; - error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - return error; + memcpy(&nbof, &mp.mp_list, sizeof(nbof)); + + ret = gfs2_meta_inode_buffer(ip, &dibh); + if (ret) + return ret; - while (height--) { - struct strip_mine sm; - sm.sm_first = !!size; - sm.sm_height = height; + mp.mp_bh[0] = dibh; + ret = lookup_metapath(ip, &mp); + if (ret == ip->i_height) + state = DEALLOC_MP_FULL; /* We have a complete metapath */ + else + state = DEALLOC_FILL_MP; /* deal with partial metapath */ - error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); - if (error) + ret = gfs2_rindex_update(sdp); + if (ret) + goto out_metapath; + + ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (ret) + goto out_metapath; + gfs2_holder_mark_uninitialized(&rd_gh); + + mp_h = strip_h; + + while (state != DEALLOC_DONE) { + switch (state) { + /* Truncate a full metapath at the given strip height. + * Note that strip_h == mp_h in order to be in this state. */ + case DEALLOC_MP_FULL: + if (mp_h > 0) { /* issue read-ahead on metadata */ + __be64 *top; + + bh = mp.mp_bh[mp_h - 1]; + if (bh->b_blocknr != last_ra) { + last_ra = bh->b_blocknr; + top = metaptr1(mp_h - 1, &mp); + gfs2_metapath_ra(ip->i_gl, bh, top); + } + } + /* If we're truncating to a non-zero size and the mp is + at the beginning of file for the strip height, we + need to preserve the first metadata pointer. */ + preserve1 = (newsize && + (mp.mp_list[mp_h] == nbof[mp_h])); + bh = mp.mp_bh[mp_h]; + gfs2_assert_withdraw(sdp, bh); + if (gfs2_assert_withdraw(sdp, + prev_bnr != bh->b_blocknr)) { + printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, " + "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n", + sdp->sd_fsname, + (unsigned long long)ip->i_no_addr, + prev_bnr, ip->i_height, strip_h, mp_h); + } + prev_bnr = bh->b_blocknr; + ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal, + mp_h, preserve1); + /* If we hit an error or just swept dinode buffer, + just exit. */ + if (ret || !mp_h) { + state = DEALLOC_DONE; + break; + } + state = DEALLOC_MP_LOWER; + break; + + /* lower the metapath strip height */ + case DEALLOC_MP_LOWER: + /* We're done with the current buffer, so release it, + unless it's the dinode buffer. Then back up to the + previous pointer. */ + if (mp_h) { + brelse(mp.mp_bh[mp_h]); + mp.mp_bh[mp_h] = NULL; + } + /* If we can't get any lower in height, we've stripped + off all we can. Next step is to back up and start + stripping the previous level of metadata. */ + if (mp_h == 0) { + strip_h--; + memcpy(&mp.mp_list, &nbof, sizeof(nbof)); + mp_h = strip_h; + state = DEALLOC_FILL_MP; + break; + } + mp.mp_list[mp_h] = 0; + mp_h--; /* search one metadata height down */ + if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1) + break; /* loop around in the same state */ + mp.mp_list[mp_h]++; + /* Here we've found a part of the metapath that is not + * allocated. We need to search at that height for the + * next non-null pointer. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) { + state = DEALLOC_FILL_MP; + mp_h++; + } + /* No more non-null pointers at this height. Back up + to the previous height and try again. */ + break; /* loop around in the same state */ + + /* Fill the metapath with buffers to the given height. */ + case DEALLOC_FILL_MP: + /* Fill the buffers out to the current height. */ + ret = fillup_metapath(ip, &mp, mp_h); + if (ret < 0) + goto out; + + /* If buffers found for the entire strip height */ + if ((ret == ip->i_height) && (mp_h == strip_h)) { + state = DEALLOC_MP_FULL; + break; + } + if (ret < ip->i_height) /* We have a partial height */ + mp_h = ret - 1; + + /* If we find a non-null block pointer, crawl a bit + higher up in the metapath and try again, otherwise + we need to look lower for a new starting point. */ + if (find_nonnull_ptr(sdp, &mp, mp_h)) + mp_h++; + else + state = DEALLOC_MP_LOWER; break; + } } - gfs2_quota_unhold(ip); + if (btotal) { + if (current->journal_info == NULL) { + ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + + RES_QUOTA, 0); + if (ret) + goto out; + down_write(&ip->i_rw_mutex); + } + gfs2_statfs_change(sdp, 0, +btotal, 0); + gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid, + ip->i_inode.i_gid); + ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode); + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + } - return error; +out: + if (gfs2_holder_initialized(&rd_gh)) + gfs2_glock_dq_uninit(&rd_gh); + if (current->journal_info) { + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); + cond_resched(); + } + gfs2_quota_unhold(ip); +out_metapath: + release_metapath(&mp); + return ret; } static int trunc_end(struct gfs2_inode *ip) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 6fe2a59c6a9a5e8ba14e7ff4f4fefd729686bf58..c2062a108d19602544276f596de75b1f593a1e61 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -911,11 +911,15 @@ static long __gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct gfs2_sbd *sdp = GFS2_SB(inode); struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; int ret; - if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + /* fallocate is needed by gfs2_grow to reserve space in the rindex */ + if (gfs2_is_jdata(ip) && inode != sdp->sd_rindex) return -EOPNOTSUPP; inode_lock(inode); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ec0848fcca02d8960ba671b9b0be5a56ed3fd7ed..959a19ced4d5f83fccf7e82c88a5855f3d1d26cc 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -73,7 +73,7 @@ static DEFINE_SPINLOCK(lru_lock); static struct rhashtable_params ht_parms = { .nelem_hint = GFS2_GL_HASH_SIZE * 3 / 4, - .key_len = sizeof(struct lm_lockname), + .key_len = offsetofend(struct lm_lockname, ln_type), .key_offset = offsetof(struct gfs2_glock, gl_name), .head_offset = offsetof(struct gfs2_glock, gl_node), }; @@ -449,6 +449,9 @@ __acquires(&gl->gl_lockref.lock) unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) && + target != LM_ST_UNLOCKED) + return; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | LM_FLAG_PRIORITY); GLOCK_BUG_ON(gl, gl->gl_state == target); @@ -484,7 +487,8 @@ __acquires(&gl->gl_lockref.lock) } else if (ret) { pr_err("lm_lock ret %d\n", ret); - GLOCK_BUG_ON(gl, 1); + GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN, + &sdp->sd_flags)); } } else { /* lock_nolock */ finish_xmote(gl, target); @@ -653,10 +657,10 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; - struct gfs2_glock *gl, *tmp = NULL; + struct gfs2_glock *gl, *tmp; struct address_space *mapping; struct kmem_cache *cachep; - int ret, tries = 0; + int ret = 0; rcu_read_lock(); gl = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); @@ -721,35 +725,32 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, } again: - ret = rhashtable_lookup_insert_fast(&gl_hash_table, &gl->gl_node, - ht_parms); - if (ret == 0) { + rcu_read_lock(); + tmp = rhashtable_lookup_get_insert_fast(&gl_hash_table, &gl->gl_node, + ht_parms); + if (!tmp) { *glp = gl; - return 0; + goto out; } - - if (ret == -EEXIST) { - ret = 0; - rcu_read_lock(); - tmp = rhashtable_lookup_fast(&gl_hash_table, &name, ht_parms); - if (tmp == NULL || !lockref_get_not_dead(&tmp->gl_lockref)) { - if (++tries < 100) { - rcu_read_unlock(); - cond_resched(); - goto again; - } - tmp = NULL; - ret = -ENOMEM; - } - rcu_read_unlock(); - } else { - WARN_ON_ONCE(ret); + if (IS_ERR(tmp)) { + ret = PTR_ERR(tmp); + goto out_free; } + if (lockref_get_not_dead(&tmp->gl_lockref)) { + *glp = tmp; + goto out_free; + } + rcu_read_unlock(); + cond_resched(); + goto again; + +out_free: kfree(gl->gl_lksb.sb_lvbptr); kmem_cache_free(cachep, gl); atomic_dec(&sdp->sd_glock_disposal); - *glp = tmp; +out: + rcu_read_unlock(); return ret; } @@ -1918,10 +1919,10 @@ static const struct seq_operations gfs2_sbstats_seq_ops = { #define GFS2_SEQ_GOODSIZE min(PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER, 65536UL) -static int gfs2_glocks_open(struct inode *inode, struct file *file) +static int __gfs2_glocks_open(struct inode *inode, struct file *file, + const struct seq_operations *ops) { - int ret = seq_open_private(file, &gfs2_glock_seq_ops, - sizeof(struct gfs2_glock_iter)); + int ret = seq_open_private(file, ops, sizeof(struct gfs2_glock_iter)); if (ret == 0) { struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; @@ -1932,11 +1933,16 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); + rhashtable_walk_enter(&gl_hash_table, &gi->hti); } return ret; } +static int gfs2_glocks_open(struct inode *inode, struct file *file) +{ + return __gfs2_glocks_open(inode, file, &gfs2_glock_seq_ops); +} + static int gfs2_glocks_release(struct inode *inode, struct file *file) { struct seq_file *seq = file->private_data; @@ -1949,20 +1955,7 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) static int gfs2_glstats_open(struct inode *inode, struct file *file) { - int ret = seq_open_private(file, &gfs2_glstats_seq_ops, - sizeof(struct gfs2_glock_iter)); - if (ret == 0) { - struct seq_file *seq = file->private_data; - struct gfs2_glock_iter *gi = seq->private; - gi->sdp = inode->i_private; - gi->last_pos = 0; - seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); - if (seq->buf) - seq->size = GFS2_SEQ_GOODSIZE; - gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti, GFP_KERNEL); - } - return ret; + return __gfs2_glocks_open(inode, file, &gfs2_glstats_seq_ops); } static int gfs2_sbstats_open(struct inode *inode, struct file *file) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 511e1ed7e2ded7b0a9dc9882cffe0b66c37c96a2..b7cf65d13561fedf98a72dfc66b36e70d84100cd 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -203,11 +203,15 @@ enum { DFL_DLM_RECOVERY = 6, }; +/* + * We are using struct lm_lockname as an rhashtable key. Avoid holes within + * the struct; padding at the end is fine. + */ struct lm_lockname { - struct gfs2_sbd *ln_sbd; u64 ln_number; + struct gfs2_sbd *ln_sbd; unsigned int ln_type; -} __packed __aligned(sizeof(int)); +}; #define lm_name_equal(name1, name2) \ (((name1)->ln_number == (name2)->ln_number) && \ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e279c3ce27be3cd9f3048557bd90e06f21df9e85..9f605ea4810c68b4ec8e7092a1b1668e0659aa27 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -202,8 +202,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_put: if (io_gl) gfs2_glock_put(io_gl); @@ -667,6 +666,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_height = 0; ip->i_depth = 0; ip->i_entries = 0; + ip->i_no_addr = 0; /* Temporarily zero until real addr is assigned */ switch(mode & S_IFMT) { case S_IFREG: diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b108e7ba81af73568b71ec0b163bf68fbbde8948..ed67548b286ccc58d84732d00af5ca281772bd39 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -1222,12 +1223,7 @@ static int set_gfs2_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; - - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); return 0; } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 86ccc01593937d93b7e5901ec26ed30bd5a4ab32..83c9909ff14a6bf7b741dd29cb316a346ba23c94 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } } -static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) -{ - u64 first = rgd->rd_data0; - u64 last = first + rgd->rd_data; - return first <= block && block < last; -} - /** * gfs2_blk2rgrpd - Find resource group for a given data/meta block number * @sdp: The GFS2 superblock diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 66b51cf66dfa3ca7f29a5ad7879d3485e956c462..e90478e2f5453be647d6386bb17e5dce52cccbba 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) +{ + u64 first = rgd->rd_data0; + u64 last = first + rgd->rd_data; + return first <= block && block < last; +} + extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 361796a84fce4f444f563592aa05f4e25642aa61..29b0473f6e74132c465cc2e398552d977be98e8d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -793,7 +793,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags) if (!(flags & (I_DIRTY_DATASYNC|I_DIRTY_SYNC))) return; - + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return; if (!gfs2_glock_is_locked_by_me(ip->i_gl)) { ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); if (ret) { @@ -1538,8 +1539,7 @@ static void gfs2_evict_inode(struct inode *inode) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); goto out; } @@ -1617,7 +1617,7 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_holder_initialized(&ip->i_iopen_gh)) { if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_glock_dq(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); } @@ -1639,8 +1639,7 @@ static void gfs2_evict_inode(struct inode *inode) if (gfs2_holder_initialized(&ip->i_iopen_gh)) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&ip->i_iopen_gh); } } diff --git a/fs/hfs/extent.c b/fs/hfs/extent.c index e33a0d36a93ebea7183909ff45f523302943b694..5d01826545809de7de454e2615553fc6a637ce70 100644 --- a/fs/hfs/extent.c +++ b/fs/hfs/extent.c @@ -485,8 +485,8 @@ void hfs_file_truncate(struct inode *inode) /* XXX: Can use generic_cont_expand? */ size = inode->i_size - 1; - res = pagecache_write_begin(NULL, mapping, size+1, 0, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size+1, 0, 0, + &page, &fsdata); if (!res) { res = pagecache_write_end(NULL, mapping, size+1, 0, 0, page, fsdata); diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index feca524ce2a5c7d6034bf0a528679fe923381507..a3eb640b4f8f8474130a893100303a0211d9037c 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -545,9 +545,8 @@ void hfsplus_file_truncate(struct inode *inode) void *fsdata; loff_t size = inode->i_size; - res = pagecache_write_begin(NULL, mapping, size, 0, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); + res = pagecache_write_begin(NULL, mapping, size, 0, 0, + &page, &fsdata); if (res) return; res = pagecache_write_end(NULL, mapping, size, diff --git a/fs/inode.c b/fs/inode.c index 88110fd0b282e49246dc9cd93a1d6e173d951d7b..db5914783a7130d77725502cb4182c05ff7775c2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -119,7 +119,7 @@ static int no_open(struct inode *inode, struct file *file) } /** - * inode_init_always - perform inode structure intialisation + * inode_init_always - perform inode structure initialisation * @sb: superblock inode belongs to * @inode: inode to initialise * @@ -371,9 +371,6 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&inode->i_fsnotify_marks); -#endif } EXPORT_SYMBOL(inode_init_once); @@ -405,6 +402,8 @@ static void inode_lru_list_add(struct inode *inode) { if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru)) this_cpu_inc(nr_unused); + else + inode->i_state |= I_REFERENCED; } /* @@ -1492,7 +1491,6 @@ static void iput_final(struct inode *inode) drop = generic_drop_inode(inode); if (!drop && (sb->s_flags & MS_ACTIVE)) { - inode->i_state |= I_REFERENCED; inode_add_lru(inode); spin_unlock(&inode->i_lock); return; diff --git a/fs/internal.h b/fs/internal.h index 11c6d89dce9c9ae2e1bc1d04614a732ccc0a952e..9676fe11c093538d284ec5c2c59e9a2a9e37c1f6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -108,8 +108,6 @@ extern struct file *do_filp_open(int dfd, struct filename *pathname, extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, const char *, const struct open_flags *); -extern long do_handle_open(int mountdirfd, - struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); extern int vfs_open(const struct path *, struct file *, const struct cred *); extern struct file *filp_clone_open(struct file *); @@ -128,8 +126,6 @@ static inline bool atime_needs_update_rcu(const struct path *path, return __atime_needs_update(path, inode, true); } -extern bool atime_needs_update_rcu(const struct path *, struct inode *); - /* * fs-writeback.c */ diff --git a/fs/iomap.c b/fs/iomap.c index 141c3cd55a8b2d974f431d7710fbe4de58f78355..4b10892967a5ae9a4ece5d8571e47c01c72f74da 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -158,12 +158,6 @@ iomap_write_actor(struct inode *inode, loff_t pos, loff_t length, void *data, ssize_t written = 0; unsigned int flags = AOP_FLAG_NOFS; - /* - * Copies from kernel address space cannot fail (NFSD is a big user). - */ - if (!iter_is_iovec(i)) - flags |= AOP_FLAG_UNINTERRUPTIBLE; - do { struct page *page; unsigned long offset; /* Offset into pagecache page */ @@ -291,8 +285,7 @@ iomap_dirty_actor(struct inode *inode, loff_t pos, loff_t length, void *data, return PTR_ERR(rpage); status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_NOFS | AOP_FLAG_UNINTERRUPTIBLE, - &page, iomap); + AOP_FLAG_NOFS, &page, iomap); put_page(rpage); if (unlikely(status)) return status; @@ -343,8 +336,8 @@ static int iomap_zero(struct inode *inode, loff_t pos, unsigned offset, struct page *page; int status; - status = iomap_write_begin(inode, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE | AOP_FLAG_NOFS, &page, iomap); + status = iomap_write_begin(inode, pos, bytes, AOP_FLAG_NOFS, &page, + iomap); if (status) return status; @@ -360,7 +353,8 @@ static int iomap_dax_zero(loff_t pos, unsigned offset, unsigned bytes, sector_t sector = iomap->blkno + (((pos & ~(PAGE_SIZE - 1)) - iomap->offset) >> 9); - return __dax_zero_page_range(iomap->bdev, sector, offset, bytes); + return __dax_zero_page_range(iomap->bdev, iomap->dax_dev, sector, + offset, bytes); } static loff_t @@ -887,16 +881,14 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, flags |= IOMAP_WRITE; } - if (mapping->nrpages) { - ret = filemap_write_and_wait_range(mapping, start, end); - if (ret) - goto out_free_dio; + ret = filemap_write_and_wait_range(mapping, start, end); + if (ret) + goto out_free_dio; - ret = invalidate_inode_pages2_range(mapping, - start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); - ret = 0; - } + ret = invalidate_inode_pages2_range(mapping, + start >> PAGE_SHIFT, end >> PAGE_SHIFT); + WARN_ON_ONCE(ret); + ret = 0; inode_dio_begin(inode); @@ -911,6 +903,9 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, break; } pos += ret; + + if (iov_iter_rw(iter) == READ && pos >= dio->i_size) + break; } while ((count = iov_iter_count(iter)) > 0); blk_finish_plug(&plug); @@ -951,7 +946,7 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, * one is a pretty crazy thing to do, so we don't support it 100%. If * this invalidation fails, tough, the write still worked... */ - if (iov_iter_rw(iter) == WRITE && mapping->nrpages) { + if (iov_iter_rw(iter) == WRITE) { int err = invalidate_inode_pages2_range(mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); WARN_ON_ONCE(err); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 5adc2fb62b0fab89899e5d0acba1e8019a73c766..ebad34266bcfbbf4fb5d491b2f2699583f7116e1 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -43,6 +43,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -205,6 +206,14 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); + /* + * Make sure that no allocations from this kernel thread will ever + * recurse to the fs layer because we are responsible for the + * transaction commit and any fs involvement might get stuck waiting for + * the trasn. commit. + */ + memalloc_nofs_save(); + /* * And now, wait forever for commit wakeup events. */ @@ -691,8 +700,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; - jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); +#ifdef CONFIG_PROVE_LOCKING + /* + * Some callers make sure transaction is already committing and in that + * case we cannot block on open handles anymore. So don't warn in that + * case. + */ + if (tid_gt(tid, journal->j_commit_sequence) && + (!journal->j_committing_transaction || + journal->j_committing_transaction->t_tid != tid)) { + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + read_lock(&journal->j_state_lock); + } +#endif #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_ERR @@ -913,7 +935,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, + REQ_SYNC | REQ_FUA); if (ret) goto out; @@ -1314,7 +1337,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - REQ_FUA); + REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1454,7 +1477,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, REQ_FUA); + jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1721,7 +1744,7 @@ int jbd2_journal_destroy(journal_t *journal) write_unlock(&journal->j_state_lock); jbd2_mark_journal_empty(journal, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1980,7 +2003,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2026,7 +2049,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } @@ -2340,7 +2363,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, + SLAB_TEMPORARY | SLAB_TYPESAFE_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 5e659ee08d6ae84046b9b8d59f41641e9ef28b22..9ee4832b6f8b3664430e31bcf3775b412e1b81d8 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -29,6 +29,7 @@ #include #include #include +#include #include @@ -388,6 +389,11 @@ static int start_this_handle(journal_t *journal, handle_t *handle, rwsem_acquire_read(&journal->j_trans_commit_map, 0, 0, _THIS_IP_); jbd2_journal_free_transaction(new_transaction); + /* + * Ensure that no allocations done while the transaction is open are + * going to recurse back to the fs layer. + */ + handle->saved_alloc_context = memalloc_nofs_save(); return 0; } @@ -466,6 +472,7 @@ handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int rsv_blocks, trace_jbd2_handle_start(journal->j_fs_dev->bd_dev, handle->h_transaction->t_tid, type, line_no, nblocks); + return handle; } EXPORT_SYMBOL(jbd2__journal_start); @@ -1760,6 +1767,11 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_rsv_handle) jbd2_journal_free_reserved(handle->h_rsv_handle); free_and_exit: + /* + * Scope of the GFP_NOFS context is over here and so we can restore the + * original alloc context. + */ + memalloc_nofs_restore(handle->saved_alloc_context); jbd2_free_handle(handle); return err; } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 06a71dbd4833e3bdf4ea5277bc50e15ee40c45ad..389ea53ea487538061ff3b6da78e27df2f894ac1 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1366,7 +1366,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, jffs2_add_ino_cache(c, f->inocache); } if (!f->inocache) { - JFFS2_ERROR("requestied to read an nonexistent ino %u\n", ino); + JFFS2_ERROR("requested to read a nonexistent ino %u\n", ino); return -ENOENT; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index fc89f94367842583d7dd9f9d8d93707e5bd1a50c..5c5ac5b3aec33baf1da1af89f762d62275a59916 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -64,7 +64,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case JFS_IOC_GETFLAGS: - jfs_get_inode_flags(jfs_inode); flags = jfs_inode->mode2 & JFS_FL_USER_VISIBLE; flags = jfs_map_ext2(flags, 0); return put_user(flags, (int __user *) arg); @@ -98,7 +97,6 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /* Lock against other parallel changes of flags */ inode_lock(inode); - jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; /* diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 6aca224a5d6847ac29104f41053f4b1de9f9de8f..f36ef68905a74d830e75e7566c1145c8f55fc0cf 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -3148,7 +3148,6 @@ static void copy_to_dinode(struct dinode * dip, struct inode *ip) else dip->di_gid = cpu_to_le32(from_kgid(&init_user_ns, jfs_ip->saved_gid)); - jfs_get_inode_flags(jfs_ip); /* * mode2 is only needed for storing the higher order bits. * Trust i_mode for the lower order ones diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c index 375dd257a34fe343b1b25a0176b4dd863aeae0c4..5e9b7bb3aabf9d5827d367d35ee1bd4740af90e3 100644 --- a/fs/jfs/jfs_inode.c +++ b/fs/jfs/jfs_inode.c @@ -45,24 +45,6 @@ void jfs_set_inode_flags(struct inode *inode) S_DIRSYNC | S_SYNC); } -void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip) -{ - unsigned int flags = jfs_ip->vfs_inode.i_flags; - - jfs_ip->mode2 &= ~(JFS_IMMUTABLE_FL | JFS_APPEND_FL | JFS_NOATIME_FL | - JFS_DIRSYNC_FL | JFS_SYNC_FL); - if (flags & S_IMMUTABLE) - jfs_ip->mode2 |= JFS_IMMUTABLE_FL; - if (flags & S_APPEND) - jfs_ip->mode2 |= JFS_APPEND_FL; - if (flags & S_NOATIME) - jfs_ip->mode2 |= JFS_NOATIME_FL; - if (flags & S_DIRSYNC) - jfs_ip->mode2 |= JFS_DIRSYNC_FL; - if (flags & S_SYNC) - jfs_ip->mode2 |= JFS_SYNC_FL; -} - /* * NAME: ialloc() * diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 9271cfe4a1490ca88785a7973ecd22fd88000e73..7b0b3a40788ffd0a834df3651a24b66551b71f99 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -33,7 +33,6 @@ extern void jfs_truncate(struct inode *); extern void jfs_truncate_nolock(struct inode *, loff_t); extern void jfs_free_zero_link(struct inode *); extern struct dentry *jfs_get_parent(struct dentry *dentry); -extern void jfs_get_inode_flags(struct jfs_inode_info *); extern struct dentry *jfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type); extern struct dentry *jfs_fh_to_parent(struct super_block *sb, struct fid *fid, diff --git a/fs/jfs/super.c b/fs/jfs/super.c index c64c2574a0aad7eecf4922a6a005341c26b3fa24..e8aad7d87b8c938aad4a51db500684950df73944 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -45,6 +45,7 @@ #include "jfs_acl.h" #include "jfs_debug.h" #include "jfs_xattr.h" +#include "jfs_dinode.h" MODULE_DESCRIPTION("The Journaled Filesystem (JFS)"); MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM"); @@ -181,6 +182,35 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +#ifdef CONFIG_QUOTA +static int jfs_quota_off(struct super_block *sb, int type); +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path); + +static void jfs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + jfs_quota_off(sb, type); +} + +static const struct quotactl_ops jfs_quotactl_ops = { + .quota_on = jfs_quota_on, + .quota_off = jfs_quota_off, + .quota_sync = dquot_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, + .get_nextdqblk = dquot_get_next_dqblk, +}; +#else +static inline void jfs_quota_off_umount(struct super_block *sb) +{ +} +#endif + static void jfs_put_super(struct super_block *sb) { struct jfs_sb_info *sbi = JFS_SBI(sb); @@ -188,7 +218,7 @@ static void jfs_put_super(struct super_block *sb) jfs_info("In jfs_put_super"); - dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + jfs_quota_off_umount(sb); rc = jfs_umount(sb); if (rc) @@ -536,7 +566,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_xattr = jfs_xattr_handlers; #ifdef CONFIG_QUOTA sb->dq_op = &dquot_operations; - sb->s_qcop = &dquot_quotactl_ops; + sb->s_qcop = &jfs_quotactl_ops; sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif @@ -840,6 +870,51 @@ static struct dquot **jfs_get_dquots(struct inode *inode) { return JFS_IP(inode)->i_dquot; } + +static int jfs_quota_on(struct super_block *sb, int type, int format_id, + const struct path *path) +{ + int err; + struct inode *inode; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + inode_lock(inode); + JFS_IP(inode)->mode2 |= JFS_NOATIME_FL | JFS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); + + return 0; +} + +static int jfs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + JFS_IP(inode)->mode2 &= ~(JFS_NOATIME_FL | JFS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} #endif static const struct super_operations jfs_super_operations = { diff --git a/fs/libfs.c b/fs/libfs.c index a8b62e5d43a972d3b02867e6d9c2136b23987ecd..a04395334bb156ae04853639650dc9751fb55753 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -507,7 +507,7 @@ EXPORT_SYMBOL(simple_write_end); * to pass it an appropriate max_reserved value to avoid collisions. */ int simple_fill_super(struct super_block *s, unsigned long magic, - struct tree_descr *files) + const struct tree_descr *files) { struct inode *inode; struct dentry *root; diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 41e491b8e5d7e40164e0e51d90534c9d8660a94e..27d577dbe51a48f21950a63758e76783fed67eca 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -69,6 +69,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) if (host->h_rpcclnt == NULL && nlm_bind_host(host) == NULL) goto out_nobind; + host->h_nlmclnt_ops = nlm_init->nlmclnt_ops; return host; out_nobind: nlmclnt_release_host(host); diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 112952037933b79fa36dbea67579c576ee19be58..066ac313ae5c0fff15188d3123079acd9a7180f3 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -150,17 +150,22 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) * @host: address of a valid nlm_host context representing the NLM server * @cmd: fcntl-style file lock operation to perform * @fl: address of arguments for the lock operation + * @data: address of data to be sent to callback operations * */ -int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) +int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data) { struct nlm_rqst *call; int status; + const struct nlmclnt_operations *nlmclnt_ops = host->h_nlmclnt_ops; call = nlm_alloc_call(host); if (call == NULL) return -ENOMEM; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_alloc_call) + nlmclnt_ops->nlmclnt_alloc_call(data); + nlmclnt_locks_init_private(fl, host); if (!fl->fl_u.nfs_fl.owner) { /* lockowner allocation has failed */ @@ -169,6 +174,7 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) } /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); + call->a_callback_data = data; if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { @@ -214,8 +220,12 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlmclnt_release_call(struct nlm_rqst *call) { + const struct nlmclnt_operations *nlmclnt_ops = call->a_host->h_nlmclnt_ops; + if (!atomic_dec_and_test(&call->a_count)) return; + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_release_call) + nlmclnt_ops->nlmclnt_release_call(call->a_callback_data); nlmclnt_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); @@ -687,6 +697,19 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) return status; } +static void nlmclnt_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nlm_rqst *req = data; + const struct nlmclnt_operations *nlmclnt_ops = req->a_host->h_nlmclnt_ops; + bool defer_call = false; + + if (nlmclnt_ops && nlmclnt_ops->nlmclnt_unlock_prepare) + defer_call = nlmclnt_ops->nlmclnt_unlock_prepare(task, req->a_callback_data); + + if (!defer_call) + rpc_call_start(task); +} + static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) { struct nlm_rqst *req = data; @@ -720,6 +743,7 @@ static void nlmclnt_unlock_callback(struct rpc_task *task, void *data) } static const struct rpc_call_ops nlmclnt_unlock_ops = { + .rpc_call_prepare = nlmclnt_unlock_prepare, .rpc_call_done = nlmclnt_unlock_callback, .rpc_release = nlmclnt_rpc_release, }; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e7c8b9c76e48573a381d0196d0dc2166045a7b3e..5d481e8a1b5d0c732207c4a583027ec084d7a0db 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -132,6 +132,8 @@ lockd(void *vrqstp) { int err = 0; struct svc_rqst *rqstp = vrqstp; + struct net *net = &init_net; + struct lockd_net *ln = net_generic(net, lockd_net_id); /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -176,6 +178,8 @@ lockd(void *vrqstp) if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); + cancel_delayed_work_sync(&ln->grace_period_end); + locks_end_grace(&ln->lockd_manager); return 0; } @@ -270,8 +274,6 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) if (ln->nlmsvc_users) { if (--ln->nlmsvc_users == 0) { nlm_shutdown_hosts_net(net); - cancel_delayed_work_sync(&ln->grace_period_end); - locks_end_grace(&ln->lockd_manager); svc_shutdown_net(serv, net); dprintk("lockd_down_net: per-net data destroyed; net=%p\n", net); } diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 5581e020644bda687f0b4bd09645a058a18d4537..3507c80d1d4b962b579d501201f4cdd060e6248b 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -870,15 +870,15 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) if (!(block = nlmsvc_find_block(cookie))) return; - if (block) { - if (status == nlm_lck_denied_grace_period) { - /* Try again in a couple of seconds */ - nlmsvc_insert_block(block, 10 * HZ); - } else { - /* Lock is now held by client, or has been rejected. - * In both cases, the block should be removed. */ - nlmsvc_unlink_block(block); - } + if (status == nlm_lck_denied_grace_period) { + /* Try again in a couple of seconds */ + nlmsvc_insert_block(block, 10 * HZ); + } else { + /* + * Lock is now held by client, or has been rejected. + * In both cases, the block should be removed. + */ + nlmsvc_unlink_block(block); } nlmsvc_release_block(block); } diff --git a/fs/locks.c b/fs/locks.c index 26811321d39b8d404046100ac3dcc4b5e04f5cc7..af2031a1fcff14fcfb05e108b9d49bfc4a74685f 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2504,7 +2504,7 @@ locks_remove_flock(struct file *filp, struct file_lock_context *flctx) .fl_owner = filp, .fl_pid = current->tgid, .fl_file = filp, - .fl_flags = FL_FLOCK, + .fl_flags = FL_FLOCK | FL_CLOSE, .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; diff --git a/fs/mount.h b/fs/mount.h index 2826543a131d1217f306404e264a27a2ee163ec6..bf1fda6eed8f36beae7e678a0be8e78cd9317201 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -59,7 +59,7 @@ struct mount { struct mountpoint *mnt_mp; /* where is it mounted */ struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ #ifdef CONFIG_FSNOTIFY - struct hlist_head mnt_fsnotify_marks; + struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif int mnt_id; /* mount identifier */ diff --git a/fs/namei.c b/fs/namei.c index 19dcf62133cc95162d364f7ef43c17d280ee6448..6571a5f5112ed82894fdbec1c9b6dfaf557d93b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -340,22 +340,14 @@ int generic_permission(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) { /* DACs are overridable for directories */ - if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) - return 0; if (!(mask & MAY_WRITE)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; - return -EACCES; - } - /* - * Read/write DACs are always overridable. - * Executable DACs are overridable when there is - * at least one exec bit set. - */ - if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) return 0; + return -EACCES; + } /* * Searching includes executable on directories, else just read. @@ -364,6 +356,14 @@ int generic_permission(struct inode *inode, int mask) if (mask == MAY_READ) if (capable_wrt_inode_uidgid(inode, CAP_DAC_READ_SEARCH)) return 0; + /* + * Read/write DACs are always overridable. + * Executable DACs are overridable when there is + * at least one exec bit set. + */ + if (!(mask & MAY_EXEC) || (inode->i_mode & S_IXUGO)) + if (capable_wrt_inode_uidgid(inode, CAP_DAC_OVERRIDE)) + return 0; return -EACCES; } @@ -2142,7 +2142,6 @@ static int link_path_walk(const char *name, struct nameidata *nd) static const char *path_init(struct nameidata *nd, unsigned flags) { - int retval = 0; const char *s = nd->name->name; if (!*s) @@ -2154,13 +2153,8 @@ static const char *path_init(struct nameidata *nd, unsigned flags) if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; - if (*s) { - if (!d_can_lookup(root)) - return ERR_PTR(-ENOTDIR); - retval = inode_permission(inode, MAY_EXEC); - if (retval) - return ERR_PTR(retval); - } + if (*s && unlikely(!d_can_lookup(root))) + return ERR_PTR(-ENOTDIR); nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { @@ -2258,6 +2252,35 @@ static inline int lookup_last(struct nameidata *nd) return walk_component(nd, 0); } +static int handle_lookup_down(struct nameidata *nd) +{ + struct path path = nd->path; + struct inode *inode = nd->inode; + unsigned seq = nd->seq; + int err; + + if (nd->flags & LOOKUP_RCU) { + /* + * don't bother with unlazy_walk on failure - we are + * at the very beginning of walk, so we lose nothing + * if we simply redo everything in non-RCU mode + */ + if (unlikely(!__follow_mount_rcu(nd, &path, &inode, &seq))) + return -ECHILD; + } else { + dget(path.dentry); + err = follow_managed(&path, nd); + if (unlikely(err < 0)) + return err; + inode = d_backing_inode(path.dentry); + seq = 0; + } + path_to_nameidata(&path, nd); + nd->inode = inode; + nd->seq = seq; + return 0; +} + /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { @@ -2266,6 +2289,15 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path if (IS_ERR(s)) return PTR_ERR(s); + + if (unlikely(flags & LOOKUP_DOWN)) { + err = handle_lookup_down(nd); + if (unlikely(err < 0)) { + terminate_walk(nd); + return err; + } + } + while (!(err = link_path_walk(s, nd)) && ((err = lookup_last(nd)) > 0)) { s = trailing_symlink(nd); @@ -4766,7 +4798,7 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) struct page *page; void *fsdata; int err; - unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + unsigned int flags = 0; if (nofs) flags |= AOP_FLAG_NOFS; diff --git a/fs/namespace.c b/fs/namespace.c index cc1375eff88c75abdf14aeb22a55c3d8b802895f..8bd3e4d448b9f07a8aa148c8d08c092f45471a7c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -236,9 +236,6 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); -#ifdef CONFIG_FSNOTIFY - INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); -#endif init_fs_pin(&mnt->mnt_umount, drop_mountpoint); } return mnt; @@ -3465,8 +3462,9 @@ static void mntns_put(struct ns_common *ns) static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) { struct fs_struct *fs = current->fs; - struct mnt_namespace *mnt_ns = to_mnt_ns(ns); + struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct path root; + int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(current_user_ns(), CAP_SYS_CHROOT) || @@ -3477,15 +3475,18 @@ static int mntns_install(struct nsproxy *nsproxy, struct ns_common *ns) return -EINVAL; get_mnt_ns(mnt_ns); - put_mnt_ns(nsproxy->mnt_ns); + old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ - root.mnt = &mnt_ns->root->mnt; - root.dentry = mnt_ns->root->mnt.mnt_root; - path_get(&root); - while(d_mountpoint(root.dentry) && follow_down_one(&root)) - ; + err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, + "/", LOOKUP_DOWN, &root); + if (err) { + /* revert to old namespace */ + nsproxy->mnt_ns = old_mnt_ns; + put_mnt_ns(mnt_ns); + return err; + } /* Update the pwd and root */ set_fs_pwd(fs, &root); diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index d5606099712a4cb2059c4dac9036aba35e9c3751..6d0f14c8609971378ee75104380873a6da67a4be 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -554,12 +554,11 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; sb->s_d_op = &ncp_dentry_operations; - sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); - error = bdi_setup_and_register(&server->bdi, "ncpfs"); + error = super_setup_bdi(sb); if (error) goto out_fput; @@ -568,7 +567,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) if (data.info_fd != -1) { struct socket *info_sock = sockfd_lookup(data.info_fd, &error); if (!info_sock) - goto out_bdi; + goto out_fput; server->info_sock = info_sock; error = -EBADFD; if (info_sock->type != SOCK_STREAM) @@ -746,8 +745,6 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) out_fput2: if (server->info_sock) sockfd_put(server->info_sock); -out_bdi: - bdi_destroy(&server->bdi); out_fput: sockfd_put(sock); out: @@ -788,7 +785,6 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); - bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/fs/ncpfs/ncp_fs_sb.h b/fs/ncpfs/ncp_fs_sb.h index 55e26fd8088694308aeee3ceff1d11b91d6b1fb5..366fd63cc506fc3e61be93877349ef6032cf5734 100644 --- a/fs/ncpfs/ncp_fs_sb.h +++ b/fs/ncpfs/ncp_fs_sb.h @@ -143,7 +143,6 @@ struct ncp_server { size_t len; __u8 data[128]; } unexpected_packet; - struct backing_dev_info bdi; }; extern void ncp_tcp_rcv_proc(struct work_struct *work); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index f31fd0dd92c61d22fc477b6da0ccc62a7eb0481d..69d02cf8cf370678609175d6e5626d0dedf02c3a 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -123,11 +123,6 @@ config PNFS_BLOCK depends on NFS_V4_1 && BLK_DEV_DM default NFS_V4 -config PNFS_OBJLAYOUT - tristate - depends on NFS_V4_1 && SCSI_OSD_ULD - default NFS_V4 - config PNFS_FLEXFILE_LAYOUT tristate depends on NFS_V4_1 && NFS_V3 diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 6abdda209642e70d917308282d82ec2cfa0f353b..98f4e5728a67c87c13cc8ca52d00e72e7bb3bc2c 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -31,6 +31,5 @@ nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o pnfs_nfs.o nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 773774531aff5fc081610706ea39756b0e5a5c25..73a1f928226c05706bab82a37468973bff7b185c 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -76,7 +76,10 @@ nfs4_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); /* * Listen for a request on the socket */ @@ -85,6 +88,8 @@ nfs4_callback_svc(void *vrqstp) continue; svc_process(rqstp); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -103,9 +108,10 @@ nfs41_callback_svc(void *vrqstp) set_freezable(); - while (!kthread_should_stop()) { - if (try_to_freeze()) - continue; + while (!kthread_freezable_should_stop(NULL)) { + + if (signal_pending(current)) + flush_signals(current); prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); spin_lock_bh(&serv->sv_cb_lock); @@ -121,11 +127,13 @@ nfs41_callback_svc(void *vrqstp) error); } else { spin_unlock_bh(&serv->sv_cb_lock); - schedule(); + if (!kthread_should_stop()) + schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } - flush_signals(current); } + svc_exit_thread(rqstp); + module_put_and_exit(0); return 0; } @@ -221,14 +229,14 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, static struct svc_serv_ops nfs40_cb_sv_ops = { .svo_function = nfs4_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; #if defined(CONFIG_NFS_V4_1) static struct svc_serv_ops nfs41_cb_sv_ops = { .svo_function = nfs41_callback_svc, .svo_enqueue_xprt = svc_xprt_do_enqueue, - .svo_setup = svc_set_num_threads, + .svo_setup = svc_set_num_threads_sync, .svo_module = THIS_MODULE, }; @@ -280,7 +288,7 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion) printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n", cb_info->users); - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); + serv = svc_create_pooled(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops); if (!serv) { printk(KERN_ERR "nfs_callback_create_svc: create service failed\n"); return ERR_PTR(-ENOMEM); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f073a6d2c6a51a4ec91cf38783519ca957bdece4..52479f180ea1497af4f8a250db4d8c222ce8bfaa 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -131,10 +131,11 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -170,10 +171,11 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, if (!inode) continue; if (!nfs_sb_active(inode->i_sb)) { - rcu_read_lock(); + rcu_read_unlock(); spin_unlock(&clp->cl_lock); iput(inode); spin_lock(&clp->cl_lock); + rcu_read_lock(); goto restart; } return inode; @@ -317,31 +319,18 @@ static u32 initiate_bulk_draining(struct nfs_client *clp, static u32 do_callback_layoutrecall(struct nfs_client *clp, struct cb_layoutrecallargs *args) { - u32 res; - - dprintk("%s enter, type=%i\n", __func__, args->cbl_recall_type); if (args->cbl_recall_type == RETURN_FILE) - res = initiate_file_draining(clp, args); - else - res = initiate_bulk_draining(clp, args); - dprintk("%s returning %i\n", __func__, res); - return res; - + return initiate_file_draining(clp, args); + return initiate_bulk_draining(clp, args); } __be32 nfs4_callback_layoutrecall(struct cb_layoutrecallargs *args, void *dummy, struct cb_process_state *cps) { - u32 res; - - dprintk("%s: -->\n", __func__); + u32 res = NFS4ERR_OP_NOT_IN_SESSION; if (cps->clp) res = do_callback_layoutrecall(cps->clp, args); - else - res = NFS4ERR_OP_NOT_IN_SESSION; - - dprintk("%s: exit with status = %d\n", __func__, res); return cpu_to_be32(res); } @@ -364,8 +353,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, struct nfs_client *clp = cps->clp; struct nfs_server *server = NULL; - dprintk("%s: -->\n", __func__); - if (!clp) { res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION); goto out; @@ -384,8 +371,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, goto found; } rcu_read_unlock(); - dprintk("%s: layout type %u not found\n", - __func__, dev->cbd_layout_type); continue; } @@ -395,8 +380,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, out: kfree(args->devs); - dprintk("%s: exit with status = %u\n", - __func__, be32_to_cpu(res)); return res; } @@ -417,16 +400,11 @@ static __be32 validate_seqid(const struct nfs4_slot_table *tbl, const struct nfs4_slot *slot, const struct cb_sequenceargs * args) { - dprintk("%s enter. slotid %u seqid %u, slot table seqid: %u\n", - __func__, args->csa_slotid, args->csa_sequenceid, slot->seq_nr); - if (args->csa_slotid > tbl->server_highest_slotid) return htonl(NFS4ERR_BADSLOT); /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { - dprintk("%s seqid %u is a replay\n", - __func__, args->csa_sequenceid); if (nfs4_test_locked_slot(tbl, slot->slot_nr)) return htonl(NFS4ERR_DELAY); /* Signal process_op to set this error on next op */ @@ -480,15 +458,6 @@ static bool referring_call_exists(struct nfs_client *clp, for (j = 0; j < rclist->rcl_nrefcalls; j++) { ref = &rclist->rcl_refcalls[j]; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u " - "slotid %u\n", __func__, - ((u32 *)&rclist->rcl_sessionid.data)[0], - ((u32 *)&rclist->rcl_sessionid.data)[1], - ((u32 *)&rclist->rcl_sessionid.data)[2], - ((u32 *)&rclist->rcl_sessionid.data)[3], - ref->rc_sequenceid, ref->rc_slotid); - status = nfs4_slot_wait_on_seqid(tbl, ref->rc_slotid, ref->rc_sequenceid, HZ >> 1) < 0; if (status) @@ -593,8 +562,6 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, res->csr_status = status; trace_nfs4_cb_sequence(args, res, status); - dprintk("%s: exit with status = %d res->csr_status %d\n", __func__, - ntohl(status), ntohl(res->csr_status)); return status; } diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d051fc3583a9097d46d8159860ea6c22a7ec116a..c14758e08d738eec44bf08c79acee71366ce0e70 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -171,8 +171,6 @@ static __be32 decode_compound_hdr_arg(struct xdr_stream *xdr, struct cb_compound return htonl(NFS4ERR_MINOR_VERS_MISMATCH); } hdr->nops = ntohl(*p); - dprintk("%s: minorversion %d nops %d\n", __func__, - hdr->minorversion, hdr->nops); return 0; } @@ -192,11 +190,8 @@ static __be32 decode_getattr_args(struct svc_rqst *rqstp, struct xdr_stream *xdr status = decode_fh(xdr, &args->fh); if (unlikely(status != 0)) - goto out; - status = decode_bitmap(xdr, args->bitmap); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_bitmap(xdr, args->bitmap); } static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_recallargs *args) @@ -206,17 +201,12 @@ static __be32 decode_recall_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = decode_delegation_stateid(xdr, &args->stateid); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 4); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_RESOURCE); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_RESOURCE); args->truncate = ntohl(*p); - status = decode_fh(xdr, &args->fh); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return decode_fh(xdr, &args->fh); } #if defined(CONFIG_NFS_V4_1) @@ -235,10 +225,8 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, uint32_t iomode; p = read_buf(xdr, 4 * sizeof(uint32_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); args->cbl_layout_type = ntohl(*p++); /* Depite the spec's xdr, iomode really belongs in the FILE switch, @@ -252,37 +240,23 @@ static __be32 decode_layoutrecall_args(struct svc_rqst *rqstp, args->cbl_range.iomode = iomode; status = decode_fh(xdr, &args->cbl_fh); if (unlikely(status != 0)) - goto out; + return status; p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_range.offset); p = xdr_decode_hyper(p, &args->cbl_range.length); - status = decode_layout_stateid(xdr, &args->cbl_stateid); - if (unlikely(status != 0)) - goto out; + return decode_layout_stateid(xdr, &args->cbl_stateid); } else if (args->cbl_recall_type == RETURN_FSID) { p = read_buf(xdr, 2 * sizeof(uint64_t)); - if (unlikely(p == NULL)) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } + if (unlikely(p == NULL)) + return htonl(NFS4ERR_BADXDR); p = xdr_decode_hyper(p, &args->cbl_fsid.major); p = xdr_decode_hyper(p, &args->cbl_fsid.minor); - } else if (args->cbl_recall_type != RETURN_ALL) { - status = htonl(NFS4ERR_BADXDR); - goto out; - } - dprintk("%s: ltype 0x%x iomode %d changed %d recall_type %d\n", - __func__, - args->cbl_layout_type, iomode, - args->cbl_layoutchanged, args->cbl_recall_type); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + } else if (args->cbl_recall_type != RETURN_ALL) + return htonl(NFS4ERR_BADXDR); + return 0; } static @@ -437,12 +411,11 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, status = decode_sessionid(xdr, &args->csa_sessionid); if (status) - goto out; + return status; - status = htonl(NFS4ERR_RESOURCE); p = read_buf(xdr, 5 * sizeof(uint32_t)); if (unlikely(p == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); args->csa_addr = svc_addr(rqstp); args->csa_sequenceid = ntohl(*p++); @@ -456,7 +429,7 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, sizeof(*args->csa_rclists), GFP_KERNEL); if (unlikely(args->csa_rclists == NULL)) - goto out; + return htonl(NFS4ERR_RESOURCE); for (i = 0; i < args->csa_nrclists; i++) { status = decode_rc_list(xdr, &args->csa_rclists[i]); @@ -466,27 +439,13 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp, } } } - status = 0; - - dprintk("%s: sessionid %x:%x:%x:%x sequenceid %u slotid %u " - "highestslotid %u cachethis %d nrclists %u\n", - __func__, - ((u32 *)&args->csa_sessionid)[0], - ((u32 *)&args->csa_sessionid)[1], - ((u32 *)&args->csa_sessionid)[2], - ((u32 *)&args->csa_sessionid)[3], - args->csa_sequenceid, args->csa_slotid, - args->csa_highestslotid, args->csa_cachethis, - args->csa_nrclists); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; out_free: for (i = 0; i < args->csa_nrclists; i++) kfree(args->csa_rclists[i].rcl_refcalls); kfree(args->csa_rclists); - goto out; + return status; } static __be32 decode_recallany_args(struct svc_rqst *rqstp, @@ -557,11 +516,8 @@ static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream status = decode_fh(xdr, &args->cbnl_fh); if (unlikely(status != 0)) - goto out; - status = decode_lockowner(xdr, args); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return status; + return decode_lockowner(xdr, args); } #endif /* CONFIG_NFS_V4_1 */ @@ -707,7 +663,6 @@ static __be32 encode_getattr_res(struct svc_rqst *rqstp, struct xdr_stream *xdr, status = encode_attr_mtime(xdr, res->bitmap, &res->mtime); *savep = htonl((unsigned int)((char *)xdr->p - (char *)(savep+1))); out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); return status; } @@ -734,11 +689,11 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, __be32 status = res->csr_status; if (unlikely(status != 0)) - goto out; + return status; status = encode_sessionid(xdr, &res->csr_sessionid); if (status) - goto out; + return status; p = xdr_reserve_space(xdr, 4 * sizeof(uint32_t)); if (unlikely(p == NULL)) @@ -748,9 +703,7 @@ static __be32 encode_cb_sequence_res(struct svc_rqst *rqstp, *p++ = htonl(res->csr_slotid); *p++ = htonl(res->csr_highestslotid); *p++ = htonl(res->csr_target_highestslotid); -out: - dprintk("%s: exit with status = %d\n", __func__, ntohl(status)); - return status; + return 0; } static __be32 @@ -871,14 +824,10 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, long maxlen; __be32 res; - dprintk("%s: start\n", __func__); status = decode_op_hdr(xdr_in, &op_nr); if (unlikely(status)) return status; - dprintk("%s: minorversion=%d nop=%d op_nr=%u\n", - __func__, cps->minorversion, nop, op_nr); - switch (cps->minorversion) { case 0: status = preprocess_nfs4_op(op_nr, &op); @@ -917,7 +866,6 @@ static __be32 process_op(int nop, struct svc_rqst *rqstp, return res; if (op->encode_res != NULL && status == 0) status = op->encode_res(rqstp, xdr_out, resp); - dprintk("%s: done, status = %d\n", __func__, ntohl(status)); return status; } @@ -937,8 +885,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r }; unsigned int nops = 0; - dprintk("%s: start\n", __func__); - xdr_init_decode(&xdr_in, &rqstp->rq_arg, rqstp->rq_arg.head[0].iov_base); p = (__be32*)((char *)rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len); @@ -977,7 +923,6 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r *hdr_res.nops = htonl(nops); nfs4_cb_free_slot(&cps); nfs_put_client(cps.clp); - dprintk("%s: done, status = %u\n", __func__, ntohl(status)); return rpc_success; out_invalidcred: diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 390ada8741bcbfd2e4aaecb3f759ec0707003674..ee5ddbd36088e66d21b2900fddb9c7ded0d17f9a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -218,6 +218,7 @@ static void nfs_cb_idr_remove_locked(struct nfs_client *clp) static void pnfs_init_server(struct nfs_server *server) { rpc_init_wait_queue(&server->roc_rpcwaitq, "pNFS ROC"); + rpc_init_wait_queue(&server->uoc_rpcwaitq, "NFS UOC"); } #else @@ -240,8 +241,6 @@ static void pnfs_init_server(struct nfs_server *server) */ void nfs_free_client(struct nfs_client *clp) { - dprintk("--> nfs_free_client(%u)\n", clp->rpc_ops->version); - nfs_fscache_release_client_cookie(clp); /* -EIO all pending I/O */ @@ -256,8 +255,6 @@ void nfs_free_client(struct nfs_client *clp) kfree(clp->cl_hostname); kfree(clp->cl_acceptor); kfree(clp); - - dprintk("<-- nfs_free_client()\n"); } EXPORT_SYMBOL_GPL(nfs_free_client); @@ -271,7 +268,6 @@ void nfs_put_client(struct nfs_client *clp) if (!clp) return; - dprintk("--> nfs_put_client({%d})\n", atomic_read(&clp->cl_count)); nn = net_generic(clp->cl_net, nfs_net_id); if (atomic_dec_and_lock(&clp->cl_count, &nn->nfs_client_lock)) { @@ -382,9 +378,6 @@ nfs_found_client(const struct nfs_client_initdata *cl_init, } smp_rmb(); - - dprintk("<-- %s found nfs_client %p for %s\n", - __func__, clp, cl_init->hostname ?: ""); return clp; } @@ -403,9 +396,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) return NULL; } - dprintk("--> nfs_get_client(%s,v%u)\n", - cl_init->hostname, rpc_ops->version); - /* see if the client already exists */ do { spin_lock(&nn->nfs_client_lock); @@ -430,8 +420,6 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) new = rpc_ops->alloc_client(cl_init); } while (!IS_ERR(new)); - dprintk("<-- nfs_get_client() Failed to find %s (%ld)\n", - cl_init->hostname, PTR_ERR(new)); return new; } EXPORT_SYMBOL_GPL(nfs_get_client); @@ -558,6 +546,7 @@ static int nfs_start_lockd(struct nfs_server *server) .noresvport = server->flags & NFS_MOUNT_NORESVPORT ? 1 : 0, .net = clp->cl_net, + .nlmclnt_ops = clp->cl_nfs_mod->rpc_ops->nlmclnt_ops, }; if (nlm_init.nfs_version > 3) @@ -624,27 +613,21 @@ struct nfs_client *nfs_init_client(struct nfs_client *clp, { int error; - if (clp->cl_cons_state == NFS_CS_READY) { - /* the client is already initialised */ - dprintk("<-- nfs_init_client() = 0 [already %p]\n", clp); + /* the client is already initialised */ + if (clp->cl_cons_state == NFS_CS_READY) return clp; - } /* * Create a client RPC handle for doing FSSTAT with UNIX auth only * - RFC 2623, sec 2.3.2 */ error = nfs_create_rpc_client(clp, cl_init, RPC_AUTH_UNIX); - if (error < 0) - goto error; - nfs_mark_client_ready(clp, NFS_CS_READY); + nfs_mark_client_ready(clp, error == 0 ? NFS_CS_READY : error); + if (error < 0) { + nfs_put_client(clp); + clp = ERR_PTR(error); + } return clp; - -error: - nfs_mark_client_ready(clp, error); - nfs_put_client(clp); - dprintk("<-- nfs_init_client() = xerror %d\n", error); - return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_init_client); @@ -668,8 +651,6 @@ static int nfs_init_server(struct nfs_server *server, struct nfs_client *clp; int error; - dprintk("--> nfs_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); if (data->flags & NFS_MOUNT_NORESVPORT) @@ -677,10 +658,8 @@ static int nfs_init_server(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp)); + if (IS_ERR(clp)) return PTR_ERR(clp); - } server->nfs_client = clp; @@ -725,13 +704,11 @@ static int nfs_init_server(struct nfs_server *server, server->mountd_protocol = data->mount_server.protocol; server->namelen = data->namlen; - dprintk("<-- nfs_init_server() = 0 [new %p]\n", clp); return 0; error: server->nfs_client = NULL; nfs_put_client(clp); - dprintk("<-- nfs_init_server() = xerror %d\n", error); return error; } @@ -761,9 +738,6 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_SIZE - 1) >> PAGE_SHIFT; - server->backing_dev_info.name = "nfs"; - server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; - if (server->wsize > max_rpc_payload) server->wsize = max_rpc_payload; if (server->wsize > NFS_MAX_FILE_IO_SIZE) @@ -801,12 +775,10 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs struct nfs_client *clp = server->nfs_client; int error; - dprintk("--> nfs_probe_fsinfo()\n"); - if (clp->rpc_ops->set_capabilities != NULL) { error = clp->rpc_ops->set_capabilities(server, mntfh); if (error < 0) - goto out_error; + return error; } fsinfo.fattr = fattr; @@ -814,7 +786,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype)); error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo); if (error < 0) - goto out_error; + return error; nfs_server_set_fsinfo(server, &fsinfo); @@ -829,12 +801,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs server->namelen = pathinfo.max_namelen; } - dprintk("<-- nfs_probe_fsinfo() = 0\n"); return 0; - -out_error: - dprintk("nfs_probe_fsinfo: error = %d\n", -error); - return error; } EXPORT_SYMBOL_GPL(nfs_probe_fsinfo); @@ -917,12 +884,6 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } - if (bdi_init(&server->backing_dev_info)) { - nfs_free_iostats(server->io_stats); - kfree(server); - return NULL; - } - ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); @@ -936,8 +897,6 @@ EXPORT_SYMBOL_GPL(nfs_alloc_server); */ void nfs_free_server(struct nfs_server *server) { - dprintk("--> nfs_free_server()\n"); - nfs_server_remove_lists(server); if (server->destroy != NULL) @@ -953,10 +912,8 @@ void nfs_free_server(struct nfs_server *server) ida_destroy(&server->lockowner_id); ida_destroy(&server->openowner_id); nfs_free_iostats(server->io_stats); - bdi_destroy(&server->backing_dev_info); kfree(server); nfs_release_automount_timer(); - dprintk("<-- nfs_free_server()\n"); } EXPORT_SYMBOL_GPL(nfs_free_server); @@ -1036,10 +993,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, struct nfs_fattr *fattr_fsinfo; int error; - dprintk("--> nfs_clone_server(,%llx:%llx,)\n", - (unsigned long long) fattr->fsid.major, - (unsigned long long) fattr->fsid.minor); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1071,10 +1024,6 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - dprintk("Cloned FSID: %llx:%llx\n", - (unsigned long long) server->fsid.major, - (unsigned long long) server->fsid.minor); - error = nfs_start_lockd(server); if (error < 0) goto out_free_server; @@ -1083,13 +1032,11 @@ struct nfs_server *nfs_clone_server(struct nfs_server *source, server->mount_time = jiffies; nfs_free_fattr(fattr_fsinfo); - dprintk("<-- nfs_clone_server() = %p\n", server); return server; out_free_server: nfs_free_fattr(fattr_fsinfo); nfs_free_server(server); - dprintk("<-- nfs_clone_server() = error %d\n", error); return ERR_PTR(error); } EXPORT_SYMBOL_GPL(nfs_clone_server); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index f92ba8d6c5569099f6c469eda92446ad0d7e148d..32ccd7754f8a2875933d1f9c532b54c656971bfd 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -57,7 +57,7 @@ static void nfs_readdir_clear_array(struct page*); const struct file_operations nfs_dir_operations = { .llseek = nfs_llseek_dir, .read = generic_read_dir, - .iterate_shared = nfs_readdir, + .iterate = nfs_readdir, .open = nfs_opendir, .release = nfs_closedir, .fsync = nfs_fsync_dir, @@ -145,7 +145,6 @@ struct nfs_cache_array_entry { }; struct nfs_cache_array { - atomic_t refcount; int size; int eof_index; u64 last_cookie; @@ -170,27 +169,6 @@ typedef struct { unsigned int eof:1; } nfs_readdir_descriptor_t; -/* - * The caller is responsible for calling nfs_readdir_release_array(page) - */ -static -struct nfs_cache_array *nfs_readdir_get_array(struct page *page) -{ - void *ptr; - if (page == NULL) - return ERR_PTR(-EIO); - ptr = kmap(page); - if (ptr == NULL) - return ERR_PTR(-ENOMEM); - return ptr; -} - -static -void nfs_readdir_release_array(struct page *page) -{ - kunmap(page); -} - /* * we are freeing strings created by nfs_add_to_readdir_array() */ @@ -201,18 +179,9 @@ void nfs_readdir_clear_array(struct page *page) int i; array = kmap_atomic(page); - if (atomic_dec_and_test(&array->refcount)) - for (i = 0; i < array->size; i++) - kfree(array->array[i].string.name); - kunmap_atomic(array); -} - -static bool grab_page(struct page *page) -{ - struct nfs_cache_array *array = kmap_atomic(page); - bool res = atomic_inc_not_zero(&array->refcount); + for (i = 0; i < array->size; i++) + kfree(array->array[i].string.name); kunmap_atomic(array); - return res; } /* @@ -239,13 +208,10 @@ int nfs_readdir_make_qstr(struct qstr *string, const char *name, unsigned int le static int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) { - struct nfs_cache_array *array = nfs_readdir_get_array(page); + struct nfs_cache_array *array = kmap(page); struct nfs_cache_array_entry *cache_entry; int ret; - if (IS_ERR(array)) - return PTR_ERR(array); - cache_entry = &array->array[array->size]; /* Check that this entry lies within the page bounds */ @@ -264,7 +230,7 @@ int nfs_readdir_add_to_array(struct nfs_entry *entry, struct page *page) if (entry->eof != 0) array->eof_index = array->size; out: - nfs_readdir_release_array(page); + kunmap(page); return ret; } @@ -353,11 +319,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array; int status; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out; - } + array = kmap(desc->page); if (*desc->dir_cookie == 0) status = nfs_readdir_search_for_pos(array, desc); @@ -369,8 +331,7 @@ int nfs_readdir_search_array(nfs_readdir_descriptor_t *desc) desc->current_index += array->size; desc->page_index++; } - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); return status; } @@ -606,13 +567,10 @@ int nfs_readdir_page_filler(nfs_readdir_descriptor_t *desc, struct nfs_entry *en out_nopages: if (count == 0 || (status == -EBADCOOKIE && entry->eof != 0)) { - array = nfs_readdir_get_array(page); - if (!IS_ERR(array)) { - array->eof_index = array->size; - status = 0; - nfs_readdir_release_array(page); - } else - status = PTR_ERR(array); + array = kmap(page); + array->eof_index = array->size; + status = 0; + kunmap(page); } put_page(scratch); @@ -674,13 +632,8 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, goto out; } - array = nfs_readdir_get_array(page); - if (IS_ERR(array)) { - status = PTR_ERR(array); - goto out_label_free; - } + array = kmap(page); memset(array, 0, sizeof(struct nfs_cache_array)); - atomic_set(&array->refcount, 1); array->eof_index = -1; status = nfs_readdir_alloc_pages(pages, array_size); @@ -703,8 +656,7 @@ int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, nfs_readdir_free_pages(pages, array_size); out_release_array: - nfs_readdir_release_array(page); -out_label_free: + kunmap(page); nfs4_label_free(entry.label); out: nfs_free_fattr(entry.fattr); @@ -743,7 +695,8 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page* page) static void cache_page_release(nfs_readdir_descriptor_t *desc) { - nfs_readdir_clear_array(desc->page); + if (!desc->page->mapping) + nfs_readdir_clear_array(desc->page); put_page(desc->page); desc->page = NULL; } @@ -751,16 +704,8 @@ void cache_page_release(nfs_readdir_descriptor_t *desc) static struct page *get_cache_page(nfs_readdir_descriptor_t *desc) { - struct page *page; - - for (;;) { - page = read_cache_page(desc->file->f_mapping, + return read_cache_page(desc->file->f_mapping, desc->page_index, (filler_t *)nfs_readdir_filler, desc); - if (IS_ERR(page) || grab_page(page)) - break; - put_page(page); - } - return page; } /* @@ -809,12 +754,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) struct nfs_cache_array *array = NULL; struct nfs_open_dir_context *ctx = file->private_data; - array = nfs_readdir_get_array(desc->page); - if (IS_ERR(array)) { - res = PTR_ERR(array); - goto out; - } - + array = kmap(desc->page); for (i = desc->cache_entry_index; i < array->size; i++) { struct nfs_cache_array_entry *ent; @@ -835,8 +775,7 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc) if (array->eof_index >= 0) desc->eof = 1; - nfs_readdir_release_array(desc->page); -out: + kunmap(desc->page); cache_page_release(desc); dfprintk(DIRCACHE, "NFS: nfs_do_filldir() filling ended @ cookie %Lu; returning = %d\n", (unsigned long long)*desc->dir_cookie, res); @@ -966,11 +905,13 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) { + struct inode *inode = file_inode(filp); struct nfs_open_dir_context *dir_ctx = filp->private_data; dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -978,13 +919,16 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) if (offset >= 0) break; default: - return -EINVAL; + offset = -EINVAL; + goto out; } if (offset != filp->f_pos) { filp->f_pos = offset; dir_ctx->dir_cookie = 0; dir_ctx->duped = 0; } +out: + inode_unlock(inode); return offset; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index aab32fc3d6a84965ea879854c942b12888548411..6fb9fad2d1e6cf6909cfe6dfb2e482bff969e7df 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -392,16 +392,6 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) nfs_direct_req_release(dreq); } -static void nfs_direct_readpage_release(struct nfs_page *req) -{ - dprintk("NFS: direct read done (%s/%llu %d@%lld)\n", - req->wb_context->dentry->d_sb->s_id, - (unsigned long long)NFS_FILEID(d_inode(req->wb_context->dentry)), - req->wb_bytes, - (long long)req_offset(req)); - nfs_release_request(req); -} - static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) { unsigned long bytes = 0; @@ -426,7 +416,7 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) set_page_dirty(page); bytes += req->wb_bytes; nfs_list_remove_request(req); - nfs_direct_readpage_release(req); + nfs_release_request(req); } out_put: if (put_dreq(dreq)) @@ -537,7 +527,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_complete(dreq); - return 0; + return requested_bytes; } /** @@ -566,7 +556,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) struct inode *inode = mapping->host; struct nfs_direct_req *dreq; struct nfs_lock_context *l_ctx; - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count = iov_iter_count(iter); nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count); @@ -600,14 +590,19 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); NFS_I(inode)->read_io += count; - result = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); + requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); - if (result > 0) + if (result > 0) { + requested -= result; iocb->ki_pos += result; + } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: @@ -695,16 +690,9 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) int status = data->task.tk_status; nfs_init_cinfo_from_dreq(&cinfo, dreq); - if (status < 0) { - dprintk("NFS: %5u commit failed with error %d.\n", - data->task.tk_pid, status); + if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data)) dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } else if (nfs_direct_cmp_commit_data_verf(dreq, data)) { - dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; - } - dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); @@ -954,7 +942,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, if (put_dreq(dreq)) nfs_direct_write_complete(dreq); - return 0; + return requested_bytes; } /** @@ -979,7 +967,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) { - ssize_t result = -EINVAL; + ssize_t result = -EINVAL, requested; size_t count; struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -1022,7 +1010,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_start_io_direct(inode); - result = nfs_direct_write_schedule_iovec(dreq, iter, pos); + requested = nfs_direct_write_schedule_iovec(dreq, iter, pos); if (mapping->nrpages) { invalidate_inode_pages2_range(mapping, @@ -1031,13 +1019,17 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) nfs_end_io_direct(inode); - if (!result) { + if (requested > 0) { result = nfs_direct_wait(dreq); if (result > 0) { + requested -= result; iocb->ki_pos = pos + result; /* XXX: should check the generic_write_sync retval */ generic_write_sync(iocb, result); } + iov_iter_revert(iter, requested); + } else { + result = requested; } out_release: nfs_direct_req_release(dreq); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 668213984d68708c3d54cccce36ad3a0c048657e..5713eb32a45ea20c1de50f2ee28f4ddcae468f67 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -482,7 +482,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_launder_page(inode, page); + return nfs_wb_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -697,14 +697,14 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) if (!IS_ERR(l_ctx)) { status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); - if (status < 0) + /* NOTE: special case + * If we're signalled while cleaning up locks on process exit, we + * still need to complete the unlock. + */ + if (status < 0 && !(fl->fl_flags & FL_CLOSE)) return status; } - /* NOTE: special case - * If we're signalled while cleaning up locks on process exit, we - * still need to complete the unlock. - */ /* * Use local locking if mounted with "-onolock" or with appropriate * "-olocal_lock=" @@ -820,9 +820,23 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) if (NFS_SERVER(inode)->flags & NFS_MOUNT_LOCAL_FLOCK) is_local = 1; - /* We're simulating flock() locks using posix locks on the server */ - if (fl->fl_type == F_UNLCK) + /* + * VFS doesn't require the open mode to match a flock() lock's type. + * NFS, however, may simulate flock() locking with posix locking which + * requires the open mode to match the lock type. + */ + switch (fl->fl_type) { + case F_UNLCK: return do_unlk(filp, cmd, fl, is_local); + case F_RDLCK: + if (!(filp->f_mode & FMODE_READ)) + return -EBADF; + break; + case F_WRLCK: + if (!(filp->f_mode & FMODE_WRITE)) + return -EBADF; + } + return do_setlk(filp, cmd, fl, is_local); } EXPORT_SYMBOL_GPL(nfs_flock); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index acd30baca46166c902aa5dfae1663184cc30e235..1cf85d65b7489bd56c69447d618360263febc52c 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -921,11 +921,11 @@ fl_pnfs_update_layout(struct inode *ino, fl = FILELAYOUT_LSEG(lseg); status = filelayout_check_deviceid(lo, fl, gfp_flags); - if (status) + if (status) { + pnfs_put_lseg(lseg); lseg = ERR_PTR(status); + } out: - if (IS_ERR(lseg)) - pnfs_put_lseg(lseg); return lseg; } @@ -933,6 +933,7 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -959,6 +960,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode, req->wb_context, diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 42dedf2d625fca62a418c5e4f82935dc41a53987..f5714ee01000de49c5efcd3675226a3a3b7a7074 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -846,6 +846,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; retry: + pnfs_generic_pg_check_layout(pgio); /* Use full layout for now */ if (!pgio->pg_lseg) ff_layout_pg_get_read(pgio, req, false); @@ -894,6 +895,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int status; retry: + pnfs_generic_pg_check_layout(pgio); if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -1800,16 +1802,16 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); if (!ds) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, hdr->inode); if (IS_ERR(ds_clnt)) - return PNFS_NOT_ATTEMPTED; + goto out_failed; ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); if (!ds_cred) - return PNFS_NOT_ATTEMPTED; + goto out_failed; vers = nfs4_ff_layout_ds_version(lseg, idx); @@ -1839,6 +1841,11 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) sync, RPC_TASK_SOFTCONN); put_rpccred(ds_cred); return PNFS_ATTEMPTED; + +out_failed: + if (ff_layout_avoid_mds_available_ds(lseg)) + return PNFS_TRY_AGAIN; + return PNFS_NOT_ATTEMPTED; } static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) @@ -2354,10 +2361,21 @@ ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) return 0; } +static int +ff_layout_set_layoutdriver(struct nfs_server *server, + const struct nfs_fh *dummy) +{ +#if IS_ENABLED(CONFIG_NFS_V4_2) + server->caps |= NFS_CAP_LAYOUTSTATS; +#endif + return 0; +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", .owner = THIS_MODULE, + .set_layoutdriver = ff_layout_set_layoutdriver, .alloc_layout_hdr = ff_layout_alloc_layout_hdr, .free_layout_hdr = ff_layout_free_layout_hdr, .alloc_lseg = ff_layout_alloc_lseg, diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 457cfeb1d5c162e4177450eb941460a2fe39f3b1..6df7a0cf566015378aa3f76c480115675454297d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -119,7 +119,13 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; - if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { + /* + * check for valid major/minor combination. + * currently we support dataserver which talk: + * v3, v4.0, v4.1, v4.2 + */ + if (!((ds_versions[i].version == 3 && ds_versions[i].minor_version == 0) || + (ds_versions[i].version == 4 && ds_versions[i].minor_version < 3))) { dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, i, ds_versions[i].version, ds_versions[i].minor_version); @@ -415,7 +421,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror->mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ - if (ds->ds_clp) { + if (!status) { max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index f489a5a71bd5cd89f0667b455e04930753e24e84..1de93ba78dc95a34292cc6160198d9be7c843de1 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -734,7 +734,10 @@ int nfs_getattr(const struct path *path, struct kstat *stat, if (need_atime || nfs_need_revalidate_inode(inode)) { struct nfs_server *server = NFS_SERVER(inode); - nfs_readdirplus_parent_cache_miss(path->dentry); + if (!(server->flags & NFS_MOUNT_NOAC)) + nfs_readdirplus_parent_cache_miss(path->dentry); + else + nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); } else nfs_readdirplus_parent_cache_hit(path->dentry); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7b38fedb7e032824ec509edca5cf465a22147851..e9b4c3320e371a90020ea25f4be44d2d76508db7 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -495,7 +495,6 @@ void nfs_mark_request_commit(struct nfs_page *req, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, @@ -756,9 +755,13 @@ static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: + case -EACCES: + case -EDQUOT: + case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: + case -ESTALE: case -E2BIG: return true; default: diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 786f175805827df3a76ef4e37307927f81372628..1a224a33a6c23c362e1bbacb8150bb9bbf02b3fd 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -143,11 +143,8 @@ struct vfsmount *nfs_d_automount(struct path *path) struct nfs_fh *fh = NULL; struct nfs_fattr *fattr = NULL; - dprintk("--> nfs_d_automount()\n"); - - mnt = ERR_PTR(-ESTALE); if (IS_ROOT(path->dentry)) - goto out_nofree; + return ERR_PTR(-ESTALE); mnt = ERR_PTR(-ENOMEM); fh = nfs_alloc_fhandle(); @@ -155,13 +152,10 @@ struct vfsmount *nfs_d_automount(struct path *path) if (fh == NULL || fattr == NULL) goto out; - dprintk("%s: enter\n", __func__); - mnt = server->nfs_client->rpc_ops->submount(server, path->dentry, fh, fattr); if (IS_ERR(mnt)) goto out; - dprintk("%s: done, success\n", __func__); mntget(mnt); /* prevent immediate expiration */ mnt_set_expiry(mnt, &nfs_automount_list); schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout); @@ -169,11 +163,6 @@ struct vfsmount *nfs_d_automount(struct path *path) out: nfs_free_fattr(fattr); nfs_free_fhandle(fh); -out_nofree: - if (IS_ERR(mnt)) - dprintk("<-- %s(): error %ld\n", __func__, PTR_ERR(mnt)); - else - dprintk("<-- %s() = %p\n", __func__, mnt); return mnt; } @@ -248,27 +237,20 @@ struct vfsmount *nfs_do_submount(struct dentry *dentry, struct nfs_fh *fh, .fattr = fattr, .authflavor = authflavor, }; - struct vfsmount *mnt = ERR_PTR(-ENOMEM); + struct vfsmount *mnt; char *page = (char *) __get_free_page(GFP_USER); char *devname; - dprintk("--> nfs_do_submount()\n"); - - dprintk("%s: submounting on %pd2\n", __func__, - dentry); if (page == NULL) - goto out; + return ERR_PTR(-ENOMEM); + devname = nfs_devname(dentry, page, PAGE_SIZE); - mnt = (struct vfsmount *)devname; if (IS_ERR(devname)) - goto free_page; - mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); -free_page: - free_page((unsigned long)page); -out: - dprintk("%s: done\n", __func__); + mnt = (struct vfsmount *)devname; + else + mnt = nfs_do_clone_mount(NFS_SB(dentry->d_sb), devname, &mountdata); - dprintk("<-- nfs_do_submount() = %p\n", mnt); + free_page((unsigned long)page); return mnt; } EXPORT_SYMBOL_GPL(nfs_do_submount); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index dc925b531f32632bb708c3f8490d0d00c7315495..0c07b567118dcd8a99d924a738c0fe7a0149bf43 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -865,12 +865,63 @@ static void nfs3_proc_commit_setup(struct nfs_commit_data *data, struct rpc_mess msg->rpc_proc = &nfs3_procedures[NFS3PROC_COMMIT]; } +static void nfs3_nlm_alloc_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + get_nfs_open_context(l_ctx->open_context); + nfs_get_lock_context(l_ctx->open_context); + } +} + +static bool nfs3_nlm_unlock_prepare(struct rpc_task *task, void *data) +{ + struct nfs_lock_context *l_ctx = data; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) + return nfs_async_iocounter_wait(task, l_ctx); + return false; + +} + +static void nfs3_nlm_release_call(void *data) +{ + struct nfs_lock_context *l_ctx = data; + struct nfs_open_context *ctx; + if (l_ctx && test_bit(NFS_CONTEXT_UNLOCK, &l_ctx->open_context->flags)) { + ctx = l_ctx->open_context; + nfs_put_lock_context(l_ctx); + put_nfs_open_context(ctx); + } +} + +const struct nlmclnt_operations nlmclnt_fl_close_lock_ops = { + .nlmclnt_alloc_call = nfs3_nlm_alloc_call, + .nlmclnt_unlock_prepare = nfs3_nlm_unlock_prepare, + .nlmclnt_release_call = nfs3_nlm_release_call, +}; + static int nfs3_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); + struct nfs_lock_context *l_ctx = NULL; + struct nfs_open_context *ctx = nfs_file_open_context(filp); + int status; - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + if (fl->fl_flags & FL_CLOSE) { + l_ctx = nfs_get_lock_context(ctx); + if (IS_ERR(l_ctx)) + l_ctx = NULL; + else + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); + } + + status = nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, l_ctx); + + if (l_ctx) + nfs_put_lock_context(l_ctx); + + return status; } static int nfs3_have_delegation(struct inode *inode, fmode_t flags) @@ -921,6 +972,7 @@ const struct nfs_rpc_ops nfs_v3_clientops = { .dir_inode_ops = &nfs3_dir_inode_operations, .file_inode_ops = &nfs3_file_inode_operations, .file_ops = &nfs_file_operations, + .nlmclnt_ops = &nlmclnt_fl_close_lock_ops, .getroot = nfs3_proc_get_root, .submount = nfs_submount, .try_mount = nfs_try_mount, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1e486c73ec9438d99e2e4a5b6637ec64594b626c..929d09a5310ad7df79be527a45a9aa524825b7f0 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -167,23 +167,29 @@ static ssize_t _nfs42_proc_copy(struct file *src, if (status) return status; + res->commit_res.verf = kzalloc(sizeof(struct nfs_writeverf), GFP_NOFS); + if (!res->commit_res.verf) + return -ENOMEM; status = nfs4_call_sync(server->client, server, &msg, &args->seq_args, &res->seq_res, 0); if (status == -ENOTSUPP) server->caps &= ~NFS_CAP_COPY; if (status) - return status; + goto out; - if (res->write_res.verifier.committed != NFS_FILE_SYNC) { - status = nfs_commit_file(dst, &res->write_res.verifier.verifier); - if (status) - return status; + if (!nfs_write_verifier_cmp(&res->write_res.verifier.verifier, + &res->commit_res.verf->verifier)) { + status = -EAGAIN; + goto out; } truncate_pagecache_range(dst_inode, pos_dst, pos_dst + res->write_res.count); - return res->write_res.count; + status = res->write_res.count; +out: + kfree(res->commit_res.verf); + return status; } ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, @@ -240,6 +246,9 @@ ssize_t nfs42_proc_copy(struct file *src, loff_t pos_src, if (err == -ENOTSUPP) { err = -EOPNOTSUPP; break; + } if (err == -EAGAIN) { + dst_exception.retry = 1; + continue; } err2 = nfs4_handle_exception(server, err, &src_exception); @@ -379,6 +388,7 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&head); + nfs_commit_inode(inode, 0); } else spin_unlock(&inode->i_lock); break; @@ -400,8 +410,6 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) case -EOPNOTSUPP: NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; } - - dprintk("%s server returns %d\n", __func__, task->tk_status); } static void diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 6c7296454bbc05c3ab1c18dd178314603916e40a..528362f69cc1ba6156102b113fdd5ba18c566c71 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -66,12 +66,14 @@ encode_putfh_maxsz + \ encode_savefh_maxsz + \ encode_putfh_maxsz + \ - encode_copy_maxsz) + encode_copy_maxsz + \ + encode_commit_maxsz) #define NFS4_dec_copy_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_savefh_maxsz + \ decode_putfh_maxsz + \ - decode_copy_maxsz) + decode_copy_maxsz + \ + decode_commit_maxsz) #define NFS4_enc_deallocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ encode_deallocate_maxsz + \ @@ -222,6 +224,18 @@ static void nfs4_xdr_enc_allocate(struct rpc_rqst *req, encode_nops(&hdr); } +static void encode_copy_commit(struct xdr_stream *xdr, + struct nfs42_copy_args *args, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_COMMIT, decode_commit_maxsz, hdr); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->dst_pos); + *p = cpu_to_be32(args->count); +} + /* * Encode COPY request */ @@ -239,6 +253,7 @@ static void nfs4_xdr_enc_copy(struct rpc_rqst *req, encode_savefh(xdr, &hdr); encode_putfh(xdr, args->dst_fh, &hdr); encode_copy(xdr, args, &hdr); + encode_copy_commit(xdr, args, &hdr); encode_nops(&hdr); } @@ -481,6 +496,9 @@ static int nfs4_xdr_dec_copy(struct rpc_rqst *rqstp, if (status) goto out; status = decode_copy(xdr, res); + if (status) + goto out; + status = decode_commit(xdr, &res->commit_res); out: return status; } diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 8346ccbf2d52e518b6fa61d0c8cbb3d033ec1f02..692a7a8bfc7afd05ad40d6e04a4a53db07e01753 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -359,11 +359,9 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, struct nfs_client *old; int error; - if (clp->cl_cons_state == NFS_CS_READY) { + if (clp->cl_cons_state == NFS_CS_READY) /* the client is initialised already */ - dprintk("<-- nfs4_init_client() = 0 [already %p]\n", clp); return clp; - } /* Check NFS protocol revision and initialize RPC op vector */ clp->rpc_ops = &nfs_v4_clientops; @@ -421,7 +419,6 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error: nfs_mark_client_ready(clp, error); nfs_put_client(clp); - dprintk("<-- nfs4_init_client() = xerror %d\n", error); return ERR_PTR(error); } @@ -469,6 +466,50 @@ static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2) return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0; } +static int nfs4_match_client(struct nfs_client *pos, struct nfs_client *new, + struct nfs_client **prev, struct nfs_net *nn) +{ + int status; + + if (pos->rpc_ops != new->rpc_ops) + return 1; + + if (pos->cl_minorversion != new->cl_minorversion) + return 1; + + /* If "pos" isn't marked ready, we can't trust the + * remaining fields in "pos", especially the client + * ID and serverowner fields. Wait for CREATE_SESSION + * to finish. */ + if (pos->cl_cons_state > NFS_CS_READY) { + atomic_inc(&pos->cl_count); + spin_unlock(&nn->nfs_client_lock); + + nfs_put_client(*prev); + *prev = pos; + + status = nfs_wait_client_init_complete(pos); + spin_lock(&nn->nfs_client_lock); + + if (status < 0) + return status; + } + + if (pos->cl_cons_state != NFS_CS_READY) + return 1; + + if (pos->cl_clientid != new->cl_clientid) + return 1; + + /* NFSv4.1 always uses the uniform string, however someone + * might switch the uniquifier string on us. + */ + if (!nfs4_match_client_owner_id(pos, new)) + return 1; + + return 0; +} + /** * nfs40_walk_client_list - Find server that recognizes a client ID * @@ -497,34 +538,10 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos" */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - if (status < 0) - goto out; - status = -NFS4ERR_STALE_CLIENTID; - spin_lock(&nn->nfs_client_lock); - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (pos->cl_clientid != new->cl_clientid) - continue; - - if (!nfs4_match_client_owner_id(pos, new)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out_unlock; + if (status != 0) continue; /* * We just sent a new SETCLIENTID, which should have @@ -557,8 +574,6 @@ int nfs40_walk_client_list(struct nfs_client *new, prev = NULL; *result = pos; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); goto out; case -ERESTARTSYS: case -ETIMEDOUT: @@ -567,36 +582,22 @@ int nfs40_walk_client_list(struct nfs_client *new, */ nfs4_schedule_path_down_recovery(pos); default: + spin_lock(&nn->nfs_client_lock); goto out; } spin_lock(&nn->nfs_client_lock); } +out_unlock: spin_unlock(&nn->nfs_client_lock); /* No match found. The server lost our clientid */ out: nfs_put_client(prev); - dprintk("NFS: <-- %s status = %d\n", __func__, status); return status; } #ifdef CONFIG_NFS_V4_1 -/* - * Returns true if the client IDs match - */ -static bool nfs4_match_clientids(u64 a, u64 b) -{ - if (a != b) { - dprintk("NFS: --> %s client ID %llx does not match %llx\n", - __func__, a, b); - return false; - } - dprintk("NFS: --> %s client ID %llx matches %llx\n", - __func__, a, b); - return true; -} - /* * Returns true if the server major ids match */ @@ -605,36 +606,8 @@ nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1, struct nfs41_server_owner *o2) { if (o1->major_id_sz != o2->major_id_sz) - goto out_major_mismatch; - if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0) - goto out_major_mismatch; - - dprintk("NFS: --> %s server owner major IDs match\n", __func__); - return true; - -out_major_mismatch: - dprintk("NFS: --> %s server owner major IDs do not match\n", - __func__); - return false; -} - -/* - * Returns true if server minor ids match - */ -static bool -nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1, - struct nfs41_server_owner *o2) -{ - /* Check eir_server_owner so_minor_id */ - if (o1->minor_id != o2->minor_id) - goto out_minor_mismatch; - - dprintk("NFS: --> %s server owner minor IDs match\n", __func__); - return true; - -out_minor_mismatch: - dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__); - return false; + return false; + return memcmp(o1->major_id, o2->major_id, o1->major_id_sz) == 0; } /* @@ -645,18 +618,9 @@ nfs4_check_server_scope(struct nfs41_server_scope *s1, struct nfs41_server_scope *s2) { if (s1->server_scope_sz != s2->server_scope_sz) - goto out_scope_mismatch; - if (memcmp(s1->server_scope, s2->server_scope, - s1->server_scope_sz) != 0) - goto out_scope_mismatch; - - dprintk("NFS: --> %s server scopes match\n", __func__); - return true; - -out_scope_mismatch: - dprintk("NFS: --> %s server scopes do not match\n", - __func__); - return false; + return false; + return memcmp(s1->server_scope, s2->server_scope, + s1->server_scope_sz) == 0; } /** @@ -680,7 +644,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, struct rpc_xprt *xprt) { /* Check eir_clientid */ - if (!nfs4_match_clientids(clp->cl_clientid, res->clientid)) + if (clp->cl_clientid != res->clientid) goto out_err; /* Check eir_server_owner so_major_id */ @@ -689,8 +653,7 @@ int nfs4_detect_session_trunking(struct nfs_client *clp, goto out_err; /* Check eir_server_owner so_minor_id */ - if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner, - res->server_owner)) + if (clp->cl_serverowner->minor_id != res->server_owner->minor_id) goto out_err; /* Check eir_server_scope */ @@ -739,33 +702,10 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos == new) goto found; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - - /* If "pos" isn't marked ready, we can't trust the - * remaining fields in "pos", especially the client - * ID and serverowner fields. Wait for CREATE_SESSION - * to finish. */ - if (pos->cl_cons_state > NFS_CS_READY) { - atomic_inc(&pos->cl_count); - spin_unlock(&nn->nfs_client_lock); - - nfs_put_client(prev); - prev = pos; - - status = nfs_wait_client_init_complete(pos); - spin_lock(&nn->nfs_client_lock); - if (status < 0) - break; - status = -NFS4ERR_STALE_CLIENTID; - } - if (pos->cl_cons_state != NFS_CS_READY) - continue; - - if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid)) + status = nfs4_match_client(pos, new, &prev, nn); + if (status < 0) + goto out; + if (status != 0) continue; /* @@ -777,23 +717,15 @@ int nfs41_walk_client_list(struct nfs_client *new, new->cl_serverowner)) continue; - /* Unlike NFSv4.0, we know that NFSv4.1 always uses the - * uniform string, however someone might switch the - * uniquifier string on us. - */ - if (!nfs4_match_client_owner_id(pos, new)) - continue; found: atomic_inc(&pos->cl_count); *result = pos; status = 0; - dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", - __func__, pos, atomic_read(&pos->cl_count)); break; } +out: spin_unlock(&nn->nfs_client_lock); - dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); return status; } @@ -916,9 +848,6 @@ static int nfs4_set_client(struct nfs_server *server, .timeparms = timeparms, }; struct nfs_client *clp; - int error; - - dprintk("--> nfs4_set_client()\n"); if (server->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); @@ -927,15 +856,11 @@ static int nfs4_set_client(struct nfs_server *server, /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); - if (IS_ERR(clp)) { - error = PTR_ERR(clp); - goto error; - } + if (IS_ERR(clp)) + return PTR_ERR(clp); - if (server->nfs_client == clp) { - error = -ELOOP; - goto error; - } + if (server->nfs_client == clp) + return -ELOOP; /* * Query for the lease time on clientid setup or renewal @@ -947,11 +872,7 @@ static int nfs4_set_client(struct nfs_server *server, set_bit(NFS_CS_CHECK_LEASE_TIME, &clp->cl_res_state); server->nfs_client = clp; - dprintk("<-- nfs4_set_client() = 0 [new %p]\n", clp); return 0; -error: - dprintk("<-- nfs4_set_client() = xerror %d\n", error); - return error; } /* @@ -982,7 +903,6 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, .net = mds_clp->cl_net, .timeparms = &ds_timeout, }; - struct nfs_client *clp; char buf[INET6_ADDRSTRLEN + 1]; if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0) @@ -998,10 +918,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, * (section 13.1 RFC 5661). */ nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans); - clp = nfs_get_client(&cl_init); - - dprintk("<-- %s %p\n", __func__, clp); - return clp; + return nfs_get_client(&cl_init); } EXPORT_SYMBOL_GPL(nfs4_set_ds_client); @@ -1098,8 +1015,6 @@ static int nfs4_init_server(struct nfs_server *server, struct rpc_timeout timeparms; int error; - dprintk("--> nfs4_init_server()\n"); - nfs_init_timeout_values(&timeparms, data->nfs_server.protocol, data->timeo, data->retrans); @@ -1127,7 +1042,7 @@ static int nfs4_init_server(struct nfs_server *server, data->minorversion, data->net); if (error < 0) - goto error; + return error; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -1138,16 +1053,10 @@ static int nfs4_init_server(struct nfs_server *server, server->acregmax = data->acregmax * HZ; server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; - server->port = data->nfs_server.port; - - error = nfs_init_server_rpcclient(server, &timeparms, - data->selected_flavor); - -error: - /* Done */ - dprintk("<-- nfs4_init_server() = %d\n", error); - return error; + return nfs_init_server_rpcclient(server, &timeparms, + data->selected_flavor); } /* @@ -1163,8 +1072,6 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, bool auth_probe; int error; - dprintk("--> nfs4_create_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1180,12 +1087,10 @@ struct nfs_server *nfs4_create_server(struct nfs_mount_info *mount_info, if (error < 0) goto error; - dprintk("<-- nfs4_create_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_server() = error %d\n", error); return ERR_PTR(error); } @@ -1200,8 +1105,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, bool auth_probe; int error; - dprintk("--> nfs4_create_referral_server()\n"); - server = nfs_alloc_server(); if (!server) return ERR_PTR(-ENOMEM); @@ -1235,12 +1138,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, if (error < 0) goto error; - dprintk("<-- nfs_create_referral_server() = %p\n", server); return server; error: nfs_free_server(server); - dprintk("<-- nfs4_create_referral_server() = error %d\n", error); return ERR_PTR(error); } @@ -1300,31 +1201,16 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr *localaddr = (struct sockaddr *)&address; int error; - dprintk("--> %s: move FSID %llx:%llx to \"%s\")\n", __func__, - (unsigned long long)server->fsid.major, - (unsigned long long)server->fsid.minor, - hostname); - error = rpc_switch_client_transport(clnt, &xargs, clnt->cl_timeout); - if (error != 0) { - dprintk("<-- %s(): rpc_switch_client_transport returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; error = rpc_localaddr(clnt, localaddr, sizeof(address)); - if (error != 0) { - dprintk("<-- %s(): rpc_localaddr returned %d\n", - __func__, error); - goto out; - } + if (error != 0) + return error; - error = -EAFNOSUPPORT; - if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) { - dprintk("<-- %s(): rpc_ntop returned %d\n", - __func__, error); - goto out; - } + if (rpc_ntop(localaddr, buf, sizeof(buf)) == 0) + return -EAFNOSUPPORT; nfs_server_remove_lists(server); error = nfs4_set_client(server, hostname, sap, salen, buf, @@ -1333,21 +1219,12 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname, nfs_put_client(clp); if (error != 0) { nfs_server_insert_lists(server); - dprintk("<-- %s(): nfs4_set_client returned %d\n", - __func__, error); - goto out; + return error; } if (server->nfs_client->cl_hostname == NULL) server->nfs_client->cl_hostname = kstrdup(hostname, GFP_KERNEL); nfs_server_insert_lists(server); - error = nfs_probe_destination(server); - if (error < 0) - goto out; - - dprintk("<-- %s() succeeded\n", __func__); - -out: - return error; + return nfs_probe_destination(server); } diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index 039b3eb6d83404f33224961465406d52203ff570..ac84060189626ccb07f24b366594d36372717cbb 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -14,8 +14,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p struct nfs_fsinfo fsinfo; int ret = -ENOMEM; - dprintk("--> nfs4_get_rootfh()\n"); - fsinfo.fattr = nfs_alloc_fattr(); if (fsinfo.fattr == NULL) goto out; @@ -38,6 +36,5 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); - dprintk("<-- nfs4_get_rootfh() = %d\n", ret); return ret; } diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index d8b040bd9814d3199e9189519f7bdcb09c04801c..7d531da1bae37e1cd957c3181423930e1033ab8b 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -340,7 +340,6 @@ static struct vfsmount *nfs_follow_referral(struct dentry *dentry, out: free_page((unsigned long) page); free_page((unsigned long) page2); - dprintk("%s: done\n", __func__); return mnt; } @@ -358,11 +357,9 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * int err; /* BUG_ON(IS_ROOT(dentry)); */ - dprintk("%s: enter\n", __func__); - page = alloc_page(GFP_KERNEL); if (page == NULL) - goto out; + return mnt; fs_locations = kmalloc(sizeof(struct nfs4_fs_locations), GFP_KERNEL); if (fs_locations == NULL) @@ -386,8 +383,6 @@ static struct vfsmount *nfs_do_refmount(struct rpc_clnt *client, struct dentry * out_free: __free_page(page); kfree(fs_locations); -out: - dprintk("%s: done\n", __func__); return mnt; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 201ca3f2c4bac14986220fcdf6a6c37b734ffa96..c08c46a3b8cde00ef5aa40fae87ed2fce06faea1 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -698,7 +698,8 @@ static int nfs41_sequence_process(struct rpc_task *task, session = slot->table->session; if (slot->interrupted) { - slot->interrupted = 0; + if (res->sr_status != -NFS4ERR_DELAY) + slot->interrupted = 0; interrupted = true; } @@ -2300,8 +2301,10 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) if (status != 0) return status; } - if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) + if (!(o_res->f_attr->valid & NFS_ATTR_FATTR)) { + nfs4_sequence_free_slot(&o_res->seq_res); nfs4_proc_getattr(server, &o_res->fh, o_res->f_attr, o_res->f_label); + } return 0; } @@ -3265,6 +3268,7 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f .rpc_resp = &res, }; int status; + int i; bitmask[0] = FATTR4_WORD0_SUPPORTED_ATTRS | FATTR4_WORD0_FH_EXPIRE_TYPE | @@ -3330,8 +3334,13 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; server->cache_consistency_bitmask[2] = 0; + + /* Avoid a regression due to buggy server */ + for (i = 0; i < ARRAY_SIZE(res.exclcreat_bitmask); i++) + res.exclcreat_bitmask[i] &= res.attr_bitmask[i]; memcpy(server->exclcreat_bitmask, res.exclcreat_bitmask, sizeof(server->exclcreat_bitmask)); + server->acl_bitmask = res.acl_bitmask; server->fh_expire_type = res.fh_expire_type; } @@ -4610,7 +4619,7 @@ static int nfs4_proc_pgio_rpc_prepare(struct rpc_task *task, return 0; if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, - hdr->rw_ops->rw_mode) == -EIO) + hdr->rw_mode) == -EIO) return -EIO; if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) return -EIO; @@ -4804,8 +4813,10 @@ static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, if (!atomic_inc_not_zero(&clp->cl_count)) return -EIO; data = kmalloc(sizeof(*data), GFP_NOFS); - if (data == NULL) + if (data == NULL) { + nfs_put_client(clp); return -ENOMEM; + } data->client = clp; data->timestamp = jiffies; return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT, @@ -5782,6 +5793,7 @@ struct nfs4_unlockdata { struct nfs_locku_res res; struct nfs4_lock_state *lsp; struct nfs_open_context *ctx; + struct nfs_lock_context *l_ctx; struct file_lock fl; struct nfs_server *server; unsigned long timestamp; @@ -5806,6 +5818,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, atomic_inc(&lsp->ls_count); /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); + p->l_ctx = nfs_get_lock_context(ctx); memcpy(&p->fl, fl, sizeof(p->fl)); p->server = NFS_SERVER(inode); return p; @@ -5816,6 +5829,7 @@ static void nfs4_locku_release_calldata(void *data) struct nfs4_unlockdata *calldata = data; nfs_free_seqid(calldata->arg.seqid); nfs4_put_lock_state(calldata->lsp); + nfs_put_lock_context(calldata->l_ctx); put_nfs_open_context(calldata->ctx); kfree(calldata); } @@ -5857,6 +5871,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; + if (test_bit(NFS_CONTEXT_UNLOCK, &calldata->l_ctx->open_context->flags) && + nfs_async_iocounter_wait(task, calldata->l_ctx)) + return; + if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0) goto out_wait; nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid); @@ -5908,6 +5926,8 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, * canceled lock is passed in, and it won't be an unlock. */ fl->fl_type = F_UNLCK; + if (fl->fl_flags & FL_CLOSE) + set_bit(NFS_CONTEXT_UNLOCK, &ctx->flags); data = nfs4_alloc_unlockdata(fl, ctx, lsp, seqid); if (data == NULL) { @@ -6445,9 +6465,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) ctx = nfs_file_open_context(filp); state = ctx->state; - if (request->fl_start < 0 || request->fl_end < 0) - return -EINVAL; - if (IS_GETLK(cmd)) { if (state != NULL) return nfs4_proc_getlk(state, F_GETLK, request); @@ -6470,20 +6487,6 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request) !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) return -ENOLCK; - /* - * Don't rely on the VFS having checked the file open mode, - * since it won't do this for flock() locks. - */ - switch (request->fl_type) { - case F_RDLCK: - if (!(filp->f_mode & FMODE_READ)) - return -EBADF; - break; - case F_WRLCK: - if (!(filp->f_mode & FMODE_WRITE)) - return -EBADF; - } - status = nfs4_set_lock_state(state, request); if (status != 0) return status; @@ -7155,8 +7158,6 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, }; struct rpc_task *task; - dprintk("--> %s\n", __func__); - nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id); if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) args.dir = NFS4_CDFC4_FORE; @@ -7176,24 +7177,20 @@ int nfs4_proc_bind_one_conn_to_session(struct rpc_clnt *clnt, if (memcmp(res.sessionid.data, clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("NFS: %s: Session ID mismatch\n", __func__); - status = -EIO; - goto out; + return -EIO; } if ((res.dir & args.dir) != res.dir || res.dir == 0) { dprintk("NFS: %s: Unexpected direction from server\n", __func__); - status = -EIO; - goto out; + return -EIO; } if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) { dprintk("NFS: %s: Server returned RDMA mode = true\n", __func__); - status = -EIO; - goto out; + return -EIO; } } -out: - dprintk("<-- %s status= %d\n", __func__, status); + return status; } @@ -7459,15 +7456,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; struct nfs41_exchange_id_data *calldata; struct rpc_task *task; - int status = -EIO; + int status; if (!atomic_inc_not_zero(&clp->cl_count)) - goto out; + return -EIO; - status = -ENOMEM; calldata = kzalloc(sizeof(*calldata), GFP_NOFS); - if (!calldata) - goto out; + if (!calldata) { + nfs_put_client(clp); + return -ENOMEM; + } if (!xprt) nfs4_init_boot_verifier(clp, &verifier); @@ -7476,10 +7474,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (status) goto out_calldata; - dprintk("NFS call exchange_id auth=%s, '%s'\n", - clp->cl_rpcclient->cl_auth->au_ops->au_name, - clp->cl_owner_id); - calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); status = -ENOMEM; @@ -7545,13 +7539,6 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, rpc_put_task(task); out: - if (clp->cl_implid != NULL) - dprintk("NFS reply exchange_id: Server Implementation ID: " - "domain: %s, name: %s, date: %llu,%u\n", - clp->cl_implid->domain, clp->cl_implid->name, - clp->cl_implid->date.seconds, - clp->cl_implid->date.nseconds); - dprintk("NFS reply exchange_id: %d\n", status); return status; out_impl_id: @@ -7769,17 +7756,13 @@ int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo) nfs4_init_sequence(&args.la_seq_args, &res.lr_seq_res, 0); nfs4_set_sequence_privileged(&args.la_seq_args); - dprintk("--> %s\n", __func__); task = rpc_run_task(&task_setup); if (IS_ERR(task)) - status = PTR_ERR(task); - else { - status = task->tk_status; - rpc_put_task(task); - } - dprintk("<-- %s return %d\n", __func__, status); + return PTR_ERR(task); + status = task->tk_status; + rpc_put_task(task); return status; } @@ -8180,6 +8163,12 @@ static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nf /* fall through */ case -NFS4ERR_RETRY_UNCACHED_REP: return -EAGAIN; + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: + nfs4_schedule_session_recovery(clp->cl_session, + task->tk_status); + break; default: nfs4_schedule_lease_recovery(clp); } @@ -8258,7 +8247,6 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp, if (status == 0) status = task->tk_status; rpc_put_task(task); - return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; @@ -8357,6 +8345,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task, */ pnfs_mark_layout_stateid_invalid(lo, &head); spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); pnfs_free_lseg_list(&head); status = -EAGAIN; goto out; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 8156bad6b441095199878a900e35594863cad571..b34de036501bc90e48be043aec38485f7f755e55 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1649,13 +1649,14 @@ static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp) nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot); } -static void nfs4_reclaim_complete(struct nfs_client *clp, +static int nfs4_reclaim_complete(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops, struct rpc_cred *cred) { /* Notify the server we're done reclaiming our state */ if (ops->reclaim_complete) - (void)ops->reclaim_complete(clp, cred); + return ops->reclaim_complete(clp, cred); + return 0; } static void nfs4_clear_reclaim_server(struct nfs_server *server) @@ -1702,13 +1703,16 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) { const struct nfs4_state_recovery_ops *ops; struct rpc_cred *cred; + int err; if (!nfs4_state_clear_reclaim_reboot(clp)) return; ops = clp->cl_mvops->reboot_recovery_ops; cred = nfs4_get_clid_cred(clp); - nfs4_reclaim_complete(clp, ops, cred); + err = nfs4_reclaim_complete(clp, ops, cred); put_rpccred(cred); + if (err == -NFS4ERR_CONN_NOT_BOUND_TO_SESSION) + set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state); } static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 80ce289eea05326336a7edecbe8a132ee4900d23..3aebfdc82b30320625b0011b4f968207e436fb48 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1000,8 +1000,9 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs4_label *label, + const umode_t *umask, const struct nfs_server *server, - bool excl_check, const umode_t *umask) + const uint32_t attrmask[]) { char owner_name[IDMAP_NAMESZ]; char owner_group[IDMAP_NAMESZ]; @@ -1016,22 +1017,20 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, /* * We reserve enough space to write the entire attribute buffer at once. */ - if (iap->ia_valid & ATTR_SIZE) { + if ((iap->ia_valid & ATTR_SIZE) && (attrmask[0] & FATTR4_WORD0_SIZE)) { bmval[0] |= FATTR4_WORD0_SIZE; len += 8; } - if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) - umask = NULL; if (iap->ia_valid & ATTR_MODE) { - if (umask) { + if (umask && (attrmask[2] & FATTR4_WORD2_MODE_UMASK)) { bmval[2] |= FATTR4_WORD2_MODE_UMASK; len += 8; - } else { + } else if (attrmask[1] & FATTR4_WORD1_MODE) { bmval[1] |= FATTR4_WORD1_MODE; len += 4; } } - if (iap->ia_valid & ATTR_UID) { + if ((iap->ia_valid & ATTR_UID) && (attrmask[1] & FATTR4_WORD1_OWNER)) { owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ); if (owner_namelen < 0) { dprintk("nfs: couldn't resolve uid %d to string\n", @@ -1044,7 +1043,8 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER; len += 4 + (XDR_QUADLEN(owner_namelen) << 2); } - if (iap->ia_valid & ATTR_GID) { + if ((iap->ia_valid & ATTR_GID) && + (attrmask[1] & FATTR4_WORD1_OWNER_GROUP)) { owner_grouplen = nfs_map_gid_to_group(server, iap->ia_gid, owner_group, IDMAP_NAMESZ); if (owner_grouplen < 0) { dprintk("nfs: couldn't resolve gid %d to string\n", @@ -1056,32 +1056,26 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, bmval[1] |= FATTR4_WORD1_OWNER_GROUP; len += 4 + (XDR_QUADLEN(owner_grouplen) << 2); } - if (iap->ia_valid & ATTR_ATIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 16; - } else if (iap->ia_valid & ATTR_ATIME) { - bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; - len += 4; - } - if (iap->ia_valid & ATTR_MTIME_SET) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 16; - } else if (iap->ia_valid & ATTR_MTIME) { - bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; - len += 4; + if (attrmask[1] & FATTR4_WORD1_TIME_ACCESS_SET) { + if (iap->ia_valid & ATTR_ATIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 16; + } else if (iap->ia_valid & ATTR_ATIME) { + bmval[1] |= FATTR4_WORD1_TIME_ACCESS_SET; + len += 4; + } } - - if (excl_check) { - const u32 *excl_bmval = server->exclcreat_bitmask; - bmval[0] &= excl_bmval[0]; - bmval[1] &= excl_bmval[1]; - bmval[2] &= excl_bmval[2]; - - if (!(excl_bmval[2] & FATTR4_WORD2_SECURITY_LABEL)) - label = NULL; + if (attrmask[1] & FATTR4_WORD1_TIME_MODIFY_SET) { + if (iap->ia_valid & ATTR_MTIME_SET) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 16; + } else if (iap->ia_valid & ATTR_MTIME) { + bmval[1] |= FATTR4_WORD1_TIME_MODIFY_SET; + len += 4; + } } - if (label) { + if (label && (attrmask[2] & FATTR4_WORD2_SECURITY_LABEL)) { len += 4 + 4 + 4 + (XDR_QUADLEN(label->len) << 2); bmval[2] |= FATTR4_WORD2_SECURITY_LABEL; } @@ -1188,8 +1182,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * } encode_string(xdr, create->name->len, create->name->name); - encode_attrs(xdr, create->attrs, create->label, create->server, false, - &create->umask); + encode_attrs(xdr, create->attrs, create->label, &create->umask, + create->server, create->server->attr_bitmask); } static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr) @@ -1409,13 +1403,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op switch(arg->createmode) { case NFS4_CREATE_UNCHECKED: *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_GUARDED: *p = cpu_to_be32(NFS4_CREATE_GUARDED); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->attr_bitmask); break; case NFS4_CREATE_EXCLUSIVE: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); @@ -1424,8 +1418,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op case NFS4_CREATE_EXCLUSIVE4_1: *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1); encode_nfs4_verifier(xdr, &arg->u.verifier); - encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true, - &arg->umask); + encode_attrs(xdr, arg->u.attrs, arg->label, &arg->umask, + arg->server, arg->server->exclcreat_bitmask); } } @@ -1681,7 +1675,8 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr); encode_nfs4_stateid(xdr, &arg->stateid); - encode_attrs(xdr, arg->iap, arg->label, server, false, NULL); + encode_attrs(xdr, arg->iap, arg->label, NULL, server, + server->attr_bitmask); } static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr) @@ -2005,16 +2000,10 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { - NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( - NFS_I(inode)->layout, xdr, args); - } else { - encode_uint32(xdr, args->layoutupdate_len); - if (args->layoutupdate_pages) { - xdr_write_pages(xdr, args->layoutupdate_pages, 0, - args->layoutupdate_len); - } - } + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); return 0; } @@ -2024,7 +2013,6 @@ encode_layoutreturn(struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args, struct compound_hdr *hdr) { - const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld; __be32 *p; encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr); @@ -2041,8 +2029,6 @@ encode_layoutreturn(struct xdr_stream *xdr, spin_unlock(&args->inode->i_lock); if (args->ld_private->ops && args->ld_private->ops->encode) args->ld_private->ops->encode(xdr, args, args->ld_private); - else if (lr_ops->encode_layoutreturn) - lr_ops->encode_layoutreturn(xdr, args); else encode_uint32(xdr, 0); } @@ -5579,6 +5565,8 @@ static int decode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map) unsigned int i; p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; bitmap_words = be32_to_cpup(p++); if (bitmap_words > NFS4_OP_MAP_NUM_WORDS) return -EIO; diff --git a/fs/nfs/objlayout/Kbuild b/fs/nfs/objlayout/Kbuild deleted file mode 100644 index ed30ea072bb85561045a864846b2e5d647adb489..0000000000000000000000000000000000000000 --- a/fs/nfs/objlayout/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the pNFS Objects Layout Driver kernel module -# -objlayoutdriver-y := objio_osd.o pnfs_osd_xdr_cli.o objlayout.o -obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayoutdriver.o diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c deleted file mode 100644 index 049c1b1f2932be74266a47d97733426d89ec8154..0000000000000000000000000000000000000000 --- a/fs/nfs/objlayout/objio_osd.c +++ /dev/null @@ -1,675 +0,0 @@ -/* - * pNFS Objects layout implementation over open-osd initiator library - * - * Copyright (C) 2009 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include - -#include "objlayout.h" -#include "../internal.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -struct objio_dev_ent { - struct nfs4_deviceid_node id_node; - struct ore_dev od; -}; - -static void -objio_free_deviceid_node(struct nfs4_deviceid_node *d) -{ - struct objio_dev_ent *de = container_of(d, struct objio_dev_ent, id_node); - - dprintk("%s: free od=%p\n", __func__, de->od.od); - osduld_put_device(de->od.od); - kfree_rcu(d, rcu); -} - -struct objio_segment { - struct pnfs_layout_segment lseg; - - struct ore_layout layout; - struct ore_components oc; -}; - -static inline struct objio_segment * -OBJIO_LSEG(struct pnfs_layout_segment *lseg) -{ - return container_of(lseg, struct objio_segment, lseg); -} - -struct objio_state { - /* Generic layer */ - struct objlayout_io_res oir; - - bool sync; - /*FIXME: Support for extra_bytes at ore_get_rw_state() */ - struct ore_io_state *ios; -}; - -/* Send and wait for a get_device_info of devices in the layout, - then look them up with the osd_initiator library */ -struct nfs4_deviceid_node * -objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, - gfp_t gfp_flags) -{ - struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode = NULL; - struct osd_dev *od; - struct osd_dev_info odi; - bool retry_flag = true; - __be32 *p; - int err; - - deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); - if (!deviceaddr) - return NULL; - - p = page_address(pdev->pages[0]); - pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); - - odi.systemid_len = deviceaddr->oda_systemid.len; - if (odi.systemid_len > sizeof(odi.systemid)) { - dprintk("%s: odi.systemid_len > sizeof(systemid=%zd)\n", - __func__, sizeof(odi.systemid)); - err = -EINVAL; - goto out; - } else if (odi.systemid_len) - memcpy(odi.systemid, deviceaddr->oda_systemid.data, - odi.systemid_len); - odi.osdname_len = deviceaddr->oda_osdname.len; - odi.osdname = (u8 *)deviceaddr->oda_osdname.data; - - if (!odi.osdname_len && !odi.systemid_len) { - dprintk("%s: !odi.osdname_len && !odi.systemid_len\n", - __func__); - err = -ENODEV; - goto out; - } - -retry_lookup: - od = osduld_info_lookup(&odi); - if (IS_ERR(od)) { - err = PTR_ERR(od); - dprintk("%s: osduld_info_lookup => %d\n", __func__, err); - if (err == -ENODEV && retry_flag) { - err = objlayout_autologin(deviceaddr); - if (likely(!err)) { - retry_flag = false; - goto retry_lookup; - } - } - goto out; - } - - dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); - - ode = kzalloc(sizeof(*ode), gfp_flags); - if (!ode) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - goto out; - } - - nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); - kfree(deviceaddr); - - ode->od.od = od; - return &ode->id_node; - -out: - kfree(deviceaddr); - return NULL; -} - -static void copy_single_comp(struct ore_components *oc, unsigned c, - struct pnfs_osd_object_cred *src_comp) -{ - struct ore_comp *ocomp = &oc->comps[c]; - - WARN_ON(src_comp->oc_cap_key.cred_len > 0); /* libosd is NO_SEC only */ - WARN_ON(src_comp->oc_cap.cred_len > sizeof(ocomp->cred)); - - ocomp->obj.partition = src_comp->oc_object_id.oid_partition_id; - ocomp->obj.id = src_comp->oc_object_id.oid_object_id; - - memcpy(ocomp->cred, src_comp->oc_cap.cred, sizeof(ocomp->cred)); -} - -static int __alloc_objio_seg(unsigned numdevs, gfp_t gfp_flags, - struct objio_segment **pseg) -{ -/* This is the in memory structure of the objio_segment - * - * struct __alloc_objio_segment { - * struct objio_segment olseg; - * struct ore_dev *ods[numdevs]; - * struct ore_comp comps[numdevs]; - * } *aolseg; - * NOTE: The code as above compiles and runs perfectly. It is elegant, - * type safe and compact. At some Past time Linus has decided he does not - * like variable length arrays, For the sake of this principal we uglify - * the code as below. - */ - struct objio_segment *lseg; - size_t lseg_size = sizeof(*lseg) + - numdevs * sizeof(lseg->oc.ods[0]) + - numdevs * sizeof(*lseg->oc.comps); - - lseg = kzalloc(lseg_size, gfp_flags); - if (unlikely(!lseg)) { - dprintk("%s: Failed allocation numdevs=%d size=%zd\n", __func__, - numdevs, lseg_size); - return -ENOMEM; - } - - lseg->oc.numdevs = numdevs; - lseg->oc.single_comp = EC_MULTPLE_COMPS; - lseg->oc.ods = (void *)(lseg + 1); - lseg->oc.comps = (void *)(lseg->oc.ods + numdevs); - - *pseg = lseg; - return 0; -} - -int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags) -{ - struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); - struct objio_segment *objio_seg; - struct pnfs_osd_xdr_decode_layout_iter iter; - struct pnfs_osd_layout layout; - struct pnfs_osd_object_cred src_comp; - unsigned cur_comp; - int err; - - err = pnfs_osd_xdr_decode_layout_map(&layout, &iter, xdr); - if (unlikely(err)) - return err; - - err = __alloc_objio_seg(layout.olo_num_comps, gfp_flags, &objio_seg); - if (unlikely(err)) - return err; - - objio_seg->layout.stripe_unit = layout.olo_map.odm_stripe_unit; - objio_seg->layout.group_width = layout.olo_map.odm_group_width; - objio_seg->layout.group_depth = layout.olo_map.odm_group_depth; - objio_seg->layout.mirrors_p1 = layout.olo_map.odm_mirror_cnt + 1; - objio_seg->layout.raid_algorithm = layout.olo_map.odm_raid_algorithm; - - err = ore_verify_layout(layout.olo_map.odm_num_comps, - &objio_seg->layout); - if (unlikely(err)) - goto err; - - objio_seg->oc.first_dev = layout.olo_comps_index; - cur_comp = 0; - while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { - struct nfs4_deviceid_node *d; - struct objio_dev_ent *ode; - - copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - - d = nfs4_find_get_deviceid(server, - &src_comp.oc_object_id.oid_device_id, - pnfslay->plh_lc_cred, gfp_flags); - if (!d) { - err = -ENXIO; - goto err; - } - - ode = container_of(d, struct objio_dev_ent, id_node); - objio_seg->oc.ods[cur_comp++] = &ode->od; - } - /* pnfs_osd_xdr_decode_layout_comp returns false on error */ - if (unlikely(err)) - goto err; - - *outp = &objio_seg->lseg; - return 0; - -err: - kfree(objio_seg); - dprintk("%s: Error: return %d\n", __func__, err); - *outp = NULL; - return err; -} - -void objio_free_lseg(struct pnfs_layout_segment *lseg) -{ - int i; - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - - for (i = 0; i < objio_seg->oc.numdevs; i++) { - struct ore_dev *od = objio_seg->oc.ods[i]; - struct objio_dev_ent *ode; - - if (!od) - break; - ode = container_of(od, typeof(*ode), od); - nfs4_put_deviceid_node(&ode->id_node); - } - kfree(objio_seg); -} - -static int -objio_alloc_io_state(struct pnfs_layout_hdr *pnfs_layout_type, bool is_reading, - struct pnfs_layout_segment *lseg, struct page **pages, unsigned pgbase, - loff_t offset, size_t count, void *rpcdata, gfp_t gfp_flags, - struct objio_state **outp) -{ - struct objio_segment *objio_seg = OBJIO_LSEG(lseg); - struct ore_io_state *ios; - int ret; - struct __alloc_objio_state { - struct objio_state objios; - struct pnfs_osd_ioerr ioerrs[objio_seg->oc.numdevs]; - } *aos; - - aos = kzalloc(sizeof(*aos), gfp_flags); - if (unlikely(!aos)) - return -ENOMEM; - - objlayout_init_ioerrs(&aos->objios.oir, objio_seg->oc.numdevs, - aos->ioerrs, rpcdata, pnfs_layout_type); - - ret = ore_get_rw_state(&objio_seg->layout, &objio_seg->oc, is_reading, - offset, count, &ios); - if (unlikely(ret)) { - kfree(aos); - return ret; - } - - ios->pages = pages; - ios->pgbase = pgbase; - ios->private = aos; - BUG_ON(ios->nr_pages > (pgbase + count + PAGE_SIZE - 1) >> PAGE_SHIFT); - - aos->objios.sync = 0; - aos->objios.ios = ios; - *outp = &aos->objios; - return 0; -} - -void objio_free_result(struct objlayout_io_res *oir) -{ - struct objio_state *objios = container_of(oir, struct objio_state, oir); - - ore_put_io_state(objios->ios); - kfree(objios); -} - -static enum pnfs_osd_errno osd_pri_2_pnfs_err(enum osd_err_priority oep) -{ - switch (oep) { - case OSD_ERR_PRI_NO_ERROR: - return (enum pnfs_osd_errno)0; - - case OSD_ERR_PRI_CLEAR_PAGES: - BUG_ON(1); - return 0; - - case OSD_ERR_PRI_RESOURCE: - return PNFS_OSD_ERR_RESOURCE; - case OSD_ERR_PRI_BAD_CRED: - return PNFS_OSD_ERR_BAD_CRED; - case OSD_ERR_PRI_NO_ACCESS: - return PNFS_OSD_ERR_NO_ACCESS; - case OSD_ERR_PRI_UNREACHABLE: - return PNFS_OSD_ERR_UNREACHABLE; - case OSD_ERR_PRI_NOT_FOUND: - return PNFS_OSD_ERR_NOT_FOUND; - case OSD_ERR_PRI_NO_SPACE: - return PNFS_OSD_ERR_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case OSD_ERR_PRI_EIO: - return PNFS_OSD_ERR_EIO; - } -} - -static void __on_dev_error(struct ore_io_state *ios, - struct ore_dev *od, unsigned dev_index, enum osd_err_priority oep, - u64 dev_offset, u64 dev_len) -{ - struct objio_state *objios = ios->private; - struct pnfs_osd_objid pooid; - struct objio_dev_ent *ode = container_of(od, typeof(*ode), od); - /* FIXME: what to do with more-then-one-group layouts. We need to - * translate from ore_io_state index to oc->comps index - */ - unsigned comp = dev_index; - - pooid.oid_device_id = ode->id_node.deviceid; - pooid.oid_partition_id = ios->oc->comps[comp].obj.partition; - pooid.oid_object_id = ios->oc->comps[comp].obj.id; - - objlayout_io_set_result(&objios->oir, comp, - &pooid, osd_pri_2_pnfs_err(oep), - dev_offset, dev_len, !ios->reading); -} - -/* - * read - */ -static void _read_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) - status = ios->length; - else - status = ret; - - objlayout_read_done(&objios->oir, status, objios->sync); -} - -int objio_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, true, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, - GFP_KERNEL, &objios); - if (unlikely(ret)) - return ret; - - objios->ios->done = _read_done; - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_read(objios->ios); - if (unlikely(ret)) - objio_free_result(&objios->oir); - return ret; -} - -/* - * write - */ -static void _write_done(struct ore_io_state *ios, void *private) -{ - struct objio_state *objios = private; - ssize_t status; - int ret = ore_check_io(ios, &__on_dev_error); - - /* FIXME: _io_free(ios) can we dealocate the libosd resources; */ - - if (likely(!ret)) { - /* FIXME: should be based on the OSD's persistence model - * See OSD2r05 Section 4.13 Data persistence model */ - objios->oir.committed = NFS_FILE_SYNC; - status = ios->length; - } else { - status = ret; - } - - objlayout_write_done(&objios->oir, status, objios->sync); -} - -static struct page *__r4w_get_page(void *priv, u64 offset, bool *uptodate) -{ - struct objio_state *objios = priv; - struct nfs_pgio_header *hdr = objios->oir.rpcdata; - struct address_space *mapping = hdr->inode->i_mapping; - pgoff_t index = offset / PAGE_SIZE; - struct page *page; - loff_t i_size = i_size_read(hdr->inode); - - if (offset >= i_size) { - *uptodate = true; - dprintk("%s: g_zero_page index=0x%lx\n", __func__, index); - return ZERO_PAGE(0); - } - - page = find_get_page(mapping, index); - if (!page) { - page = find_or_create_page(mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s: grab_cache_page Failed index=0x%lx\n", - __func__, index); - return NULL; - } - unlock_page(page); - } - *uptodate = PageUptodate(page); - dprintk("%s: index=0x%lx uptodate=%d\n", __func__, index, *uptodate); - return page; -} - -static void __r4w_put_page(void *priv, struct page *page) -{ - dprintk("%s: index=0x%lx\n", __func__, - (page == ZERO_PAGE(0)) ? -1UL : page->index); - if (ZERO_PAGE(0) != page) - put_page(page); - return; -} - -static const struct _ore_r4w_op _r4w_op = { - .get_page = &__r4w_get_page, - .put_page = &__r4w_put_page, -}; - -int objio_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - struct objio_state *objios; - int ret; - - ret = objio_alloc_io_state(NFS_I(hdr->inode)->layout, false, - hdr->lseg, hdr->args.pages, hdr->args.pgbase, - hdr->args.offset, hdr->args.count, hdr, GFP_NOFS, - &objios); - if (unlikely(ret)) - return ret; - - objios->sync = 0 != (how & FLUSH_SYNC); - objios->ios->r4w = &_r4w_op; - - if (!objios->sync) - objios->ios->done = _write_done; - - dprintk("%s: offset=0x%llx length=0x%x\n", __func__, - hdr->args.offset, hdr->args.count); - ret = ore_write(objios->ios); - if (unlikely(ret)) { - objio_free_result(&objios->oir); - return ret; - } - - if (objios->sync) - _write_done(objios->ios, objios); - - return 0; -} - -/* - * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number - * of bytes (maximum @req->wb_bytes) that can be coalesced. - */ -static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio, - struct nfs_page *prev, struct nfs_page *req) -{ - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio); - unsigned int size; - - size = pnfs_generic_pg_test(pgio, prev, req); - - if (!size || mirror->pg_count + req->wb_bytes > - (unsigned long)pgio->pg_layout_private) - return 0; - - return min(size, req->wb_bytes); -} - -static void objio_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - pnfs_generic_pg_init_read(pgio, req); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; -} - -static bool aligned_on_raid_stripe(u64 offset, struct ore_layout *layout, - unsigned long *stripe_end) -{ - u32 stripe_off; - unsigned stripe_size; - - if (layout->raid_algorithm == PNFS_OSD_RAID_0) - return true; - - stripe_size = layout->stripe_unit * - (layout->group_width - layout->parity); - - div_u64_rem(offset, stripe_size, &stripe_off); - if (!stripe_off) - return true; - - *stripe_end = stripe_size - stripe_off; - return false; -} - -static void objio_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) -{ - unsigned long stripe_end = 0; - u64 wb_size; - - if (pgio->pg_dreq == NULL) - wb_size = i_size_read(pgio->pg_inode) - req_offset(req); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); - if (unlikely(pgio->pg_lseg == NULL)) - return; /* Not pNFS */ - - if (req->wb_offset || - !aligned_on_raid_stripe(req->wb_index * PAGE_SIZE, - &OBJIO_LSEG(pgio->pg_lseg)->layout, - &stripe_end)) { - pgio->pg_layout_private = (void *)stripe_end; - } else { - pgio->pg_layout_private = (void *) - OBJIO_LSEG(pgio->pg_lseg)->layout.max_io_length; - } -} - -static const struct nfs_pageio_ops objio_pg_read_ops = { - .pg_init = objio_init_read, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_readpages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static const struct nfs_pageio_ops objio_pg_write_ops = { - .pg_init = objio_init_write, - .pg_test = objio_pg_test, - .pg_doio = pnfs_generic_pg_writepages, - .pg_cleanup = pnfs_generic_pg_cleanup, -}; - -static struct pnfs_layoutdriver_type objlayout_type = { - .id = LAYOUT_OSD2_OBJECTS, - .name = "LAYOUT_OSD2_OBJECTS", - .flags = PNFS_LAYOUTRET_ON_SETATTR | - PNFS_LAYOUTRET_ON_ERROR, - - .max_deviceinfo_size = PAGE_SIZE, - .owner = THIS_MODULE, - .alloc_layout_hdr = objlayout_alloc_layout_hdr, - .free_layout_hdr = objlayout_free_layout_hdr, - - .alloc_lseg = objlayout_alloc_lseg, - .free_lseg = objlayout_free_lseg, - - .read_pagelist = objlayout_read_pagelist, - .write_pagelist = objlayout_write_pagelist, - .pg_read_ops = &objio_pg_read_ops, - .pg_write_ops = &objio_pg_write_ops, - - .sync = pnfs_generic_sync, - - .free_deviceid_node = objio_free_deviceid_node, - - .encode_layoutcommit = objlayout_encode_layoutcommit, - .encode_layoutreturn = objlayout_encode_layoutreturn, -}; - -MODULE_DESCRIPTION("pNFS Layout Driver for OSD2 objects"); -MODULE_AUTHOR("Benny Halevy "); -MODULE_LICENSE("GPL"); - -static int __init -objlayout_init(void) -{ - int ret = pnfs_register_layoutdriver(&objlayout_type); - - if (ret) - printk(KERN_INFO - "NFS: %s: Registering OSD pNFS Layout Driver failed: error=%d\n", - __func__, ret); - else - printk(KERN_INFO "NFS: %s: Registered OSD pNFS Layout Driver\n", - __func__); - return ret; -} - -static void __exit -objlayout_exit(void) -{ - pnfs_unregister_layoutdriver(&objlayout_type); - printk(KERN_INFO "NFS: %s: Unregistered OSD pNFS Layout Driver\n", - __func__); -} - -MODULE_ALIAS("nfs-layouttype4-2"); - -module_init(objlayout_init); -module_exit(objlayout_exit); diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c deleted file mode 100644 index 8f3d2acb81c3d816f7bb12f78fa6c50fb8a20528..0000000000000000000000000000000000000000 --- a/fs/nfs/objlayout/objlayout.c +++ /dev/null @@ -1,706 +0,0 @@ -/* - * pNFS Objects layout driver high level definitions - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include -#include -#include -#include -#include "objlayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD -/* - * Create a objlayout layout structure for the given inode and return it. - */ -struct pnfs_layout_hdr * -objlayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) -{ - struct objlayout *objlay; - - objlay = kzalloc(sizeof(struct objlayout), gfp_flags); - if (!objlay) - return NULL; - spin_lock_init(&objlay->lock); - INIT_LIST_HEAD(&objlay->err_list); - dprintk("%s: Return %p\n", __func__, objlay); - return &objlay->pnfs_layout; -} - -/* - * Free an objlayout layout structure - */ -void -objlayout_free_layout_hdr(struct pnfs_layout_hdr *lo) -{ - struct objlayout *objlay = OBJLAYOUT(lo); - - dprintk("%s: objlay %p\n", __func__, objlay); - - WARN_ON(!list_empty(&objlay->err_list)); - kfree(objlay); -} - -/* - * Unmarshall layout and store it in pnfslay. - */ -struct pnfs_layout_segment * -objlayout_alloc_lseg(struct pnfs_layout_hdr *pnfslay, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - int status = -ENOMEM; - struct xdr_stream stream; - struct xdr_buf buf = { - .pages = lgr->layoutp->pages, - .page_len = lgr->layoutp->len, - .buflen = lgr->layoutp->len, - .len = lgr->layoutp->len, - }; - struct page *scratch; - struct pnfs_layout_segment *lseg; - - dprintk("%s: Begin pnfslay %p\n", __func__, pnfslay); - - scratch = alloc_page(gfp_flags); - if (!scratch) - goto err_nofree; - - xdr_init_decode(&stream, &buf, NULL); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - status = objio_alloc_lseg(&lseg, pnfslay, &lgr->range, &stream, gfp_flags); - if (unlikely(status)) { - dprintk("%s: objio_alloc_lseg Return err %d\n", __func__, - status); - goto err; - } - - __free_page(scratch); - - dprintk("%s: Return %p\n", __func__, lseg); - return lseg; - -err: - __free_page(scratch); -err_nofree: - dprintk("%s: Err Return=>%d\n", __func__, status); - return ERR_PTR(status); -} - -/* - * Free a layout segement - */ -void -objlayout_free_lseg(struct pnfs_layout_segment *lseg) -{ - dprintk("%s: freeing layout segment %p\n", __func__, lseg); - - if (unlikely(!lseg)) - return; - - objio_free_lseg(lseg); -} - -/* - * I/O Operations - */ -static inline u64 -end_offset(u64 start, u64 len) -{ - u64 end; - - end = start + len; - return end >= start ? end : NFS4_MAX_UINT64; -} - -static void _fix_verify_io_params(struct pnfs_layout_segment *lseg, - struct page ***p_pages, unsigned *p_pgbase, - u64 offset, unsigned long count) -{ - u64 lseg_end_offset; - - BUG_ON(offset < lseg->pls_range.offset); - lseg_end_offset = end_offset(lseg->pls_range.offset, - lseg->pls_range.length); - BUG_ON(offset >= lseg_end_offset); - WARN_ON(offset + count > lseg_end_offset); - - if (*p_pgbase > PAGE_SIZE) { - dprintk("%s: pgbase(0x%x) > PAGE_SIZE\n", __func__, *p_pgbase); - *p_pages += *p_pgbase >> PAGE_SHIFT; - *p_pgbase &= ~PAGE_MASK; - } -} - -/* - * I/O done common code - */ -static void -objlayout_iodone(struct objlayout_io_res *oir) -{ - if (likely(oir->status >= 0)) { - objio_free_result(oir); - } else { - struct objlayout *objlay = oir->objlay; - - spin_lock(&objlay->lock); - objlay->delta_space_valid = OBJ_DSU_INVALID; - list_add(&objlay->err_list, &oir->err_list); - spin_unlock(&objlay->lock); - } -} - -/* - * objlayout_io_set_result - Set an osd_error code on a specific osd comp. - * - * The @index component IO failed (error returned from target). Register - * the error for later reporting at layout-return. - */ -void -objlayout_io_set_result(struct objlayout_io_res *oir, unsigned index, - struct pnfs_osd_objid *pooid, int osd_error, - u64 offset, u64 length, bool is_write) -{ - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[index]; - - BUG_ON(index >= oir->num_comps); - if (osd_error) { - ioerr->oer_component = *pooid; - ioerr->oer_comp_offset = offset; - ioerr->oer_comp_length = length; - ioerr->oer_iswrite = is_write; - ioerr->oer_errno = osd_error; - - dprintk("%s: err[%d]: errno=%d is_write=%d dev(%llx:%llx) " - "par=0x%llx obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, index, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - } else { - /* User need not call if no error is reported */ - ioerr->oer_errno = 0; - } -} - -/* Function scheduled on rpc workqueue to call ->nfs_readlist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_read_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_read_done(hdr); -} - -void -objlayout_read_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) - hdr->res.count = status; - else - hdr->pnfs_error = status; - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status=%zd eof=%d sync=%d\n", __func__, - status, hdr->res.eof, sync); - - if (sync) - pnfs_ld_read_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_read_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async reads. - */ -enum pnfs_try_status -objlayout_read_pagelist(struct nfs_pgio_header *hdr) -{ - struct inode *inode = hdr->inode; - loff_t offset = hdr->args.offset; - size_t count = hdr->args.count; - int err; - loff_t eof; - - eof = i_size_read(inode); - if (unlikely(offset + count > eof)) { - if (offset >= eof) { - err = 0; - hdr->res.count = 0; - hdr->res.eof = 1; - /*FIXME: do we need to call pnfs_ld_read_done() */ - goto out; - } - count = eof - offset; - } - - hdr->res.eof = (offset + count) >= eof; - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - dprintk("%s: inode(%lx) offset 0x%llx count 0x%zx eof=%d\n", - __func__, inode->i_ino, offset, count, hdr->res.eof); - - err = objio_read_pagelist(hdr); - out: - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -/* Function scheduled on rpc workqueue to call ->nfs_writelist_complete(). - * This is because the osd completion is called with ints-off from - * the block layer - */ -static void _rpc_write_complete(struct work_struct *work) -{ - struct rpc_task *task; - struct nfs_pgio_header *hdr; - - dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); - - pnfs_ld_write_done(hdr); -} - -void -objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync) -{ - struct nfs_pgio_header *hdr = oir->rpcdata; - - oir->status = hdr->task.tk_status = status; - if (status >= 0) { - hdr->res.count = status; - hdr->verf.committed = oir->committed; - } else { - hdr->pnfs_error = status; - } - objlayout_iodone(oir); - /* must not use oir after this point */ - - dprintk("%s: Return status %zd committed %d sync=%d\n", __func__, - status, hdr->verf.committed, sync); - - if (sync) - pnfs_ld_write_done(hdr); - else { - INIT_WORK(&hdr->task.u.tk_work, _rpc_write_complete); - schedule_work(&hdr->task.u.tk_work); - } -} - -/* - * Perform sync or async writes. - */ -enum pnfs_try_status -objlayout_write_pagelist(struct nfs_pgio_header *hdr, int how) -{ - int err; - - _fix_verify_io_params(hdr->lseg, &hdr->args.pages, - &hdr->args.pgbase, - hdr->args.offset, hdr->args.count); - - err = objio_write_pagelist(hdr, how); - if (unlikely(err)) { - hdr->pnfs_error = err; - dprintk("%s: Returned Error %d\n", __func__, err); - return PNFS_NOT_ATTEMPTED; - } - return PNFS_ATTEMPTED; -} - -void -objlayout_encode_layoutcommit(struct pnfs_layout_hdr *pnfslay, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args) -{ - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct pnfs_osd_layoutupdate lou; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - - spin_lock(&objlay->lock); - lou.dsu_valid = (objlay->delta_space_valid == OBJ_DSU_VALID); - lou.dsu_delta = objlay->delta_space_used; - objlay->delta_space_used = 0; - objlay->delta_space_valid = OBJ_DSU_INIT; - lou.olu_ioerr_flag = !list_empty(&objlay->err_list); - spin_unlock(&objlay->lock); - - start = xdr_reserve_space(xdr, 4); - - BUG_ON(pnfs_osd_xdr_encode_layoutupdate(xdr, &lou)); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - - dprintk("%s: Return delta_space_used %lld err %d\n", __func__, - lou.dsu_delta, lou.olu_ioerr_flag); -} - -static int -err_prio(u32 oer_errno) -{ - switch (oer_errno) { - case 0: - return 0; - - case PNFS_OSD_ERR_RESOURCE: - return OSD_ERR_PRI_RESOURCE; - case PNFS_OSD_ERR_BAD_CRED: - return OSD_ERR_PRI_BAD_CRED; - case PNFS_OSD_ERR_NO_ACCESS: - return OSD_ERR_PRI_NO_ACCESS; - case PNFS_OSD_ERR_UNREACHABLE: - return OSD_ERR_PRI_UNREACHABLE; - case PNFS_OSD_ERR_NOT_FOUND: - return OSD_ERR_PRI_NOT_FOUND; - case PNFS_OSD_ERR_NO_SPACE: - return OSD_ERR_PRI_NO_SPACE; - default: - WARN_ON(1); - /* fallthrough */ - case PNFS_OSD_ERR_EIO: - return OSD_ERR_PRI_EIO; - } -} - -static void -merge_ioerr(struct pnfs_osd_ioerr *dest_err, - const struct pnfs_osd_ioerr *src_err) -{ - u64 dest_end, src_end; - - if (!dest_err->oer_errno) { - *dest_err = *src_err; - /* accumulated device must be blank */ - memset(&dest_err->oer_component.oid_device_id, 0, - sizeof(dest_err->oer_component.oid_device_id)); - - return; - } - - if (dest_err->oer_component.oid_partition_id != - src_err->oer_component.oid_partition_id) - dest_err->oer_component.oid_partition_id = 0; - - if (dest_err->oer_component.oid_object_id != - src_err->oer_component.oid_object_id) - dest_err->oer_component.oid_object_id = 0; - - if (dest_err->oer_comp_offset > src_err->oer_comp_offset) - dest_err->oer_comp_offset = src_err->oer_comp_offset; - - dest_end = end_offset(dest_err->oer_comp_offset, - dest_err->oer_comp_length); - src_end = end_offset(src_err->oer_comp_offset, - src_err->oer_comp_length); - if (dest_end < src_end) - dest_end = src_end; - - dest_err->oer_comp_length = dest_end - dest_err->oer_comp_offset; - - if ((src_err->oer_iswrite == dest_err->oer_iswrite) && - (err_prio(src_err->oer_errno) > err_prio(dest_err->oer_errno))) { - dest_err->oer_errno = src_err->oer_errno; - } else if (src_err->oer_iswrite) { - dest_err->oer_iswrite = true; - dest_err->oer_errno = src_err->oer_errno; - } -} - -static void -encode_accumulated_error(struct objlayout *objlay, __be32 *p) -{ - struct objlayout_io_res *oir, *tmp; - struct pnfs_osd_ioerr accumulated_err = {.oer_errno = 0}; - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - unsigned i; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - printk(KERN_ERR "NFS: %s: err[%d]: errno=%d " - "is_write=%d dev(%llx:%llx) par=0x%llx " - "obj=0x%llx offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - merge_ioerr(&accumulated_err, ioerr); - } - list_del(&oir->err_list); - objio_free_result(oir); - } - - pnfs_osd_xdr_encode_ioerr(p, &accumulated_err); -} - -void -objlayout_encode_layoutreturn(struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args) -{ - struct pnfs_layout_hdr *pnfslay = args->layout; - struct objlayout *objlay = OBJLAYOUT(pnfslay); - struct objlayout_io_res *oir, *tmp; - __be32 *start; - - dprintk("%s: Begin\n", __func__); - start = xdr_reserve_space(xdr, 4); - BUG_ON(!start); - - spin_lock(&objlay->lock); - - list_for_each_entry_safe(oir, tmp, &objlay->err_list, err_list) { - __be32 *last_xdr = NULL, *p; - unsigned i; - int res = 0; - - for (i = 0; i < oir->num_comps; i++) { - struct pnfs_osd_ioerr *ioerr = &oir->ioerrs[i]; - - if (!ioerr->oer_errno) - continue; - - dprintk("%s: err[%d]: errno=%d is_write=%d " - "dev(%llx:%llx) par=0x%llx obj=0x%llx " - "offset=0x%llx length=0x%llx\n", - __func__, i, ioerr->oer_errno, - ioerr->oer_iswrite, - _DEVID_LO(&ioerr->oer_component.oid_device_id), - _DEVID_HI(&ioerr->oer_component.oid_device_id), - ioerr->oer_component.oid_partition_id, - ioerr->oer_component.oid_object_id, - ioerr->oer_comp_offset, - ioerr->oer_comp_length); - - p = pnfs_osd_xdr_ioerr_reserve_space(xdr); - if (unlikely(!p)) { - res = -E2BIG; - break; /* accumulated_error */ - } - - last_xdr = p; - pnfs_osd_xdr_encode_ioerr(p, &oir->ioerrs[i]); - } - - /* TODO: use xdr_write_pages */ - if (unlikely(res)) { - /* no space for even one error descriptor */ - BUG_ON(!last_xdr); - - /* we've encountered a situation with lots and lots of - * errors and no space to encode them all. Use the last - * available slot to report the union of all the - * remaining errors. - */ - encode_accumulated_error(objlay, last_xdr); - goto loop_done; - } - list_del(&oir->err_list); - objio_free_result(oir); - } -loop_done: - spin_unlock(&objlay->lock); - - *start = cpu_to_be32((xdr->p - start - 1) * 4); - dprintk("%s: Return\n", __func__); -} - -enum { - OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, - OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, - OSD_LOGIN_UPCALL_PATHLEN = 256 -}; - -static char osd_login_prog[OSD_LOGIN_UPCALL_PATHLEN] = "/sbin/osd_login"; - -module_param_string(osd_login_prog, osd_login_prog, sizeof(osd_login_prog), - 0600); -MODULE_PARM_DESC(osd_login_prog, "Path to the osd_login upcall program"); - -struct __auto_login { - char uri[OBJLAYOUT_MAX_URI_LEN]; - char osdname[OBJLAYOUT_MAX_OSDNAME_LEN]; - char systemid_hex[OBJLAYOUT_MAX_SYSID_HEX_LEN]; -}; - -static int __objlayout_upcall(struct __auto_login *login) -{ - static char *envp[] = { "HOME=/", - "TERM=linux", - "PATH=/sbin:/usr/sbin:/bin:/usr/bin", - NULL - }; - char *argv[8]; - int ret; - - if (unlikely(!osd_login_prog[0])) { - dprintk("%s: osd_login_prog is disabled\n", __func__); - return -EACCES; - } - - dprintk("%s uri: %s\n", __func__, login->uri); - dprintk("%s osdname %s\n", __func__, login->osdname); - dprintk("%s systemid_hex %s\n", __func__, login->systemid_hex); - - argv[0] = (char *)osd_login_prog; - argv[1] = "-u"; - argv[2] = login->uri; - argv[3] = "-o"; - argv[4] = login->osdname; - argv[5] = "-s"; - argv[6] = login->systemid_hex; - argv[7] = NULL; - - ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); - /* - * Disable the upcall mechanism if we're getting an ENOENT or - * EACCES error. The admin can re-enable it on the fly by using - * sysfs to set the objlayoutdriver.osd_login_prog module parameter once - * the problem has been fixed. - */ - if (ret == -ENOENT || ret == -EACCES) { - printk(KERN_ERR "PNFS-OBJ: %s was not found please set " - "objlayoutdriver.osd_login_prog kernel parameter!\n", - osd_login_prog); - osd_login_prog[0] = '\0'; - } - dprintk("%s %s return value: %d\n", __func__, osd_login_prog, ret); - - return ret; -} - -/* Assume dest is all zeros */ -static void __copy_nfsS_and_zero_terminate(struct nfs4_string s, - char *dest, int max_len, - const char *var_name) -{ - if (!s.len) - return; - - if (s.len >= max_len) { - pr_warn_ratelimited( - "objlayout_autologin: %s: s.len(%d) >= max_len(%d)", - var_name, s.len, max_len); - s.len = max_len - 1; /* space for null terminator */ - } - - memcpy(dest, s.data, s.len); -} - -/* Assume sysid is all zeros */ -static void _sysid_2_hex(struct nfs4_string s, - char sysid[OBJLAYOUT_MAX_SYSID_HEX_LEN]) -{ - int i; - char *cur; - - if (!s.len) - return; - - if (s.len != OSD_SYSTEMID_LEN) { - pr_warn_ratelimited( - "objlayout_autologin: systemid_len(%d) != OSD_SYSTEMID_LEN", - s.len); - if (s.len > OSD_SYSTEMID_LEN) - s.len = OSD_SYSTEMID_LEN; - } - - cur = sysid; - for (i = 0; i < s.len; i++) - cur = hex_byte_pack(cur, s.data[i]); -} - -int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr) -{ - int rc; - struct __auto_login login; - - if (!deviceaddr->oda_targetaddr.ota_netaddr.r_addr.len) - return -ENODEV; - - memset(&login, 0, sizeof(login)); - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_targetaddr.ota_netaddr.r_addr, - login.uri, sizeof(login.uri), "URI"); - - __copy_nfsS_and_zero_terminate( - deviceaddr->oda_osdname, - login.osdname, sizeof(login.osdname), "OSDNAME"); - - _sysid_2_hex(deviceaddr->oda_systemid, login.systemid_hex); - - rc = __objlayout_upcall(&login); - if (rc > 0) /* script returns positive values */ - rc = -ENODEV; - - return rc; -} diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h deleted file mode 100644 index fc94a5872ed417b94b89213a34ead3e2105f78ec..0000000000000000000000000000000000000000 --- a/fs/nfs/objlayout/objlayout.h +++ /dev/null @@ -1,183 +0,0 @@ -/* - * Data types and function declerations for interfacing with the - * pNFS standard object layout driver. - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#ifndef _OBJLAYOUT_H -#define _OBJLAYOUT_H - -#include -#include -#include "../pnfs.h" - -/* - * per-inode layout - */ -struct objlayout { - struct pnfs_layout_hdr pnfs_layout; - - /* for layout_commit */ - enum osd_delta_space_valid_enum { - OBJ_DSU_INIT = 0, - OBJ_DSU_VALID, - OBJ_DSU_INVALID, - } delta_space_valid; - s64 delta_space_used; /* consumed by write ops */ - - /* for layout_return */ - spinlock_t lock; - struct list_head err_list; -}; - -static inline struct objlayout * -OBJLAYOUT(struct pnfs_layout_hdr *lo) -{ - return container_of(lo, struct objlayout, pnfs_layout); -} - -/* - * per-I/O operation state - * embedded in objects provider io_state data structure - */ -struct objlayout_io_res { - struct objlayout *objlay; - - void *rpcdata; - int status; /* res */ - int committed; /* res */ - - /* Error reporting (layout_return) */ - struct list_head err_list; - unsigned num_comps; - /* Pointer to array of error descriptors of size num_comps. - * It should contain as many entries as devices in the osd_layout - * that participate in the I/O. It is up to the io_engine to allocate - * needed space and set num_comps. - */ - struct pnfs_osd_ioerr *ioerrs; -}; - -static inline -void objlayout_init_ioerrs(struct objlayout_io_res *oir, unsigned num_comps, - struct pnfs_osd_ioerr *ioerrs, void *rpcdata, - struct pnfs_layout_hdr *pnfs_layout_type) -{ - oir->objlay = OBJLAYOUT(pnfs_layout_type); - oir->rpcdata = rpcdata; - INIT_LIST_HEAD(&oir->err_list); - oir->num_comps = num_comps; - oir->ioerrs = ioerrs; -} - -/* - * Raid engine I/O API - */ -extern int objio_alloc_lseg(struct pnfs_layout_segment **outp, - struct pnfs_layout_hdr *pnfslay, - struct pnfs_layout_range *range, - struct xdr_stream *xdr, - gfp_t gfp_flags); -extern void objio_free_lseg(struct pnfs_layout_segment *lseg); - -/* objio_free_result will free these @oir structs received from - * objlayout_{read,write}_done - */ -extern void objio_free_result(struct objlayout_io_res *oir); - -extern int objio_read_pagelist(struct nfs_pgio_header *rdata); -extern int objio_write_pagelist(struct nfs_pgio_header *wdata, int how); - -/* - * callback API - */ -extern void objlayout_io_set_result(struct objlayout_io_res *oir, - unsigned index, struct pnfs_osd_objid *pooid, - int osd_error, u64 offset, u64 length, bool is_write); - -static inline void -objlayout_add_delta_space_used(struct objlayout *objlay, s64 space_used) -{ - /* If one of the I/Os errored out and the delta_space_used was - * invalid we render the complete report as invalid. Protocol mandate - * the DSU be accurate or not reported. - */ - spin_lock(&objlay->lock); - if (objlay->delta_space_valid != OBJ_DSU_INVALID) { - objlay->delta_space_valid = OBJ_DSU_VALID; - objlay->delta_space_used += space_used; - } - spin_unlock(&objlay->lock); -} - -extern void objlayout_read_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); -extern void objlayout_write_done(struct objlayout_io_res *oir, - ssize_t status, bool sync); - -/* - * exported generic objects function vectors - */ - -extern struct pnfs_layout_hdr *objlayout_alloc_layout_hdr(struct inode *, gfp_t gfp_flags); -extern void objlayout_free_layout_hdr(struct pnfs_layout_hdr *); - -extern struct pnfs_layout_segment *objlayout_alloc_lseg( - struct pnfs_layout_hdr *, - struct nfs4_layoutget_res *, - gfp_t gfp_flags); -extern void objlayout_free_lseg(struct pnfs_layout_segment *); - -extern enum pnfs_try_status objlayout_read_pagelist( - struct nfs_pgio_header *); - -extern enum pnfs_try_status objlayout_write_pagelist( - struct nfs_pgio_header *, - int how); - -extern void objlayout_encode_layoutcommit( - struct pnfs_layout_hdr *, - struct xdr_stream *, - const struct nfs4_layoutcommit_args *); - -extern void objlayout_encode_layoutreturn( - struct xdr_stream *, - const struct nfs4_layoutreturn_args *); - -extern int objlayout_autologin(struct pnfs_osd_deviceaddr *deviceaddr); - -#endif /* _OBJLAYOUT_H */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c deleted file mode 100644 index f093c7ec983bbc2bb0b2cb21d0aa8c1eaba411d8..0000000000000000000000000000000000000000 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ /dev/null @@ -1,415 +0,0 @@ -/* - * Object-Based pNFS Layout XDR layer - * - * Copyright (C) 2007 Panasas Inc. [year of first publication] - * All rights reserved. - * - * Benny Halevy - * Boaz Harrosh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * See the file COPYING included with this distribution for more details. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the Panasas company nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE - * DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR - * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF - * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING - * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* - * The following implementation is based on RFC5664 - */ - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static __be32 * -_osd_xdr_decode_objid(__be32 *p, struct pnfs_osd_objid *objid) -{ - p = xdr_decode_opaque_fixed(p, objid->oid_device_id.data, - sizeof(objid->oid_device_id.data)); - - p = xdr_decode_hyper(p, &objid->oid_partition_id); - p = xdr_decode_hyper(p, &objid->oid_object_id); - return p; -} -/* - * struct pnfs_osd_opaque_cred { - * u32 cred_len; - * void *cred; - * }; // xdr size [variable] - * The return pointers are from the xdr buffer - */ -static int -_osd_xdr_decode_opaque_cred(struct pnfs_osd_opaque_cred *opaque_cred, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 1); - - if (!p) - return -EINVAL; - - opaque_cred->cred_len = be32_to_cpu(*p++); - - p = xdr_inline_decode(xdr, opaque_cred->cred_len); - if (!p) - return -EINVAL; - - opaque_cred->cred = p; - return 0; -} - -/* - * struct pnfs_osd_object_cred { - * struct pnfs_osd_objid oc_object_id; - * u32 oc_osd_version; - * u32 oc_cap_key_sec; - * struct pnfs_osd_opaque_cred oc_cap_key - * struct pnfs_osd_opaque_cred oc_cap; - * }; // xdr size 32 + 4 + 4 + [variable] + [variable] - */ -static int -_osd_xdr_decode_object_cred(struct pnfs_osd_object_cred *comp, - struct xdr_stream *xdr) -{ - __be32 *p = xdr_inline_decode(xdr, 32 + 4 + 4); - int ret; - - if (!p) - return -EIO; - - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p); - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap_key, xdr); - if (unlikely(ret)) - return ret; - - ret = _osd_xdr_decode_opaque_cred(&comp->oc_cap, xdr); - return ret; -} - -/* - * struct pnfs_osd_data_map { - * u32 odm_num_comps; - * u64 odm_stripe_unit; - * u32 odm_group_width; - * u32 odm_group_depth; - * u32 odm_mirror_cnt; - * u32 odm_raid_algorithm; - * }; // xdr size 4 + 8 + 4 + 4 + 4 + 4 - */ -static inline int -_osd_data_map_xdr_sz(void) -{ - return 4 + 8 + 4 + 4 + 4 + 4; -} - -static __be32 * -_osd_xdr_decode_data_map(__be32 *p, struct pnfs_osd_data_map *data_map) -{ - data_map->odm_num_comps = be32_to_cpup(p++); - p = xdr_decode_hyper(p, &data_map->odm_stripe_unit); - data_map->odm_group_width = be32_to_cpup(p++); - data_map->odm_group_depth = be32_to_cpup(p++); - data_map->odm_mirror_cnt = be32_to_cpup(p++); - data_map->odm_raid_algorithm = be32_to_cpup(p++); - dprintk("%s: odm_num_comps=%u odm_stripe_unit=%llu odm_group_width=%u " - "odm_group_depth=%u odm_mirror_cnt=%u odm_raid_algorithm=%u\n", - __func__, - data_map->odm_num_comps, - (unsigned long long)data_map->odm_stripe_unit, - data_map->odm_group_width, - data_map->odm_group_depth, - data_map->odm_mirror_cnt, - data_map->odm_raid_algorithm); - return p; -} - -int pnfs_osd_xdr_decode_layout_map(struct pnfs_osd_layout *layout, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr) -{ - __be32 *p; - - memset(iter, 0, sizeof(*iter)); - - p = xdr_inline_decode(xdr, _osd_data_map_xdr_sz() + 4 + 4); - if (unlikely(!p)) - return -EINVAL; - - p = _osd_xdr_decode_data_map(p, &layout->olo_map); - layout->olo_comps_index = be32_to_cpup(p++); - layout->olo_num_comps = be32_to_cpup(p++); - dprintk("%s: olo_comps_index=%d olo_num_comps=%d\n", __func__, - layout->olo_comps_index, layout->olo_num_comps); - - iter->total_comps = layout->olo_num_comps; - return 0; -} - -bool pnfs_osd_xdr_decode_layout_comp(struct pnfs_osd_object_cred *comp, - struct pnfs_osd_xdr_decode_layout_iter *iter, struct xdr_stream *xdr, - int *err) -{ - BUG_ON(iter->decoded_comps > iter->total_comps); - if (iter->decoded_comps == iter->total_comps) - return false; - - *err = _osd_xdr_decode_object_cred(comp, xdr); - if (unlikely(*err)) { - dprintk("%s: _osd_xdr_decode_object_cred=>%d decoded_comps=%d " - "total_comps=%d\n", __func__, *err, - iter->decoded_comps, iter->total_comps); - return false; /* stop the loop */ - } - dprintk("%s: dev(%llx:%llx) par=0x%llx obj=0x%llx " - "key_len=%u cap_len=%u\n", - __func__, - _DEVID_LO(&comp->oc_object_id.oid_device_id), - _DEVID_HI(&comp->oc_object_id.oid_device_id), - comp->oc_object_id.oid_partition_id, - comp->oc_object_id.oid_object_id, - comp->oc_cap_key.cred_len, comp->oc_cap.cred_len); - - iter->decoded_comps++; - return true; -} - -/* - * Get Device Information Decoding - * - * Note: since Device Information is currently done synchronously, all - * variable strings fields are left inside the rpc buffer and are only - * pointed to by the pnfs_osd_deviceaddr members. So the read buffer - * should not be freed while the returned information is in use. - */ -/* - *struct nfs4_string { - * unsigned int len; - * char *data; - *}; // size [variable] - * NOTE: Returned string points to inside the XDR buffer - */ -static __be32 * -__read_u8_opaque(__be32 *p, struct nfs4_string *str) -{ - str->len = be32_to_cpup(p++); - str->data = (char *)p; - - p += XDR_QUADLEN(str->len); - return p; -} - -/* - * struct pnfs_osd_targetid { - * u32 oti_type; - * struct nfs4_string oti_scsi_device_id; - * };// size 4 + [variable] - */ -static __be32 * -__read_targetid(__be32 *p, struct pnfs_osd_targetid* targetid) -{ - u32 oti_type; - - oti_type = be32_to_cpup(p++); - targetid->oti_type = oti_type; - - switch (oti_type) { - case OBJ_TARGET_SCSI_NAME: - case OBJ_TARGET_SCSI_DEVICE_ID: - p = __read_u8_opaque(p, &targetid->oti_scsi_device_id); - } - - return p; -} - -/* - * struct pnfs_osd_net_addr { - * struct nfs4_string r_netid; - * struct nfs4_string r_addr; - * }; - */ -static __be32 * -__read_net_addr(__be32 *p, struct pnfs_osd_net_addr* netaddr) -{ - p = __read_u8_opaque(p, &netaddr->r_netid); - p = __read_u8_opaque(p, &netaddr->r_addr); - - return p; -} - -/* - * struct pnfs_osd_targetaddr { - * u32 ota_available; - * struct pnfs_osd_net_addr ota_netaddr; - * }; - */ -static __be32 * -__read_targetaddr(__be32 *p, struct pnfs_osd_targetaddr *targetaddr) -{ - u32 ota_available; - - ota_available = be32_to_cpup(p++); - targetaddr->ota_available = ota_available; - - if (ota_available) - p = __read_net_addr(p, &targetaddr->ota_netaddr); - - - return p; -} - -/* - * struct pnfs_osd_deviceaddr { - * struct pnfs_osd_targetid oda_targetid; - * struct pnfs_osd_targetaddr oda_targetaddr; - * u8 oda_lun[8]; - * struct nfs4_string oda_systemid; - * struct pnfs_osd_object_cred oda_root_obj_cred; - * struct nfs4_string oda_osdname; - * }; - */ - -/* We need this version for the pnfs_osd_xdr_decode_deviceaddr which does - * not have an xdr_stream - */ -static __be32 * -__read_opaque_cred(__be32 *p, - struct pnfs_osd_opaque_cred *opaque_cred) -{ - opaque_cred->cred_len = be32_to_cpu(*p++); - opaque_cred->cred = p; - return p + XDR_QUADLEN(opaque_cred->cred_len); -} - -static __be32 * -__read_object_cred(__be32 *p, struct pnfs_osd_object_cred *comp) -{ - p = _osd_xdr_decode_objid(p, &comp->oc_object_id); - comp->oc_osd_version = be32_to_cpup(p++); - comp->oc_cap_key_sec = be32_to_cpup(p++); - - p = __read_opaque_cred(p, &comp->oc_cap_key); - p = __read_opaque_cred(p, &comp->oc_cap); - return p; -} - -void pnfs_osd_xdr_decode_deviceaddr( - struct pnfs_osd_deviceaddr *deviceaddr, __be32 *p) -{ - p = __read_targetid(p, &deviceaddr->oda_targetid); - - p = __read_targetaddr(p, &deviceaddr->oda_targetaddr); - - p = xdr_decode_opaque_fixed(p, deviceaddr->oda_lun, - sizeof(deviceaddr->oda_lun)); - - p = __read_u8_opaque(p, &deviceaddr->oda_systemid); - - p = __read_object_cred(p, &deviceaddr->oda_root_obj_cred); - - p = __read_u8_opaque(p, &deviceaddr->oda_osdname); - - /* libosd likes this terminated in dbg. It's last, so no problems */ - deviceaddr->oda_osdname.data[deviceaddr->oda_osdname.len] = 0; -} - -/* - * struct pnfs_osd_layoutupdate { - * u32 dsu_valid; - * s64 dsu_delta; - * u32 olu_ioerr_flag; - * }; xdr size 4 + 8 + 4 - */ -int -pnfs_osd_xdr_encode_layoutupdate(struct xdr_stream *xdr, - struct pnfs_osd_layoutupdate *lou) -{ - __be32 *p = xdr_reserve_space(xdr, 4 + 8 + 4); - - if (!p) - return -E2BIG; - - *p++ = cpu_to_be32(lou->dsu_valid); - if (lou->dsu_valid) - p = xdr_encode_hyper(p, lou->dsu_delta); - *p++ = cpu_to_be32(lou->olu_ioerr_flag); - return 0; -} - -/* - * struct pnfs_osd_objid { - * struct nfs4_deviceid oid_device_id; - * u64 oid_partition_id; - * u64 oid_object_id; - * }; // xdr size 32 bytes - */ -static inline __be32 * -pnfs_osd_xdr_encode_objid(__be32 *p, struct pnfs_osd_objid *object_id) -{ - p = xdr_encode_opaque_fixed(p, &object_id->oid_device_id.data, - sizeof(object_id->oid_device_id.data)); - p = xdr_encode_hyper(p, object_id->oid_partition_id); - p = xdr_encode_hyper(p, object_id->oid_object_id); - - return p; -} - -/* - * struct pnfs_osd_ioerr { - * struct pnfs_osd_objid oer_component; - * u64 oer_comp_offset; - * u64 oer_comp_length; - * u32 oer_iswrite; - * u32 oer_errno; - * }; // xdr size 32 + 24 bytes - */ -void pnfs_osd_xdr_encode_ioerr(__be32 *p, struct pnfs_osd_ioerr *ioerr) -{ - p = pnfs_osd_xdr_encode_objid(p, &ioerr->oer_component); - p = xdr_encode_hyper(p, ioerr->oer_comp_offset); - p = xdr_encode_hyper(p, ioerr->oer_comp_length); - *p++ = cpu_to_be32(ioerr->oer_iswrite); - *p = cpu_to_be32(ioerr->oer_errno); -} - -__be32 *pnfs_osd_xdr_ioerr_reserve_space(struct xdr_stream *xdr) -{ - __be32 *p; - - p = xdr_reserve_space(xdr, 32 + 24); - if (unlikely(!p)) - dprintk("%s: out of xdr space\n", __func__); - - return p; -} diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 6e629b856a00f2ebe52b4ba6badb356d49f380e6..ad92b401326c69a154b514b11764a2bc4a04a1cb 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -29,19 +29,6 @@ static struct kmem_cache *nfs_page_cachep; static const struct rpc_call_ops nfs_pgio_common_ops; -static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) -{ - p->npages = pagecount; - if (pagecount <= ARRAY_SIZE(p->page_array)) - p->pagevec = p->page_array; - else { - p->pagevec = kcalloc(pagecount, sizeof(struct page *), GFP_KERNEL); - if (!p->pagevec) - p->npages = 0; - } - return p->pagevec != NULL; -} - struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc) { @@ -115,6 +102,35 @@ nfs_iocounter_wait(struct nfs_lock_context *l_ctx) TASK_KILLABLE); } +/** + * nfs_async_iocounter_wait - wait on a rpc_waitqueue for I/O + * to complete + * @task: the rpc_task that should wait + * @l_ctx: nfs_lock_context with io_counter to check + * + * Returns true if there is outstanding I/O to wait on and the + * task has been put to sleep. + */ +bool +nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx) +{ + struct inode *inode = d_inode(l_ctx->open_context->dentry); + bool ret = false; + + if (atomic_read(&l_ctx->io_count) > 0) { + rpc_sleep_on(&NFS_SERVER(inode)->uoc_rpcwaitq, task, NULL); + ret = true; + } + + if (atomic_read(&l_ctx->io_count) == 0) { + rpc_wake_up_queued_task(&NFS_SERVER(inode)->uoc_rpcwaitq, task); + ret = false; + } + + return ret; +} +EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait); + /* * nfs_page_group_lock - lock the head of the page group * @req - request in group that is to be locked @@ -398,8 +414,11 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - if (atomic_dec_and_test(&l_ctx->io_count)) + if (atomic_dec_and_test(&l_ctx->io_count)) { wake_up_atomic_t(&l_ctx->io_count); + if (test_bit(NFS_CONTEXT_UNLOCK, &ctx->flags)) + rpc_wake_up(&NFS_SERVER(d_inode(ctx->dentry))->uoc_rpcwaitq); + } nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -677,7 +696,8 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int io_flags) + int io_flags, + gfp_t gfp_flags) { struct nfs_pgio_mirror *new; int i; @@ -701,7 +721,7 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc, /* until we have a request, we don't have an lseg and no * idea how many mirrors there will be */ new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX, - sizeof(struct nfs_pgio_mirror), GFP_KERNEL); + sizeof(struct nfs_pgio_mirror), gfp_flags); desc->pg_mirrors_dynamic = new; desc->pg_mirrors = new; @@ -754,13 +774,24 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *last_page; struct list_head *head = &mirror->pg_list; struct nfs_commit_info cinfo; + struct nfs_page_array *pg_array = &hdr->page_array; unsigned int pagecount, pageused; + gfp_t gfp_flags = GFP_KERNEL; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { - nfs_pgio_error(hdr); - desc->pg_error = -ENOMEM; - return desc->pg_error; + + if (pagecount <= ARRAY_SIZE(pg_array->page_array)) + pg_array->pagevec = pg_array->page_array; + else { + if (hdr->rw_mode == FMODE_WRITE) + gfp_flags = GFP_NOIO; + pg_array->pagevec = kcalloc(pagecount, sizeof(struct page *), gfp_flags); + if (!pg_array->pagevec) { + pg_array->npages = 0; + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); @@ -1256,8 +1287,10 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index) mirror = &desc->pg_mirrors[midx]; if (!list_empty(&mirror->pg_list)) { prev = nfs_list_entry(mirror->pg_list.prev); - if (index != prev->wb_index + 1) - nfs_pageio_complete_mirror(desc, midx); + if (index != prev->wb_index + 1) { + nfs_pageio_complete(desc); + break; + } } } } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index dd042498ce7c67df24b4919c2e98dc8f48db13ca..adc6ec28d4b59d3181c76ca8f33a9fed25d68ce9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -322,9 +322,15 @@ pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode, static void pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo) { + struct pnfs_layout_segment *lseg; lo->plh_return_iomode = 0; lo->plh_return_seq = 0; clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + list_for_each_entry(lseg, &lo->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + continue; + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); + } } static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) @@ -367,9 +373,9 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg, *next; set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - pnfs_clear_layoutreturn_info(lo); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) pnfs_clear_lseg_state(lseg, lseg_list); + pnfs_clear_layoutreturn_info(lo); pnfs_free_returned_lsegs(lo, lseg_list, &range, 0); if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) && !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) @@ -563,7 +569,6 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) } } } -EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); /* * is l2 fully contained in l1? @@ -728,6 +733,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); + nfs_commit_inode(&nfsi->vfs_inode, 0); pnfs_put_layout_hdr(lo); } else spin_unlock(&nfsi->vfs_inode.i_lock); @@ -1209,7 +1215,6 @@ _pnfs_return_layout(struct inode *ino) dprintk("<-- %s status: %d\n", __func__, status); return status; } -EXPORT_SYMBOL_GPL(_pnfs_return_layout); int pnfs_commit_and_return_layout(struct inode *inode) @@ -1991,6 +1996,8 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) spin_unlock(&ino->i_lock); lseg->pls_layout = lo; NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg); + if (!pnfs_layout_is_valid(lo)) + nfs_commit_inode(ino, 0); return ERR_PTR(-EAGAIN); } @@ -2051,9 +2058,11 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, bool return_now = false; spin_lock(&inode->i_lock); + if (!pnfs_layout_is_valid(lo)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_set_plh_return_info(lo, range.iomode, 0); - /* Block LAYOUTGET */ - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() @@ -2074,11 +2083,23 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); +void +pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio) +{ + if (pgio->pg_lseg == NULL || + test_bit(NFS_LSEG_VALID, &pgio->pg_lseg->pls_flags)) + return; + pnfs_put_lseg(pgio->pg_lseg); + pgio->pg_lseg = NULL; +} +EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout); + void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { u64 rd_size = req->wb_bytes; + pnfs_generic_pg_check_layout(pgio); if (pgio->pg_lseg == NULL) { if (pgio->pg_dreq == NULL) rd_size = i_size_read(pgio->pg_inode) - req_offset(req); @@ -2109,6 +2130,7 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { + pnfs_generic_pg_check_layout(pgio); if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, @@ -2277,8 +2299,20 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc, enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how); - if (trypnfs == PNFS_NOT_ATTEMPTED) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_write_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) @@ -2408,10 +2442,20 @@ pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr) enum pnfs_try_status trypnfs; trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg); - if (trypnfs == PNFS_TRY_AGAIN) - pnfs_read_resend_pnfs(hdr); - if (trypnfs == PNFS_NOT_ATTEMPTED || hdr->task.tk_status) + switch (trypnfs) { + case PNFS_NOT_ATTEMPTED: pnfs_read_through_mds(desc, hdr); + case PNFS_ATTEMPTED: + break; + case PNFS_TRY_AGAIN: + /* cleanup hdr and prepare to redo pnfs */ + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { + struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); + list_splice_init(&hdr->pages, &mirror->pg_list); + mirror->pg_recoalesce = 1; + } + hdr->mds_ops->rpc_release(hdr); + } } static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 590e1e35781f0b737b5b277d76ab56092f8e3f3b..2d05b756a8d6504e79796d71eb5471a578d9ef5a 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -173,14 +173,9 @@ struct pnfs_layoutdriver_type { gfp_t gfp_flags); int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *); - void (*encode_layoutreturn) (struct xdr_stream *xdr, - const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); - void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *args); int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); }; @@ -239,6 +234,7 @@ void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *); void unset_pnfs_layoutdriver(struct nfs_server *); +void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio); void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *); int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc); void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 7250b95549ecc73bd1dbdae9ec909aac64f93a49..d40755a0984bbb0942e96aee388ed50fe7629a6e 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -217,7 +217,14 @@ pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo, for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) { if (list_empty(&bucket->committing)) continue; - data = nfs_commitdata_alloc(); + /* + * If the layout segment is invalid, then let + * pnfs_generic_retry_commit() clean up the bucket. + */ + if (bucket->clseg && !pnfs_is_valid_lseg(bucket->clseg) && + !test_bit(NFS_LSEG_LAYOUTRETURN, &bucket->clseg->pls_flags)) + break; + data = nfs_commitdata_alloc(false); if (!data) break; data->ds_commit_index = i; @@ -283,16 +290,10 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, unsigned int nreq = 0; if (!list_empty(mds_pages)) { - data = nfs_commitdata_alloc(); - if (data != NULL) { - data->ds_commit_index = -1; - list_add(&data->pages, &list); - nreq++; - } else { - nfs_retry_commit(mds_pages, NULL, cinfo, 0); - pnfs_generic_retry_commit(cinfo, 0); - return -ENOMEM; - } + data = nfs_commitdata_alloc(true); + data->ds_commit_index = -1; + list_add(&data->pages, &list); + nreq++; } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); @@ -619,7 +620,6 @@ void nfs4_pnfs_v3_ds_connect_unload(void) get_v3_ds_connect = NULL; } } -EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload); static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index b7bca83039895b5692163d2a119f29c126ea5d83..9872cf676a50a7cfe945bfc5294602b4f67b5673 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -638,7 +638,7 @@ nfs_proc_lock(struct file *filp, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(filp); - return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl); + return nlmclnt_proc(NFS_SERVER(inode)->nlm_host, cmd, fl, NULL); } /* Helper functions for NFS lock bounds checking */ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index defc9233e9858c43a9438b2cf918a899b7f56afa..a8421d9dab6a125c4eb6b7ad9f074d8ce91a5a99 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -35,7 +35,11 @@ static struct kmem_cache *nfs_rdata_cachep; static struct nfs_pgio_header *nfs_readhdr_alloc(void) { - return kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + struct nfs_pgio_header *p = kmem_cache_zalloc(nfs_rdata_cachep, GFP_KERNEL); + + if (p) + p->rw_mode = FMODE_READ; + return p; } static void nfs_readhdr_free(struct nfs_pgio_header *rhdr) @@ -64,7 +68,7 @@ void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_read_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_read_ops, - server->rsize, 0); + server->rsize, 0, GFP_KERNEL); } EXPORT_SYMBOL_GPL(nfs_pageio_init_read); @@ -451,7 +455,6 @@ void nfs_destroy_readpagecache(void) } static const struct nfs_rw_ops nfs_rw_read_ops = { - .rw_mode = FMODE_READ, .rw_alloc_header = nfs_readhdr_alloc, .rw_free_header = nfs_readhdr_free, .rw_done = nfs_readpage_done, diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 54e0f9f2dd94903d2d6d96de46287ebc31fe1b6e..2f3822a4a7d565e9e2e607b61d2c580c1befdd05 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2315,8 +2315,6 @@ inline void nfs_initialise_sb(struct super_block *sb) sb->s_blocksize = nfs_block_bits(server->wsize, &sb->s_blocksize_bits); - sb->s_bdi = &server->backing_dev_info; - nfs_super_set_maxbytes(sb, server->maxfilesize); } @@ -2522,11 +2520,6 @@ static void nfs_get_cache_cookie(struct super_block *sb, } #endif -static int nfs_bdi_register(struct nfs_server *server) -{ - return bdi_register_dev(&server->backing_dev_info, server->s_dev); -} - int nfs_set_sb_security(struct super_block *s, struct dentry *mntroot, struct nfs_mount_info *mount_info) { @@ -2594,11 +2587,13 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server, nfs_free_server(server); server = NULL; } else { - error = nfs_bdi_register(server); + error = super_setup_bdi_name(s, "%u:%u", MAJOR(server->s_dev), + MINOR(server->s_dev)); if (error) { mntroot = ERR_PTR(error); goto error_splat_super; } + s->s_bdi->ra_pages = server->rpages * NFS_MAX_READAHEAD; server->super = s; } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index abb2c8a3be42e4755f747c62a1cec5466f13ee77..db7ba542559e7def882448708bb5dc0d530a3416 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -60,14 +60,28 @@ static mempool_t *nfs_wdata_mempool; static struct kmem_cache *nfs_cdata_cachep; static mempool_t *nfs_commit_mempool; -struct nfs_commit_data *nfs_commitdata_alloc(void) +struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + struct nfs_commit_data *p; - if (p) { - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->pages); + if (never_fail) + p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); + else { + /* It is OK to do some reclaim, not no safe to wait + * for anything to be returned to the pool. + * mempool_alloc() cannot handle that particular combination, + * so we need two separate attempts. + */ + p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT); + if (!p) + p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO | + __GFP_NOWARN | __GFP_NORETRY); + if (!p) + return NULL; } + + memset(p, 0, sizeof(*p)); + INIT_LIST_HEAD(&p->pages); return p; } EXPORT_SYMBOL_GPL(nfs_commitdata_alloc); @@ -82,8 +96,10 @@ static struct nfs_pgio_header *nfs_writehdr_alloc(void) { struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); - if (p) + if (p) { memset(p, 0, sizeof(*p)); + p->rw_mode = FMODE_WRITE; + } return p; } @@ -263,16 +279,15 @@ int nfs_congestion_kb; static void nfs_set_page_writeback(struct page *page) { - struct nfs_server *nfss = NFS_SERVER(page_file_mapping(page)->host); + struct inode *inode = page_file_mapping(page)->host; + struct nfs_server *nfss = NFS_SERVER(inode); int ret = test_set_page_writeback(page); WARN_ON_ONCE(ret != 0); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) { - set_bdi_congested(&nfss->backing_dev_info, - BLK_RW_ASYNC); - } + NFS_CONGESTION_ON_THRESH) + set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } static void nfs_end_page_writeback(struct nfs_page *req) @@ -285,7 +300,7 @@ static void nfs_end_page_writeback(struct nfs_page *req) end_page_writeback(req->wb_page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); } @@ -548,9 +563,21 @@ static void nfs_write_error_remove_page(struct nfs_page *req) { nfs_unlock_request(req); nfs_end_page_writeback(req); - nfs_release_request(req); generic_error_remove_page(page_file_mapping(req->wb_page), req->wb_page); + nfs_release_request(req); +} + +static bool +nfs_error_is_fatal_on_server(int err) +{ + switch (err) { + case 0: + case -ERESTARTSYS: + case -EINTR: + return false; + } + return nfs_error_is_fatal(err); } /* @@ -558,8 +585,7 @@ static void nfs_write_error_remove_page(struct nfs_page *req) * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock, - bool launder) + struct page *page, bool nonblock) { struct nfs_page *req; int ret = 0; @@ -575,19 +601,19 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); ret = 0; + /* If there is a fatal error that covers this write, just exit */ + if (nfs_error_is_fatal_on_server(req->wb_context->error)) + goto out_launder; + if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* - * Remove the problematic req upon fatal errors - * in launder case, while other dirty pages can - * still be around until they get flushed. + * Remove the problematic req upon fatal errors on the server */ if (nfs_error_is_fatal(ret)) { nfs_context_set_write_error(req->wb_context, ret); - if (launder) { - nfs_write_error_remove_page(req); - goto out; - } + if (nfs_error_is_fatal_on_server(ret)) + goto out_launder; } nfs_redirty_request(req); ret = -EAGAIN; @@ -596,16 +622,18 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, NFSIOS_WRITEPAGES, 1); out: return ret; +out_launder: + nfs_write_error_remove_page(req); + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio, bool launder) + struct nfs_pageio_descriptor *pgio) { int ret; nfs_pageio_cond_complete(pgio, page_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, - launder); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -617,8 +645,7 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, * Write an mmapped page to the server. */ static int nfs_writepage_locked(struct page *page, - struct writeback_control *wbc, - bool launder) + struct writeback_control *wbc) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -627,7 +654,7 @@ static int nfs_writepage_locked(struct page *page, nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, 0, false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio, launder); + err = nfs_do_writepage(page, wbc, &pgio); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -640,7 +667,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc, false); + ret = nfs_writepage_locked(page, wbc); unlock_page(page); return ret; } @@ -649,7 +676,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data, false); + ret = nfs_do_writepage(page, wbc, data); unlock_page(page); return ret; } @@ -1368,7 +1395,7 @@ void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, pg_ops = server->pnfs_curr_ld->pg_write_ops; #endif nfs_pageio_init(pgio, inode, pg_ops, compl_ops, &nfs_rw_write_ops, - server->wsize, ioflags); + server->wsize, ioflags, GFP_NOIO); } EXPORT_SYMBOL_GPL(nfs_pageio_init_write); @@ -1705,50 +1732,14 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, if (list_empty(head)) return 0; - data = nfs_commitdata_alloc(); - - if (!data) - goto out_bad; + data = nfs_commitdata_alloc(true); /* Set up the argument struct */ nfs_init_commit(data, head, NULL, cinfo); atomic_inc(&cinfo->mds->rpcs_out); return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode), data->mds_ops, how, 0); - out_bad: - nfs_retry_commit(head, NULL, cinfo, 0); - return -ENOMEM; -} - -int nfs_commit_file(struct file *file, struct nfs_write_verifier *verf) -{ - struct inode *inode = file_inode(file); - struct nfs_open_context *open; - struct nfs_commit_info cinfo; - struct nfs_page *req; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - req = nfs_create_request(open, NULL, NULL, 0, i_size_read(inode)); - if (IS_ERR(req)) { - ret = PTR_ERR(req); - goto out_put; - } - - nfs_init_cinfo_from_inode(&cinfo, inode); - - memcpy(&req->wb_verf, verf, sizeof(struct nfs_write_verifier)); - nfs_request_add_commit_list(req, &cinfo); - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret > 0) - ret = 0; - - nfs_free_request(req); -out_put: - put_nfs_open_context(open); - return ret; } -EXPORT_SYMBOL_GPL(nfs_commit_file); /* * COMMIT call returned @@ -1808,7 +1799,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) } nfss = NFS_SERVER(data->inode); if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + clear_bdi_congested(inode_to_bdi(data->inode), BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); nfs_commit_end(cinfo.mds); @@ -1986,7 +1977,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) +int nfs_wb_page(struct inode *inode, struct page *page) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_SIZE - 1); @@ -2003,7 +1994,7 @@ int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc, launder); + ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; continue; @@ -2108,7 +2099,6 @@ void nfs_destroy_writepagecache(void) } static const struct nfs_rw_ops nfs_rw_write_ops = { - .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, .rw_done = nfs_writeback_done, diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 92b4b41d19d2a2e23b469ce993e3a9f159f5f578..fb5213afc854e2c28edafc238f374a5a93c01d9f 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -242,10 +242,11 @@ static int nfsd4_scsi_identify_device(struct block_device *bdev, req->cmd[4] = bufflen & 0xff; req->cmd_len = COMMAND_SIZE(INQUIRY); - error = blk_execute_rq(rq->q, NULL, rq, 1); - if (error) { + blk_execute_rq(rq->q, NULL, rq, 1); + if (req->result) { pr_err("pNFS: INQUIRY 0x83 failed with: %x\n", - rq->errors); + req->result); + error = -EIO; goto out_put_request; } diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index dba2ff8eaa68e3365deac3d61df3da5641099d9e..12feac6ee2fd461a46c7b06b7a0ed0359fb4dfd1 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -334,8 +334,11 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, if (!p) return 0; p = xdr_decode_hyper(p, &args->offset); - args->count = ntohl(*p++); + + if (!xdr_argsize_check(rqstp, p)) + return 0; + len = min(args->count, max_blocksize); /* set up the kvec */ @@ -349,7 +352,7 @@ nfs3svc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, v++; } args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int @@ -358,6 +361,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, { unsigned int len, v, hdr, dlen; u32 max_blocksize = svc_max_payload(rqstp); + struct kvec *head = rqstp->rq_arg.head; + struct kvec *tail = rqstp->rq_arg.tail; p = decode_fh(p, &args->fh); if (!p) @@ -367,6 +372,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, args->count = ntohl(*p++); args->stable = ntohl(*p++); len = args->len = ntohl(*p++); + if ((void *)p > head->iov_base + head->iov_len) + return 0; /* * The count must equal the amount of data passed. */ @@ -377,9 +384,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - + rqstp->rq_arg.tail[0].iov_len - hdr; + hdr = (void*)p - head->iov_base; + dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr; /* * Round the length of the data which was specified up to * the next multiple of XDR units and then compare that @@ -396,7 +402,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, len = args->len = max_blocksize; } rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -471,6 +477,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p, /* first copy and check from the first page */ old = (char*)p; vec = &rqstp->rq_arg.head[0]; + if ((void *)old > vec->iov_base + vec->iov_len) + return 0; avail = vec->iov_len - (old - (char*)vec->iov_base); while (len && avail && *old) { *new++ = *old++; @@ -536,9 +544,11 @@ nfs3svc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, p = decode_fh(p, &args->fh); if (!p) return 0; + if (!xdr_argsize_check(rqstp, p)) + return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } int @@ -564,10 +574,14 @@ nfs3svc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->verf = p; p += 2; args->dircount = ~0; args->count = ntohl(*p++); + + if (!xdr_argsize_check(rqstp, p)) + return 0; + args->count = min_t(u32, args->count, PAGE_SIZE); args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } int @@ -585,6 +599,9 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->dircount = ntohl(*p++); args->count = ntohl(*p++); + if (!xdr_argsize_check(rqstp, p)) + return 0; + len = args->count = min(args->count, max_blocksize); while (len > 0) { struct page *p = *(rqstp->rq_next_page++); @@ -592,8 +609,7 @@ nfs3svc_decode_readdirplusargs(struct svc_rqst *rqstp, __be32 *p, args->buffer = page_address(p); len -= PAGE_SIZE; } - - return xdr_argsize_check(rqstp, p); + return 1; } int diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index cbeeda1e94a2fbbba61e2adeeb4f9ba89287eaf9..c453a1998e003d3e900407b266f1a15de5d5d94b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1259,7 +1259,8 @@ nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type) return NULL; } - if (!(exp->ex_layout_types & (1 << layout_type))) { + if (layout_type >= LAYOUT_TYPE_MAX || + !(exp->ex_layout_types & (1 << layout_type))) { dprintk("%s: layout type %d not supported\n", __func__, layout_type); return NULL; @@ -2489,7 +2490,7 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) { - if (op->opnum == OP_ILLEGAL) + if (op->opnum == OP_ILLEGAL || op->status == nfserr_notsupp) return op_encode_hdr_size * sizeof(__be32); BUG_ON(OPDESC(op)->op_rsize_bop == NULL); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index e9ef50addddb4489534bc07f138cd8c321d9193d..22002fb75a1827f2ca08bd1dcf23a775669fdd14 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1912,28 +1912,15 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -int strdup_if_nonnull(char **target, char *source) -{ - if (source) { - *target = kstrdup(source, GFP_KERNEL); - if (!*target) - return -ENOMEM; - } else - *target = NULL; - return 0; -} - static int copy_cred(struct svc_cred *target, struct svc_cred *source) { - int ret; + target->cr_principal = kstrdup(source->cr_principal, GFP_KERNEL); + target->cr_raw_principal = kstrdup(source->cr_raw_principal, + GFP_KERNEL); + if ((source->cr_principal && ! target->cr_principal) || + (source->cr_raw_principal && ! target->cr_raw_principal)) + return -ENOMEM; - ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); - if (ret) - return ret; - ret = strdup_if_nonnull(&target->cr_raw_principal, - source->cr_raw_principal); - if (ret) - return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 33017d652b1da23165ff75ef0c8abb529ac55234..26780d53a6f9412ba50cf3b3711b32d53d061723 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2831,9 +2831,14 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, } #endif /* CONFIG_NFSD_PNFS */ if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) { - status = nfsd4_encode_bitmap(xdr, NFSD_SUPPATTR_EXCLCREAT_WORD0, - NFSD_SUPPATTR_EXCLCREAT_WORD1, - NFSD_SUPPATTR_EXCLCREAT_WORD2); + u32 supp[3]; + + memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp)); + supp[0] &= NFSD_SUPPATTR_EXCLCREAT_WORD0; + supp[1] &= NFSD_SUPPATTR_EXCLCREAT_WORD1; + supp[2] &= NFSD_SUPPATTR_EXCLCREAT_WORD2; + + status = nfsd4_encode_bitmap(xdr, supp[0], supp[1], supp[2]); if (status) goto out; } @@ -4119,8 +4124,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_getdeviceinfo *gdev) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[gdev->gd_layout_type]; + const struct nfsd4_layout_ops *ops; u32 starting_len = xdr->buf->len, needed_len; __be32 *p; @@ -4137,6 +4141,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr, /* If maxcount is 0 then just update notifications */ if (gdev->gd_maxcount != 0) { + ops = nfsd4_layout_ops[gdev->gd_layout_type]; nfserr = ops->encode_getdeviceinfo(xdr, gdev); if (nfserr) { /* @@ -4189,8 +4194,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_layoutget *lgp) { struct xdr_stream *xdr = &resp->xdr; - const struct nfsd4_layout_ops *ops = - nfsd4_layout_ops[lgp->lg_layout_type]; + const struct nfsd4_layout_ops *ops; __be32 *p; dprintk("%s: err %d\n", __func__, nfserr); @@ -4213,6 +4217,7 @@ nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr, *p++ = cpu_to_be32(lgp->lg_seg.iomode); *p++ = cpu_to_be32(lgp->lg_layout_type); + ops = nfsd4_layout_ops[lgp->lg_layout_type]; nfserr = ops->encode_layoutget(xdr, lgp); out: kfree(lgp->lg_content); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8bf8f667a8cf2fe8359f74b41497bc9436ca0efb..6493df6b1bd5f192646d1f63b2230af840833651 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1146,7 +1146,7 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) static int nfsd_fill_super(struct super_block * sb, void * data, int silent) { - static struct tree_descr nfsd_files[] = { + static const struct tree_descr nfsd_files[] = { [NFSD_List] = {"exports", &exports_nfsd_operations, S_IRUGO}, [NFSD_Export_features] = {"export_features", &export_features_operations, S_IRUGO}, diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 31e1f959345715a59f8f6b9d9ec00a0ec00fece5..59979f0bbd4bf255f5ea6f8fbd33cb9b2a5aa073 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -747,6 +747,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr) return nfserr; } +/* + * A write procedure can have a large argument, and a read procedure can + * have a large reply, but no NFSv2 or NFSv3 procedure has argument and + * reply that can both be larger than a page. The xdr code has taken + * advantage of this assumption to be a sloppy about bounds checking in + * some cases. Pending a rewrite of the NFSv2/v3 xdr code to fix that + * problem, we enforce these assumptions here: + */ +static bool nfs_request_too_big(struct svc_rqst *rqstp, + struct svc_procedure *proc) +{ + /* + * The ACL code has more careful bounds-checking and is not + * susceptible to this problem: + */ + if (rqstp->rq_prog != NFS_PROGRAM) + return false; + /* + * Ditto NFSv4 (which can in theory have argument and reply both + * more than a page): + */ + if (rqstp->rq_vers >= 4) + return false; + /* The reply will be small, we're OK: */ + if (proc->pc_xdrressize > 0 && + proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE)) + return false; + + return rqstp->rq_arg.len > PAGE_SIZE; +} + int nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) { @@ -759,6 +790,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp) rqstp->rq_vers, rqstp->rq_proc); proc = rqstp->rq_procinfo; + if (nfs_request_too_big(rqstp, proc)) { + dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers); + *statp = rpc_garbage_args; + return 1; + } /* * Give the xdr decoder a chance to change this if it wants * (necessary in the NFSv4.0 compound case) diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 41b468a6a90f807fe3f3d2e4ceeaa9f8c7ae0f8c..6a4947a3f4fa82be4118e4ed538a171118f4baa8 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -257,6 +257,9 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, len = args->count = ntohl(*p++); p++; /* totalcount - unused */ + if (!xdr_argsize_check(rqstp, p)) + return 0; + len = min_t(unsigned int, len, NFSSVC_MAXBLKSIZE_V2); /* set up somewhere to store response. @@ -272,7 +275,7 @@ nfssvc_decode_readargs(struct svc_rqst *rqstp, __be32 *p, v++; } args->vlen = v; - return xdr_argsize_check(rqstp, p); + return 1; } int @@ -280,6 +283,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_writeargs *args) { unsigned int len, hdr, dlen; + struct kvec *head = rqstp->rq_arg.head; int v; p = decode_fh(p, &args->fh); @@ -300,9 +304,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, * Check to make sure that we got the right number of * bytes. */ - hdr = (void*)p - rqstp->rq_arg.head[0].iov_base; - dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len - - hdr; + hdr = (void*)p - head->iov_base; + if (hdr > head->iov_len) + return 0; + dlen = head->iov_len + rqstp->rq_arg.page_len - hdr; /* * Round the length of the data which was specified up to @@ -316,7 +321,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p, return 0; rqstp->rq_vec[0].iov_base = (void*)p; - rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr; + rqstp->rq_vec[0].iov_len = head->iov_len - hdr; v = 0; while (len > rqstp->rq_vec[v].iov_len) { len -= rqstp->rq_vec[v].iov_len; @@ -360,9 +365,11 @@ nfssvc_decode_readlinkargs(struct svc_rqst *rqstp, __be32 *p, struct nfsd_readli p = decode_fh(p, &args->fh); if (!p) return 0; + if (!xdr_argsize_check(rqstp, p)) + return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } int @@ -400,9 +407,11 @@ nfssvc_decode_readdirargs(struct svc_rqst *rqstp, __be32 *p, args->cookie = ntohl(*p++); args->count = ntohl(*p++); args->count = min_t(u32, args->count, PAGE_SIZE); + if (!xdr_argsize_check(rqstp, p)) + return 0; args->buffer = page_address(*(rqstp->rq_next_page++)); - return xdr_argsize_check(rqstp, p); + return 1; } /* diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 19d50f600e8d48c6f493130076606a6213de258d..2be32955d7f27d285d0cba1dad39e88c885dc7e0 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -94,6 +94,12 @@ nfsd_cross_mnt(struct svc_rqst *rqstp, struct dentry **dpp, err = follow_down(&path); if (err < 0) goto out; + if (path.mnt == exp->ex_path.mnt && path.dentry == dentry && + nfsd_mountpoint(dentry, exp) == 2) { + /* This is only a mountpoint in some other namespace */ + path_put(&path); + goto out; + } exp2 = rqst_exp_get_by_name(rqstp, &path); if (IS_ERR(exp2)) { @@ -167,16 +173,26 @@ static int nfsd_lookup_parent(struct svc_rqst *rqstp, struct dentry *dparent, st /* * For nfsd purposes, we treat V4ROOT exports as though there was an * export at *every* directory. + * We return: + * '1' if this dentry *must* be an export point, + * '2' if it might be, if there is really a mount here, and + * '0' if there is no chance of an export point here. */ int nfsd_mountpoint(struct dentry *dentry, struct svc_export *exp) { - if (d_mountpoint(dentry)) + if (!d_inode(dentry)) + return 0; + if (exp->ex_flags & NFSEXP_V4ROOT) return 1; if (nfsd4_is_junction(dentry)) return 1; - if (!(exp->ex_flags & NFSEXP_V4ROOT)) - return 0; - return d_inode(dentry) != NULL; + if (d_mountpoint(dentry)) + /* + * Might only be a mountpoint in a different namespace, + * but we need to check. + */ + return 2; + return 0; } __be32 @@ -1004,7 +1020,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, else err = nfserrno(host_err); if (test_bit(RQ_LOCAL, &rqstp->rq_flags)) - tsk_restore_flags(current, pflags, PF_LESS_THROTTLE); + current_restore_flags(pflags, PF_LESS_THROTTLE); return err; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index e1872f36147f590765cf1d7548cce2e0d5243752..926682981d61f0bd1671e9f49c1a0a8b80e72f03 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1068,7 +1068,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info; + sb->s_bdi = bdi_get(sb->s_bdev->bd_bdi); err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/notify/Makefile b/fs/notify/Makefile index 96d3420d0242ba6ab1457c3312a395a57ef3ad86..3e969ae91b60dd1bbc55cb51698387f2c01ba412 100644 --- a/fs/notify/Makefile +++ b/fs/notify/Makefile @@ -1,5 +1,5 @@ -obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o inode_mark.o \ - mark.o vfsmount_mark.o fdinfo.o +obj-$(CONFIG_FSNOTIFY) += fsnotify.o notification.o group.o mark.o \ + fdinfo.o obj-y += dnotify/ obj-y += inotify/ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index 5a4ec309e28375eb60981f9c32314e1ae28c1efd..2430a0415995d9e8d056b2c24eda8b749a79998f 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -52,7 +52,7 @@ struct dnotify_mark { */ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) { - __u32 new_mask, old_mask; + __u32 new_mask = 0; struct dnotify_struct *dn; struct dnotify_mark *dn_mark = container_of(fsn_mark, struct dnotify_mark, @@ -60,17 +60,13 @@ static void dnotify_recalc_inode_mask(struct fsnotify_mark *fsn_mark) assert_spin_locked(&fsn_mark->lock); - old_mask = fsn_mark->mask; - new_mask = 0; for (dn = dn_mark->dn; dn != NULL; dn = dn->dn_next) new_mask |= (dn->dn_mask & ~FS_DN_MULTISHOT); - fsnotify_set_mark_mask_locked(fsn_mark, new_mask); - - if (old_mask == new_mask) + if (fsn_mark->mask == new_mask) return; + fsn_mark->mask = new_mask; - if (fsn_mark->inode) - fsnotify_recalc_inode_mask(fsn_mark->inode); + fsnotify_recalc_mask(fsn_mark->connector); } /* @@ -86,7 +82,8 @@ static int dnotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct dnotify_mark *dn_mark; struct dnotify_struct *dn; @@ -138,6 +135,7 @@ static void dnotify_free_mark(struct fsnotify_mark *fsn_mark) static struct fsnotify_ops dnotify_fsnotify_ops = { .handle_event = dnotify_handle_event, + .free_mark = dnotify_free_mark, }; /* @@ -160,7 +158,7 @@ void dnotify_flush(struct file *filp, fl_owner_t id) if (!S_ISDIR(inode->i_mode)) return; - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (!fsn_mark) return; dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); @@ -308,7 +306,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) /* set up the new_fsn_mark and new_dn_mark */ new_fsn_mark = &new_dn_mark->fsn_mark; - fsnotify_init_mark(new_fsn_mark, dnotify_free_mark); + fsnotify_init_mark(new_fsn_mark, dnotify_group); new_fsn_mark->mask = mask; new_dn_mark->dn = NULL; @@ -316,13 +314,12 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) mutex_lock(&dnotify_group->mark_mutex); /* add the new_fsn_mark or find an old one. */ - fsn_mark = fsnotify_find_inode_mark(dnotify_group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, dnotify_group); if (fsn_mark) { dn_mark = container_of(fsn_mark, struct dnotify_mark, fsn_mark); spin_lock(&fsn_mark->lock); } else { - fsnotify_add_mark_locked(new_fsn_mark, dnotify_group, inode, - NULL, 0); + fsnotify_add_mark_locked(new_fsn_mark, inode, NULL, 0); spin_lock(&new_fsn_mark->lock); fsn_mark = new_fsn_mark; dn_mark = new_dn_mark; diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index e5f7e47de68e4bf3cc75dc1cf03e29fed3fda8be..2fa99aeaa0959e43e80db466a95e7a09ea0dbfa7 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -57,14 +57,26 @@ static int fanotify_merge(struct list_head *list, struct fsnotify_event *event) #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS static int fanotify_get_response(struct fsnotify_group *group, - struct fanotify_perm_event_info *event) + struct fanotify_perm_event_info *event, + struct fsnotify_iter_info *iter_info) { int ret; pr_debug("%s: group=%p event=%p\n", __func__, group, event); + /* + * fsnotify_prepare_user_wait() fails if we race with mark deletion. + * Just let the operation pass in that case. + */ + if (!fsnotify_prepare_user_wait(iter_info)) { + event->response = FAN_ALLOW; + goto out; + } + wait_event(group->fanotify_data.access_waitq, event->response); + fsnotify_finish_user_wait(iter_info); +out: /* userspace responded, convert to something usable */ switch (event->response) { case FAN_ALLOW: @@ -174,7 +186,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *fanotify_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { int ret = 0; struct fanotify_event_info *event; @@ -215,7 +228,8 @@ static int fanotify_handle_event(struct fsnotify_group *group, #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS if (mask & FAN_ALL_PERM_EVENTS) { - ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event)); + ret = fanotify_get_response(group, FANOTIFY_PE(fsn_event), + iter_info); fsnotify_destroy_event(group, fsn_event); } #endif @@ -248,8 +262,14 @@ static void fanotify_free_event(struct fsnotify_event *fsn_event) kmem_cache_free(fanotify_event_cachep, event); } +static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + kmem_cache_free(fanotify_mark_cache, fsn_mark); +} + const struct fsnotify_ops fanotify_fsnotify_ops = { .handle_event = fanotify_handle_event, .free_group_priv = fanotify_free_group_priv, .free_event = fanotify_free_event, + .free_mark = fanotify_free_mark, }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index 4500a74f8d3873173ddc13a11ef7e5ab4c03cfa5..4eb6f5efa282527f4d2d4824fcf80267f0b68be7 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -2,6 +2,7 @@ #include #include +extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 2b37f27858345ccf7bca6747768df11680067a1e..907a481ac78158612ce918e4a1f4c46ea860fb30 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -41,7 +41,7 @@ extern const struct fsnotify_ops fanotify_fsnotify_ops; -static struct kmem_cache *fanotify_mark_cache __read_mostly; +struct kmem_cache *fanotify_mark_cache __read_mostly; struct kmem_cache *fanotify_event_cachep __read_mostly; struct kmem_cache *fanotify_perm_event_cachep __read_mostly; @@ -295,27 +295,37 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, } ret = copy_event_to_user(group, kevent, buf); + if (unlikely(ret == -EOPENSTALE)) { + /* + * We cannot report events with stale fd so drop it. + * Setting ret to 0 will continue the event loop and + * do the right thing if there are no more events to + * read (i.e. return bytes read, -EAGAIN or wait). + */ + ret = 0; + } + /* * Permission events get queued to wait for response. Other * events can be destroyed now. */ if (!(kevent->mask & FAN_ALL_PERM_EVENTS)) { fsnotify_destroy_event(group, kevent); - if (ret < 0) - break; } else { #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS - if (ret < 0) { + if (ret <= 0) { FANOTIFY_PE(kevent)->response = FAN_DENY; wake_up(&group->fanotify_data.access_waitq); - break; + } else { + spin_lock(&group->notification_lock); + list_add_tail(&kevent->list, + &group->fanotify_data.access_list); + spin_unlock(&group->notification_lock); } - spin_lock(&group->notification_lock); - list_add_tail(&kevent->list, - &group->fanotify_data.access_list); - spin_unlock(&group->notification_lock); #endif } + if (ret < 0) + break; buf += ret; count -= ret; } @@ -445,11 +455,6 @@ static const struct file_operations fanotify_fops = { .llseek = noop_llseek, }; -static void fanotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - kmem_cache_free(fanotify_mark_cache, fsn_mark); -} - static int fanotify_find_path(int dfd, const char __user *filename, struct path *path, unsigned int flags) { @@ -511,13 +516,12 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark, tmask &= ~FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask & ~mask; if (flags & FAN_MARK_ONDIR) tmask &= ~FAN_ONDIR; - - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; } *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask); spin_unlock(&fsn_mark->lock); @@ -534,7 +538,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -542,6 +547,8 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & real_mount(mnt)->mnt_fsnotify_mask) + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); @@ -549,9 +556,6 @@ static int fanotify_remove_vfsmount_mark(struct fsnotify_group *group, fsnotify_free_mark(fsn_mark); fsnotify_put_mark(fsn_mark); - if (removed & real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); - return 0; } @@ -564,7 +568,7 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, int destroy_mark; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { mutex_unlock(&group->mark_mutex); return -ENOENT; @@ -572,16 +576,16 @@ static int fanotify_remove_inode_mark(struct fsnotify_group *group, removed = fanotify_mark_remove_from_mask(fsn_mark, mask, flags, &destroy_mark); + if (removed & inode->i_fsnotify_mask) + fsnotify_recalc_mask(inode->i_fsnotify_marks); if (destroy_mark) fsnotify_detach_mark(fsn_mark); mutex_unlock(&group->mark_mutex); if (destroy_mark) fsnotify_free_mark(fsn_mark); - /* matches the fsnotify_find_inode_mark() */ + /* matches the fsnotify_find_mark() */ fsnotify_put_mark(fsn_mark); - if (removed & inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); return 0; } @@ -600,13 +604,13 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark, tmask |= FAN_ONDIR; oldmask = fsn_mark->mask; - fsnotify_set_mark_mask_locked(fsn_mark, tmask); + fsn_mark->mask = tmask; } else { __u32 tmask = fsn_mark->ignored_mask | mask; if (flags & FAN_MARK_ONDIR) tmask |= FAN_ONDIR; - fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask); + fsn_mark->ignored_mask = tmask; if (flags & FAN_MARK_IGNORED_SURV_MODIFY) fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY; } @@ -629,8 +633,8 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, if (!mark) return ERR_PTR(-ENOMEM); - fsnotify_init_mark(mark, fanotify_free_mark); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, 0); + fsnotify_init_mark(mark, group); + ret = fsnotify_add_mark_locked(mark, inode, mnt, 0); if (ret) { fsnotify_put_mark(mark); return ERR_PTR(ret); @@ -648,7 +652,8 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, __u32 added; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_vfsmount_mark(group, mnt); + fsn_mark = fsnotify_find_mark(&real_mount(mnt)->mnt_fsnotify_marks, + group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, NULL, mnt); if (IS_ERR(fsn_mark)) { @@ -657,10 +662,9 @@ static int fanotify_add_vfsmount_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~real_mount(mnt)->mnt_fsnotify_mask) - fsnotify_recalc_vfsmount_mask(mnt); + fsnotify_recalc_mask(real_mount(mnt)->mnt_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; @@ -686,7 +690,7 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, return 0; mutex_lock(&group->mark_mutex); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) { fsn_mark = fanotify_add_new_mark(group, inode, NULL); if (IS_ERR(fsn_mark)) { @@ -695,10 +699,9 @@ static int fanotify_add_inode_mark(struct fsnotify_group *group, } } added = fanotify_mark_add_to_mask(fsn_mark, mask, flags); - mutex_unlock(&group->mark_mutex); - if (added & ~inode->i_fsnotify_mask) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); + mutex_unlock(&group->mark_mutex); fsnotify_put_mark(fsn_mark); return 0; diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index fd98e5100cabedaa2a71ecef4bd8997334ae528e..dd63aa9a6f9a3fe2696496fee7afa17a4ea45210 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -76,12 +76,11 @@ static void inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) struct inotify_inode_mark *inode_mark; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) || - !(mark->flags & FSNOTIFY_MARK_FLAG_INODE)) + if (!(mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE)) return; inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark); - inode = igrab(mark->inode); + inode = igrab(mark->connector->inode); if (inode) { /* * IN_ALL_EVENTS represents all of the mask bits @@ -113,14 +112,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) unsigned int mflags = 0; struct inode *inode; - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) - return; - if (mark->flags & FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY) mflags |= FAN_MARK_IGNORED_SURV_MODIFY; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = igrab(mark->inode); + if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = igrab(mark->connector->inode); if (!inode) return; seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ", @@ -129,8 +125,8 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) show_mark_fhandle(m, inode); seq_putc(m, '\n'); iput(inode); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) { - struct mount *mnt = real_mount(mark->mnt); + } else if (mark->connector->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + struct mount *mnt = real_mount(mark->connector->mnt); seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n", mnt->mnt_id, mflags, mark->mask, mark->ignored_mask); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index b41515d3f0815246185b264532d71cc43824ef9a..01a9f0f007d4518563fd9ad1f06d3b48e2aa872a 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -41,6 +41,63 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +/** + * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. + * @sb: superblock being unmounted. + * + * Called during unmount with no locks held, so needs to be safe against + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + */ +void fsnotify_unmount_inodes(struct super_block *sb) +{ + struct inode *inode, *iput_inode = NULL; + + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + /* + * We cannot __iget() an inode in state I_FREEING, + * I_WILL_FREE, or I_NEW which is fine because by that point + * the inode cannot have any associated watches. + */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { + spin_unlock(&inode->i_lock); + continue; + } + + /* + * If i_count is zero, the inode cannot have any watches and + * doing an __iget/iput with MS_ACTIVE clear would actually + * evict all inodes with zero i_count from icache which is + * unnecessarily violent and may in fact be illegal to do. + */ + if (!atomic_read(&inode->i_count)) { + spin_unlock(&inode->i_lock); + continue; + } + + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); + + /* for each watch, send FS_UNMOUNT and then remove it */ + fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); + + fsnotify_inode_delete(inode); + + iput_inode = inode; + + spin_lock(&sb->s_inode_list_lock); + } + spin_unlock(&sb->s_inode_list_lock); + + if (iput_inode) + iput(iput_inode); +} + /* * Given an inode, first check if we care what happens to our children. Inotify * and dnotify both tell their parents about events. If we care about any event @@ -127,7 +184,8 @@ static int send_to_group(struct inode *to_tell, struct fsnotify_mark *vfsmount_mark, __u32 mask, const void *data, int data_is, u32 cookie, - const unsigned char *file_name) + const unsigned char *file_name, + struct fsnotify_iter_info *iter_info) { struct fsnotify_group *group = NULL; __u32 inode_test_mask = 0; @@ -178,7 +236,7 @@ static int send_to_group(struct inode *to_tell, return group->ops->handle_event(group, to_tell, inode_mark, vfsmount_mark, mask, data, data_is, - file_name, cookie); + file_name, cookie, iter_info); } /* @@ -193,8 +251,10 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, struct hlist_node *inode_node = NULL, *vfsmount_node = NULL; struct fsnotify_mark *inode_mark = NULL, *vfsmount_mark = NULL; struct fsnotify_group *inode_group, *vfsmount_group; + struct fsnotify_mark_connector *inode_conn, *vfsmount_conn; + struct fsnotify_iter_info iter_info; struct mount *mnt; - int idx, ret = 0; + int ret = 0; /* global tests shouldn't care about events on child only the specific event */ __u32 test_mask = (mask & ~FS_EVENT_ON_CHILD); @@ -210,8 +270,8 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, * SRCU because we have no references to any objects and do not * need SRCU to keep them "alive". */ - if (hlist_empty(&to_tell->i_fsnotify_marks) && - (!mnt || hlist_empty(&mnt->mnt_fsnotify_marks))) + if (!to_tell->i_fsnotify_marks && + (!mnt || !mnt->mnt_fsnotify_marks)) return 0; /* * if this is a modify event we may need to clear the ignored masks @@ -223,19 +283,30 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, !(mnt && test_mask & mnt->mnt_fsnotify_mask)) return 0; - idx = srcu_read_lock(&fsnotify_mark_srcu); + iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); if ((mask & FS_MODIFY) || - (test_mask & to_tell->i_fsnotify_mask)) - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + (test_mask & to_tell->i_fsnotify_mask)) { + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + } if (mnt && ((mask & FS_MODIFY) || (test_mask & mnt->mnt_fsnotify_mask))) { - vfsmount_node = srcu_dereference(mnt->mnt_fsnotify_marks.first, - &fsnotify_mark_srcu); - inode_node = srcu_dereference(to_tell->i_fsnotify_marks.first, + inode_conn = srcu_dereference(to_tell->i_fsnotify_marks, &fsnotify_mark_srcu); + if (inode_conn) + inode_node = srcu_dereference(inode_conn->list.first, + &fsnotify_mark_srcu); + vfsmount_conn = srcu_dereference(mnt->mnt_fsnotify_marks, + &fsnotify_mark_srcu); + if (vfsmount_conn) + vfsmount_node = srcu_dereference( + vfsmount_conn->list.first, + &fsnotify_mark_srcu); } /* @@ -272,8 +343,13 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, vfsmount_mark = NULL; } } + + iter_info.inode_mark = inode_mark; + iter_info.vfsmount_mark = vfsmount_mark; + ret = send_to_group(to_tell, inode_mark, vfsmount_mark, mask, - data, data_is, cookie, file_name); + data, data_is, cookie, file_name, + &iter_info); if (ret && (mask & ALL_FSNOTIFY_PERM_EVENTS)) goto out; @@ -287,12 +363,14 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is, } ret = 0; out: - srcu_read_unlock(&fsnotify_mark_srcu, idx); + srcu_read_unlock(&fsnotify_mark_srcu, iter_info.srcu_idx); return ret; } EXPORT_SYMBOL_GPL(fsnotify); +extern struct kmem_cache *fsnotify_mark_connector_cachep; + static __init int fsnotify_init(void) { int ret; @@ -303,6 +381,9 @@ static __init int fsnotify_init(void) if (ret) panic("initializing fsnotify_mark_srcu"); + fsnotify_mark_connector_cachep = KMEM_CACHE(fsnotify_mark_connector, + SLAB_PANIC); + return 0; } core_initcall(fsnotify_init); diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 0a3bc2cf192cfdf94f51e3a814c175f26a586e74..bf012e8ecd14be9fb3d9a526eba95baa9fe7c9be 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -8,60 +8,36 @@ #include "../mount.h" +struct fsnotify_iter_info { + struct fsnotify_mark *inode_mark; + struct fsnotify_mark *vfsmount_mark; + int srcu_idx; +}; + /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; -/* Calculate mask of events for a list of marks */ -extern u32 fsnotify_recalc_mask(struct hlist_head *head); - /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); -extern void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *fsn_mark, - __u32 mask); -/* Add mark to a proper place in mark list */ -extern int fsnotify_add_mark_list(struct hlist_head *head, - struct fsnotify_mark *mark, - int allow_dups); -/* add a mark to an inode */ -extern int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups); -/* add a mark to a vfsmount */ -extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups); - -/* vfsmount specific destruction of a mark */ -extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); -/* inode specific destruction of a mark */ -extern void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark); -/* Find mark belonging to given group in the list of marks */ -extern struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group); -/* Destroy all marks in the given list protected by 'lock' */ -extern void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock); +/* Destroy all marks connected via given connector */ +extern void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { - fsnotify_destroy_marks(&inode->i_fsnotify_marks, &inode->i_lock); + fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { - fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks, - &mnt->mnt_root->d_lock); + fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } -/* prepare for freeing all marks associated with given group */ -extern void fsnotify_detach_group_marks(struct fsnotify_group *group); -/* - * wait for fsnotify_mark_srcu period to end and free all marks in destroy_list - */ -extern void fsnotify_mark_destroy_list(void); +/* Wait until all marks queued for destruction are destroyed */ +extern void fsnotify_wait_marks_destroyed(void); /* * update the dentry->d_flags of all of inode's children to indicate if inode cares diff --git a/fs/notify/group.c b/fs/notify/group.c index fbe3cbebec1671247231ba80b79694ef5b03157e..32357534de18db462a2f154e2e361528471acd0c 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -66,14 +66,23 @@ void fsnotify_destroy_group(struct fsnotify_group *group) */ fsnotify_group_stop_queueing(group); - /* clear all inode marks for this group, attach them to destroy_list */ - fsnotify_detach_group_marks(group); + /* Clear all marks for this group and queue them for destruction */ + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_ALL_TYPES); /* - * Wait for fsnotify_mark_srcu period to end and free all marks in - * destroy_list + * Some marks can still be pinned when waiting for response from + * userspace. Wait for those now. fsnotify_prepare_user_wait() will + * not succeed now so this wait is race-free. */ - fsnotify_mark_destroy_list(); + wait_event(group->notification_waitq, !atomic_read(&group->user_waits)); + + /* + * Wait until all marks get really destroyed. We could actually destroy + * them ourselves instead of waiting for worker to do it, however that + * would be racy as worker can already be processing some marks before + * we even entered fsnotify_destroy_group(). + */ + fsnotify_wait_marks_destroyed(); /* * Since we have waited for fsnotify_mark_srcu in @@ -124,6 +133,7 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) /* set to 0 when there a no external references to this group */ atomic_set(&group->refcnt, 1); atomic_set(&group->num_marks, 0); + atomic_set(&group->user_waits, 0); spin_lock_init(&group->notification_lock); INIT_LIST_HEAD(&group->notification_list); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c deleted file mode 100644 index a3645249f7ecfa4cbdae8434bb87b47ffd02dbb1..0000000000000000000000000000000000000000 --- a/fs/notify/inode_mark.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include - -#include - -#include -#include "fsnotify.h" - -#include "../internal.h" - -/* - * Recalculate the inode->i_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this inode. - */ -void fsnotify_recalc_inode_mask(struct inode *inode) -{ - spin_lock(&inode->i_lock); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - __fsnotify_update_child_dentry_flags(inode); -} - -void fsnotify_destroy_inode_mark(struct fsnotify_mark *mark) -{ - struct inode *inode = mark->inode; - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->inode = NULL; - - /* - * this mark is now off the inode->i_fsnotify_marks list and we - * hold the inode->i_lock, so this is the perfect time to update the - * inode->i_fsnotify_mask - */ - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); -} - -/* - * Given a group clear all of the inode marks associated with that group. - */ -void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_INODE); -} - -/* - * given a group and inode, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, - struct inode *inode) -{ - struct fsnotify_mark *mark; - - spin_lock(&inode->i_lock); - mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); - spin_unlock(&inode->i_lock); - - return mark; -} - -/* - * If we are setting a mark mask on an inode mark we should pin the inode - * in memory. - */ -void fsnotify_set_inode_mark_mask_locked(struct fsnotify_mark *mark, - __u32 mask) -{ - struct inode *inode; - - assert_spin_locked(&mark->lock); - - if (mask && - mark->inode && - !(mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) { - mark->flags |= FSNOTIFY_MARK_FLAG_OBJECT_PINNED; - inode = igrab(mark->inode); - /* - * we shouldn't be able to get here if the inode wasn't - * already safely held in memory. But bug in case it - * ever is wrong. - */ - BUG_ON(!inode); - } -} - -/* - * Attach an initialized mark to a given inode. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which group and for which inodes. These - * marks are ordered according to priority, highest number first, and then by - * the group's location in memory. - */ -int fsnotify_add_inode_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, - int allow_dups) -{ - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_INODE; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&inode->i_lock); - mark->inode = inode; - ret = fsnotify_add_mark_list(&inode->i_fsnotify_marks, mark, - allow_dups); - inode->i_fsnotify_mask = fsnotify_recalc_mask(&inode->i_fsnotify_marks); - spin_unlock(&inode->i_lock); - - return ret; -} - -/** - * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @sb: superblock being unmounted. - * - * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. - */ -void fsnotify_unmount_inodes(struct super_block *sb) -{ - struct inode *inode, *iput_inode = NULL; - - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - /* - * We cannot __iget() an inode in state I_FREEING, - * I_WILL_FREE, or I_NEW which is fine because by that point - * the inode cannot have any associated watches. - */ - spin_lock(&inode->i_lock); - if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) { - spin_unlock(&inode->i_lock); - continue; - } - - /* - * If i_count is zero, the inode cannot have any watches and - * doing an __iget/iput with MS_ACTIVE clear would actually - * evict all inodes with zero i_count from icache which is - * unnecessarily violent and may in fact be illegal to do. - */ - if (!atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - __iget(inode); - spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); - - /* for each watch, send FS_UNMOUNT and then remove it */ - fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0); - - fsnotify_inode_delete(inode); - - iput_inode = inode; - - spin_lock(&sb->s_inode_list_lock); - } - spin_unlock(&sb->s_inode_list_lock); - - if (iput_inode) - iput(iput_inode); -} diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index 7c461fd49c4ccf1f81d87d1eddcacd16c3c3ab86..9ff67b61da8a6abb3e573ae7b1f31c71235b4139 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -27,9 +27,11 @@ extern int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie); + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info); extern const struct fsnotify_ops inotify_fsnotify_ops; +extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 1aeb837ae41405c94c72041bbba87fa7e9d0e1a5..8b73332735ba5071f67c5f39b6079c506cdea6f7 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -68,7 +68,8 @@ int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct inotify_inode_mark *i_mark; struct inotify_event_info *event; @@ -156,8 +157,8 @@ static int idr_callback(int id, void *p, void *data) * BUG() that was here. */ if (fsn_mark) - printk(KERN_WARNING "fsn_mark->group=%p inode=%p wd=%d\n", - fsn_mark->group, fsn_mark->inode, i_mark->wd); + printk(KERN_WARNING "fsn_mark->group=%p wd=%d\n", + fsn_mark->group, i_mark->wd); return 0; } @@ -175,9 +176,20 @@ static void inotify_free_event(struct fsnotify_event *fsn_event) kfree(INOTIFY_E(fsn_event)); } +/* ding dong the mark is dead */ +static void inotify_free_mark(struct fsnotify_mark *fsn_mark) +{ + struct inotify_inode_mark *i_mark; + + i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); + + kmem_cache_free(inotify_inode_mark_cachep, i_mark); +} + const struct fsnotify_ops inotify_fsnotify_ops = { .handle_event = inotify_handle_event, .free_group_priv = inotify_free_group_priv, .free_event = inotify_free_event, .freeing_mark = inotify_freeing_mark, + .free_mark = inotify_free_mark, }; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 498d609b26c7dbf4d81537477b338aedc1a64156..7cc7d3fb1862fcb557933e63ec66361cf8bcfb7a 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,7 +47,7 @@ /* configurable via /proc/sys/fs/inotify/ */ static int inotify_max_queued_events __read_mostly; -static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; +struct kmem_cache *inotify_inode_mark_cachep __read_mostly; #ifdef CONFIG_SYSCTL @@ -395,21 +395,6 @@ static struct inotify_inode_mark *inotify_idr_find(struct fsnotify_group *group, return i_mark; } -static void do_inotify_remove_from_idr(struct fsnotify_group *group, - struct inotify_inode_mark *i_mark) -{ - struct idr *idr = &group->inotify_data.idr; - spinlock_t *idr_lock = &group->inotify_data.idr_lock; - int wd = i_mark->wd; - - assert_spin_locked(idr_lock); - - idr_remove(idr, wd); - - /* removed from the idr, drop that ref */ - fsnotify_put_mark(&i_mark->fsn_mark); -} - /* * Remove the mark from the idr (if present) and drop the reference * on the mark because it was in the idr. @@ -417,6 +402,7 @@ static void do_inotify_remove_from_idr(struct fsnotify_group *group, static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark *i_mark) { + struct idr *idr = &group->inotify_data.idr; spinlock_t *idr_lock = &group->inotify_data.idr_lock; struct inotify_inode_mark *found_i_mark = NULL; int wd; @@ -429,18 +415,16 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, * if it wasn't.... */ if (wd == -1) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } /* Lets look in the idr to see if we find it */ found_i_mark = inotify_idr_find_locked(group, wd); if (unlikely(!found_i_mark)) { - WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); goto out; } @@ -451,35 +435,33 @@ static void inotify_remove_from_idr(struct fsnotify_group *group, */ if (unlikely(found_i_mark != i_mark)) { WARN_ONCE(1, "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p " - "mark->inode=%p found_i_mark=%p found_i_mark->wd=%d " - "found_i_mark->group=%p found_i_mark->inode=%p\n", - __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group, - i_mark->fsn_mark.inode, found_i_mark, found_i_mark->wd, - found_i_mark->fsn_mark.group, - found_i_mark->fsn_mark.inode); + "found_i_mark=%p found_i_mark->wd=%d " + "found_i_mark->group=%p\n", __func__, i_mark, + i_mark->wd, i_mark->fsn_mark.group, found_i_mark, + found_i_mark->wd, found_i_mark->fsn_mark.group); goto out; } /* * One ref for being in the idr - * one ref held by the caller trying to kill us * one ref grabbed by inotify_idr_find */ - if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 3)) { - printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p" - " i_mark->inode=%p\n", __func__, i_mark, i_mark->wd, - i_mark->fsn_mark.group, i_mark->fsn_mark.inode); + if (unlikely(atomic_read(&i_mark->fsn_mark.refcnt) < 2)) { + printk(KERN_ERR "%s: i_mark=%p i_mark->wd=%d i_mark->group=%p\n", + __func__, i_mark, i_mark->wd, i_mark->fsn_mark.group); /* we can't really recover with bad ref cnting.. */ BUG(); } - do_inotify_remove_from_idr(group, i_mark); + idr_remove(idr, wd); + /* Removed from the idr, drop that ref. */ + fsnotify_put_mark(&i_mark->fsn_mark); out: + i_mark->wd = -1; + spin_unlock(idr_lock); /* match the ref taken by inotify_idr_find_locked() */ if (found_i_mark) fsnotify_put_mark(&found_i_mark->fsn_mark); - i_mark->wd = -1; - spin_unlock(idr_lock); } /* @@ -492,7 +474,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, /* Queue ignore event for the watch */ inotify_handle_event(group, NULL, fsn_mark, NULL, FS_IN_IGNORED, - NULL, FSNOTIFY_EVENT_NONE, NULL, 0); + NULL, FSNOTIFY_EVENT_NONE, NULL, 0, NULL); i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); /* remove this mark from the idr */ @@ -501,16 +483,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, dec_inotify_watches(group->inotify_data.ucounts); } -/* ding dong the mark is dead */ -static void inotify_free_mark(struct fsnotify_mark *fsn_mark) -{ - struct inotify_inode_mark *i_mark; - - i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); - - kmem_cache_free(inotify_inode_mark_cachep, i_mark); -} - static int inotify_update_existing_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) @@ -524,21 +496,19 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, mask = inotify_arg_to_mask(arg); - fsn_mark = fsnotify_find_inode_mark(group, inode); + fsn_mark = fsnotify_find_mark(&inode->i_fsnotify_marks, group); if (!fsn_mark) return -ENOENT; i_mark = container_of(fsn_mark, struct inotify_inode_mark, fsn_mark); spin_lock(&fsn_mark->lock); - old_mask = fsn_mark->mask; if (add) - fsnotify_set_mark_mask_locked(fsn_mark, (fsn_mark->mask | mask)); + fsn_mark->mask |= mask; else - fsnotify_set_mark_mask_locked(fsn_mark, mask); + fsn_mark->mask = mask; new_mask = fsn_mark->mask; - spin_unlock(&fsn_mark->lock); if (old_mask != new_mask) { @@ -549,7 +519,7 @@ static int inotify_update_existing_watch(struct fsnotify_group *group, /* update the inode with this new fsn_mark */ if (dropped || do_inode) - fsnotify_recalc_inode_mask(inode); + fsnotify_recalc_mask(inode->i_fsnotify_marks); } @@ -578,7 +548,7 @@ static int inotify_new_watch(struct fsnotify_group *group, if (unlikely(!tmp_i_mark)) return -ENOMEM; - fsnotify_init_mark(&tmp_i_mark->fsn_mark, inotify_free_mark); + fsnotify_init_mark(&tmp_i_mark->fsn_mark, group); tmp_i_mark->fsn_mark.mask = mask; tmp_i_mark->wd = -1; @@ -594,8 +564,7 @@ static int inotify_new_watch(struct fsnotify_group *group, } /* we are on the idr, now get on the inode */ - ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, group, inode, - NULL, 0); + ret = fsnotify_add_mark_locked(&tmp_i_mark->fsn_mark, inode, NULL, 0); if (ret) { /* we failed to get on the inode, get off the idr */ inotify_remove_from_idr(group, tmp_i_mark); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 6043306e8e21518daeb945ebf2569d19fe9edbbf..9991f88267342f9e699655cc29129225334e8dd6 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -33,7 +33,7 @@ * * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock * * group->mark_mutex protects the marks_list anchored inside a given group and * each mark is hooked via the g_list. It also protects the groups private @@ -44,14 +44,22 @@ * is assigned to as well as the access to a reference of the inode/vfsmount * that is being watched by the mark. * - * inode->i_lock protects the i_fsnotify_marks list anchored inside a - * given inode and each mark is hooked via the i_list. (and sorta the - * free_i_list) + * mark->connector->lock protects the list of marks anchored inside an + * inode / vfsmount and each mark is hooked via the i_list. * + * A list of notification marks relating to inode / mnt is contained in + * fsnotify_mark_connector. That structure is alive as long as there are any + * marks in the list and is also protected by fsnotify_mark_srcu. A mark gets + * detached from fsnotify_mark_connector when last reference to the mark is + * dropped. Thus having mark reference is enough to protect mark->connector + * pointer and to make sure fsnotify_mark_connector cannot disappear. Also + * because we remove mark from g_list before dropping mark reference associated + * with that, any mark found through g_list is guaranteed to have + * mark->connector set until we drop group->mark_mutex. * * LIFETIME: * Inode marks survive between when they are added to an inode and when their - * refcnt==0. + * refcnt==0. Marks are also protected by fsnotify_mark_srcu. * * The inode mark can be cleared for a number of different reasons including: * - The inode is unlinked for the last time. (fsnotify_inode_remove) @@ -61,17 +69,6 @@ * - The fsnotify_group associated with the mark is going away and all such marks * need to be cleaned up. (fsnotify_clear_marks_by_group) * - * Worst case we are given an inode and need to clean up all the marks on that - * inode. We take i_lock and walk the i_fsnotify_marks safely. For each - * mark on the list we take a reference (so the mark can't disappear under us). - * We remove that mark form the inode's list of marks and we add this mark to a - * private list anchored on the stack using i_free_list; we walk i_free_list - * and before we destroy the mark we make sure that we dont race with a - * concurrent destroy_group by getting a ref to the marks group and taking the - * groups mutex. - - * Very similarly for freeing by group, except we use free_g_list. - * * This has the very interesting property of being able to run concurrently with * any (or all) other directions. */ @@ -94,94 +91,281 @@ #define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ struct srcu_struct fsnotify_mark_srcu; +struct kmem_cache *fsnotify_mark_connector_cachep; + static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); +static struct fsnotify_mark_connector *connector_destroy_list; static void fsnotify_mark_destroy_workfn(struct work_struct *work); static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy_workfn); +static void fsnotify_connector_destroy_workfn(struct work_struct *work); +static DECLARE_WORK(connector_reaper_work, fsnotify_connector_destroy_workfn); + void fsnotify_get_mark(struct fsnotify_mark *mark) { + WARN_ON_ONCE(!atomic_read(&mark->refcnt)); atomic_inc(&mark->refcnt); } -void fsnotify_put_mark(struct fsnotify_mark *mark) +/* + * Get mark reference when we found the mark via lockless traversal of object + * list. Mark can be already removed from the list by now and on its way to be + * destroyed once SRCU period ends. + */ +static bool fsnotify_get_mark_safe(struct fsnotify_mark *mark) { - if (atomic_dec_and_test(&mark->refcnt)) { - if (mark->group) - fsnotify_put_group(mark->group); - mark->free_mark(mark); - } + return atomic_inc_not_zero(&mark->refcnt); } -/* Calculate mask of events for a list of marks */ -u32 fsnotify_recalc_mask(struct hlist_head *head) +static void __fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) { u32 new_mask = 0; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) - new_mask |= mark->mask; - return new_mask; + assert_spin_locked(&conn->lock); + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) + new_mask |= mark->mask; + } + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + conn->inode->i_fsnotify_mask = new_mask; + else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) + real_mount(conn->mnt)->mnt_fsnotify_mask = new_mask; } /* - * Remove mark from inode / vfsmount list, group list, drop inode reference - * if we got one. - * - * Must be called with group->mark_mutex held. + * Calculate mask of events for a list of marks. The caller must make sure + * connector and connector->inode cannot disappear under us. Callers achieve + * this by holding a mark->lock or mark->group->mark_mutex for a mark on this + * list. */ -void fsnotify_detach_mark(struct fsnotify_mark *mark) +void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn) +{ + if (!conn) + return; + + spin_lock(&conn->lock); + __fsnotify_recalc_mask(conn); + spin_unlock(&conn->lock); + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) + __fsnotify_update_child_dentry_flags(conn->inode); +} + +/* Free all connectors queued for freeing once SRCU period ends */ +static void fsnotify_connector_destroy_workfn(struct work_struct *work) +{ + struct fsnotify_mark_connector *conn, *free; + + spin_lock(&destroy_lock); + conn = connector_destroy_list; + connector_destroy_list = NULL; + spin_unlock(&destroy_lock); + + synchronize_srcu(&fsnotify_mark_srcu); + while (conn) { + free = conn; + conn = conn->destroy_next; + kmem_cache_free(fsnotify_mark_connector_cachep, free); + } +} + +static struct inode *fsnotify_detach_connector_from_object( + struct fsnotify_mark_connector *conn) { struct inode *inode = NULL; + + if (conn->flags & FSNOTIFY_OBJ_TYPE_INODE) { + inode = conn->inode; + rcu_assign_pointer(inode->i_fsnotify_marks, NULL); + inode->i_fsnotify_mask = 0; + conn->inode = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_INODE; + } else if (conn->flags & FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + rcu_assign_pointer(real_mount(conn->mnt)->mnt_fsnotify_marks, + NULL); + real_mount(conn->mnt)->mnt_fsnotify_mask = 0; + conn->mnt = NULL; + conn->flags &= ~FSNOTIFY_OBJ_TYPE_VFSMOUNT; + } + + return inode; +} + +static void fsnotify_final_mark_destroy(struct fsnotify_mark *mark) +{ struct fsnotify_group *group = mark->group; - BUG_ON(!mutex_is_locked(&group->mark_mutex)); + if (WARN_ON_ONCE(!group)) + return; + group->ops->free_mark(mark); + fsnotify_put_group(group); +} - spin_lock(&mark->lock); +void fsnotify_put_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_mark_connector *conn; + struct inode *inode = NULL; + bool free_conn = false; - /* something else already called this function on this mark */ - if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { - spin_unlock(&mark->lock); + /* Catch marks that were actually never attached to object */ + if (!mark->connector) { + if (atomic_dec_and_test(&mark->refcnt)) + fsnotify_final_mark_destroy(mark); return; } - mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + /* + * We have to be careful so that traversals of obj_list under lock can + * safely grab mark reference. + */ + if (!atomic_dec_and_lock(&mark->refcnt, &mark->connector->lock)) + return; - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) { - inode = mark->inode; - fsnotify_destroy_inode_mark(mark); - } else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) - fsnotify_destroy_vfsmount_mark(mark); - else - BUG(); + conn = mark->connector; + hlist_del_init_rcu(&mark->obj_list); + if (hlist_empty(&conn->list)) { + inode = fsnotify_detach_connector_from_object(conn); + free_conn = true; + } else { + __fsnotify_recalc_mask(conn); + } + mark->connector = NULL; + spin_unlock(&conn->lock); + + iput(inode); + + if (free_conn) { + spin_lock(&destroy_lock); + conn->destroy_next = connector_destroy_list; + connector_destroy_list = conn; + spin_unlock(&destroy_lock); + queue_work(system_unbound_wq, &connector_reaper_work); + } /* * Note that we didn't update flags telling whether inode cares about * what's happening with children. We update these flags from * __fsnotify_parent() lazily when next event happens on one of our * children. */ + spin_lock(&destroy_lock); + list_add(&mark->g_list, &destroy_list); + spin_unlock(&destroy_lock); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); +} - list_del_init(&mark->g_list); +bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group; - spin_unlock(&mark->lock); + if (WARN_ON_ONCE(!iter_info->inode_mark && !iter_info->vfsmount_mark)) + return false; + + if (iter_info->inode_mark) + group = iter_info->inode_mark->group; + else + group = iter_info->vfsmount_mark->group; + + /* + * Since acquisition of mark reference is an atomic op as well, we can + * be sure this inc is seen before any effect of refcount increment. + */ + atomic_inc(&group->user_waits); + + if (iter_info->inode_mark) { + /* This can fail if mark is being removed */ + if (!fsnotify_get_mark_safe(iter_info->inode_mark)) + goto out_wait; + } + if (iter_info->vfsmount_mark) { + if (!fsnotify_get_mark_safe(iter_info->vfsmount_mark)) + goto out_inode; + } - if (inode && (mark->flags & FSNOTIFY_MARK_FLAG_OBJECT_PINNED)) - iput(inode); + /* + * Now that both marks are pinned by refcount in the inode / vfsmount + * lists, we can drop SRCU lock, and safely resume the list iteration + * once userspace returns. + */ + srcu_read_unlock(&fsnotify_mark_srcu, iter_info->srcu_idx); + + return true; +out_inode: + if (iter_info->inode_mark) + fsnotify_put_mark(iter_info->inode_mark); +out_wait: + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); + return false; +} + +void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info) +{ + struct fsnotify_group *group = NULL; + + iter_info->srcu_idx = srcu_read_lock(&fsnotify_mark_srcu); + if (iter_info->inode_mark) { + group = iter_info->inode_mark->group; + fsnotify_put_mark(iter_info->inode_mark); + } + if (iter_info->vfsmount_mark) { + group = iter_info->vfsmount_mark->group; + fsnotify_put_mark(iter_info->vfsmount_mark); + } + /* + * We abuse notification_waitq on group shutdown for waiting for all + * marks pinned when waiting for userspace. + */ + if (atomic_dec_and_test(&group->user_waits) && group->shutdown) + wake_up(&group->notification_waitq); +} + +/* + * Mark mark as detached, remove it from group list. Mark still stays in object + * list until its last reference is dropped. Note that we rely on mark being + * removed from group list before corresponding reference to it is dropped. In + * particular we rely on mark->connector being valid while we hold + * group->mark_mutex if we found the mark through g_list. + * + * Must be called with group->mark_mutex held. The caller must either hold + * reference to the mark or be protected by fsnotify_mark_srcu. + */ +void fsnotify_detach_mark(struct fsnotify_mark *mark) +{ + struct fsnotify_group *group = mark->group; + + WARN_ON_ONCE(!mutex_is_locked(&group->mark_mutex)); + WARN_ON_ONCE(!srcu_read_lock_held(&fsnotify_mark_srcu) && + atomic_read(&mark->refcnt) < 1 + + !!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)); + + spin_lock(&mark->lock); + /* something else already called this function on this mark */ + if (!(mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { + spin_unlock(&mark->lock); + return; + } + mark->flags &= ~FSNOTIFY_MARK_FLAG_ATTACHED; + list_del_init(&mark->g_list); + spin_unlock(&mark->lock); atomic_dec(&group->num_marks); + + /* Drop mark reference acquired in fsnotify_add_mark_locked() */ + fsnotify_put_mark(mark); } /* - * Prepare mark for freeing and add it to the list of marks prepared for - * freeing. The actual freeing must happen after SRCU period ends and the - * caller is responsible for this. + * Free fsnotify mark. The mark is actually only marked as being freed. The + * freeing is actually happening only once last reference to the mark is + * dropped from a workqueue which first waits for srcu period end. * - * The function returns true if the mark was added to the list of marks for - * freeing. The function returns false if someone else has already called - * __fsnotify_free_mark() for the mark. + * Caller must have a reference to the mark or be protected by + * fsnotify_mark_srcu. */ -static bool __fsnotify_free_mark(struct fsnotify_mark *mark) +void fsnotify_free_mark(struct fsnotify_mark *mark) { struct fsnotify_group *group = mark->group; @@ -189,7 +373,7 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) /* something else already called this function on this mark */ if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE)) { spin_unlock(&mark->lock); - return false; + return; } mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; spin_unlock(&mark->lock); @@ -201,25 +385,6 @@ static bool __fsnotify_free_mark(struct fsnotify_mark *mark) */ if (group->ops->freeing_mark) group->ops->freeing_mark(mark, group); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - - return true; -} - -/* - * Free fsnotify mark. The freeing is actually happening from a workqueue which - * first waits for srcu period end. Caller must have a reference to the mark - * or be protected by fsnotify_mark_srcu. - */ -void fsnotify_free_mark(struct fsnotify_mark *mark) -{ - if (__fsnotify_free_mark(mark)) { - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - } } void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -231,54 +396,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, fsnotify_free_mark(mark); } -void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) -{ - struct fsnotify_mark *mark; - - while (1) { - /* - * We have to be careful since we can race with e.g. - * fsnotify_clear_marks_by_group() and once we drop 'lock', - * mark can get removed from the obj_list and destroyed. But - * we are holding mark reference so mark cannot be freed and - * calling fsnotify_destroy_mark() more than once is fine. - */ - spin_lock(lock); - if (hlist_empty(head)) { - spin_unlock(lock); - break; - } - mark = hlist_entry(head->first, struct fsnotify_mark, obj_list); - /* - * We don't update i_fsnotify_mask / mnt_fsnotify_mask here - * since inode / mount is going away anyway. So just remove - * mark from the list. - */ - hlist_del_init_rcu(&mark->obj_list); - fsnotify_get_mark(mark); - spin_unlock(lock); - fsnotify_destroy_mark(mark, mark->group); - fsnotify_put_mark(mark); - } -} - -void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->mask = mask; - - if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) - fsnotify_set_inode_mark_mask_locked(mark, mask); -} - -void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask) -{ - assert_spin_locked(&mark->lock); - - mark->ignored_mask = mask; -} - /* * Sorting function for lists of fsnotify marks. * @@ -315,37 +432,133 @@ int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b) return -1; } -/* Add mark into proper place in given list of marks */ -int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, - int allow_dups) +static int fsnotify_attach_connector_to_object( + struct fsnotify_mark_connector __rcu **connp, + struct inode *inode, + struct vfsmount *mnt) +{ + struct fsnotify_mark_connector *conn; + + conn = kmem_cache_alloc(fsnotify_mark_connector_cachep, GFP_KERNEL); + if (!conn) + return -ENOMEM; + spin_lock_init(&conn->lock); + INIT_HLIST_HEAD(&conn->list); + if (inode) { + conn->flags = FSNOTIFY_OBJ_TYPE_INODE; + conn->inode = igrab(inode); + } else { + conn->flags = FSNOTIFY_OBJ_TYPE_VFSMOUNT; + conn->mnt = mnt; + } + /* + * cmpxchg() provides the barrier so that readers of *connp can see + * only initialized structure + */ + if (cmpxchg(connp, NULL, conn)) { + /* Someone else created list structure for us */ + if (inode) + iput(inode); + kmem_cache_free(fsnotify_mark_connector_cachep, conn); + } + + return 0; +} + +/* + * Get mark connector, make sure it is alive and return with its lock held. + * This is for users that get connector pointer from inode or mount. Users that + * hold reference to a mark on the list may directly lock connector->lock as + * they are sure list cannot go away under them. + */ +static struct fsnotify_mark_connector *fsnotify_grab_connector( + struct fsnotify_mark_connector __rcu **connp) +{ + struct fsnotify_mark_connector *conn; + int idx; + + idx = srcu_read_lock(&fsnotify_mark_srcu); + conn = srcu_dereference(*connp, &fsnotify_mark_srcu); + if (!conn) + goto out; + spin_lock(&conn->lock); + if (!(conn->flags & (FSNOTIFY_OBJ_TYPE_INODE | + FSNOTIFY_OBJ_TYPE_VFSMOUNT))) { + spin_unlock(&conn->lock); + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return NULL; + } +out: + srcu_read_unlock(&fsnotify_mark_srcu, idx); + return conn; +} + +/* + * Add mark into proper place in given list of marks. These marks may be used + * for the fsnotify backend to determine which event types should be delivered + * to which group and for which inodes. These marks are ordered according to + * priority, highest number first, and then by the group's location in memory. + */ +static int fsnotify_add_mark_list(struct fsnotify_mark *mark, + struct inode *inode, struct vfsmount *mnt, + int allow_dups) { struct fsnotify_mark *lmark, *last = NULL; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark_connector __rcu **connp; int cmp; + int err = 0; + + if (WARN_ON(!inode && !mnt)) + return -EINVAL; + if (inode) + connp = &inode->i_fsnotify_marks; + else + connp = &real_mount(mnt)->mnt_fsnotify_marks; +restart: + spin_lock(&mark->lock); + conn = fsnotify_grab_connector(connp); + if (!conn) { + spin_unlock(&mark->lock); + err = fsnotify_attach_connector_to_object(connp, inode, mnt); + if (err) + return err; + goto restart; + } /* is mark the first mark? */ - if (hlist_empty(head)) { - hlist_add_head_rcu(&mark->obj_list, head); - return 0; + if (hlist_empty(&conn->list)) { + hlist_add_head_rcu(&mark->obj_list, &conn->list); + goto added; } /* should mark be in the middle of the current list? */ - hlist_for_each_entry(lmark, head, obj_list) { + hlist_for_each_entry(lmark, &conn->list, obj_list) { last = lmark; - if ((lmark->group == mark->group) && !allow_dups) - return -EEXIST; + if ((lmark->group == mark->group) && + (lmark->flags & FSNOTIFY_MARK_FLAG_ATTACHED) && + !allow_dups) { + err = -EEXIST; + goto out_err; + } cmp = fsnotify_compare_groups(lmark->group, mark->group); if (cmp >= 0) { hlist_add_before_rcu(&mark->obj_list, &lmark->obj_list); - return 0; + goto added; } } BUG_ON(last == NULL); /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); - return 0; +added: + mark->connector = conn; +out_err: + spin_unlock(&conn->lock); + spin_unlock(&mark->lock); + return err; } /* @@ -353,10 +566,10 @@ int fsnotify_add_mark_list(struct hlist_head *head, struct fsnotify_mark *mark, * These marks may be used for the fsnotify backend to determine which * event types should be delivered to which group. */ -int fsnotify_add_mark_locked(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct inode *inode, +int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode, struct vfsmount *mnt, int allow_dups) { + struct fsnotify_group *group = mark->group; int ret = 0; BUG_ON(inode && mnt); @@ -367,61 +580,42 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark, * LOCKING ORDER!!!! * group->mark_mutex * mark->lock - * inode->i_lock + * mark->connector->lock */ spin_lock(&mark->lock); mark->flags |= FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_ATTACHED; - fsnotify_get_group(group); - mark->group = group; list_add(&mark->g_list, &group->marks_list); atomic_inc(&group->num_marks); - fsnotify_get_mark(mark); /* for i_list and g_list */ - - if (inode) { - ret = fsnotify_add_inode_mark(mark, group, inode, allow_dups); - if (ret) - goto err; - } else if (mnt) { - ret = fsnotify_add_vfsmount_mark(mark, group, mnt, allow_dups); - if (ret) - goto err; - } else { - BUG(); - } - - /* this will pin the object if appropriate */ - fsnotify_set_mark_mask_locked(mark, mark->mask); + fsnotify_get_mark(mark); /* for g_list */ spin_unlock(&mark->lock); - if (inode) - __fsnotify_update_child_dentry_flags(inode); + ret = fsnotify_add_mark_list(mark, inode, mnt, allow_dups); + if (ret) + goto err; + + if (mark->mask) + fsnotify_recalc_mask(mark->connector); return ret; err: - mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE; + mark->flags &= ~(FSNOTIFY_MARK_FLAG_ALIVE | + FSNOTIFY_MARK_FLAG_ATTACHED); list_del_init(&mark->g_list); - fsnotify_put_group(group); - mark->group = NULL; atomic_dec(&group->num_marks); - spin_unlock(&mark->lock); - - spin_lock(&destroy_lock); - list_add(&mark->g_list, &destroy_list); - spin_unlock(&destroy_lock); - queue_delayed_work(system_unbound_wq, &reaper_work, - FSNOTIFY_REAPER_DELAY); - + fsnotify_put_mark(mark); return ret; } -int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, - struct inode *inode, struct vfsmount *mnt, int allow_dups) +int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode, + struct vfsmount *mnt, int allow_dups) { int ret; + struct fsnotify_group *group = mark->group; + mutex_lock(&group->mark_mutex); - ret = fsnotify_add_mark_locked(mark, group, inode, mnt, allow_dups); + ret = fsnotify_add_mark_locked(mark, inode, mnt, allow_dups); mutex_unlock(&group->mark_mutex); return ret; } @@ -430,29 +624,42 @@ int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, * Given a list of marks, find the mark associated with given group. If found * take a reference to that mark and return it, else return NULL. */ -struct fsnotify_mark *fsnotify_find_mark(struct hlist_head *head, - struct fsnotify_group *group) +struct fsnotify_mark *fsnotify_find_mark( + struct fsnotify_mark_connector __rcu **connp, + struct fsnotify_group *group) { + struct fsnotify_mark_connector *conn; struct fsnotify_mark *mark; - hlist_for_each_entry(mark, head, obj_list) { - if (mark->group == group) { + conn = fsnotify_grab_connector(connp); + if (!conn) + return NULL; + + hlist_for_each_entry(mark, &conn->list, obj_list) { + if (mark->group == group && + (mark->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { fsnotify_get_mark(mark); + spin_unlock(&conn->lock); return mark; } } + spin_unlock(&conn->lock); return NULL; } -/* - * clear any marks in a group in which mark->flags & flags is true - */ -void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, - unsigned int flags) +/* Clear any marks in a group with given type */ +void fsnotify_clear_marks_by_group(struct fsnotify_group *group, + unsigned int type) { struct fsnotify_mark *lmark, *mark; LIST_HEAD(to_free); + struct list_head *head = &to_free; + /* Skip selection step if we want to clear all marks. */ + if (type == FSNOTIFY_OBJ_ALL_TYPES) { + head = &group->marks_list; + goto clear; + } /* * We have to be really careful here. Anytime we drop mark_mutex, e.g. * fsnotify_clear_marks_by_inode() can come and free marks. Even in our @@ -464,18 +671,19 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, */ mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) { - if (mark->flags & flags) + if (mark->connector->flags & type) list_move(&mark->g_list, &to_free); } mutex_unlock(&group->mark_mutex); +clear: while (1) { mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&to_free)) { + if (list_empty(head)) { mutex_unlock(&group->mark_mutex); break; } - mark = list_first_entry(&to_free, struct fsnotify_mark, g_list); + mark = list_first_entry(head, struct fsnotify_mark, g_list); fsnotify_get_mark(mark); fsnotify_detach_mark(mark); mutex_unlock(&group->mark_mutex); @@ -484,49 +692,62 @@ void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, } } -/* - * Given a group, prepare for freeing all the marks associated with that group. - * The marks are attached to the list of marks prepared for destruction, the - * caller is responsible for freeing marks in that list after SRCU period has - * ended. - */ -void fsnotify_detach_group_marks(struct fsnotify_group *group) +/* Destroy all marks attached to inode / vfsmount */ +void fsnotify_destroy_marks(struct fsnotify_mark_connector __rcu **connp) { - struct fsnotify_mark *mark; + struct fsnotify_mark_connector *conn; + struct fsnotify_mark *mark, *old_mark = NULL; + struct inode *inode; - while (1) { - mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING); - if (list_empty(&group->marks_list)) { - mutex_unlock(&group->mark_mutex); - break; - } - mark = list_first_entry(&group->marks_list, - struct fsnotify_mark, g_list); + conn = fsnotify_grab_connector(connp); + if (!conn) + return; + /* + * We have to be careful since we can race with e.g. + * fsnotify_clear_marks_by_group() and once we drop the conn->lock, the + * list can get modified. However we are holding mark reference and + * thus our mark cannot be removed from obj_list so we can continue + * iteration after regaining conn->lock. + */ + hlist_for_each_entry(mark, &conn->list, obj_list) { fsnotify_get_mark(mark); - fsnotify_detach_mark(mark); - mutex_unlock(&group->mark_mutex); - __fsnotify_free_mark(mark); - fsnotify_put_mark(mark); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + old_mark = mark; + fsnotify_destroy_mark(mark, mark->group); + spin_lock(&conn->lock); } + /* + * Detach list from object now so that we don't pin inode until all + * mark references get dropped. It would lead to strange results such + * as delaying inode deletion or blocking unmount. + */ + inode = fsnotify_detach_connector_from_object(conn); + spin_unlock(&conn->lock); + if (old_mark) + fsnotify_put_mark(old_mark); + iput(inode); } /* * Nothing fancy, just initialize lists and locks and counters. */ void fsnotify_init_mark(struct fsnotify_mark *mark, - void (*free_mark)(struct fsnotify_mark *mark)) + struct fsnotify_group *group) { memset(mark, 0, sizeof(*mark)); spin_lock_init(&mark->lock); atomic_set(&mark->refcnt, 1); - mark->free_mark = free_mark; + fsnotify_get_group(group); + mark->group = group; } /* * Destroy all marks in destroy_list, waits for SRCU period to finish before * actually freeing marks. */ -void fsnotify_mark_destroy_list(void) +static void fsnotify_mark_destroy_workfn(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; @@ -540,11 +761,12 @@ void fsnotify_mark_destroy_list(void) list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { list_del_init(&mark->g_list); - fsnotify_put_mark(mark); + fsnotify_final_mark_destroy(mark); } } -static void fsnotify_mark_destroy_workfn(struct work_struct *work) +/* Wait for all marks queued for destruction to be actually destroyed */ +void fsnotify_wait_marks_destroyed(void) { - fsnotify_mark_destroy_list(); + flush_delayed_work(&reaper_work); } diff --git a/fs/notify/vfsmount_mark.c b/fs/notify/vfsmount_mark.c deleted file mode 100644 index a8fcab68faef1cdc826103d095b5b387895dfb20..0000000000000000000000000000000000000000 --- a/fs/notify/vfsmount_mark.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright (C) 2008 Red Hat, Inc., Eric Paris - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2, or (at your option) - * any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; see the file COPYING. If not, write to - * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include - -#include - -#include -#include "fsnotify.h" - -void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) -{ - fsnotify_clear_marks_by_group_flags(group, FSNOTIFY_MARK_FLAG_VFSMOUNT); -} - -/* - * Recalculate the mnt->mnt_fsnotify_mask, or the mask of all FS_* event types - * any notifier is interested in hearing for this mount point - */ -void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - - spin_lock(&mnt->mnt_root->d_lock); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark) -{ - struct vfsmount *mnt = mark->mnt; - struct mount *m = real_mount(mnt); - - BUG_ON(!mutex_is_locked(&mark->group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - - hlist_del_init_rcu(&mark->obj_list); - mark->mnt = NULL; - - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); -} - -/* - * given a group and vfsmount, find the mark associated with that combination. - * if found take a reference to that mark and return it, else return NULL - */ -struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, - struct vfsmount *mnt) -{ - struct mount *m = real_mount(mnt); - struct fsnotify_mark *mark; - - spin_lock(&mnt->mnt_root->d_lock); - mark = fsnotify_find_mark(&m->mnt_fsnotify_marks, group); - spin_unlock(&mnt->mnt_root->d_lock); - - return mark; -} - -/* - * Attach an initialized mark to a given group and vfsmount. - * These marks may be used for the fsnotify backend to determine which - * event types should be delivered to which groups. - */ -int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, - struct fsnotify_group *group, struct vfsmount *mnt, - int allow_dups) -{ - struct mount *m = real_mount(mnt); - int ret; - - mark->flags |= FSNOTIFY_MARK_FLAG_VFSMOUNT; - - BUG_ON(!mutex_is_locked(&group->mark_mutex)); - assert_spin_locked(&mark->lock); - - spin_lock(&mnt->mnt_root->d_lock); - mark->mnt = mnt; - ret = fsnotify_add_mark_list(&m->mnt_fsnotify_marks, mark, allow_dups); - m->mnt_fsnotify_mask = fsnotify_recalc_mask(&m->mnt_fsnotify_marks); - spin_unlock(&mnt->mnt_root->d_lock); - - return ret; -} diff --git a/fs/nsfs.c b/fs/nsfs.c index 1656843e87d2bef47b69d65b23c23946e8936f33..f3db56e83dd204279895751e8cd885f15c28b2f5 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -91,6 +91,7 @@ static void *__ns_get_path(struct path *path, struct ns_common *ns) return ERR_PTR(-ENOMEM); } d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_RCUACCESS; dentry->d_fsdata = (void *)ns->ops; d = atomic_long_cmpxchg(&ns->stashed, 0, (unsigned long)dentry); if (d) { @@ -195,9 +196,11 @@ int ns_get_name(char *buf, size_t size, struct task_struct *task, { struct ns_common *ns; int res = -ENOENT; + const char *name; ns = ns_ops->get(task); if (ns) { - res = snprintf(buf, size, "%s:[%u]", ns_ops->name, ns->inum); + name = ns_ops->real_ns_name ? : ns_ops->name; + res = snprintf(buf, size, "%s:[%u]", name, ns->inum); ns_ops->put(ns); } return res; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index f6e871760f8d97265f4974c159dca7a7b6439b7f..0da0332725aafcf253707f7ca59c9385cd6b4fb4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2242,13 +2242,13 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group, spin_unlock(&o2hb_live_lock); } -static ssize_t o2hb_heartbeat_group_threshold_show(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_show(struct config_item *item, char *page) { return sprintf(page, "%u\n", o2hb_dead_threshold); } -static ssize_t o2hb_heartbeat_group_threshold_store(struct config_item *item, +static ssize_t o2hb_heartbeat_group_dead_threshold_store(struct config_item *item, const char *page, size_t count) { unsigned long tmp; @@ -2297,11 +2297,11 @@ static ssize_t o2hb_heartbeat_group_mode_store(struct config_item *item, } -CONFIGFS_ATTR(o2hb_heartbeat_group_, threshold); +CONFIGFS_ATTR(o2hb_heartbeat_group_, dead_threshold); CONFIGFS_ATTR(o2hb_heartbeat_group_, mode); static struct configfs_attribute *o2hb_heartbeat_group_attrs[] = { - &o2hb_heartbeat_group_attr_threshold, + &o2hb_heartbeat_group_attr_dead_threshold, &o2hb_heartbeat_group_attr_mode, NULL, }; diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index d0ab7e56d0b41a7a97f3640eb0ab4801f747dffc..8d779227370ab1d121fdc2fc6f546f7844e95a5b 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -450,9 +450,8 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node) INIT_WORK(&sc->sc_shutdown_work, o2net_shutdown_sc); INIT_DELAYED_WORK(&sc->sc_keepalive_work, o2net_sc_send_keep_req); - init_timer(&sc->sc_idle_timeout); - sc->sc_idle_timeout.function = o2net_idle_timer; - sc->sc_idle_timeout.data = (unsigned long)sc; + setup_timer(&sc->sc_idle_timeout, o2net_idle_timer, + (unsigned long)sc); sclog(sc, "alloced\n"); @@ -956,7 +955,7 @@ static void o2net_sendpage(struct o2net_sock_container *sc, mutex_lock(&sc->sc_send_lock); ret = sc->sc_sock->ops->sendpage(sc->sc_sock, virt_to_page(kmalloced_virt), - (long)kmalloced_virt & ~PAGE_MASK, + offset_in_page(kmalloced_virt), size, MSG_DONTWAIT); mutex_unlock(&sc->sc_send_lock); if (ret == size) @@ -1460,27 +1459,10 @@ static void o2net_rx_until_empty(struct work_struct *work) static int o2net_set_nodelay(struct socket *sock) { - int ret, val = 1; - mm_segment_t oldfs; + int val = 1; - oldfs = get_fs(); - set_fs(KERNEL_DS); - - /* - * Dear unsuspecting programmer, - * - * Don't use sock_setsockopt() for SOL_TCP. It doesn't check its level - * argument and assumes SOL_SOCKET so, say, your TCP_NODELAY will - * silently turn into SO_DEBUG. - * - * Yours, - * Keeper of hilariously fragile interfaces. - */ - ret = sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, - (char __user *)&val, sizeof(val)); - - set_fs(oldfs); - return ret; + return kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void *)&val, sizeof(val)); } static int o2net_set_usertimeout(struct socket *sock) @@ -1488,7 +1470,7 @@ static int o2net_set_usertimeout(struct socket *sock) int user_timeout = O2NET_TCP_USER_TIMEOUT; return kernel_setsockopt(sock, SOL_TCP, TCP_USER_TIMEOUT, - (char *)&user_timeout, sizeof(user_timeout)); + (void *)&user_timeout, sizeof(user_timeout)); } static void o2net_initialize_handshake(void) diff --git a/fs/open.c b/fs/open.c index 949cef29c3bba9395efd2e461d2078d0a9a9b47c..cd0c5be8d01247e8bab3f4e00e6f171803291881 100644 --- a/fs/open.c +++ b/fs/open.c @@ -193,7 +193,8 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small) goto out_putf; error = -EPERM; - if (IS_APPEND(inode)) + /* Check IS_APPEND on real upper inode */ + if (IS_APPEND(file_inode(f.file))) goto out_putf; sb_start_write(inode->i_sb); @@ -459,20 +460,17 @@ SYSCALL_DEFINE1(chdir, const char __user *, filename) SYSCALL_DEFINE1(fchdir, unsigned int, fd) { struct fd f = fdget_raw(fd); - struct inode *inode; - int error = -EBADF; + int error; error = -EBADF; if (!f.file) goto out; - inode = file_inode(f.file); - error = -ENOTDIR; - if (!S_ISDIR(inode->i_mode)) + if (!d_can_lookup(f.file->f_path.dentry)) goto out_putf; - error = inode_permission(inode, MAY_EXEC | MAY_CHDIR); + error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR); if (!error) set_fs_pwd(current->fs, &f.file->f_path); out_putf: @@ -900,6 +898,12 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o int lookup_flags = 0; int acc_mode = ACC_MODE(flags); + /* + * Clear out all open flags we don't know about so that we don't report + * them in fcntl(F_GETFD) or similar interfaces. + */ + flags &= VALID_OPEN_FLAGS; + if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; else @@ -1078,6 +1082,26 @@ SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, return do_sys_open(dfd, filename, flags, mode); } +#ifdef CONFIG_COMPAT +/* + * Exactly like sys_open(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE3(open, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(AT_FDCWD, filename, flags, mode); +} + +/* + * Exactly like sys_openat(), except that it doesn't set the + * O_LARGEFILE flag. + */ +COMPAT_SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags, umode_t, mode) +{ + return do_sys_open(dfd, filename, flags, mode); +} +#endif + #ifndef __alpha__ /* diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c index e1534c9bab16ce69e30eefd6614ca2872595fb04..c19f0787c9c65bddfc8b82ca7fad2f88549f2f91 100644 --- a/fs/orangefs/devorangefs-req.c +++ b/fs/orangefs/devorangefs-req.c @@ -180,6 +180,10 @@ static ssize_t orangefs_devreq_read(struct file *file, return -EINVAL; } + /* Check for an empty list before locking. */ + if (list_empty(&orangefs_request_list)) + return -EAGAIN; + restart: /* Get next op (if any) from top of list. */ spin_lock(&orangefs_request_list_lock); diff --git a/fs/orangefs/dir.c b/fs/orangefs/dir.c index 284373a57a082fa0e38422afbc49fa2e8519b0df..d327cbd17756dcbc18ca787602e436c24e70d198 100644 --- a/fs/orangefs/dir.c +++ b/fs/orangefs/dir.c @@ -1,396 +1,404 @@ /* - * (C) 2001 Clemson University and The University of Chicago - * - * See COPYING in top-level directory. + * Copyright 2017 Omnibond Systems, L.L.C. */ #include "protocol.h" #include "orangefs-kernel.h" #include "orangefs-bufmap.h" +struct orangefs_dir_part { + struct orangefs_dir_part *next; + size_t len; +}; + +struct orangefs_dir { + __u64 token; + struct orangefs_dir_part *part; + loff_t end; + int error; +}; + +#define PART_SHIFT (24) +#define PART_SIZE (1<<24) +#define PART_MASK (~(PART_SIZE - 1)) + /* - * decode routine used by kmod to deal with the blob sent from - * userspace for readdirs. The blob contains zero or more of these - * sub-blobs: - * __u32 - represents length of the character string that follows. - * string - between 1 and ORANGEFS_NAME_MAX bytes long. - * padding - (if needed) to cause the __u32 plus the string to be - * eight byte aligned. - * khandle - sizeof(khandle) bytes. + * There can be up to 512 directory entries. Each entry is encoded as + * follows: + * 4 bytes: string size (n) + * n bytes: string + * 1 byte: trailing zero + * padding to 8 bytes + * 16 bytes: khandle + * padding to 8 bytes + * + * The trailer_buf starts with a struct orangefs_readdir_response_s + * which must be skipped to get to the directory data. + * + * The data which is received from the userspace daemon is termed a + * part and is stored in a linked list in case more than one part is + * needed for a large directory. + * + * The position pointer (ctx->pos) encodes the part and offset on which + * to begin reading at. Bits above PART_SHIFT encode the part and bits + * below PART_SHIFT encode the offset. Parts are stored in a linked + * list which grows as data is received from the server. The overhead + * associated with managing the list is presumed to be small compared to + * the overhead of communicating with the server. + * + * As data is received from the server, it is placed at the end of the + * part list. Data is parsed from the current position as it is needed. + * When data is determined to be corrupt, it is either because the + * userspace component has sent back corrupt data or because the file + * pointer has been moved to an invalid location. Since the two cannot + * be differentiated, return EIO. + * + * Part zero is synthesized to contains `.' and `..'. Part one is the + * first part of the part list. */ -static long decode_dirents(char *ptr, size_t size, - struct orangefs_readdir_response_s *readdir) + +static int do_readdir(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct orangefs_kernel_op_s *op) { - int i; - struct orangefs_readdir_response_s *rd = - (struct orangefs_readdir_response_s *) ptr; - char *buf = ptr; - int khandle_size = sizeof(struct orangefs_khandle); - size_t offset = offsetof(struct orangefs_readdir_response_s, - dirent_array); - /* 8 reflects eight byte alignment */ - int smallest_blob = khandle_size + 8; - __u32 len; - int aligned_len; - int sizeof_u32 = sizeof(__u32); - long ret; - - gossip_debug(GOSSIP_DIR_DEBUG, "%s: size:%zu:\n", __func__, size); - - /* size is = offset on empty dirs, > offset on non-empty dirs... */ - if (size < offset) { - gossip_err("%s: size:%zu: offset:%zu:\n", - __func__, - size, - offset); - ret = -EINVAL; - goto out; - } + struct orangefs_readdir_response_s *resp; + int bufi, r; - if ((size == offset) && (readdir->orangefs_dirent_outcount != 0)) { - gossip_err("%s: size:%zu: dirent_outcount:%d:\n", - __func__, - size, - readdir->orangefs_dirent_outcount); - ret = -EINVAL; - goto out; - } + /* + * Despite the badly named field, readdir does not use shared + * memory. However, there are a limited number of readdir + * slots, which must be allocated here. This flag simply tells + * the op scheduler to return the op here for retry. + */ + op->uses_shared_memory = 1; + op->upcall.req.readdir.refn = oi->refn; + op->upcall.req.readdir.token = od->token; + op->upcall.req.readdir.max_dirent_count = + ORANGEFS_MAX_DIRENT_COUNT_READDIR; - readdir->token = rd->token; - readdir->orangefs_dirent_outcount = rd->orangefs_dirent_outcount; - readdir->dirent_array = kcalloc(readdir->orangefs_dirent_outcount, - sizeof(*readdir->dirent_array), - GFP_KERNEL); - if (readdir->dirent_array == NULL) { - gossip_err("%s: kcalloc failed.\n", __func__); - ret = -ENOMEM; - goto out; +again: + bufi = orangefs_readdir_index_get(); + if (bufi < 0) { + od->error = bufi; + return bufi; } - buf += offset; - size -= offset; - - for (i = 0; i < readdir->orangefs_dirent_outcount; i++) { - if (size < smallest_blob) { - gossip_err("%s: size:%zu: smallest_blob:%d:\n", - __func__, - size, - smallest_blob); - ret = -EINVAL; - goto free; - } + op->upcall.req.readdir.buf_index = bufi; - len = *(__u32 *)buf; - if ((len < 1) || (len > ORANGEFS_NAME_MAX)) { - gossip_err("%s: len:%d:\n", __func__, len); - ret = -EINVAL; - goto free; - } + r = service_operation(op, "orangefs_readdir", + get_interruptible_flag(dentry->d_inode)); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: size:%zu: len:%d:\n", - __func__, - size, - len); + orangefs_readdir_index_put(bufi); - readdir->dirent_array[i].d_name = buf + sizeof_u32; - readdir->dirent_array[i].d_length = len; + if (op_state_purged(op)) { + if (r == -EAGAIN) { + vfree(op->downcall.trailer_buf); + goto again; + } else if (r == -EIO) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } + } - /* - * Calculate "aligned" length of this string and its - * associated __u32 descriptor. - */ - aligned_len = ((sizeof_u32 + len + 1) + 7) & ~7; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: aligned_len:%d:\n", - __func__, - aligned_len); + if (r < 0) { + vfree(op->downcall.trailer_buf); + od->error = r; + return r; + } else if (op->downcall.status) { + vfree(op->downcall.trailer_buf); + od->error = op->downcall.status; + return op->downcall.status; + } - /* - * The end of the blob should coincide with the end - * of the last sub-blob. - */ - if (size < aligned_len + khandle_size) { - gossip_err("%s: ran off the end of the blob.\n", - __func__); - ret = -EINVAL; - goto free; - } - size -= aligned_len + khandle_size; + /* + * The maximum size is size per entry times the 512 entries plus + * the header. This is well under the limit. + */ + if (op->downcall.trailer_size > PART_SIZE) { + vfree(op->downcall.trailer_buf); + od->error = -EIO; + return -EIO; + } - buf += aligned_len; + resp = (struct orangefs_readdir_response_s *) + op->downcall.trailer_buf; + od->token = resp->token; + return 0; +} - readdir->dirent_array[i].khandle = - *(struct orangefs_khandle *) buf; - buf += khandle_size; +static int parse_readdir(struct orangefs_dir *od, + struct orangefs_kernel_op_s *op) +{ + struct orangefs_dir_part *part, *new; + size_t count; + + count = 1; + part = od->part; + while (part) { + count++; + if (part->next) + part = part->next; + else + break; } - ret = buf - ptr; - gossip_debug(GOSSIP_DIR_DEBUG, "%s: returning:%ld:\n", __func__, ret); - goto out; -free: - kfree(readdir->dirent_array); - readdir->dirent_array = NULL; + new = (void *)op->downcall.trailer_buf; + new->next = NULL; + new->len = op->downcall.trailer_size - + sizeof(struct orangefs_readdir_response_s); + if (!od->part) + od->part = new; + else + part->next = new; + count++; + od->end = count << PART_SHIFT; -out: - return ret; + return 0; } -/* - * Read directory entries from an instance of an open directory. - */ -static int orangefs_readdir(struct file *file, struct dir_context *ctx) +static int orangefs_dir_more(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry) { - int ret = 0; - int buffer_index; - /* - * ptoken supports Orangefs' distributed directory logic, added - * in 2.9.2. - */ - __u64 *ptoken = file->private_data; - __u64 pos = 0; - ino_t ino = 0; - struct dentry *dentry = file->f_path.dentry; - struct orangefs_kernel_op_s *new_op = NULL; - struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(dentry->d_inode); - struct orangefs_readdir_response_s readdir_response; - void *dents_buf; - int i = 0; - int len = 0; - ino_t current_ino = 0; - char *current_entry = NULL; - long bytes_decoded; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld, ptoken = %llu\n", - __func__, - lld(ctx->pos), - llu(*ptoken)); - - pos = (__u64) ctx->pos; - - /* are we done? */ - if (pos == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Skipping to termination path\n"); - return 0; + struct orangefs_kernel_op_s *op; + int r; + + op = op_alloc(ORANGEFS_VFS_OP_READDIR); + if (!op) { + od->error = -ENOMEM; + return -ENOMEM; + } + r = do_readdir(oi, od, dentry, op); + if (r) { + od->error = r; + goto out; + } + r = parse_readdir(od, op); + if (r) { + od->error = r; + goto out; } - gossip_debug(GOSSIP_DIR_DEBUG, - "orangefs_readdir called on %pd (pos=%llu)\n", - dentry, llu(pos)); + od->error = 0; +out: + op_release(op); + return od->error; +} - memset(&readdir_response, 0, sizeof(readdir_response)); +static int fill_from_part(struct orangefs_dir_part *part, + struct dir_context *ctx) +{ + const int offset = sizeof(struct orangefs_readdir_response_s); + struct orangefs_khandle *khandle; + __u32 *len, padlen; + loff_t i; + char *s; + i = ctx->pos & ~PART_MASK; - new_op = op_alloc(ORANGEFS_VFS_OP_READDIR); - if (!new_op) - return -ENOMEM; + /* The file offset from userspace is too large. */ + if (i > part->len) + return 1; /* - * Only the indices are shared. No memory is actually shared, but the - * mechanism is used. + * If the seek pointer is positioned just before an entry it + * should find the next entry. */ - new_op->uses_shared_memory = 1; - new_op->upcall.req.readdir.refn = orangefs_inode->refn; - new_op->upcall.req.readdir.max_dirent_count = - ORANGEFS_MAX_DIRENT_COUNT_READDIR; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: upcall.req.readdir.refn.khandle: %pU\n", - __func__, - &new_op->upcall.req.readdir.refn.khandle); + if (i % 8) + i = i + (8 - i%8)%8; - new_op->upcall.req.readdir.token = *ptoken; - -get_new_buffer_index: - buffer_index = orangefs_readdir_index_get(); - if (buffer_index < 0) { - ret = buffer_index; - gossip_lerr("orangefs_readdir: orangefs_readdir_index_get() failure (%d)\n", - ret); - goto out_free_op; + while (i < part->len) { + if (part->len < i + sizeof *len) + break; + len = (void *)part + offset + i; + /* + * len is the size of the string itself. padlen is the + * total size of the encoded string. + */ + padlen = (sizeof *len + *len + 1) + + (8 - (sizeof *len + *len + 1)%8)%8; + if (part->len < i + padlen + sizeof *khandle) + goto next; + s = (void *)part + offset + i + sizeof *len; + if (s[*len] != 0) + goto next; + khandle = (void *)part + offset + i + padlen; + if (!dir_emit(ctx, s, *len, + orangefs_khandle_to_ino(khandle), + DT_UNKNOWN)) + return 0; + i += padlen + sizeof *khandle; + i = i + (8 - i%8)%8; + BUG_ON(i > part->len); + ctx->pos = (ctx->pos & PART_MASK) | i; + continue; +next: + i += 8; } - new_op->upcall.req.readdir.buf_index = buffer_index; - - ret = service_operation(new_op, - "orangefs_readdir", - get_interruptible_flag(dentry->d_inode)); + return 1; +} - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir downcall status is %d. ret:%d\n", - new_op->downcall.status, - ret); +static int orangefs_dir_fill(struct orangefs_inode_s *oi, + struct orangefs_dir *od, struct dentry *dentry, + struct dir_context *ctx) +{ + struct orangefs_dir_part *part; + size_t count; - orangefs_readdir_index_put(buffer_index); + count = ((ctx->pos & PART_MASK) >> PART_SHIFT) - 1; - if (ret == -EAGAIN && op_state_purged(new_op)) { - /* Client-core indices are invalid after it restarted. */ - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: Getting new buffer_index for retry of readdir..\n", - __func__); - goto get_new_buffer_index; + part = od->part; + while (part->next && count) { + count--; + part = part->next; } - - if (ret == -EIO && op_state_purged(new_op)) { - gossip_err("%s: Client is down. Aborting readdir call.\n", - __func__); - goto out_free_op; + /* This means the userspace file offset is invalid. */ + if (count) { + od->error = -EIO; + return -EIO; } - if (ret < 0 || new_op->downcall.status != 0) { - gossip_debug(GOSSIP_DIR_DEBUG, - "Readdir request failed. Status:%d\n", - new_op->downcall.status); - if (ret >= 0) - ret = new_op->downcall.status; - goto out_free_op; + while (part && part->len) { + int r; + r = fill_from_part(part, ctx); + if (r < 0) { + od->error = r; + return r; + } else if (r == 0) { + /* Userspace buffer is full. */ + break; + } else { + /* + * The part ran out of data. Move to the next + * part. */ + ctx->pos = (ctx->pos & PART_MASK) + + (1 << PART_SHIFT); + part = part->next; + } } + return 0; +} - dents_buf = new_op->downcall.trailer_buf; - if (dents_buf == NULL) { - gossip_err("Invalid NULL buffer in readdir response\n"); - ret = -ENOMEM; - goto out_free_op; +static loff_t orangefs_dir_llseek(struct file *file, loff_t offset, + int whence) +{ + struct orangefs_dir *od = file->private_data; + /* + * Delete the stored data so userspace sees new directory + * entries. + */ + if (!whence && offset < od->end) { + struct orangefs_dir_part *part = od->part; + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; } + return default_llseek(file, offset, whence); +} - bytes_decoded = decode_dirents(dents_buf, new_op->downcall.trailer_size, - &readdir_response); - if (bytes_decoded < 0) { - ret = bytes_decoded; - gossip_err("Could not decode readdir from buffer %d\n", ret); - goto out_vfree; - } +static int orangefs_dir_iterate(struct file *file, + struct dir_context *ctx) +{ + struct orangefs_inode_s *oi; + struct orangefs_dir *od; + struct dentry *dentry; + int r; - if (bytes_decoded != new_op->downcall.trailer_size) { - gossip_err("orangefs_readdir: # bytes decoded (%ld) " - "!= trailer size (%ld)\n", - bytes_decoded, - (long)new_op->downcall.trailer_size); - ret = -EINVAL; - goto out_destroy_handle; - } + dentry = file->f_path.dentry; + oi = ORANGEFS_I(dentry->d_inode); + od = file->private_data; - /* - * orangefs doesn't actually store dot and dot-dot, but - * we need to have them represented. - */ - if (pos == 0) { - ino = get_ino_from_khandle(dentry->d_inode); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \".\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, ".", 1, ino, DT_DIR); - pos += 1; - } + if (od->error) + return od->error; - if (pos == 1) { - ino = get_parent_ino_from_dentry(dentry); - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: calling dir_emit of \"..\" with pos = %llu\n", - __func__, - llu(pos)); - ret = dir_emit(ctx, "..", 2, ino, DT_DIR); - pos += 1; + if (ctx->pos == 0) { + if (!dir_emit_dot(file, ctx)) + return 0; + ctx->pos++; + } + if (ctx->pos == 1) { + if (!dir_emit_dotdot(file, ctx)) + return 0; + ctx->pos = 1 << PART_SHIFT; } /* - * we stored ORANGEFS_ITERATE_NEXT in ctx->pos last time around - * to prevent "finding" dot and dot-dot on any iteration - * other than the first. + * The seek position is in the first synthesized part but is not + * valid. */ - if (ctx->pos == ORANGEFS_ITERATE_NEXT) - ctx->pos = 0; - - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: dirent_outcount:%d:\n", - __func__, - readdir_response.orangefs_dirent_outcount); - for (i = ctx->pos; - i < readdir_response.orangefs_dirent_outcount; - i++) { - len = readdir_response.dirent_array[i].d_length; - current_entry = readdir_response.dirent_array[i].d_name; - current_ino = orangefs_khandle_to_ino( - &readdir_response.dirent_array[i].khandle); - - gossip_debug(GOSSIP_DIR_DEBUG, - "calling dir_emit for %s with len %d" - ", ctx->pos %ld\n", - current_entry, - len, - (unsigned long)ctx->pos); - /* - * type is unknown. We don't return object type - * in the dirent_array. This leaves getdents - * clueless about type. - */ - ret = - dir_emit(ctx, current_entry, len, current_ino, DT_UNKNOWN); - if (!ret) - break; - ctx->pos++; - gossip_debug(GOSSIP_DIR_DEBUG, - "%s: ctx->pos:%lld\n", - __func__, - lld(ctx->pos)); + if ((ctx->pos & PART_MASK) == 0) + return -EIO; - } + r = 0; /* - * we ran all the way through the last batch, set up for - * getting another batch... + * Must read more if the user has sought past what has been read + * so far. Stop a user who has sought past the end. */ - if (ret) { - *ptoken = readdir_response.token; - ctx->pos = ORANGEFS_ITERATE_NEXT; + while (od->token != ORANGEFS_ITERATE_END && + ctx->pos > od->end) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + } + if (od->token == ORANGEFS_ITERATE_END && ctx->pos > od->end) + return -EIO; + + /* Then try to fill if there's any left in the buffer. */ + if (ctx->pos < od->end) { + r = orangefs_dir_fill(oi, od, dentry, ctx); + if (r) + return r; } - /* - * Did we hit the end of the directory? - */ - if (readdir_response.token == ORANGEFS_READDIR_END) { - gossip_debug(GOSSIP_DIR_DEBUG, - "End of dir detected; setting ctx->pos to ORANGEFS_READDIR_END.\n"); - ctx->pos = ORANGEFS_READDIR_END; + /* Finally get some more and try to fill. */ + if (od->token != ORANGEFS_ITERATE_END) { + r = orangefs_dir_more(oi, od, dentry); + if (r) + return r; + r = orangefs_dir_fill(oi, od, dentry, ctx); } -out_destroy_handle: - /* kfree(NULL) is safe */ - kfree(readdir_response.dirent_array); -out_vfree: - gossip_debug(GOSSIP_DIR_DEBUG, "vfree %p\n", dents_buf); - vfree(dents_buf); -out_free_op: - op_release(new_op); - gossip_debug(GOSSIP_DIR_DEBUG, "orangefs_readdir returning %d\n", ret); - return ret; + return r; } static int orangefs_dir_open(struct inode *inode, struct file *file) { - __u64 *ptoken; - - file->private_data = kmalloc(sizeof(__u64), GFP_KERNEL); + struct orangefs_dir *od; + file->private_data = kmalloc(sizeof(struct orangefs_dir), + GFP_KERNEL); if (!file->private_data) return -ENOMEM; - - ptoken = file->private_data; - *ptoken = ORANGEFS_READDIR_START; + od = file->private_data; + od->token = ORANGEFS_ITERATE_START; + od->part = NULL; + od->end = 1 << PART_SHIFT; + od->error = 0; return 0; } static int orangefs_dir_release(struct inode *inode, struct file *file) { + struct orangefs_dir *od = file->private_data; + struct orangefs_dir_part *part = od->part; orangefs_flush_inode(inode); - kfree(file->private_data); + while (part) { + struct orangefs_dir_part *next = part->next; + vfree(part); + part = next; + } + kfree(od); return 0; } -/** ORANGEFS implementation of VFS directory operations */ const struct file_operations orangefs_dir_operations = { + .llseek = orangefs_dir_llseek, .read = generic_read_dir, - .iterate = orangefs_readdir, + .iterate = orangefs_dir_iterate, .open = orangefs_dir_open, - .release = orangefs_dir_release, + .release = orangefs_dir_release }; diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h index 3b8923f8bf21d6902f0d8f440e2102cb57be94bf..163001c9550184feff1255364a3b59a61de1ecf9 100644 --- a/fs/orangefs/downcall.h +++ b/fs/orangefs/downcall.h @@ -40,16 +40,6 @@ struct orangefs_mkdir_response { struct orangefs_object_kref refn; }; -/* - * duplication of some system interface structures so that I don't have - * to allocate extra memory - */ -struct orangefs_dirent { - char *d_name; - int d_length; - struct orangefs_khandle khandle; -}; - struct orangefs_statfs_response { __s64 block_size; __s64 blocks_total; @@ -131,12 +121,16 @@ struct orangefs_downcall_s { } resp; }; +/* + * The readdir response comes in the trailer. It is followed by the + * directory entries as described in dir.c. + */ + struct orangefs_readdir_response_s { __u64 token; __u64 directory_version; __u32 __pad2; __u32 orangefs_dirent_outcount; - struct orangefs_dirent *dirent_array; }; #endif /* __DOWNCALL_H */ diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index e6bbc8083d77235919ee56e093605283bb2d8dbb..28f38d813ad23f69f24900a4e8bd6ee75e57659f 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -114,7 +114,6 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_khandle *handle = &orangefs_inode->refn.khandle; struct orangefs_kernel_op_s *new_op = NULL; - struct iov_iter saved = *iter; int buffer_index = -1; ssize_t ret; @@ -193,7 +192,7 @@ static ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inod orangefs_bufmap_put(buffer_index); buffer_index = -1; if (type == ORANGEFS_IO_WRITE) - *iter = saved; + iov_iter_revert(iter, total_size); gossip_debug(GOSSIP_FILE_DEBUG, "%s:going to repopulate_shared_memory.\n", __func__); @@ -475,7 +474,8 @@ static ssize_t orangefs_file_write_iter(struct kiocb *iocb, struct iov_iter *ite /* Make sure generic_write_checks sees an up to date inode size. */ if (file->f_flags & O_APPEND) { - rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + rc = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (rc == -ESTALE) rc = -EIO; if (rc) { @@ -693,7 +693,8 @@ static loff_t orangefs_file_llseek(struct file *file, loff_t offset, int origin) * NOTE: We are only interested in file size here, * so we set mask accordingly. */ - ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1); + ret = orangefs_inode_getattr(file->f_mapping->host, 0, 1, + STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index a304bf34b2127d47d4aff2d8c8b4b0a99a55ec46..9428ea0aac16d32e14fa74501ad470b5eab1ffda 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -161,7 +161,7 @@ static int orangefs_setattr_size(struct inode *inode, struct iattr *iattr) iattr->ia_size); /* Ensure that we have a up to date size, so we know if it changed. */ - ret = orangefs_inode_getattr(inode, 0, 1); + ret = orangefs_inode_getattr(inode, 0, 1, STATX_SIZE); if (ret == -ESTALE) ret = -EIO; if (ret) { @@ -218,8 +218,7 @@ int orangefs_setattr(struct dentry *dentry, struct iattr *iattr) if (ret) goto out; - if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(inode)) { + if (iattr->ia_valid & ATTR_SIZE) { ret = orangefs_setattr_size(inode, iattr); if (ret) goto out; @@ -256,13 +255,19 @@ int orangefs_getattr(const struct path *path, struct kstat *stat, "orangefs_getattr: called on %pd\n", path->dentry); - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, request_mask); if (ret == 0) { generic_fillattr(inode, stat); /* override block size reported to stat */ orangefs_inode = ORANGEFS_I(inode); stat->blksize = orangefs_inode->blksize; + + if (request_mask & STATX_SIZE) + stat->result_mask = STATX_BASIC_STATS; + else + stat->result_mask = STATX_BASIC_STATS & + ~STATX_SIZE; } return ret; } @@ -277,7 +282,7 @@ int orangefs_permission(struct inode *inode, int mask) gossip_debug(GOSSIP_INODE_DEBUG, "%s: refreshing\n", __func__); /* Make sure the permission (and other common attrs) are up to date. */ - ret = orangefs_inode_getattr(inode, 0, 0); + ret = orangefs_inode_getattr(inode, 0, 0, STATX_MODE); if (ret < 0) return ret; @@ -375,7 +380,7 @@ struct inode *orangefs_iget(struct super_block *sb, struct orangefs_object_kref if (!inode || !(inode->i_state & I_NEW)) return inode; - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) { iget_failed(inode); return ERR_PTR(error); @@ -420,7 +425,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir, orangefs_set_inode(inode, ref); inode->i_ino = hash; /* needed for stat etc */ - error = orangefs_inode_getattr(inode, 1, 1); + error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL); if (error) goto out_iput; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index a290ff6ec7569dc3b5eea9ded2df0e8483c49d65..478e88bd7f9d3d3d79e4f8edf76d4df5a5acc395 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -74,6 +74,7 @@ static int orangefs_create(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "%s: dentry instantiated for %pd\n", @@ -193,8 +194,6 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry, goto out; } - ORANGEFS_I(inode)->getattr_time = jiffies - 1; - gossip_debug(GOSSIP_NAME_DEBUG, "%s:%s:%d " "Found good inode [%lu] with count [%d]\n", @@ -324,6 +323,7 @@ static int orangefs_symlink(struct inode *dir, unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Symlink) %pU -> %pd\n", @@ -388,6 +388,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode unlock_new_inode(inode); orangefs_set_timeout(dentry); ORANGEFS_I(inode)->getattr_time = jiffies - 1; + ORANGEFS_I(inode)->getattr_mask = STATX_BASIC_STATS; gossip_debug(GOSSIP_NAME_DEBUG, "Inode (Directory) %pU -> %pd\n", diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index 6333cbbdfef7ae652c1a4e6c4d2818ae1cd188d7..83b506020718980a69e5285a8c86496c14a2d050 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -521,13 +521,11 @@ int orangefs_bufmap_copy_from_iovec(struct iov_iter *iter, size_t n = size; if (n > PAGE_SIZE) n = PAGE_SIZE; - n = copy_page_from_iter(page, 0, n, iter); - if (!n) + if (copy_page_from_iter(page, 0, n, iter) != n) return -EFAULT; size -= n; } return 0; - } /* diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 791912da97d7487d99c618ce8ac6f5fbe3cd11fb..716ed337f1663ddfaf0729c5d8275b0366308f19 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,6 +440,9 @@ static ssize_t orangefs_debug_write(struct file *file, "orangefs_debug_write: %pD\n", file); + if (count == 0) + return 0; + /* * Thwart users who try to jamb a ridiculous number * of bytes into the debug file... diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h index f380f9ed1b286a7eedfb41fa5f7f8f9259abfdf3..efe08c763e566bffc500439645757966c65cc93c 100644 --- a/fs/orangefs/orangefs-dev-proto.h +++ b/fs/orangefs/orangefs-dev-proto.h @@ -52,12 +52,7 @@ */ #define ORANGEFS_MAX_DEBUG_STRING_LEN 0x00000800 -/* - * The maximum number of directory entries in a single request is 96. - * XXX: Why can this not be higher. The client-side code can handle up to 512. - * XXX: What happens if we expect more than the client can return? - */ -#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 96 +#define ORANGEFS_MAX_DIRENT_COUNT_READDIR 512 #include "upcall.h" #include "downcall.h" diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 8afac46fcc87a1e1d3ea8c658e0ae60c336c3d5d..ea0ce507a6ab641d17c1983b898fb2f12a2d0ed2 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -215,6 +215,7 @@ struct orangefs_inode_s { unsigned long pinode_flags; unsigned long getattr_time; + u32 getattr_mask; }; #define P_ATIME_FLAG 0 @@ -340,11 +341,6 @@ static inline struct orangefs_khandle *get_khandle_from_ino(struct inode *inode) return &(ORANGEFS_I(inode)->refn.khandle); } -static inline __s32 get_fsid_from_ino(struct inode *inode) -{ - return ORANGEFS_I(inode)->refn.fs_id; -} - static inline ino_t get_ino_from_khandle(struct inode *inode) { struct orangefs_khandle *khandle; @@ -500,7 +496,8 @@ int orangefs_inode_setxattr(struct inode *inode, size_t size, int flags); -int orangefs_inode_getattr(struct inode *inode, int new, int bypass); +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask); int orangefs_inode_check_changed(struct inode *inode); diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c index 9b96b99539d62501718bd752f26d059900095864..aab6f1842963d3d0e5891900f46fc9eb570e9a4f 100644 --- a/fs/orangefs/orangefs-utils.c +++ b/fs/orangefs/orangefs-utils.c @@ -251,7 +251,8 @@ static int orangefs_inode_is_stale(struct inode *inode, int new, return 0; } -int orangefs_inode_getattr(struct inode *inode, int new, int bypass) +int orangefs_inode_getattr(struct inode *inode, int new, int bypass, + u32 request_mask) { struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode); struct orangefs_kernel_op_s *new_op; @@ -262,7 +263,13 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) get_khandle_from_ino(inode)); if (!new && !bypass) { - if (time_before(jiffies, orangefs_inode->getattr_time)) + /* + * Must have all the attributes in the mask and be within cache + * time. + */ + if ((request_mask & orangefs_inode->getattr_mask) == + request_mask && + time_before(jiffies, orangefs_inode->getattr_time)) return 0; } @@ -270,7 +277,15 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) if (!new_op) return -ENOMEM; new_op->upcall.req.getattr.refn = orangefs_inode->refn; - new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + /* + * Size is the hardest attribute to get. The incremental cost of any + * other attribute is essentially zero. + */ + if (request_mask & STATX_SIZE || new) + new_op->upcall.req.getattr.mask = ORANGEFS_ATTR_SYS_ALL_NOHINT; + else + new_op->upcall.req.getattr.mask = + ORANGEFS_ATTR_SYS_ALL_NOHINT & ~ORANGEFS_ATTR_SYS_SIZE; ret = service_operation(new_op, __func__, get_interruptible_flag(inode)); @@ -291,25 +306,29 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) case S_IFREG: inode->i_flags = orangefs_inode_flags(&new_op-> downcall.resp.getattr.attributes); - inode_size = (loff_t)new_op-> - downcall.resp.getattr.attributes.size; - rounded_up_size = - (inode_size + (4096 - (inode_size % 4096))); - inode->i_size = inode_size; - orangefs_inode->blksize = - new_op->downcall.resp.getattr.attributes.blksize; - spin_lock(&inode->i_lock); - inode->i_bytes = inode_size; - inode->i_blocks = - (unsigned long)(rounded_up_size / 512); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode_size = (loff_t)new_op-> + downcall.resp.getattr.attributes.size; + rounded_up_size = + (inode_size + (4096 - (inode_size % 4096))); + inode->i_size = inode_size; + orangefs_inode->blksize = + new_op->downcall.resp.getattr.attributes.blksize; + spin_lock(&inode->i_lock); + inode->i_bytes = inode_size; + inode->i_blocks = + (unsigned long)(rounded_up_size / 512); + spin_unlock(&inode->i_lock); + } break; case S_IFDIR: - inode->i_size = PAGE_SIZE; - orangefs_inode->blksize = i_blocksize(inode); - spin_lock(&inode->i_lock); - inode_set_bytes(inode, inode->i_size); - spin_unlock(&inode->i_lock); + if (request_mask & STATX_SIZE || new) { + inode->i_size = PAGE_SIZE; + orangefs_inode->blksize = i_blocksize(inode); + spin_lock(&inode->i_lock); + inode_set_bytes(inode, inode->i_size); + spin_unlock(&inode->i_lock); + } set_nlink(inode, 1); break; case S_IFLNK: @@ -349,6 +368,10 @@ int orangefs_inode_getattr(struct inode *inode, int new, int bypass) orangefs_inode->getattr_time = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; + if (request_mask & STATX_SIZE || new) + orangefs_inode->getattr_mask = STATX_BASIC_STATS; + else + orangefs_inode->getattr_mask = STATX_BASIC_STATS & ~STATX_SIZE; ret = 0; out: op_release(new_op); @@ -500,41 +523,6 @@ int orangefs_flush_inode(struct inode *inode) return ret; } -int orangefs_unmount_sb(struct super_block *sb) -{ - int ret = -EINVAL; - struct orangefs_kernel_op_s *new_op = NULL; - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount_sb called on sb %p\n", - sb); - - new_op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); - if (!new_op) - return -ENOMEM; - new_op->upcall.req.fs_umount.id = ORANGEFS_SB(sb)->id; - new_op->upcall.req.fs_umount.fs_id = ORANGEFS_SB(sb)->fs_id; - strncpy(new_op->upcall.req.fs_umount.orangefs_config_server, - ORANGEFS_SB(sb)->devname, - ORANGEFS_MAX_SERVER_ADDR_LEN); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "Attempting ORANGEFS Unmount via host %s\n", - new_op->upcall.req.fs_umount.orangefs_config_server); - - ret = service_operation(new_op, "orangefs_fs_umount", 0); - - gossip_debug(GOSSIP_UTILS_DEBUG, - "orangefs_unmount: got return value of %d\n", ret); - if (ret) - sb = ERR_PTR(ret); - else - ORANGEFS_SB(sb)->mount_pending = 1; - - op_release(new_op); - return ret; -} - void orangefs_make_bad_inode(struct inode *inode) { if (is_root_handle(inode)) { diff --git a/fs/orangefs/protocol.h b/fs/orangefs/protocol.h index 971307ad69be4b08cd7be8df8b5a213a314a25b7..48bcc1bbe415f1a4f2394fb1b72ab704b2d0a1d3 100644 --- a/fs/orangefs/protocol.h +++ b/fs/orangefs/protocol.h @@ -138,13 +138,8 @@ typedef __s64 ORANGEFS_offset; #define ORANGEFS_G_SGID (1 << 10) #define ORANGEFS_U_SUID (1 << 11) -/* definition taken from stdint.h */ -#define INT32_MAX (2147483647) -#define ORANGEFS_ITERATE_START (INT32_MAX - 1) -#define ORANGEFS_ITERATE_END (INT32_MAX - 2) -#define ORANGEFS_ITERATE_NEXT (INT32_MAX - 3) -#define ORANGEFS_READDIR_START ORANGEFS_ITERATE_START -#define ORANGEFS_READDIR_END ORANGEFS_ITERATE_END +#define ORANGEFS_ITERATE_START 2147483646 +#define ORANGEFS_ITERATE_END 2147483645 #define ORANGEFS_IMMUTABLE_FL FS_IMMUTABLE_FL #define ORANGEFS_APPEND_FL FS_APPEND_FL #define ORANGEFS_NOATIME_FL FS_NOATIME_FL diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index 629d8c917fa679886715360fcfe8f6cee1b5a37f..5c7c273e17ec3b32c3c87d313e2c68eb82bdac84 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -376,6 +376,25 @@ static const struct export_operations orangefs_export_ops = { .fh_to_dentry = orangefs_fh_to_dentry, }; +static int orangefs_unmount(int id, __s32 fs_id, const char *devname) +{ + struct orangefs_kernel_op_s *op; + int r; + op = op_alloc(ORANGEFS_VFS_OP_FS_UMOUNT); + if (!op) + return -ENOMEM; + op->upcall.req.fs_umount.id = id; + op->upcall.req.fs_umount.fs_id = fs_id; + strncpy(op->upcall.req.fs_umount.orangefs_config_server, + devname, ORANGEFS_MAX_SERVER_ADDR_LEN); + r = service_operation(op, "orangefs_fs_umount", 0); + /* Not much to do about an error here. */ + if (r) + gossip_err("orangefs_unmount: service_operation %d\n", r); + op_release(op); + return r; +} + static int orangefs_fill_sb(struct super_block *sb, struct orangefs_fs_mount_response *fs_mount, void *data, int silent) @@ -484,6 +503,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (IS_ERR(sb)) { d = ERR_CAST(sb); + orangefs_unmount(new_op->downcall.resp.fs_mount.id, + new_op->downcall.resp.fs_mount.fs_id, devname); goto free_op; } @@ -539,6 +560,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ ORANGEFS_SB(sb)->no_list = 1; + /* ORANGEFS_VFS_OP_FS_UMOUNT is done by orangefs_kill_sb. */ deactivate_locked_super(sb); free_op: gossip_err("orangefs_mount: mount request failed with %d\n", ret); @@ -554,6 +576,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, void orangefs_kill_sb(struct super_block *sb) { + int r; gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_kill_sb: called\n"); /* provided sb cleanup */ @@ -563,7 +586,10 @@ void orangefs_kill_sb(struct super_block *sb) * issue the unmount to userspace to tell it to remove the * dynamic mount info it has for this superblock */ - orangefs_unmount_sb(sb); + r = orangefs_unmount(ORANGEFS_SB(sb)->id, ORANGEFS_SB(sb)->fs_id, + ORANGEFS_SB(sb)->devname); + if (!r) + ORANGEFS_SB(sb)->mount_pending = 1; if (!ORANGEFS_SB(sb)->no_list) { /* remove the sb from our list of orangefs specific sb's */ diff --git a/fs/orangefs/waitqueue.c b/fs/orangefs/waitqueue.c index abcfa3fa9992957006461c202c54380018a17787..61e2ca7fec553d3a694cf80d5e7083da98853015 100644 --- a/fs/orangefs/waitqueue.c +++ b/fs/orangefs/waitqueue.c @@ -124,7 +124,14 @@ int service_operation(struct orangefs_kernel_op_s *op, gossip_debug(GOSSIP_WAIT_DEBUG, "%s:client core is NOT in service.\n", __func__); - timeout = op_timeout_secs * HZ; + /* + * Don't wait for the userspace component to return if + * the filesystem is being umounted anyway. + */ + if (op->upcall.type == ORANGEFS_VFS_OP_FS_UMOUNT) + timeout = 0; + else + timeout = op_timeout_secs * HZ; } spin_unlock(&orangefs_request_list_lock); diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74a81b1daaac8850b68f37bc46110657082dfcc1..237c9c04dc3b49ffcaae3de63a752dbb4891fd0c 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -76,11 +76,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, if (S_ISLNK(inode->i_mode)) return -EOPNOTSUPP; - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err("Invalid key length (%d)\n", - (int)strlen(name)); + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } fsuid = from_kuid(&init_user_ns, current_fsuid()); fsgid = from_kgid(&init_user_ns, current_fsgid()); @@ -172,6 +169,9 @@ static int orangefs_inode_removexattr(struct inode *inode, const char *name, struct orangefs_kernel_op_s *new_op = NULL; int ret = -ENOMEM; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) + return -EINVAL; + down_write(&orangefs_inode->xattr_sem); new_op = op_alloc(ORANGEFS_VFS_OP_REMOVEXATTR); if (!new_op) @@ -231,23 +231,13 @@ int orangefs_inode_setxattr(struct inode *inode, const char *name, "%s: name %s, buffer_size %zd\n", __func__, name, size); - if (size >= ORANGEFS_MAX_XATTR_VALUELEN || - flags < 0) { - gossip_err("orangefs_inode_setxattr: bogus values of size(%d), flags(%d)\n", - (int)size, - flags); + if (size > ORANGEFS_MAX_XATTR_VALUELEN) + return -EINVAL; + if (strlen(name) > ORANGEFS_MAX_XATTR_NAMELEN) return -EINVAL; - } internal_flag = convert_to_internal_xattr_flags(flags); - if (strlen(name) >= ORANGEFS_MAX_XATTR_NAMELEN) { - gossip_err - ("orangefs_inode_setxattr: bogus key size (%d)\n", - (int)(strlen(name))); - return -EINVAL; - } - /* This is equivalent to a removexattr */ if (size == 0 && value == NULL) { gossip_debug(GOSSIP_XATTR_DEBUG, @@ -358,7 +348,7 @@ ssize_t orangefs_listxattr(struct dentry *dentry, char *buffer, size_t size) returned_count = new_op->downcall.resp.listxattr.returned_count; if (returned_count < 0 || - returned_count >= ORANGEFS_MAX_XATTR_LISTLEN) { + returned_count > ORANGEFS_MAX_XATTR_LISTLEN) { gossip_err("%s: impossible value for returned_count:%d:\n", __func__, returned_count); diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 906ea6c93260179c9c4947abe97c35750162c3e3..9008ab9fbd2ebe89d419c249455eb740c48a9eb1 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "overlayfs.h" #include "ovl_entry.h" @@ -232,6 +233,79 @@ int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) return err; } +static struct ovl_fh *ovl_encode_fh(struct dentry *lower, uuid_be *uuid) +{ + struct ovl_fh *fh; + int fh_type, fh_len, dwords; + void *buf; + int buflen = MAX_HANDLE_SZ; + + buf = kmalloc(buflen, GFP_TEMPORARY); + if (!buf) + return ERR_PTR(-ENOMEM); + + /* + * We encode a non-connectable file handle for non-dir, because we + * only need to find the lower inode number and we don't want to pay + * the price or reconnecting the dentry. + */ + dwords = buflen >> 2; + fh_type = exportfs_encode_fh(lower, buf, &dwords, 0); + buflen = (dwords << 2); + + fh = ERR_PTR(-EIO); + if (WARN_ON(fh_type < 0) || + WARN_ON(buflen > MAX_HANDLE_SZ) || + WARN_ON(fh_type == FILEID_INVALID)) + goto out; + + BUILD_BUG_ON(MAX_HANDLE_SZ + offsetof(struct ovl_fh, fid) > 255); + fh_len = offsetof(struct ovl_fh, fid) + buflen; + fh = kmalloc(fh_len, GFP_KERNEL); + if (!fh) { + fh = ERR_PTR(-ENOMEM); + goto out; + } + + fh->version = OVL_FH_VERSION; + fh->magic = OVL_FH_MAGIC; + fh->type = fh_type; + fh->flags = OVL_FH_FLAG_CPU_ENDIAN; + fh->len = fh_len; + fh->uuid = *uuid; + memcpy(fh->fid, buf, buflen); + +out: + kfree(buf); + return fh; +} + +static int ovl_set_origin(struct dentry *dentry, struct dentry *lower, + struct dentry *upper) +{ + struct super_block *sb = lower->d_sb; + uuid_be *uuid = (uuid_be *) &sb->s_uuid; + const struct ovl_fh *fh = NULL; + int err; + + /* + * When lower layer doesn't support export operations store a 'null' fh, + * so we can use the overlay.origin xattr to distignuish between a copy + * up and a pure upper inode. + */ + if (sb->s_export_op && sb->s_export_op->fh_to_dentry && + uuid_be_cmp(*uuid, NULL_UUID_BE)) { + fh = ovl_encode_fh(lower, uuid); + if (IS_ERR(fh)) + return PTR_ERR(fh); + } + + err = ovl_do_setxattr(upper, OVL_XATTR_ORIGIN, fh, fh ? fh->len : 0, 0); + kfree(fh); + + return err; +} + static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, struct dentry *dentry, struct path *lowerpath, struct kstat *stat, const char *link, @@ -316,6 +390,14 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; + /* + * Store identifier of lower inode in upper inode xattr to + * allow lookup of the copy up origin inode. + */ + err = ovl_set_origin(dentry, lowerpath->dentry, temp); + if (err) + goto out_cleanup; + if (tmpfile) err = ovl_do_link(temp, udir, upper, true); else diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 6515796460dfe170fb33b109feb09c9ac0b17519..723b98b9069876d1656b74735dabcaf01484e26d 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,36 +138,6 @@ static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry) return err; } -static int ovl_dir_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) -{ - struct dentry *dentry = path->dentry; - int err; - enum ovl_path_type type; - struct path realpath; - const struct cred *old_cred; - - type = ovl_path_real(dentry, &realpath); - old_cred = ovl_override_creds(dentry->d_sb); - err = vfs_getattr(&realpath, stat, request_mask, flags); - revert_creds(old_cred); - if (err) - return err; - - stat->dev = dentry->d_sb->s_dev; - stat->ino = dentry->d_inode->i_ino; - - /* - * It's probably not worth it to count subdirs to get the - * correct link count. nlink=1 seems to pacify 'find' and - * other utilities. - */ - if (OVL_TYPE_MERGE(type)) - stat->nlink = 1; - - return 0; -} - /* Common operations required to be done after creation of file on upper */ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, struct dentry *newdentry, bool hardlink) @@ -182,6 +152,9 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode, inc_nlink(inode); } d_instantiate(dentry, inode); + /* Force lookup of new upper hardlink to find its lower */ + if (hardlink) + d_drop(dentry); } static bool ovl_type_merge(struct dentry *dentry) @@ -210,7 +183,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, if (err) goto out_dput; - if (ovl_type_merge(dentry->d_parent)) { + if (ovl_type_merge(dentry->d_parent) && d_is_dir(newdentry)) { /* Setting opaque here is just an optimization, allow to fail */ ovl_set_opaque(dentry, newdentry); } @@ -1070,7 +1043,7 @@ const struct inode_operations ovl_dir_inode_operations = { .create = ovl_create, .mknod = ovl_mknod, .permission = ovl_permission, - .getattr = ovl_dir_getattr, + .getattr = ovl_getattr, .listxattr = ovl_listxattr, .get_acl = ovl_get_acl, .update_time = ovl_update_time, diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index f8fe6bf2036df3bd531132877101cb979c1feed0..ad9547f82da57fa4bd51eb5738cb8f02235e4455 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -57,18 +57,78 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) return err; } -static int ovl_getattr(const struct path *path, struct kstat *stat, - u32 request_mask, unsigned int flags) +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags) { struct dentry *dentry = path->dentry; + enum ovl_path_type type; struct path realpath; const struct cred *old_cred; + bool is_dir = S_ISDIR(dentry->d_inode->i_mode); int err; - ovl_path_real(dentry, &realpath); + type = ovl_path_real(dentry, &realpath); old_cred = ovl_override_creds(dentry->d_sb); err = vfs_getattr(&realpath, stat, request_mask, flags); + if (err) + goto out; + + /* + * When all layers are on the same fs, all real inode number are + * unique, so we use the overlay st_dev, which is friendly to du -x. + * + * We also use st_ino of the copy up origin, if we know it. + * This guaranties constant st_dev/st_ino across copy up. + * + * If filesystem supports NFS export ops, this also guaranties + * persistent st_ino across mount cycle. + */ + if (ovl_same_sb(dentry->d_sb)) { + if (OVL_TYPE_ORIGIN(type)) { + struct kstat lowerstat; + u32 lowermask = STATX_INO | (!is_dir ? STATX_NLINK : 0); + + ovl_path_lower(dentry, &realpath); + err = vfs_getattr(&realpath, &lowerstat, + lowermask, flags); + if (err) + goto out; + + WARN_ON_ONCE(stat->dev != lowerstat.dev); + /* + * Lower hardlinks are broken on copy up to different + * upper files, so we cannot use the lower origin st_ino + * for those different files, even for the same fs case. + */ + if (is_dir || lowerstat.nlink == 1) + stat->ino = lowerstat.ino; + } + stat->dev = dentry->d_sb->s_dev; + } else if (is_dir) { + /* + * If not all layers are on the same fs the pair {real st_ino; + * overlay st_dev} is not unique, so use the non persistent + * overlay st_ino. + * + * Always use the overlay st_dev for directories, so 'find + * -xdev' will scan the entire overlay mount and won't cross the + * overlay mount boundaries. + */ + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + } + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (is_dir && OVL_TYPE_MERGE(type)) + stat->nlink = 1; + +out: revert_creds(old_cred); + return err; } @@ -303,6 +363,41 @@ static const struct inode_operations ovl_symlink_inode_operations = { .update_time = ovl_update_time, }; +/* + * It is possible to stack overlayfs instance on top of another + * overlayfs instance as lower layer. We need to annonate the + * stackable i_mutex locks according to stack level of the super + * block instance. An overlayfs instance can never be in stack + * depth 0 (there is always a real fs below it). An overlayfs + * inode lock will use the lockdep annotaion ovl_i_mutex_key[depth]. + * + * For example, here is a snip from /proc/lockdep_chains after + * dir_iterate of nested overlayfs: + * + * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) + * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) + * [...] &type->i_mutex_dir_key (stack_depth=0) + */ +#define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH + +static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) +{ +#ifdef CONFIG_LOCKDEP + static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; + static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; + + int depth = inode->i_sb->s_stack_depth - 1; + + if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) + depth = 0; + + if (S_ISDIR(inode->i_mode)) + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); + else + lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); +#endif +} + static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_ino = get_next_ino(); @@ -312,6 +407,8 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; #endif + ovl_lockdep_annotate_inode_mutex_key(inode); + switch (mode & S_IFMT) { case S_IFREG: inode->i_op = &ovl_file_inode_operations; diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index b8b077821fb03bea9d63b3bf3508039836409e1b..bad0f665a63521efde00b4c488d4ed2ba85a5b75 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include "overlayfs.h" #include "ovl_entry.h" @@ -81,6 +83,90 @@ static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d, goto err_free; } +static int ovl_acceptable(void *ctx, struct dentry *dentry) +{ + return 1; +} + +static struct dentry *ovl_get_origin(struct dentry *dentry, + struct vfsmount *mnt) +{ + int res; + struct ovl_fh *fh = NULL; + struct dentry *origin = NULL; + int bytes; + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, NULL, 0); + if (res < 0) { + if (res == -ENODATA || res == -EOPNOTSUPP) + return NULL; + goto fail; + } + /* Zero size value means "copied up but origin unknown" */ + if (res == 0) + return NULL; + + fh = kzalloc(res, GFP_TEMPORARY); + if (!fh) + return ERR_PTR(-ENOMEM); + + res = vfs_getxattr(dentry, OVL_XATTR_ORIGIN, fh, res); + if (res < 0) + goto fail; + + if (res < sizeof(struct ovl_fh) || res < fh->len) + goto invalid; + + if (fh->magic != OVL_FH_MAGIC) + goto invalid; + + /* Treat larger version and unknown flags as "origin unknown" */ + if (fh->version > OVL_FH_VERSION || fh->flags & ~OVL_FH_FLAG_ALL) + goto out; + + /* Treat endianness mismatch as "origin unknown" */ + if (!(fh->flags & OVL_FH_FLAG_ANY_ENDIAN) && + (fh->flags & OVL_FH_FLAG_BIG_ENDIAN) != OVL_FH_FLAG_CPU_ENDIAN) + goto out; + + bytes = (fh->len - offsetof(struct ovl_fh, fid)); + + /* + * Make sure that the stored uuid matches the uuid of the lower + * layer where file handle will be decoded. + */ + if (uuid_be_cmp(fh->uuid, *(uuid_be *) &mnt->mnt_sb->s_uuid)) + goto out; + + origin = exportfs_decode_fh(mnt, (struct fid *)fh->fid, + bytes >> 2, (int)fh->type, + ovl_acceptable, NULL); + if (IS_ERR(origin)) { + /* Treat stale file handle as "origin unknown" */ + if (origin == ERR_PTR(-ESTALE)) + origin = NULL; + goto out; + } + + if (ovl_dentry_weird(origin) || + ((d_inode(origin)->i_mode ^ d_inode(dentry)->i_mode) & S_IFMT)) { + dput(origin); + origin = NULL; + goto invalid; + } + +out: + kfree(fh); + return origin; + +fail: + pr_warn_ratelimited("overlayfs: failed to get origin (%i)\n", res); + goto out; +invalid: + pr_warn_ratelimited("overlayfs: invalid origin (%*phN)\n", res, fh); + goto out; +} + static bool ovl_is_opaquedir(struct dentry *dentry) { int res; @@ -192,6 +278,45 @@ static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d, return 0; } + +static int ovl_check_origin(struct dentry *dentry, struct dentry *upperdentry, + struct path **stackp, unsigned int *ctrp) +{ + struct super_block *same_sb = ovl_same_sb(dentry->d_sb); + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; + struct vfsmount *mnt; + struct dentry *origin; + + if (!same_sb || !roe->numlower) + return 0; + + /* + * Since all layers are on the same fs, we use the first layer for + * decoding the file handle. We may get a disconnected dentry, + * which is fine, because we only need to hold the origin inode in + * cache and use its inode number. We may even get a connected dentry, + * that is not under the first layer's root. That is also fine for + * using it's inode number - it's the same as if we held a reference + * to a dentry in first layer that was moved under us. + */ + mnt = roe->lowerstack[0].mnt; + + origin = ovl_get_origin(upperdentry, mnt); + if (IS_ERR_OR_NULL(origin)) + return PTR_ERR(origin); + + BUG_ON(*stackp || *ctrp); + *stackp = kmalloc(sizeof(struct path), GFP_TEMPORARY); + if (!*stackp) { + dput(origin); + return -ENOMEM; + } + **stackp = (struct path) { .dentry = origin, .mnt = mnt }; + *ctrp = 1; + + return 0; +} + /* * Returns next layer in stack starting from top. * Returns -1 if this is the last layer. @@ -220,6 +345,7 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, const struct cred *old_cred; struct ovl_fs *ofs = dentry->d_sb->s_fs_info; struct ovl_entry *poe = dentry->d_parent->d_fsdata; + struct ovl_entry *roe = dentry->d_sb->s_root->d_fsdata; struct path *stack = NULL; struct dentry *upperdir, *upperdentry = NULL; unsigned int ctr = 0; @@ -253,13 +379,20 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, err = -EREMOTE; goto out; } + if (upperdentry && !d.is_dir) { + BUG_ON(!d.stop || d.redirect); + err = ovl_check_origin(dentry, upperdentry, + &stack, &ctr); + if (err) + goto out; + } if (d.redirect) { upperredirect = kstrdup(d.redirect, GFP_KERNEL); if (!upperredirect) goto out_put_upper; if (d.redirect[0] == '/') - poe = dentry->d_sb->s_root->d_fsdata; + poe = roe; } upperopaque = d.opaque; } @@ -290,10 +423,8 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, if (d.stop) break; - if (d.redirect && - d.redirect[0] == '/' && - poe != dentry->d_sb->s_root->d_fsdata) { - poe = dentry->d_sb->s_root->d_fsdata; + if (d.redirect && d.redirect[0] == '/' && poe != roe) { + poe = roe; /* Find the current layer on the root dentry */ for (i = 0; i < poe->numlower; i++) diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 741dc0b6931fe90fc62874989c93e659f7e7a4ef..caa36cb9c46de9838805dc40e21672217407d40e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -8,18 +8,56 @@ */ #include +#include enum ovl_path_type { __OVL_PATH_UPPER = (1 << 0), __OVL_PATH_MERGE = (1 << 1), + __OVL_PATH_ORIGIN = (1 << 2), }; #define OVL_TYPE_UPPER(type) ((type) & __OVL_PATH_UPPER) #define OVL_TYPE_MERGE(type) ((type) & __OVL_PATH_MERGE) +#define OVL_TYPE_ORIGIN(type) ((type) & __OVL_PATH_ORIGIN) #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay." #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque" #define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect" +#define OVL_XATTR_ORIGIN OVL_XATTR_PREFIX "origin" + +/* + * The tuple (fh,uuid) is a universal unique identifier for a copy up origin, + * where: + * origin.fh - exported file handle of the lower file + * origin.uuid - uuid of the lower filesystem + */ +#define OVL_FH_VERSION 0 +#define OVL_FH_MAGIC 0xfb + +/* CPU byte order required for fid decoding: */ +#define OVL_FH_FLAG_BIG_ENDIAN (1 << 0) +#define OVL_FH_FLAG_ANY_ENDIAN (1 << 1) + +#define OVL_FH_FLAG_ALL (OVL_FH_FLAG_BIG_ENDIAN | OVL_FH_FLAG_ANY_ENDIAN) + +#if defined(__LITTLE_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN 0 +#elif defined(__BIG_ENDIAN) +#define OVL_FH_FLAG_CPU_ENDIAN OVL_FH_FLAG_BIG_ENDIAN +#else +#error Endianness not defined +#endif + +/* On-disk and in-memeory format for redirect by file handle */ +struct ovl_fh { + u8 version; /* 0 */ + u8 magic; /* 0xfb */ + u8 len; /* size of this header + size of fid */ + u8 flags; /* OVL_FH_FLAG_* */ + u8 type; /* fid_type of fid */ + uuid_be uuid; /* uuid of filesystem */ + u8 fid[0]; /* file identifier */ +} __packed; #define OVL_ISUPPER_MASK 1UL @@ -151,6 +189,7 @@ int ovl_want_write(struct dentry *dentry); void ovl_drop_write(struct dentry *dentry); struct dentry *ovl_workdir(struct dentry *dentry); const struct cred *ovl_override_creds(struct super_block *sb); +struct super_block *ovl_same_sb(struct super_block *sb); struct ovl_entry *ovl_alloc_entry(unsigned int numlower); bool ovl_dentry_remote(struct dentry *dentry); bool ovl_dentry_weird(struct dentry *dentry); @@ -197,6 +236,8 @@ void ovl_workdir_cleanup(struct inode *dir, struct vfsmount *mnt, /* inode.c */ int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_getattr(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int flags); int ovl_permission(struct inode *inode, int mask); int ovl_xattr_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 59614faa14c315fb8f6e5998f91bc7061f818bab..b2023ddb85323725b8bbfa687f31fc854c1ba5e9 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -29,6 +29,8 @@ struct ovl_fs { const struct cred *creator_cred; bool tmpfile; wait_queue_head_t copyup_wq; + /* sb common to all layers */ + struct super_block *same_sb; }; /* private information held for every overlayfs dentry */ diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index c9e70d39c1ea1cafd730be5caaea6924a29e7c58..9828b7de89992e64a1900a277b58423dde7b992a 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -49,11 +49,28 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_check_append_only(struct inode *inode, int flag) +{ + /* + * This test was moot in vfs may_open() because overlay inode does + * not have the S_APPEND flag, so re-check on real upper inode + */ + if (IS_APPEND(inode)) { + if ((flag & O_ACCMODE) != O_RDONLY && !(flag & O_APPEND)) + return -EPERM; + if (flag & O_TRUNC) + return -EPERM; + } + + return 0; +} + static struct dentry *ovl_d_real(struct dentry *dentry, const struct inode *inode, unsigned int open_flags) { struct dentry *real; + int err; if (!d_is_reg(dentry)) { if (!inode || inode == d_inode(dentry)) @@ -65,15 +82,20 @@ static struct dentry *ovl_d_real(struct dentry *dentry, return dentry; if (open_flags) { - int err = ovl_open_maybe_copy_up(dentry, open_flags); - + err = ovl_open_maybe_copy_up(dentry, open_flags); if (err) return ERR_PTR(err); } real = ovl_dentry_upper(dentry); - if (real && (!inode || inode == d_inode(real))) + if (real && (!inode || inode == d_inode(real))) { + if (!inode) { + err = ovl_check_append_only(d_inode(real), open_flags); + if (err) + return ERR_PTR(err); + } return real; + } real = ovl_dentry_lower(dentry); if (!real) @@ -709,8 +731,8 @@ static const struct xattr_handler *ovl_xattr_handlers[] = { static int ovl_fill_super(struct super_block *sb, void *data, int silent) { - struct path upperpath = { NULL, NULL }; - struct path workpath = { NULL, NULL }; + struct path upperpath = { }; + struct path workpath = { }; struct dentry *root_dentry; struct inode *realinode; struct ovl_entry *oe; @@ -892,11 +914,19 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) ufs->lower_mnt[ufs->numlower] = mnt; ufs->numlower++; + + /* Check if all lower layers are on same sb */ + if (i == 0) + ufs->same_sb = mnt->mnt_sb; + else if (ufs->same_sb != mnt->mnt_sb) + ufs->same_sb = NULL; } /* If the upper fs is nonexistent, we mark overlayfs r/o too */ if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; + else if (ufs->upper_mnt->mnt_sb != ufs->same_sb) + ufs->same_sb = NULL; if (remote) sb->s_d_op = &ovl_reval_dentry_operations; diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 6e610a205e1556477ba80e512f1243629c193141..cfdea47313a10e22a9c06193e4cc422891badaae 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -40,6 +40,13 @@ const struct cred *ovl_override_creds(struct super_block *sb) return override_creds(ofs->creator_cred); } +struct super_block *ovl_same_sb(struct super_block *sb) +{ + struct ovl_fs *ofs = sb->s_fs_info; + + return ofs->same_sb; +} + struct ovl_entry *ovl_alloc_entry(unsigned int numlower) { size_t size = offsetof(struct ovl_entry, lowerstack[numlower]); @@ -75,11 +82,13 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) type = __OVL_PATH_UPPER; /* - * Non-dir dentry can hold lower dentry from previous - * location. + * Non-dir dentry can hold lower dentry of its copy up origin. */ - if (oe->numlower && d_is_dir(dentry)) - type |= __OVL_PATH_MERGE; + if (oe->numlower) { + type |= __OVL_PATH_ORIGIN; + if (d_is_dir(dentry)) + type |= __OVL_PATH_MERGE; + } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -100,7 +109,7 @@ void ovl_path_lower(struct dentry *dentry, struct path *path) { struct ovl_entry *oe = dentry->d_fsdata; - *path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL }; + *path = oe->numlower ? oe->lowerstack[0] : (struct path) { }; } enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) diff --git a/fs/proc/base.c b/fs/proc/base.c index c87b6b9a8a76b0a11078a391b0f6e4772cd1941e..45f6bf68fff3ed30df0f85d98c0f644c320a9bf7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -821,10 +821,7 @@ static ssize_t mem_rw(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - /* Maybe we should limit FOLL_FORCE to actual ptrace users? */ - flags = FOLL_FORCE; - if (write) - flags |= FOLL_WRITE; + flags = write ? FOLL_WRITE : 0; while (count > 0) { int this_len = min_t(int, count, PAGE_SIZE); @@ -2834,6 +2831,15 @@ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, return err; } +#ifdef CONFIG_LIVEPATCH +static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%d\n", task->patch_state); + return 0; +} +#endif /* CONFIG_LIVEPATCH */ + /* * Thread groups */ @@ -2933,6 +2939,9 @@ static const struct pid_entry tgid_base_stuff[] = { REG("timers", S_IRUGO, proc_timers_operations), #endif REG("timerslack_ns", S_IRUGO|S_IWUGO, proc_pid_set_timerslack_ns_operations), +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx) @@ -3315,6 +3324,9 @@ static const struct pid_entry tid_base_stuff[] = { REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif +#ifdef CONFIG_LIVEPATCH + ONE("patch_state", S_IRUSR, proc_pid_patch_state), +#endif }; static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx) diff --git a/fs/proc/generic.c b/fs/proc/generic.c index ee27feb34cf4d50fc867dc138198c2da9cd7d9ef..9425c0d97262b548d8c8fbf7ce222cbad6e1ef58 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -472,6 +472,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name) ent->data = NULL; ent->proc_fops = NULL; ent->proc_iops = NULL; + parent->nlink++; if (proc_register(parent, ent) < 0) { kfree(ent); parent->nlink--; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2cc7a8030275c1bc98a0da9adb870b33d99f5433..e250910cffc8fd54af942e8009da413ce76093ac 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,7 +58,7 @@ static struct inode *proc_alloc_inode(struct super_block *sb) struct proc_inode *ei; struct inode *inode; - ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(proc_inode_cachep, GFP_KERNEL); if (!ei) return NULL; ei->pid = NULL; diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 766f0c637ad1b429c83749ce01a48f56b5863899..3803b24ca220a4dc6cd02b4637e7576a071a9f98 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -23,6 +23,7 @@ static const struct proc_ns_operations *ns_entries[] = { #endif #ifdef CONFIG_PID_NS &pidns_operations, + &pidns_for_children_operations, #endif #ifdef CONFIG_USER_NS &userns_operations, diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index d04ea43499096e4fc02eec0f363683e3713f65ba..67985a7233c24023e9810e25057e2a28ae7874e0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -408,10 +408,6 @@ static void next_entry(struct ctl_table_header **phead, struct ctl_table **pentr *pentry = entry; } -void register_sysctl_root(struct ctl_table_root *root) -{ -} - /* * sysctl_perm does NOT grant the superuser all rights automatically, because * some sysctl variables are readonly even to root. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 312578089544dbd652aac303d0cbabee8fbcb968..f0c8b33d99b137e0c607448237216c2826df3fe8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -441,6 +441,7 @@ struct mem_size_stats { unsigned long private_dirty; unsigned long referenced; unsigned long anonymous; + unsigned long lazyfree; unsigned long anonymous_thp; unsigned long shmem_thp; unsigned long swap; @@ -457,8 +458,11 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, int i, nr = compound ? 1 << compound_order(page) : 1; unsigned long size = nr * PAGE_SIZE; - if (PageAnon(page)) + if (PageAnon(page)) { mss->anonymous += size; + if (!PageSwapBacked(page) && !dirty && !PageDirty(page)) + mss->lazyfree += size; + } mss->resident += size; /* Accumulate the size in pages that have been accessed. */ @@ -771,6 +775,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" "Anonymous: %8lu kB\n" + "LazyFree: %8lu kB\n" "AnonHugePages: %8lu kB\n" "ShmemPmdMapped: %8lu kB\n" "Shared_Hugetlb: %8lu kB\n" @@ -789,6 +794,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.private_dirty >> 10, mss.referenced >> 10, mss.anonymous >> 10, + mss.lazyfree >> 10, mss.anonymous_thp >> 10, mss.shmem_thp >> 10, mss.shared_hugetlb >> 10, diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c index 899d0ba0bd6cca44593f25be751efbe52a507d31..06aab07b6bb71bdabd8fa794c39a7b11cb8cb0a0 100644 --- a/fs/pstore/ftrace.c +++ b/fs/pstore/ftrace.c @@ -37,6 +37,12 @@ static void notrace pstore_ftrace_call(unsigned long ip, { unsigned long flags; struct pstore_ftrace_record rec = {}; + struct pstore_record record = { + .type = PSTORE_TYPE_FTRACE, + .buf = (char *)&rec, + .size = sizeof(rec), + .psi = psinfo, + }; if (unlikely(oops_in_progress)) return; @@ -47,8 +53,7 @@ static void notrace pstore_ftrace_call(unsigned long ip, rec.parent_ip = parent_ip; pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++); pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id()); - psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec, - 0, sizeof(rec), psinfo); + psinfo->write(&record); local_irq_restore(flags); } @@ -117,7 +122,7 @@ void pstore_register_ftrace(void) { struct dentry *file; - if (!psinfo->write_buf) + if (!psinfo->write) return; pstore_ftrace_dir = debugfs_create_dir("pstore", NULL); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 57c0646479f51cc0baa9bd82ba3cbb5689f41696..792a4e5f9226d16fefa0b6fd47061a76c79416ba 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -47,12 +47,8 @@ static LIST_HEAD(allpstore); struct pstore_private { struct list_head list; - struct pstore_info *psi; - enum pstore_type_id type; - u64 id; - int count; - ssize_t size; - char data[]; + struct pstore_record *record; + size_t total_size; }; struct pstore_ftrace_seq_data { @@ -63,6 +59,17 @@ struct pstore_ftrace_seq_data { #define REC_SIZE sizeof(struct pstore_ftrace_record) +static void free_pstore_private(struct pstore_private *private) +{ + if (!private) + return; + if (private->record) { + kfree(private->record->buf); + kfree(private->record); + } + kfree(private); +} + static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) { struct pstore_private *ps = s->private; @@ -72,9 +79,9 @@ static void *pstore_ftrace_seq_start(struct seq_file *s, loff_t *pos) if (!data) return NULL; - data->off = ps->size % REC_SIZE; + data->off = ps->total_size % REC_SIZE; data->off += *pos * REC_SIZE; - if (data->off + REC_SIZE > ps->size) { + if (data->off + REC_SIZE > ps->total_size) { kfree(data); return NULL; } @@ -94,7 +101,7 @@ static void *pstore_ftrace_seq_next(struct seq_file *s, void *v, loff_t *pos) struct pstore_ftrace_seq_data *data = v; data->off += REC_SIZE; - if (data->off + REC_SIZE > ps->size) + if (data->off + REC_SIZE > ps->total_size) return NULL; (*pos)++; @@ -105,7 +112,9 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v) { struct pstore_private *ps = s->private; struct pstore_ftrace_seq_data *data = v; - struct pstore_ftrace_record *rec = (void *)(ps->data + data->off); + struct pstore_ftrace_record *rec; + + rec = (struct pstore_ftrace_record *)(ps->record->buf + data->off); seq_printf(s, "CPU:%d ts:%llu %08lx %08lx %pf <- %pF\n", pstore_ftrace_decode_cpu(rec), @@ -125,7 +134,7 @@ static const struct seq_operations pstore_ftrace_seq_ops = { static int pstore_check_syslog_permissions(struct pstore_private *ps) { - switch (ps->type) { + switch (ps->record->type) { case PSTORE_TYPE_DMESG: case PSTORE_TYPE_CONSOLE: return check_syslog_permissions(SYSLOG_ACTION_READ_ALL, @@ -141,9 +150,10 @@ static ssize_t pstore_file_read(struct file *file, char __user *userbuf, struct seq_file *sf = file->private_data; struct pstore_private *ps = sf->private; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) return seq_read(file, userbuf, count, ppos); - return simple_read_from_buffer(userbuf, count, ppos, ps->data, ps->size); + return simple_read_from_buffer(userbuf, count, ppos, + ps->record->buf, ps->total_size); } static int pstore_file_open(struct inode *inode, struct file *file) @@ -157,7 +167,7 @@ static int pstore_file_open(struct inode *inode, struct file *file) if (err) return err; - if (ps->type == PSTORE_TYPE_FTRACE) + if (ps->record->type == PSTORE_TYPE_FTRACE) sops = &pstore_ftrace_seq_ops; err = seq_open(file, sops); @@ -193,20 +203,19 @@ static const struct file_operations pstore_file_operations = { static int pstore_unlink(struct inode *dir, struct dentry *dentry) { struct pstore_private *p = d_inode(dentry)->i_private; + struct pstore_record *record = p->record; int err; err = pstore_check_syslog_permissions(p); if (err) return err; - if (p->psi->erase) { - mutex_lock(&p->psi->read_mutex); - p->psi->erase(p->type, p->id, p->count, - d_inode(dentry)->i_ctime, p->psi); - mutex_unlock(&p->psi->read_mutex); - } else { + if (!record->psi->erase) return -EPERM; - } + + mutex_lock(&record->psi->read_mutex); + record->psi->erase(record); + mutex_unlock(&record->psi->read_mutex); return simple_unlink(dir, dentry); } @@ -221,7 +230,7 @@ static void pstore_evict_inode(struct inode *inode) spin_lock_irqsave(&allpstore_lock, flags); list_del(&p->list); spin_unlock_irqrestore(&allpstore_lock, flags); - kfree(p); + free_pstore_private(p); } } @@ -302,23 +311,23 @@ bool pstore_is_mounted(void) * Load it up with "size" bytes of data from "buf". * Set the mtime & ctime to the date that this record was originally stored. */ -int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, - char *data, bool compressed, size_t size, - struct timespec time, struct pstore_info *psi) +int pstore_mkfile(struct dentry *root, struct pstore_record *record) { - struct dentry *root = pstore_sb->s_root; struct dentry *dentry; struct inode *inode; int rc = 0; char name[PSTORE_NAMELEN]; struct pstore_private *private, *pos; unsigned long flags; + size_t size = record->size + record->ecc_notice_size; + + WARN_ON(!inode_is_locked(d_inode(root))); spin_lock_irqsave(&allpstore_lock, flags); list_for_each_entry(pos, &allpstore, list) { - if (pos->type == type && - pos->id == id && - pos->psi == psi) { + if (pos->record->type == record->type && + pos->record->id == record->id && + pos->record->psi == record->psi) { rc = -EEXIST; break; } @@ -328,72 +337,74 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, return rc; rc = -ENOMEM; - inode = pstore_get_inode(pstore_sb); + inode = pstore_get_inode(root->d_sb); if (!inode) goto fail; inode->i_mode = S_IFREG | 0444; inode->i_fop = &pstore_file_operations; - private = kmalloc(sizeof *private + size, GFP_KERNEL); + private = kzalloc(sizeof(*private), GFP_KERNEL); if (!private) goto fail_alloc; - private->type = type; - private->id = id; - private->count = count; - private->psi = psi; + private->record = record; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: scnprintf(name, sizeof(name), "dmesg-%s-%lld%s", - psname, id, compressed ? ".enc.z" : ""); + record->psi->name, record->id, + record->compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - scnprintf(name, sizeof(name), "console-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "console-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_FTRACE: - scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "ftrace-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_MCE: - scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "mce-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_RTAS: - scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "rtas-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OF: scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_COMMON: scnprintf(name, sizeof(name), "powerpc-common-%s-%lld", - psname, id); + record->psi->name, record->id); break; case PSTORE_TYPE_PMSG: - scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "pmsg-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_PPC_OPAL: - sprintf(name, "powerpc-opal-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "powerpc-opal-%s-%lld", + record->psi->name, record->id); break; case PSTORE_TYPE_UNKNOWN: - scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id); + scnprintf(name, sizeof(name), "unknown-%s-%lld", + record->psi->name, record->id); break; default: scnprintf(name, sizeof(name), "type%d-%s-%lld", - type, psname, id); + record->type, record->psi->name, record->id); break; } - inode_lock(d_inode(root)); - dentry = d_alloc_name(root, name); if (!dentry) - goto fail_lockedalloc; + goto fail_private; - memcpy(private->data, data, size); - inode->i_size = private->size = size; + inode->i_size = private->total_size = size; inode->i_private = private; - if (time.tv_sec) - inode->i_mtime = inode->i_ctime = time; + if (record->time.tv_sec) + inode->i_mtime = inode->i_ctime = record->time; d_add(dentry, inode); @@ -401,13 +412,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - inode_unlock(d_inode(root)); - return 0; -fail_lockedalloc: - inode_unlock(d_inode(root)); - kfree(private); +fail_private: + free_pstore_private(private); fail_alloc: iput(inode); @@ -415,6 +423,27 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, return rc; } +/* + * Read all the records from the persistent store. Create + * files in our filesystem. Don't warn about -EEXIST errors + * when we are re-scanning the backing store looking to add new + * error records. + */ +void pstore_get_records(int quiet) +{ + struct pstore_info *psi = psinfo; + struct dentry *root; + + if (!psi || !pstore_sb) + return; + + root = pstore_sb->s_root; + + inode_lock(d_inode(root)); + pstore_get_backend_records(psi, root, quiet); + inode_unlock(d_inode(root)); +} + static int pstore_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index da416e6591c9d91cd937b36438e31a51bde6bfa4..c416e653dc4f59d34a261ee2a617d82850067587 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -25,10 +25,10 @@ extern struct pstore_info *psinfo; extern void pstore_set_kmsg_bytes(int); extern void pstore_get_records(int); -extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id, - int count, char *data, bool compressed, - size_t size, struct timespec time, - struct pstore_info *psi); +extern void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet); +extern int pstore_mkfile(struct dentry *root, + struct pstore_record *record); extern bool pstore_is_mounted(void); #endif diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index efab7b64925ba76e2aa0a2a39d9e659c0a94472e..d468eec9b8a6ba9aa574055c0e76c615e2ede9d0 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -267,7 +267,7 @@ static void free_zlib(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_zlib = { +static const struct pstore_zbackend backend_zlib = { .compress = compress_zlib, .decompress = decompress_zlib, .allocate = allocate_zlib, @@ -328,7 +328,7 @@ static void free_lzo(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lzo = { +static const struct pstore_zbackend backend_lzo = { .compress = compress_lzo, .decompress = decompress_lzo, .allocate = allocate_lzo, @@ -393,7 +393,7 @@ static void free_lz4(void) big_oops_buf_sz = 0; } -static struct pstore_zbackend backend_lz4 = { +static const struct pstore_zbackend backend_lz4 = { .compress = compress_lz4, .decompress = decompress_lz4, .allocate = allocate_lz4, @@ -402,7 +402,7 @@ static struct pstore_zbackend backend_lz4 = { }; #endif -static struct pstore_zbackend *zbackend = +static const struct pstore_zbackend *zbackend = #if defined(CONFIG_PSTORE_ZLIB_COMPRESS) &backend_zlib; #elif defined(CONFIG_PSTORE_LZO_COMPRESS) @@ -484,7 +484,6 @@ static void pstore_dump(struct kmsg_dumper *dumper, { unsigned long total = 0; const char *why; - u64 id; unsigned int part = 1; unsigned long flags = 0; int is_locked; @@ -506,48 +505,59 @@ static void pstore_dump(struct kmsg_dumper *dumper, oopscount++; while (total < kmsg_bytes) { char *dst; - unsigned long size; - int hsize; + size_t dst_size; + int header_size; int zipped_len = -1; - size_t len; - bool compressed = false; - size_t total_len; + size_t dump_size; + struct pstore_record record = { + .type = PSTORE_TYPE_DMESG, + .count = oopscount, + .reason = reason, + .part = part, + .compressed = false, + .buf = psinfo->buf, + .psi = psinfo, + }; if (big_oops_buf && is_locked) { dst = big_oops_buf; - size = big_oops_buf_sz; + dst_size = big_oops_buf_sz; } else { dst = psinfo->buf; - size = psinfo->bufsize; + dst_size = psinfo->bufsize; } - hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); - size -= hsize; + /* Write dump header. */ + header_size = snprintf(dst, dst_size, "%s#%d Part%u\n", why, + oopscount, part); + dst_size -= header_size; - if (!kmsg_dump_get_buffer(dumper, true, dst + hsize, - size, &len)) + /* Write dump contents. */ + if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, + dst_size, &dump_size)) break; if (big_oops_buf && is_locked) { zipped_len = pstore_compress(dst, psinfo->buf, - hsize + len, psinfo->bufsize); + header_size + dump_size, + psinfo->bufsize); if (zipped_len > 0) { - compressed = true; - total_len = zipped_len; + record.compressed = true; + record.size = zipped_len; } else { - total_len = copy_kmsg_to_buffer(hsize, len); + record.size = copy_kmsg_to_buffer(header_size, + dump_size); } } else { - total_len = hsize + len; + record.size = header_size + dump_size; } - ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part, - oopscount, compressed, total_len, psinfo); + ret = psinfo->write(&record); if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted()) pstore_new_entry = 1; - total += total_len; + total += record.size; part++; } if (is_locked) @@ -577,8 +587,11 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) const char *e = s + c; while (s < e) { + struct pstore_record record = { + .type = PSTORE_TYPE_CONSOLE, + .psi = psinfo, + }; unsigned long flags; - u64 id; if (c > psinfo->bufsize) c = psinfo->bufsize; @@ -589,8 +602,9 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c) } else { spin_lock_irqsave(&psinfo->buf_lock, flags); } - psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0, - s, 0, c, psinfo); + record.buf = (char *)s; + record.size = c; + psinfo->write(&record); spin_unlock_irqrestore(&psinfo->buf_lock, flags); s += c; c = e - s; @@ -618,48 +632,30 @@ static void pstore_register_console(void) {} static void pstore_unregister_console(void) {} #endif -static int pstore_write_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, int count, - bool compressed, size_t size, - struct pstore_info *psi) -{ - return psi->write_buf(type, reason, id, part, psinfo->buf, compressed, - size, psi); -} - -static int pstore_write_buf_user_compat(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) -{ - unsigned long flags = 0; - size_t i, bufsize = size; - long ret = 0; - - if (unlikely(!access_ok(VERIFY_READ, buf, size))) - return -EFAULT; - if (bufsize > psinfo->bufsize) - bufsize = psinfo->bufsize; - spin_lock_irqsave(&psinfo->buf_lock, flags); - for (i = 0; i < size; ) { - size_t c = min(size - i, bufsize); - - ret = __copy_from_user(psinfo->buf, buf + i, c); - if (unlikely(ret != 0)) { - ret = -EFAULT; - break; - } - ret = psi->write_buf(type, reason, id, part, psinfo->buf, - compressed, c, psi); - if (unlikely(ret < 0)) - break; - i += c; +static int pstore_write_user_compat(struct pstore_record *record, + const char __user *buf) +{ + int ret = 0; + + if (record->buf) + return -EINVAL; + + record->buf = kmalloc(record->size, GFP_KERNEL); + if (!record->buf) + return -ENOMEM; + + if (unlikely(copy_from_user(record->buf, buf, record->size))) { + ret = -EFAULT; + goto out; } - spin_unlock_irqrestore(&psinfo->buf_lock, flags); - return unlikely(ret < 0) ? ret : size; + + ret = record->psi->write(record); + +out: + kfree(record->buf); + record->buf = NULL; + + return unlikely(ret < 0) ? ret : record->size; } /* @@ -673,19 +669,35 @@ int pstore_register(struct pstore_info *psi) { struct module *owner = psi->owner; - if (backend && strcmp(backend, psi->name)) + if (backend && strcmp(backend, psi->name)) { + pr_warn("ignoring unexpected backend '%s'\n", psi->name); return -EPERM; + } + + /* Sanity check flags. */ + if (!psi->flags) { + pr_warn("backend '%s' must support at least one frontend\n", + psi->name); + return -EINVAL; + } + + /* Check for required functions. */ + if (!psi->read || !psi->write) { + pr_warn("backend '%s' must implement read() and write()\n", + psi->name); + return -EINVAL; + } spin_lock(&pstore_lock); if (psinfo) { + pr_warn("backend '%s' already loaded: ignoring '%s'\n", + psinfo->name, psi->name); spin_unlock(&pstore_lock); return -EBUSY; } - if (!psi->write) - psi->write = pstore_write_compat; - if (!psi->write_buf_user) - psi->write_buf_user = pstore_write_buf_user_compat; + if (!psi->write_user) + psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); spin_unlock(&pstore_lock); @@ -709,6 +721,7 @@ int pstore_register(struct pstore_info *psi) if (psi->flags & PSTORE_FLAGS_PMSG) pstore_register_pmsg(); + /* Start watching for new records, if desired. */ if (pstore_update_ms >= 0) { pstore_timer.expires = jiffies + msecs_to_jiffies(pstore_update_ms); @@ -721,16 +734,21 @@ int pstore_register(struct pstore_info *psi) */ backend = psi->name; - module_put(owner); - pr_info("Registered %s as persistent store backend\n", psi->name); + module_put(owner); + return 0; } EXPORT_SYMBOL_GPL(pstore_register); void pstore_unregister(struct pstore_info *psi) { + /* Stop timer and make sure all work has finished. */ + pstore_update_ms = -1; + del_timer_sync(&pstore_timer); + flush_work(&pstore_work); + if (psi->flags & PSTORE_FLAGS_PMSG) pstore_unregister_pmsg(); if (psi->flags & PSTORE_FLAGS_FTRACE) @@ -747,66 +765,99 @@ void pstore_unregister(struct pstore_info *psi) } EXPORT_SYMBOL_GPL(pstore_unregister); +static void decompress_record(struct pstore_record *record) +{ + int unzipped_len; + char *decompressed; + + /* Only PSTORE_TYPE_DMESG support compression. */ + if (!record->compressed || record->type != PSTORE_TYPE_DMESG) { + pr_warn("ignored compressed record type %d\n", record->type); + return; + } + + /* No compression method has created the common buffer. */ + if (!big_oops_buf) { + pr_warn("no decompression buffer allocated\n"); + return; + } + + unzipped_len = pstore_decompress(record->buf, big_oops_buf, + record->size, big_oops_buf_sz); + if (unzipped_len <= 0) { + pr_err("decompression failed: %d\n", unzipped_len); + return; + } + + /* Build new buffer for decompressed contents. */ + decompressed = kmalloc(unzipped_len + record->ecc_notice_size, + GFP_KERNEL); + if (!decompressed) { + pr_err("decompression ran out of memory\n"); + return; + } + memcpy(decompressed, big_oops_buf, unzipped_len); + + /* Append ECC notice to decompressed buffer. */ + memcpy(decompressed + unzipped_len, record->buf + record->size, + record->ecc_notice_size); + + /* Swap out compresed contents with decompressed contents. */ + kfree(record->buf); + record->buf = decompressed; + record->size = unzipped_len; + record->compressed = false; +} + /* - * Read all the records from the persistent store. Create + * Read all the records from one persistent store backend. Create * files in our filesystem. Don't warn about -EEXIST errors * when we are re-scanning the backing store looking to add new * error records. */ -void pstore_get_records(int quiet) -{ - struct pstore_info *psi = psinfo; - char *buf = NULL; - ssize_t size; - u64 id; - int count; - enum pstore_type_id type; - struct timespec time; - int failed = 0, rc; - bool compressed; - int unzipped_len = -1; - ssize_t ecc_notice_size = 0; - - if (!psi) +void pstore_get_backend_records(struct pstore_info *psi, + struct dentry *root, int quiet) +{ + int failed = 0; + + if (!psi || !root) return; mutex_lock(&psi->read_mutex); if (psi->open && psi->open(psi)) goto out; - while ((size = psi->read(&id, &type, &count, &time, &buf, &compressed, - &ecc_notice_size, psi)) > 0) { - if (compressed && (type == PSTORE_TYPE_DMESG)) { - if (big_oops_buf) - unzipped_len = pstore_decompress(buf, - big_oops_buf, size, - big_oops_buf_sz); - - if (unzipped_len > 0) { - if (ecc_notice_size) - memcpy(big_oops_buf + unzipped_len, - buf + size, ecc_notice_size); - kfree(buf); - buf = big_oops_buf; - size = unzipped_len; - compressed = false; - } else { - pr_err("decompression failed;returned %d\n", - unzipped_len); - compressed = true; - } + /* + * Backend callback read() allocates record.buf. decompress_record() + * may reallocate record.buf. On success, pstore_mkfile() will keep + * the record.buf, so free it only on failure. + */ + for (;;) { + struct pstore_record *record; + int rc; + + record = kzalloc(sizeof(*record), GFP_KERNEL); + if (!record) { + pr_err("out of memory creating record\n"); + break; + } + record->psi = psi; + + record->size = psi->read(record); + + /* No more records left in backend? */ + if (record->size <= 0) + break; + + decompress_record(record); + rc = pstore_mkfile(root, record); + if (rc) { + /* pstore_mkfile() did not take record, so free it. */ + kfree(record->buf); + kfree(record); + if (rc != -EEXIST || !quiet) + failed++; } - rc = pstore_mkfile(type, psi->name, id, count, buf, - compressed, size + ecc_notice_size, - time, psi); - if (unzipped_len < 0) { - /* Free buffer other than big oops */ - kfree(buf); - buf = NULL; - } else - unzipped_len = -1; - if (rc && (rc != -EEXIST || !quiet)) - failed++; } if (psi->close) psi->close(psi); @@ -830,7 +881,9 @@ static void pstore_timefunc(unsigned long dummy) schedule_work(&pstore_work); } - mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); + if (pstore_update_ms >= 0) + mod_timer(&pstore_timer, + jiffies + msecs_to_jiffies(pstore_update_ms)); } module_param(backend, charp, 0444); diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c index 78f6176c020f8256dad089221af85d9560e5ac2f..209755e0d7c8ee92e3f19ff8fd4b45b6a9a9d1fa 100644 --- a/fs/pstore/pmsg.c +++ b/fs/pstore/pmsg.c @@ -15,7 +15,6 @@ #include #include #include -#include #include "internal.h" static DEFINE_MUTEX(pmsg_lock); @@ -23,19 +22,22 @@ static DEFINE_MUTEX(pmsg_lock); static ssize_t write_pmsg(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - u64 id; + struct pstore_record record = { + .type = PSTORE_TYPE_PMSG, + .size = count, + .psi = psinfo, + }; int ret; if (!count) return 0; - /* check outside lock, page in any data. write_buf_user also checks */ + /* check outside lock, page in any data. write_user also checks */ if (!access_ok(VERIFY_READ, buf, count)) return -EFAULT; mutex_lock(&pmsg_lock); - ret = psinfo->write_buf_user(PSTORE_TYPE_PMSG, 0, &id, 0, buf, 0, count, - psinfo); + ret = psinfo->write_user(&record, buf); mutex_unlock(&pmsg_lock); return ret ? ret : count; } diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 11f918d34b1e50556fda04f6f1f50e62f5dc5e3c..5cb022c8cd33f23b566721627dc25dd0263da3a8 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -58,7 +58,7 @@ module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400); MODULE_PARM_DESC(pmsg_size, "size of user space message log"); static unsigned long long mem_address; -module_param(mem_address, ullong, 0400); +module_param_hw(mem_address, ullong, other, 0400); MODULE_PARM_DESC(mem_address, "start of reserved RAM used to store oops/panic logs"); @@ -235,35 +235,34 @@ static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest, return 0; } -static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, - int *count, struct timespec *time, - char **buf, bool *compressed, - ssize_t *ecc_notice_size, - struct pstore_info *psi) +static ssize_t ramoops_pstore_read(struct pstore_record *record) { ssize_t size = 0; - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz = NULL; int header_length = 0; bool free_prz = false; - /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but + /* + * Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have * valid time stamps, so it is initialized to zero. */ - time->tv_sec = 0; - time->tv_nsec = 0; - *compressed = false; + record->time.tv_sec = 0; + record->time.tv_nsec = 0; + record->compressed = false; /* Find the next valid persistent_ram_zone for DMESG */ while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, id, type, + cxt->max_dump_cnt, &record->id, + &record->type, PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) continue; header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), - time, compressed); + &record->time, + &record->compressed); /* Clear and skip this DMESG record if it has no valid header */ if (!header_length) { persistent_ram_free_old(prz); @@ -274,18 +273,20 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, - 1, id, type, PSTORE_TYPE_CONSOLE, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_CONSOLE, 0); if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt, - 1, id, type, PSTORE_TYPE_PMSG, 0); + 1, &record->id, &record->type, + PSTORE_TYPE_PMSG, 0); /* ftrace is last since it may want to dynamically allocate memory. */ if (!prz_ok(prz)) { if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) { prz = ramoops_get_next_prz(cxt->fprzs, - &cxt->ftrace_read_cnt, 1, id, type, - PSTORE_TYPE_FTRACE, 0); + &cxt->ftrace_read_cnt, 1, &record->id, + &record->type, PSTORE_TYPE_FTRACE, 0); } else { /* * Build a new dummy record which combines all the @@ -302,8 +303,10 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) { prz_next = ramoops_get_next_prz(cxt->fprzs, &cxt->ftrace_read_cnt, - cxt->max_ftrace_cnt, id, - type, PSTORE_TYPE_FTRACE, 0); + cxt->max_ftrace_cnt, + &record->id, + &record->type, + PSTORE_TYPE_FTRACE, 0); if (!prz_ok(prz_next)) continue; @@ -316,7 +319,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (size) goto out; } - *id = 0; + record->id = 0; prz = tmp_prz; } } @@ -329,17 +332,19 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ - *ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); + record->ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); - *buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL); - if (*buf == NULL) { + record->buf = kmalloc(size + record->ecc_notice_size + 1, GFP_KERNEL); + if (record->buf == NULL) { size = -ENOMEM; goto out; } - memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size); + memcpy(record->buf, (char *)persistent_ram_old(prz) + header_length, + size); - persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1); + persistent_ram_ecc_string(prz, record->buf + size, + record->ecc_notice_size + 1); out: if (free_prz) { @@ -373,23 +378,18 @@ static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz, return len; } -static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - size_t hlen; + size_t size, hlen; - if (type == PSTORE_TYPE_CONSOLE) { + if (record->type == PSTORE_TYPE_CONSOLE) { if (!cxt->cprz) return -ENOMEM; - persistent_ram_write(cxt->cprz, buf, size); + persistent_ram_write(cxt->cprz, record->buf, record->size); return 0; - } else if (type == PSTORE_TYPE_FTRACE) { + } else if (record->type == PSTORE_TYPE_FTRACE) { int zonenum; if (!cxt->fprzs) @@ -402,33 +402,36 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, else zonenum = 0; - persistent_ram_write(cxt->fprzs[zonenum], buf, size); + persistent_ram_write(cxt->fprzs[zonenum], record->buf, + record->size); return 0; - } else if (type == PSTORE_TYPE_PMSG) { + } else if (record->type == PSTORE_TYPE_PMSG) { pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__); return -EINVAL; } - if (type != PSTORE_TYPE_DMESG) + if (record->type != PSTORE_TYPE_DMESG) return -EINVAL; - /* Out of the various dmesg dump types, ramoops is currently designed + /* + * Out of the various dmesg dump types, ramoops is currently designed * to only store crash logs, rather than storing general kernel logs. */ - if (reason != KMSG_DUMP_OOPS && - reason != KMSG_DUMP_PANIC) + if (record->reason != KMSG_DUMP_OOPS && + record->reason != KMSG_DUMP_PANIC) return -EINVAL; /* Skip Oopes when configured to do so. */ - if (reason == KMSG_DUMP_OOPS && !cxt->dump_oops) + if (record->reason == KMSG_DUMP_OOPS && !cxt->dump_oops) return -EINVAL; - /* Explicitly only take the first part of any new crash. + /* + * Explicitly only take the first part of any new crash. * If our buffer is larger than kmsg_bytes, this can never happen, * and if our buffer is smaller than kmsg_bytes, we don't want the * report split across multiple records. */ - if (part != 1) + if (record->part != 1) return -ENOSPC; if (!cxt->dprzs) @@ -436,53 +439,50 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type, prz = cxt->dprzs[cxt->dump_write_cnt]; - hlen = ramoops_write_kmsg_hdr(prz, compressed); + /* Build header and append record contents. */ + hlen = ramoops_write_kmsg_hdr(prz, record->compressed); + size = record->size; if (size + hlen > prz->buffer_size) size = prz->buffer_size - hlen; - persistent_ram_write(prz, buf, size); + persistent_ram_write(prz, record->buf, size); cxt->dump_write_cnt = (cxt->dump_write_cnt + 1) % cxt->max_dump_cnt; return 0; } -static int notrace ramoops_pstore_write_buf_user(enum pstore_type_id type, - enum kmsg_dump_reason reason, - u64 *id, unsigned int part, - const char __user *buf, - bool compressed, size_t size, - struct pstore_info *psi) +static int notrace ramoops_pstore_write_user(struct pstore_record *record, + const char __user *buf) { - if (type == PSTORE_TYPE_PMSG) { - struct ramoops_context *cxt = psi->data; + if (record->type == PSTORE_TYPE_PMSG) { + struct ramoops_context *cxt = record->psi->data; if (!cxt->mprz) return -ENOMEM; - return persistent_ram_write_user(cxt->mprz, buf, size); + return persistent_ram_write_user(cxt->mprz, buf, record->size); } return -EINVAL; } -static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count, - struct timespec time, struct pstore_info *psi) +static int ramoops_pstore_erase(struct pstore_record *record) { - struct ramoops_context *cxt = psi->data; + struct ramoops_context *cxt = record->psi->data; struct persistent_ram_zone *prz; - switch (type) { + switch (record->type) { case PSTORE_TYPE_DMESG: - if (id >= cxt->max_dump_cnt) + if (record->id >= cxt->max_dump_cnt) return -EINVAL; - prz = cxt->dprzs[id]; + prz = cxt->dprzs[record->id]; break; case PSTORE_TYPE_CONSOLE: prz = cxt->cprz; break; case PSTORE_TYPE_FTRACE: - if (id >= cxt->max_ftrace_cnt) + if (record->id >= cxt->max_ftrace_cnt) return -EINVAL; - prz = cxt->fprzs[id]; + prz = cxt->fprzs[record->id]; break; case PSTORE_TYPE_PMSG: prz = cxt->mprz; @@ -503,8 +503,8 @@ static struct ramoops_context oops_cxt = { .name = "ramoops", .open = ramoops_pstore_open, .read = ramoops_pstore_read, - .write_buf = ramoops_pstore_write_buf, - .write_buf_user = ramoops_pstore_write_buf_user, + .write = ramoops_pstore_write, + .write_user = ramoops_pstore_write_user, .erase = ramoops_pstore_erase, }, }; diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index bc927e30bdccd4fe89fd39d673126ae68d4e0e8e..e11672aa457514b8d84695688edc342f80a57b20 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -532,7 +532,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size, } /* Initialize general buffer state. */ - prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock); + raw_spin_lock_init(&prz->buffer_lock); prz->flags = flags; ret = persistent_ram_buffer_map(start, size, prz, memtype); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 74b489e3714d51cb66de83d4a9f3510c250456fe..ebf80c7739e15ebdd2bc55896d66943f3aecef9a 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2188,8 +2188,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* This can happen when suspending quotas on remount-ro... */ if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) { inode_lock(toputinode[cnt]); - toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | - S_NOATIME | S_NOQUOTA); + toputinode[cnt]->i_flags &= ~S_NOQUOTA; truncate_inode_pages(&toputinode[cnt]->i_data, 0); inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); @@ -2237,7 +2236,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, struct super_block *sb = inode->i_sb; struct quota_info *dqopt = sb_dqopt(sb); int error; - int oldflags = -1; if (!fmt) return -ESRCH; @@ -2285,9 +2283,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ inode_lock(inode); - oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | - S_NOQUOTA); - inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + inode->i_flags |= S_NOQUOTA; inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more @@ -2329,14 +2325,9 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, dqopt->files[type] = NULL; iput(inode); out_file_flags: - if (oldflags != -1) { - inode_lock(inode); - /* Set the flags back (in the case of accidental quotaon() - * on a wrong file we don't want to mess up the flags) */ - inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); - inode->i_flags |= oldflags; - inode_unlock(inode); - } + inode_lock(inode); + inode->i_flags &= ~S_NOQUOTA; + inode_unlock(inode); out_fmt: put_quota_format(fmt); @@ -2780,18 +2771,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii) } EXPORT_SYMBOL(dquot_set_dqinfo); -const struct quotactl_ops dquot_quotactl_ops = { - .quota_on = dquot_quota_on, - .quota_off = dquot_quota_off, - .quota_sync = dquot_quota_sync, - .get_state = dquot_get_state, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .get_nextdqblk = dquot_get_next_dqblk, - .set_dqblk = dquot_set_dqblk -}; -EXPORT_SYMBOL(dquot_quotactl_ops); - const struct quotactl_ops dquot_quotactl_sysfile_ops = { .quota_enable = dquot_quota_enable, .quota_disable = dquot_quota_disable, diff --git a/fs/read_write.c b/fs/read_write.c index c4f88afbc67f49ae9e451e06a81c45c37f8171f5..47c1d4484df9e6b69333215efc10d7a5db64210f 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -841,6 +841,81 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, return ret; } +#ifdef CONFIG_COMPAT +ssize_t compat_rw_copy_check_uvector(int type, + const struct compat_iovec __user *uvector, unsigned long nr_segs, + unsigned long fast_segs, struct iovec *fast_pointer, + struct iovec **ret_pointer) +{ + compat_ssize_t tot_len; + struct iovec *iov = *ret_pointer = fast_pointer; + ssize_t ret = 0; + int seg; + + /* + * SuS says "The readv() function *may* fail if the iovcnt argument + * was less than or equal to 0, or greater than {IOV_MAX}. Linux has + * traditionally returned zero for zero segments, so... + */ + if (nr_segs == 0) + goto out; + + ret = -EINVAL; + if (nr_segs > UIO_MAXIOV) + goto out; + if (nr_segs > fast_segs) { + ret = -ENOMEM; + iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL); + if (iov == NULL) + goto out; + } + *ret_pointer = iov; + + ret = -EFAULT; + if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector))) + goto out; + + /* + * Single unix specification: + * We should -EINVAL if an element length is not >= 0 and fitting an + * ssize_t. + * + * In Linux, the total length is limited to MAX_RW_COUNT, there is + * no overflow possibility. + */ + tot_len = 0; + ret = -EINVAL; + for (seg = 0; seg < nr_segs; seg++) { + compat_uptr_t buf; + compat_ssize_t len; + + if (__get_user(len, &uvector->iov_len) || + __get_user(buf, &uvector->iov_base)) { + ret = -EFAULT; + goto out; + } + if (len < 0) /* size_t not fitting in compat_ssize_t .. */ + goto out; + if (type >= 0 && + !access_ok(vrfy_dir(type), compat_ptr(buf), len)) { + ret = -EFAULT; + goto out; + } + if (len > MAX_RW_COUNT - tot_len) + len = MAX_RW_COUNT - tot_len; + tot_len += len; + iov->iov_base = compat_ptr(buf); + iov->iov_len = (compat_size_t) len; + uvector++; + iov++; + } + ret = tot_len; + +out: + return ret; +} +#endif + static ssize_t __do_readv_writev(int type, struct file *file, struct iov_iter *iter, loff_t *pos, int flags) { diff --git a/fs/readdir.c b/fs/readdir.c index 0e8a7f355f7a1916218a9bd4d874e3ed771c6fc1..89659549c09d81b42d26aef337f62c93ca4c1379 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -18,6 +18,7 @@ #include #include #include +#include #include @@ -324,3 +325,167 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, fdput_pos(f); return error; } + +#ifdef CONFIG_COMPAT +struct compat_old_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_offset; + unsigned short d_namlen; + char d_name[1]; +}; + +struct compat_readdir_callback { + struct dir_context ctx; + struct compat_old_linux_dirent __user *dirent; + int result; +}; + +static int compat_fillonedir(struct dir_context *ctx, const char *name, + int namlen, loff_t offset, u64 ino, + unsigned int d_type) +{ + struct compat_readdir_callback *buf = + container_of(ctx, struct compat_readdir_callback, ctx); + struct compat_old_linux_dirent __user *dirent; + compat_ulong_t d_ino; + + if (buf->result) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; + return -EOVERFLOW; + } + buf->result++; + dirent = buf->dirent; + if (!access_ok(VERIFY_WRITE, dirent, + (unsigned long)(dirent->d_name + namlen + 1) - + (unsigned long)dirent)) + goto efault; + if ( __put_user(d_ino, &dirent->d_ino) || + __put_user(offset, &dirent->d_offset) || + __put_user(namlen, &dirent->d_namlen) || + __copy_to_user(dirent->d_name, name, namlen) || + __put_user(0, dirent->d_name + namlen)) + goto efault; + return 0; +efault: + buf->result = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, + struct compat_old_linux_dirent __user *, dirent, unsigned int, count) +{ + int error; + struct fd f = fdget_pos(fd); + struct compat_readdir_callback buf = { + .ctx.actor = compat_fillonedir, + .dirent = dirent + }; + + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (buf.result) + error = buf.result; + + fdput_pos(f); + return error; +} + +struct compat_linux_dirent { + compat_ulong_t d_ino; + compat_ulong_t d_off; + unsigned short d_reclen; + char d_name[1]; +}; + +struct compat_getdents_callback { + struct dir_context ctx; + struct compat_linux_dirent __user *current_dir; + struct compat_linux_dirent __user *previous; + int count; + int error; +}; + +static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct compat_linux_dirent __user * dirent; + struct compat_getdents_callback *buf = + container_of(ctx, struct compat_getdents_callback, ctx); + compat_ulong_t d_ino; + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); + + buf->error = -EINVAL; /* only used if we fail.. */ + if (reclen > buf->count) + return -EINVAL; + d_ino = ino; + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; + return -EOVERFLOW; + } + dirent = buf->previous; + if (dirent) { + if (signal_pending(current)) + return -EINTR; + if (__put_user(offset, &dirent->d_off)) + goto efault; + } + dirent = buf->current_dir; + if (__put_user(d_ino, &dirent->d_ino)) + goto efault; + if (__put_user(reclen, &dirent->d_reclen)) + goto efault; + if (copy_to_user(dirent->d_name, name, namlen)) + goto efault; + if (__put_user(0, dirent->d_name + namlen)) + goto efault; + if (__put_user(d_type, (char __user *) dirent + reclen - 1)) + goto efault; + buf->previous = dirent; + dirent = (void __user *)dirent + reclen; + buf->current_dir = dirent; + buf->count -= reclen; + return 0; +efault: + buf->error = -EFAULT; + return -EFAULT; +} + +COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, + struct compat_linux_dirent __user *, dirent, unsigned int, count) +{ + struct fd f; + struct compat_linux_dirent __user * lastdirent; + struct compat_getdents_callback buf = { + .ctx.actor = compat_filldir, + .current_dir = dirent, + .count = count + }; + int error; + + if (!access_ok(VERIFY_WRITE, dirent, count)) + return -EFAULT; + + f = fdget_pos(fd); + if (!f.file) + return -EBADF; + + error = iterate_dir(f.file, &buf.ctx); + if (error >= 0) + error = buf.error; + lastdirent = buf.previous; + if (lastdirent) { + if (put_user(buf.ctx.pos, &lastdirent->d_off)) + error = -EFAULT; + else + error = count - buf.count; + } + fdput_pos(f); + return error; +} +#endif diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index a6ab9d64ea1b32777fa79a42f6396f9f52bcab08..873fc04e9403aa7d6ca9c2d13a7d045dd963fe75 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1375,7 +1375,6 @@ static void init_inode(struct inode *inode, struct treepath *path) static void inode2sd(void *sd, struct inode *inode, loff_t size) { struct stat_data *sd_v2 = (struct stat_data *)sd; - __u16 flags; set_sd_v2_mode(sd_v2, inode->i_mode); set_sd_v2_nlink(sd_v2, inode->i_nlink); @@ -1390,9 +1389,7 @@ static void inode2sd(void *sd, struct inode *inode, loff_t size) set_sd_v2_rdev(sd_v2, new_encode_dev(inode->i_rdev)); else set_sd_v2_generation(sd_v2, inode->i_generation); - flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, &flags); - set_sd_v2_attrs(sd_v2, flags); + set_sd_v2_attrs(sd_v2, REISERFS_I(inode)->i_attrs); } /* used to copy inode's fields to old stat data */ @@ -2002,10 +1999,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, /* uid and gid must already be set by the caller for quota init */ - /* symlink cannot be immutable or append only, right? */ - if (S_ISLNK(inode->i_mode)) - inode->i_flags &= ~(S_IMMUTABLE | S_APPEND); - inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_size = i_size; inode->i_blocks = 0; @@ -3095,28 +3088,6 @@ void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode) } } -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs) -{ - if (reiserfs_attrs(inode->i_sb)) { - if (inode->i_flags & S_IMMUTABLE) - *sd_attrs |= REISERFS_IMMUTABLE_FL; - else - *sd_attrs &= ~REISERFS_IMMUTABLE_FL; - if (inode->i_flags & S_SYNC) - *sd_attrs |= REISERFS_SYNC_FL; - else - *sd_attrs &= ~REISERFS_SYNC_FL; - if (inode->i_flags & S_NOATIME) - *sd_attrs |= REISERFS_NOATIME_FL; - else - *sd_attrs &= ~REISERFS_NOATIME_FL; - if (REISERFS_I(inode)->i_flags & i_nopack_mask) - *sd_attrs |= REISERFS_NOTAIL_FL; - else - *sd_attrs &= ~REISERFS_NOTAIL_FL; - } -} - /* * decide if this buffer needs to stay around for data logging or ordered * write purposes diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 1f4692a505a0c08af54f388d09a7fb9a2f0d75fd..acbbaf7a0bb26be8e25039ca4fee2a64a0db02be 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -47,7 +47,6 @@ long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } flags = REISERFS_I(inode)->i_attrs; - i_attrs_to_sd_attrs(inode, (__u16 *) & flags); err = put_user(flags, (int __user *)arg); break; case REISERFS_IOC_SETFLAGS:{ diff --git a/fs/reiserfs/item_ops.c b/fs/reiserfs/item_ops.c index aca73dd739066477a8f4eb6293122cfe84f3f055..e3c558d1b78c02c711ae9868fe409d349127103f 100644 --- a/fs/reiserfs/item_ops.c +++ b/fs/reiserfs/item_ops.c @@ -724,18 +724,18 @@ static void errcatch_print_vi(struct virtual_item *vi) } static struct item_operations errcatch_ops = { - errcatch_bytes_number, - errcatch_decrement_key, - errcatch_is_left_mergeable, - errcatch_print_item, - errcatch_check_item, - - errcatch_create_vi, - errcatch_check_left, - errcatch_check_right, - errcatch_part_size, - errcatch_unit_num, - errcatch_print_vi + .bytes_number = errcatch_bytes_number, + .decrement_key = errcatch_decrement_key, + .is_left_mergeable = errcatch_is_left_mergeable, + .print_item = errcatch_print_item, + .check_item = errcatch_check_item, + + .create_vi = errcatch_create_vi, + .check_left = errcatch_check_left, + .check_right = errcatch_check_right, + .part_size = errcatch_part_size, + .unit_num = errcatch_unit_num, + .print_vi = errcatch_print_vi }; #if ! (TYPE_STAT_DATA == 0 && TYPE_INDIRECT == 1 && TYPE_DIRECT == 2 && TYPE_DIRENTRY == 3) diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index aa40c242f1db3d9901c02eee0fd3e13a351a3c1b..da01f497180a165d163935c4744ef82521bf9151 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1961,7 +1961,7 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, * will be requeued because superblock is being shutdown and doesn't * have MS_ACTIVE set. */ - cancel_delayed_work_sync(&REISERFS_SB(sb)->old_work); + reiserfs_cancel_old_flush(sb); /* wait for all commits to finish */ cancel_delayed_work_sync(&SB_JOURNAL(sb)->j_work); diff --git a/fs/reiserfs/lbalance.c b/fs/reiserfs/lbalance.c index 249594a821e0a5e8c60d96c7cad0cd841fccd70b..f5cebd70d9038571a69ab24ef25af0d4779c600f 100644 --- a/fs/reiserfs/lbalance.c +++ b/fs/reiserfs/lbalance.c @@ -475,7 +475,7 @@ static void leaf_item_bottle(struct buffer_info *dest_bi, * 'cpy_bytes'; create new item header; * n_ih = new item_header; */ - memcpy(&n_ih, ih, SHORT_KEY_SIZE); + memcpy(&n_ih.ih_key, &ih->ih_key, KEY_SIZE); /* Endian safe, both le */ n_ih.ih_version = ih->ih_version; diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 2adcde137c3fb8d569cddcdf83dfd443a11f693f..1d34377fef97624e2ca695b4a552f01e94e84712 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -1326,7 +1326,6 @@ struct cpu_key { #define KEY_NOT_FOUND 0 #define KEY_SIZE (sizeof(struct reiserfs_key)) -#define SHORT_KEY_SIZE (sizeof (__u32) + sizeof (__u32)) /* return values for search_by_key and clones */ #define ITEM_FOUND 1 @@ -2949,6 +2948,7 @@ int reiserfs_allocate_list_bitmaps(struct super_block *s, struct reiserfs_list_bitmap *, unsigned int); void reiserfs_schedule_old_flush(struct super_block *s); +void reiserfs_cancel_old_flush(struct super_block *s); void add_save_link(struct reiserfs_transaction_handle *th, struct inode *inode, int truncate); int remove_save_link(struct inode *inode, int truncate); @@ -3099,7 +3099,6 @@ static inline void reiserfs_update_sd(struct reiserfs_transaction_handle *th, } void sd_attrs_to_i_attrs(__u16 sd_attrs, struct inode *inode); -void i_attrs_to_sd_attrs(struct inode *inode, __u16 * sd_attrs); int reiserfs_setattr(struct dentry *dentry, struct iattr *attr); int __reiserfs_write_begin(struct page *page, unsigned from, unsigned len); diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index feabcde0290d27103a11813682091abd1a191661..685f1e05699868cf59237cec9d3821b1ae2e1a61 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -89,11 +89,27 @@ static void flush_old_commits(struct work_struct *work) sbi = container_of(work, struct reiserfs_sb_info, old_work.work); s = sbi->s_journal->j_work_sb; + /* + * We need s_umount for protecting quota writeback. We have to use + * trylock as reiserfs_cancel_old_flush() may be waiting for this work + * to complete with s_umount held. + */ + if (!down_read_trylock(&s->s_umount)) { + /* Requeue work if we are not cancelling it */ + spin_lock(&sbi->old_work_lock); + if (sbi->work_queued == 1) + queue_delayed_work(system_long_wq, &sbi->old_work, HZ); + spin_unlock(&sbi->old_work_lock); + return; + } spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Avoid clobbering the cancel state... */ + if (sbi->work_queued == 1) + sbi->work_queued = 0; spin_unlock(&sbi->old_work_lock); reiserfs_sync_fs(s, 1); + up_read(&s->s_umount); } void reiserfs_schedule_old_flush(struct super_block *s) @@ -117,21 +133,22 @@ void reiserfs_schedule_old_flush(struct super_block *s) spin_unlock(&sbi->old_work_lock); } -static void cancel_old_flush(struct super_block *s) +void reiserfs_cancel_old_flush(struct super_block *s) { struct reiserfs_sb_info *sbi = REISERFS_SB(s); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); spin_lock(&sbi->old_work_lock); - sbi->work_queued = 0; + /* Make sure no new flushes will be queued */ + sbi->work_queued = 2; spin_unlock(&sbi->old_work_lock); + cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); } static int reiserfs_freeze(struct super_block *s) { struct reiserfs_transaction_handle th; - cancel_old_flush(s); + reiserfs_cancel_old_flush(s); reiserfs_write_lock(s); if (!(s->s_flags & MS_RDONLY)) { @@ -152,7 +169,13 @@ static int reiserfs_freeze(struct super_block *s) static int reiserfs_unfreeze(struct super_block *s) { + struct reiserfs_sb_info *sbi = REISERFS_SB(s); + reiserfs_allow_writes(s); + spin_lock(&sbi->old_work_lock); + /* Allow old_work to run again */ + sbi->work_queued = 0; + spin_unlock(&sbi->old_work_lock); return 0; } @@ -547,12 +570,28 @@ static void reiserfs_kill_sb(struct super_block *s) kill_block_super(s); } +#ifdef CONFIG_QUOTA +static int reiserfs_quota_off(struct super_block *sb, int type); + +static void reiserfs_quota_off_umount(struct super_block *s) +{ + int type; + + for (type = 0; type < REISERFS_MAXQUOTAS; type++) + reiserfs_quota_off(s, type); +} +#else +static inline void reiserfs_quota_off_umount(struct super_block *s) +{ +} +#endif + static void reiserfs_put_super(struct super_block *s) { struct reiserfs_transaction_handle th; th.t_trans_id = 0; - dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); + reiserfs_quota_off_umount(s); reiserfs_write_lock(s); @@ -817,7 +856,7 @@ static const struct dquot_operations reiserfs_quota_operations = { static const struct quotactl_ops reiserfs_qctl_operations = { .quota_on = reiserfs_quota_on, - .quota_off = dquot_quota_off, + .quota_off = reiserfs_quota_off, .quota_sync = dquot_quota_sync, .get_state = dquot_get_state, .set_info = dquot_set_dqinfo, @@ -2194,7 +2233,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) if (sbi->commit_wq) destroy_workqueue(sbi->commit_wq); - cancel_delayed_work_sync(&REISERFS_SB(s)->old_work); + reiserfs_cancel_old_flush(s); reiserfs_free_bitmap_cache(s); if (SB_BUFFER_WITH_SB(s)) @@ -2405,12 +2444,47 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, goto out; } reiserfs_write_unlock(sb); - return dquot_quota_on(sb, type, format_id, path); + err = dquot_quota_on(sb, type, format_id, path); + if (!err) { + inode_lock(inode); + REISERFS_I(inode)->i_attrs |= REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL; + inode_set_flags(inode, S_IMMUTABLE | S_NOATIME, + S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); + } + return err; out: reiserfs_write_unlock(sb); return err; } +static int reiserfs_quota_off(struct super_block *sb, int type) +{ + int err; + struct inode *inode = sb_dqopt(sb)->files[type]; + + if (!inode || !igrab(inode)) + goto out; + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + REISERFS_I(inode)->i_attrs &= ~(REISERFS_IMMUTABLE_FL | + REISERFS_NOATIME_FL); + inode_set_flags(inode, 0, S_IMMUTABLE | S_NOATIME); + inode_unlock(inode); + mark_inode_dirty(inode); +out_put: + iput(inode); + return err; +out: + return dquot_quota_off(sb, type); +} + /* * Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code diff --git a/fs/select.c b/fs/select.c index e2112270d75a5f878e291bb5bb681474e3c4eeaf..d6c652a31e99d18d2d924af983c11504315af56f 100644 --- a/fs/select.c +++ b/fs/select.c @@ -338,6 +338,53 @@ static int poll_select_copy_remaining(struct timespec64 *end_time, return ret; } +/* + * Scalable version of the fd_set. + */ + +typedef struct { + unsigned long *in, *out, *ex; + unsigned long *res_in, *res_out, *res_ex; +} fd_set_bits; + +/* + * How many longwords for "nr" bits? + */ +#define FDS_BITPERLONG (8*sizeof(long)) +#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) +#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) + +/* + * We do a VERIFY_WRITE here even though we are only reading this time: + * we'll write to it eventually.. + * + * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. + */ +static inline +int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + nr = FDS_BYTES(nr); + if (ufdset) + return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; + + memset(fdset, 0, nr); + return 0; +} + +static inline unsigned long __must_check +set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) +{ + if (ufdset) + return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); + return 0; +} + +static inline +void zero_fd_set(unsigned long nr, unsigned long *fdset) +{ + memset(fdset, 0, FDS_BYTES(nr)); +} + #define FDS_IN(fds, n) (fds->in + n) #define FDS_OUT(fds, n) (fds->out + n) #define FDS_EX(fds, n) (fds->ex + n) @@ -401,7 +448,7 @@ static inline void wait_key_set(poll_table *wait, unsigned long in, wait->_key |= POLLOUT_SET; } -int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) +static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) { ktime_t expire, *to = NULL; struct poll_wqueues table; @@ -409,7 +456,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) int retval, i, timed_out = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -512,11 +559,11 @@ int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -586,10 +633,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, goto out_nofds; alloc_size = 6 * size; - bits = kmalloc(alloc_size, GFP_KERNEL|__GFP_NOWARN); - if (!bits && alloc_size > PAGE_SIZE) - bits = vmalloc(alloc_size); - + bits = kvmalloc(alloc_size, GFP_KERNEL); if (!bits) goto out_nofds; } @@ -800,7 +844,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, int timed_out = 0, count = 0; u64 slack = 0; unsigned int busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0; - unsigned long busy_end = 0; + unsigned long busy_start = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -853,11 +897,11 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, /* only if found POLL_BUSY_LOOP sockets && not out of time */ if (can_busy_loop && !need_resched()) { - if (!busy_end) { - busy_end = busy_loop_end_time(); + if (!busy_start) { + busy_start = busy_loop_current_time(); continue; } - if (!busy_loop_timeout(busy_end)) + if (!busy_loop_timeout(busy_start)) continue; } busy_flag = 0; @@ -881,7 +925,7 @@ static int do_poll(struct poll_list *list, struct poll_wqueues *wait, #define N_STACK_PPS ((sizeof(stack_pps) - sizeof(struct poll_list)) / \ sizeof(struct pollfd)) -int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, +static int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, struct timespec64 *end_time) { struct poll_wqueues table; @@ -1053,3 +1097,373 @@ SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, return ret; } + +#ifdef CONFIG_COMPAT +#define __COMPAT_NFDBITS (8 * sizeof(compat_ulong_t)) + +static +int compat_poll_select_copy_remaining(struct timespec *end_time, void __user *p, + int timeval, int ret) +{ + struct timespec ts; + + if (!p) + return ret; + + if (current->personality & STICKY_TIMEOUTS) + goto sticky; + + /* No update for zero timeout */ + if (!end_time->tv_sec && !end_time->tv_nsec) + return ret; + + ktime_get_ts(&ts); + ts = timespec_sub(*end_time, ts); + if (ts.tv_sec < 0) + ts.tv_sec = ts.tv_nsec = 0; + + if (timeval) { + struct compat_timeval rtv; + + rtv.tv_sec = ts.tv_sec; + rtv.tv_usec = ts.tv_nsec / NSEC_PER_USEC; + + if (!copy_to_user(p, &rtv, sizeof(rtv))) + return ret; + } else { + struct compat_timespec rts; + + rts.tv_sec = ts.tv_sec; + rts.tv_nsec = ts.tv_nsec; + + if (!copy_to_user(p, &rts, sizeof(rts))) + return ret; + } + /* + * If an application puts its timeval in read-only memory, we + * don't want the Linux-specific update to the timeval to + * cause a fault after the select has completed + * successfully. However, because we're not updating the + * timeval, we can't restart the system call. + */ + +sticky: + if (ret == -ERESTARTNOHAND) + ret = -EINTR; + return ret; +} + +/* + * Ooo, nasty. We need here to frob 32-bit unsigned longs to + * 64-bit unsigned longs. + */ +static +int compat_get_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + if (ufdset) { + unsigned long odd; + + if (!access_ok(VERIFY_WRITE, ufdset, nr*sizeof(compat_ulong_t))) + return -EFAULT; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + if (__get_user(l, ufdset) || __get_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + *fdset++ = h << 32 | l; + nr -= 2; + } + if (odd && __get_user(*fdset, ufdset)) + return -EFAULT; + } else { + /* Tricky, must clear full unsigned long in the + * kernel fdset at the end, this makes sure that + * actually happens. + */ + memset(fdset, 0, ((nr + 1) & ~1)*sizeof(compat_ulong_t)); + } + return 0; +} + +static +int compat_set_fd_set(unsigned long nr, compat_ulong_t __user *ufdset, + unsigned long *fdset) +{ + unsigned long odd; + nr = DIV_ROUND_UP(nr, __COMPAT_NFDBITS); + + if (!ufdset) + return 0; + + odd = nr & 1UL; + nr &= ~1UL; + while (nr) { + unsigned long h, l; + l = *fdset++; + h = l >> 32; + if (__put_user(l, ufdset) || __put_user(h, ufdset+1)) + return -EFAULT; + ufdset += 2; + nr -= 2; + } + if (odd && __put_user(*fdset, ufdset)) + return -EFAULT; + return 0; +} + + +/* + * This is a virtual copy of sys_select from fs/select.c and probably + * should be compared to it from time to time + */ + +/* + * We can actually return ERESTARTSYS instead of EINTR, but I'd + * like to be certain this leads to no problems. So I return + * EINTR just for safety. + * + * Update: ERESTARTSYS breaks at least the xview clock binary, so + * I'm trying ERESTARTNOHAND which restart only when you want to. + */ +static int compat_core_sys_select(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct timespec *end_time) +{ + fd_set_bits fds; + void *bits; + int size, max_fds, ret = -EINVAL; + struct fdtable *fdt; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; + + if (n < 0) + goto out_nofds; + + /* max_fds can increase, so grab it once to avoid race */ + rcu_read_lock(); + fdt = files_fdtable(current->files); + max_fds = fdt->max_fds; + rcu_read_unlock(); + if (n > max_fds) + n = max_fds; + + /* + * We need 6 bitmaps (in/out/ex for both incoming and outgoing), + * since we used fdset we need to allocate memory in units of + * long-words. + */ + size = FDS_BYTES(n); + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + bits = kmalloc(6 * size, GFP_KERNEL); + ret = -ENOMEM; + if (!bits) + goto out_nofds; + } + fds.in = (unsigned long *) bits; + fds.out = (unsigned long *) (bits + size); + fds.ex = (unsigned long *) (bits + 2*size); + fds.res_in = (unsigned long *) (bits + 3*size); + fds.res_out = (unsigned long *) (bits + 4*size); + fds.res_ex = (unsigned long *) (bits + 5*size); + + if ((ret = compat_get_fd_set(n, inp, fds.in)) || + (ret = compat_get_fd_set(n, outp, fds.out)) || + (ret = compat_get_fd_set(n, exp, fds.ex))) + goto out; + zero_fd_set(n, fds.res_in); + zero_fd_set(n, fds.res_out); + zero_fd_set(n, fds.res_ex); + + ret = do_select(n, &fds, end_time); + + if (ret < 0) + goto out; + if (!ret) { + ret = -ERESTARTNOHAND; + if (signal_pending(current)) + goto out; + ret = 0; + } + + if (compat_set_fd_set(n, inp, fds.res_in) || + compat_set_fd_set(n, outp, fds.res_out) || + compat_set_fd_set(n, exp, fds.res_ex)) + ret = -EFAULT; +out: + if (bits != stack_fds) + kfree(bits); +out_nofds: + return ret; +} + +COMPAT_SYSCALL_DEFINE5(select, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timeval __user *, tvp) +{ + struct timespec end_time, *to = NULL; + struct compat_timeval tv; + int ret; + + if (tvp) { + if (copy_from_user(&tv, tvp, sizeof(tv))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) + return -EINVAL; + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tvp, 1, ret); + + return ret; +} + +struct compat_sel_arg_struct { + compat_ulong_t n; + compat_uptr_t inp; + compat_uptr_t outp; + compat_uptr_t exp; + compat_uptr_t tvp; +}; + +COMPAT_SYSCALL_DEFINE1(old_select, struct compat_sel_arg_struct __user *, arg) +{ + struct compat_sel_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp), + compat_ptr(a.exp), compat_ptr(a.tvp)); +} + +static long do_compat_pselect(int n, compat_ulong_t __user *inp, + compat_ulong_t __user *outp, compat_ulong_t __user *exp, + struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, + compat_size_t sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = compat_core_sys_select(n, inp, outp, exp, to); + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + if (ret == -ERESTARTNOHAND) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return ret; +} + +COMPAT_SYSCALL_DEFINE6(pselect6, int, n, compat_ulong_t __user *, inp, + compat_ulong_t __user *, outp, compat_ulong_t __user *, exp, + struct compat_timespec __user *, tsp, void __user *, sig) +{ + compat_size_t sigsetsize = 0; + compat_uptr_t up = 0; + + if (sig) { + if (!access_ok(VERIFY_READ, sig, + sizeof(compat_uptr_t)+sizeof(compat_size_t)) || + __get_user(up, (compat_uptr_t __user *)sig) || + __get_user(sigsetsize, + (compat_size_t __user *)(sig+sizeof(up)))) + return -EFAULT; + } + return do_compat_pselect(n, inp, outp, exp, tsp, compat_ptr(up), + sigsetsize); +} + +COMPAT_SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, + unsigned int, nfds, struct compat_timespec __user *, tsp, + const compat_sigset_t __user *, sigmask, compat_size_t, sigsetsize) +{ + compat_sigset_t ss32; + sigset_t ksigmask, sigsaved; + struct compat_timespec ts; + struct timespec end_time, *to = NULL; + int ret; + + if (tsp) { + if (copy_from_user(&ts, tsp, sizeof(ts))) + return -EFAULT; + + to = &end_time; + if (poll_select_set_timeout(to, ts.tv_sec, ts.tv_nsec)) + return -EINVAL; + } + + if (sigmask) { + if (sigsetsize != sizeof(compat_sigset_t)) + return -EINVAL; + if (copy_from_user(&ss32, sigmask, sizeof(ss32))) + return -EFAULT; + sigset_from_compat(&ksigmask, &ss32); + + sigdelsetmask(&ksigmask, sigmask(SIGKILL)|sigmask(SIGSTOP)); + sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); + } + + ret = do_sys_poll(ufds, nfds, to); + + /* We can restart this syscall, usually */ + if (ret == -EINTR) { + /* + * Don't restore the signal mask yet. Let do_signal() deliver + * the signal on the way back to userspace, before the signal + * mask is restored. + */ + if (sigmask) { + memcpy(¤t->saved_sigmask, &sigsaved, + sizeof(sigsaved)); + set_restore_sigmask(); + } + ret = -ERESTARTNOHAND; + } else if (sigmask) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + ret = compat_poll_select_copy_remaining(&end_time, tsp, 0, ret); + + return ret; +} +#endif diff --git a/fs/seq_file.c b/fs/seq_file.c index ca69fb99e41a8872d6d204d8fd8cffe7f08b3336..dc7c2be963ed4e2c2751313a44270b7655a45b4d 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -25,21 +25,7 @@ static void seq_set_overflow(struct seq_file *m) static void *seq_buf_alloc(unsigned long size) { - void *buf; - gfp_t gfp = GFP_KERNEL; - - /* - * For high order allocations, use __GFP_NORETRY to avoid oom-killing - - * it's better to fall back to vmalloc() than to kill things. For small - * allocations, just use GFP_KERNEL which will oom kill, thus no need - * for vmalloc fallback. - */ - if (size > PAGE_SIZE) - gfp |= __GFP_NORETRY | __GFP_NOWARN; - buf = kmalloc(size, gfp); - if (!buf && size > PAGE_SIZE) - buf = vmalloc(size); - return buf; + return kvmalloc(size, GFP_KERNEL); } /** diff --git a/fs/signalfd.c b/fs/signalfd.c index 270221fcef42cc42fcfdbc098b587b571be65a12..7e3d71109f51334d2bb8c71a9d2419e6ca4eeb3f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -38,7 +38,7 @@ void signalfd_cleanup(struct sighand_struct *sighand) /* * The lockless check can race with remove_wait_queue() in progress, * but in this case its caller should run under rcu_read_lock() and - * sighand_cachep is SLAB_DESTROY_BY_RCU, we can safely return. + * sighand_cachep is SLAB_TYPESAFE_BY_RCU, we can safely return. */ if (likely(!waitqueue_active(wqh))) return; diff --git a/fs/splice.c b/fs/splice.c index 006ba50f4ece671f48367b644641ab58b04b65d8..540c4a44756c20cdd0e7e2f3eb3db98712858210 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -247,11 +247,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } EXPORT_SYMBOL(add_to_pipe); -void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) -{ - put_page(spd->pages[i]); -} - /* * Check if we need to grow the arrays holding pages and partial page * descriptions. @@ -393,7 +388,7 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct iov_iter to; struct page **pages; unsigned int nr_pages; - size_t offset, dummy, copied = 0; + size_t offset, base, copied = 0; ssize_t res; int i; @@ -408,12 +403,11 @@ static ssize_t default_file_splice_read(struct file *in, loff_t *ppos, iov_iter_pipe(&to, ITER_PIPE | READ, pipe, len + offset); - res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &dummy); + res = iov_iter_get_pages_alloc(&to, &pages, len + offset, &base); if (res <= 0) return -ENOMEM; - BUG_ON(dummy); - nr_pages = DIV_ROUND_UP(res, PAGE_SIZE); + nr_pages = DIV_ROUND_UP(res + base, PAGE_SIZE); vec = __vec; if (nr_pages > PIPE_DEF_BUFFERS) { @@ -1359,6 +1353,8 @@ SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, struct fd f; long error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; if (unlikely(nr_segs > UIO_MAXIOV)) return -EINVAL; else if (unlikely(!nr_segs)) @@ -1409,6 +1405,9 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, if (unlikely(!len)) return 0; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + error = -EBADF; in = fdget(fd_in); if (in.file) { @@ -1737,6 +1736,9 @@ SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) struct fd in; int error; + if (unlikely(flags & ~SPLICE_F_ALL)) + return -EINVAL; + if (unlikely(!len)) return 0; diff --git a/fs/stat.c b/fs/stat.c index c6c963b2546b4f2d0301f40706064498c31fd3b6..f494b182c7c785232b1d686aae893ebbd70937c7 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -547,13 +548,13 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) /** * sys_statx - System call to get enhanced stats * @dfd: Base directory to pathwalk from *or* fd to stat. - * @filename: File to stat *or* NULL. + * @filename: File to stat or "" with AT_EMPTY_PATH * @flags: AT_* flags to control pathwalk. * @mask: Parts of statx struct actually required. * @buffer: Result buffer. * - * Note that if filename is NULL, then it does the equivalent of fstat() using - * dfd to indicate the file of interest. + * Note that fstat() can be emulated by setting dfd to the fd of interest, + * supplying "" as the filename and setting AT_EMPTY_PATH in the flags. */ SYSCALL_DEFINE5(statx, int, dfd, const char __user *, filename, unsigned, flags, @@ -568,16 +569,98 @@ SYSCALL_DEFINE5(statx, if ((flags & AT_STATX_SYNC_TYPE) == AT_STATX_SYNC_TYPE) return -EINVAL; - if (filename) - error = vfs_statx(dfd, filename, flags, &stat, mask); - else - error = vfs_statx_fd(dfd, &stat, mask, flags); + error = vfs_statx(dfd, filename, flags, &stat, mask); if (error) return error; return cp_statx(&stat, buffer); } +#ifdef CONFIG_COMPAT +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + struct compat_stat tmp; + + if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + + memset(&tmp, 0, sizeof(tmp)); + tmp.st_dev = old_encode_dev(stat->dev); + tmp.st_ino = stat->ino; + if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino) + return -EOVERFLOW; + tmp.st_mode = stat->mode; + tmp.st_nlink = stat->nlink; + if (tmp.st_nlink != stat->nlink) + return -EOVERFLOW; + SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid)); + SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid)); + tmp.st_rdev = old_encode_dev(stat->rdev); + if ((u64) stat->size > MAX_NON_LFS) + return -EOVERFLOW; + tmp.st_size = stat->size; + tmp.st_atime = stat->atime.tv_sec; + tmp.st_atime_nsec = stat->atime.tv_nsec; + tmp.st_mtime = stat->mtime.tv_sec; + tmp.st_mtime_nsec = stat->mtime.tv_nsec; + tmp.st_ctime = stat->ctime.tv_sec; + tmp.st_ctime_nsec = stat->ctime.tv_nsec; + tmp.st_blocks = stat->blocks; + tmp.st_blksize = stat->blksize; + return copy_to_user(ubuf, &tmp, sizeof(tmp)) ? -EFAULT : 0; +} + +COMPAT_SYSCALL_DEFINE2(newstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_stat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +COMPAT_SYSCALL_DEFINE2(newlstat, const char __user *, filename, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error; + + error = vfs_lstat(filename, &stat); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} + +#ifndef __ARCH_WANT_STAT64 +COMPAT_SYSCALL_DEFINE4(newfstatat, unsigned int, dfd, + const char __user *, filename, + struct compat_stat __user *, statbuf, int, flag) +{ + struct kstat stat; + int error; + + error = vfs_fstatat(dfd, filename, &stat, flag); + if (error) + return error; + return cp_compat_stat(&stat, statbuf); +} +#endif + +COMPAT_SYSCALL_DEFINE2(newfstat, unsigned int, fd, + struct compat_stat __user *, statbuf) +{ + struct kstat stat; + int error = vfs_fstat(fd, &stat); + + if (!error) + error = cp_compat_stat(&stat, statbuf); + return error; +} +#endif + /* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ void __inode_add_bytes(struct inode *inode, loff_t bytes) { diff --git a/fs/statfs.c b/fs/statfs.c index 13ae259d48795a782e566b7deaceadc73f86d8c7..4e4623c7a1260eafaf564e87b4812091a9b3eb0e 100644 --- a/fs/statfs.c +++ b/fs/statfs.c @@ -7,6 +7,7 @@ #include #include #include +#include #include "internal.h" static int flags_by_mnt(int mnt_flags) @@ -239,3 +240,142 @@ SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf) return copy_to_user(ubuf, &tmp, sizeof(struct ustat)) ? -EFAULT : 0; } + +#ifdef CONFIG_COMPAT +static int put_compat_statfs(struct compat_statfs __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof ubuf->f_blocks == 4) { + if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail | + kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +/* + * The following statfs calls are copies of code from fs/statfs.c and + * should be checked against those from time to time + */ +COMPAT_SYSCALL_DEFINE2(statfs, const char __user *, pathname, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *, buf) +{ + struct kstatfs tmp; + int error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs(buf, &tmp); + return error; +} + +static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf) +{ + if (sizeof(ubuf->f_bsize) == 4) { + if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen | + kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL) + return -EOVERFLOW; + /* f_files and f_ffree may be -1; it's okay + * to stuff that into 32 bits */ + if (kbuf->f_files != 0xffffffffffffffffULL + && (kbuf->f_files & 0xffffffff00000000ULL)) + return -EOVERFLOW; + if (kbuf->f_ffree != 0xffffffffffffffffULL + && (kbuf->f_ffree & 0xffffffff00000000ULL)) + return -EOVERFLOW; + } + if (!access_ok(VERIFY_WRITE, ubuf, sizeof(*ubuf)) || + __put_user(kbuf->f_type, &ubuf->f_type) || + __put_user(kbuf->f_bsize, &ubuf->f_bsize) || + __put_user(kbuf->f_blocks, &ubuf->f_blocks) || + __put_user(kbuf->f_bfree, &ubuf->f_bfree) || + __put_user(kbuf->f_bavail, &ubuf->f_bavail) || + __put_user(kbuf->f_files, &ubuf->f_files) || + __put_user(kbuf->f_ffree, &ubuf->f_ffree) || + __put_user(kbuf->f_namelen, &ubuf->f_namelen) || + __put_user(kbuf->f_fsid.val[0], &ubuf->f_fsid.val[0]) || + __put_user(kbuf->f_fsid.val[1], &ubuf->f_fsid.val[1]) || + __put_user(kbuf->f_frsize, &ubuf->f_frsize) || + __put_user(kbuf->f_flags, &ubuf->f_flags) || + __clear_user(ubuf->f_spare, sizeof(ubuf->f_spare))) + return -EFAULT; + return 0; +} + +COMPAT_SYSCALL_DEFINE3(statfs64, const char __user *, pathname, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = user_statfs(pathname, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +COMPAT_SYSCALL_DEFINE3(fstatfs64, unsigned int, fd, compat_size_t, sz, struct compat_statfs64 __user *, buf) +{ + struct kstatfs tmp; + int error; + + if (sz != sizeof(*buf)) + return -EINVAL; + + error = fd_statfs(fd, &tmp); + if (!error) + error = put_compat_statfs64(buf, &tmp); + return error; +} + +/* + * This is a copy of sys_ustat, just dealing with a structure layout. + * Given how simple this syscall is that apporach is more maintainable + * than the various conversion hacks. + */ +COMPAT_SYSCALL_DEFINE2(ustat, unsigned, dev, struct compat_ustat __user *, u) +{ + struct compat_ustat tmp; + struct kstatfs sbuf; + int err = vfs_ustat(new_decode_dev(dev), &sbuf); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct compat_ustat)); + tmp.f_tfree = sbuf.f_bfree; + tmp.f_tinode = sbuf.f_ffree; + if (copy_to_user(u, &tmp, sizeof(struct compat_ustat))) + return -EFAULT; + return 0; +} +#endif diff --git a/fs/super.c b/fs/super.c index b8b6a086c03b9a5478dc6a67e0b5054127032f47..adb0c0de428c2c88c0322ae739cbe108f01ae085 100644 --- a/fs/super.c +++ b/fs/super.c @@ -446,6 +446,10 @@ void generic_shutdown_super(struct super_block *sb) hlist_del_init(&sb->s_instances); spin_unlock(&sb_lock); up_write(&sb->s_umount); + if (sb->s_bdi != &noop_backing_dev_info) { + bdi_put(sb->s_bdi); + sb->s_bdi = &noop_backing_dev_info; + } } EXPORT_SYMBOL(generic_shutdown_super); @@ -1049,12 +1053,8 @@ static int set_bdev_super(struct super_block *s, void *data) { s->s_bdev = data; s->s_dev = s->s_bdev->bd_dev; + s->s_bdi = bdi_get(s->s_bdev->bd_bdi); - /* - * We set the bdi here to the queue backing, file systems can - * overwrite this in ->fill_super() - */ - s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } @@ -1255,6 +1255,49 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data) return ERR_PTR(error); } +/* + * Setup private BDI for given superblock. It gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...) +{ + struct backing_dev_info *bdi; + int err; + va_list args; + + bdi = bdi_alloc(GFP_KERNEL); + if (!bdi) + return -ENOMEM; + + bdi->name = sb->s_type->name; + + va_start(args, fmt); + err = bdi_register_va(bdi, fmt, args); + va_end(args); + if (err) { + bdi_put(bdi); + return err; + } + WARN_ON(sb->s_bdi != &noop_backing_dev_info); + sb->s_bdi = bdi; + + return 0; +} +EXPORT_SYMBOL(super_setup_bdi_name); + +/* + * Setup private BDI for given superblock. I gets automatically cleaned up + * in generic_shutdown_super(). + */ +int super_setup_bdi(struct super_block *sb) +{ + static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); + + return super_setup_bdi_name(sb, "%.28s-%ld", sb->s_type->name, + atomic_long_inc_return(&bdi_seq)); +} +EXPORT_SYMBOL(super_setup_bdi); + /* * This is an internal function, please use sb_end_{write,pagefault,intwrite} * instead. diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 21d36d28473562d4a40f75134ae3916b11c8041c..328e89c2cf835d4ff3cec1572ae30a30621fa183 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -266,7 +266,7 @@ static const struct super_operations tracefs_super_operations = { static int trace_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr trace_files[] = {{""}}; + static const struct tree_descr trace_files[] = {{""}}; struct tracefs_fs_info *fsi; int err; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index b0d0623c83ed88eae3afa31f079dfd2486767d07..83a961bf7280112f84a2567e2dd775f5679af3f3 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -61,3 +61,16 @@ config UBIFS_FS_ENCRYPTION feature is similar to ecryptfs, but it is more memory efficient since it avoids caching the encrypted and decrypted pages in the page cache. + +config UBIFS_FS_SECURITY + bool "UBIFS Security Labels" + depends on UBIFS_FS + default y + help + Security labels provide an access control facility to support Linux + Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO + Linux. This option enables an extended attribute handler for file + security labels in the ubifs filesystem, so that it requires enabling + the extended attribute support in advance. + + If you are not using a security module, say N. diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 1e712a36468064a75910e8ba9c0e1cfb2bd3ab28..7cd8a7b95299c2b096c93d0d0fbfd7838f8dff2c 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -32,6 +32,7 @@ #include #include #include +#include #include "ubifs.h" static DEFINE_SPINLOCK(dbg_lock); @@ -286,8 +287,10 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode) break; } - pr_err("\t%d: %s (%s)\n", - count++, dent->name, get_dent_type(dent->type)); + pr_err("\t%d: inode %llu, type %s, len %d\n", + count++, (unsigned long long) le64_to_cpu(dent->inum), + get_dent_type(dent->type), + le16_to_cpu(dent->nlen)); fname_name(&nm) = dent->name; fname_len(&nm) = le16_to_cpu(dent->nlen); @@ -464,7 +467,8 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("(bad name length, not printing, bad or corrupted node)"); else { for (i = 0; i < nlen && dent->name[i]; i++) - pr_cont("%c", dent->name[i]); + pr_cont("%c", isprint(dent->name[i]) ? + dent->name[i] : '?'); } pr_cont("\n"); @@ -2387,8 +2391,8 @@ int dbg_check_nondata_nodes_order(struct ubifs_info *c, struct list_head *head) ubifs_dump_node(c, sa->node); return -EINVAL; } - if (sa->type != UBIFS_INO_NODE && sa->type != UBIFS_DENT_NODE && - sa->type != UBIFS_XENT_NODE) { + if (sb->type != UBIFS_INO_NODE && sb->type != UBIFS_DENT_NODE && + sb->type != UBIFS_XENT_NODE) { ubifs_err(c, "bad node type %d", sb->type); ubifs_dump_node(c, sb->node); return -EINVAL; diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 30825d882aa94a4c2486d47581f33d1dfca1a409..566079d9b402cdde6082260fca6e07f42f0064e8 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -121,7 +121,7 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir, inode_init_owner(inode, dir, mode); inode->i_mtime = inode->i_atime = inode->i_ctime = - ubifs_current_time(inode); + current_time(inode); inode->i_mapping->nrpages = 0; switch (mode & S_IFMT) { @@ -285,6 +285,15 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto out_dent; } + if (ubifs_crypt_is_encrypted(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_inode; + } + done: kfree(dent); fscrypt_free_filename(&nm); @@ -295,6 +304,8 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, d_add(dentry, inode); return NULL; +out_inode: + iput(inode); out_dent: kfree(dent); out_fname: @@ -606,8 +617,8 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx) } while (1) { - dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, (unsigned long long)le64_to_cpu(dent->inum), + dbg_gen("ino %llu, new f_pos %#x", + (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); ubifs_assert(le64_to_cpu(dent->ch.sqnum) > ubifs_inode(dir)->creat_sqnum); @@ -748,9 +759,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, goto out_fname; lock_2_inodes(dir, inode); + + /* Handle O_TMPFILE corner case, it is allowed to link a O_TMPFILE. */ + if (inode->i_nlink == 0) + ubifs_delete_orphan(c, inode->i_ino); + inc_nlink(inode); ihold(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); dir->i_size += sz_change; dir_ui->ui_size = dir->i_size; dir->i_mtime = dir->i_ctime = inode->i_ctime; @@ -768,6 +784,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; drop_nlink(inode); + if (inode->i_nlink == 0) + ubifs_add_orphan(c, inode->i_ino); unlock_2_inodes(dir, inode); ubifs_release_budget(c, &req); iput(inode); @@ -823,7 +841,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); drop_nlink(inode); dir->i_size -= sz_change; dir_ui->ui_size = dir->i_size; @@ -927,7 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) } lock_2_inodes(dir, inode); - inode->i_ctime = ubifs_current_time(dir); + inode->i_ctime = current_time(dir); clear_nlink(inode); drop_nlink(dir); dir->i_size -= sz_change; @@ -1068,8 +1086,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry, } err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm); - if (err) + if (err) { + kfree(dev); goto out_budg; + } sz_change = CALC_DENT_SIZE(fname_len(&nm)); @@ -1316,9 +1336,6 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, unsigned int uninitialized_var(saved_nlink); struct fscrypt_name old_nm, new_nm; - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - /* * Budget request settings: deletion direntry, new direntry, removing * the old inode, and changing old and new parent directory inodes. @@ -1405,7 +1422,7 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry, * Like most other Unix systems, set the @i_ctime for inodes on a * rename. */ - time = ubifs_current_time(old_dir); + time = current_time(old_dir); old_inode->i_ctime = time; /* We must adjust parent link count when renaming directories */ @@ -1578,7 +1595,7 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry, lock_4_inodes(old_dir, new_dir, NULL, NULL); - time = ubifs_current_time(old_dir); + time = current_time(old_dir); fst_inode->i_ctime = time; snd_inode->i_ctime = time; old_dir->i_mtime = old_dir->i_ctime = time; diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index d9ae86f96df7b52255c259489b5343a7cd94faf8..2cda3d67e2d01c0e047a58c327a62e2778c9a117 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1196,7 +1196,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); ui->ui_size = inode->i_size; /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* Other attributes may be changed at the same time as well */ do_attr_changes(inode, attr); err = ubifs_jnl_truncate(c, inode, old_size, new_size); @@ -1243,7 +1243,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode, mutex_lock(&ui->ui_mutex); if (attr->ia_valid & ATTR_SIZE) { /* Truncation changes inode [mc]time */ - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); /* 'truncate_setsize()' changed @i_size, update @ui_size */ ui->ui_size = inode->i_size; } @@ -1420,7 +1420,7 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, */ static int update_mctime(struct inode *inode) { - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_inode *ui = ubifs_inode(inode); struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -1434,7 +1434,7 @@ static int update_mctime(struct inode *inode) return err; mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); @@ -1511,7 +1511,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct ubifs_info *c = inode->i_sb->s_fs_info; - struct timespec now = ubifs_current_time(inode); + struct timespec now = current_time(inode); struct ubifs_budget_req req = { .new_page = 1 }; int err, update_time; @@ -1579,7 +1579,7 @@ static int ubifs_vm_page_mkwrite(struct vm_fault *vmf) struct ubifs_inode *ui = ubifs_inode(inode); mutex_lock(&ui->ui_mutex); - inode->i_mtime = inode->i_ctime = ubifs_current_time(inode); + inode->i_mtime = inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index da519ba205f614fb7a5ea9d3eb72b4c58b4504e9..fdc311246807a4c3a018ff53e06f44614f5c6800 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -53,7 +53,7 @@ void ubifs_set_inode_flags(struct inode *inode) * ioctl2ubifs - convert ioctl inode flags to UBIFS inode flags. * @ioctl_flags: flags to convert * - * This function convert ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags + * This function converts ioctl flags (@FS_COMPR_FL, etc) to UBIFS inode flags * (@UBIFS_COMPR_FL, etc). */ static int ioctl2ubifs(int ioctl_flags) @@ -78,8 +78,8 @@ static int ioctl2ubifs(int ioctl_flags) * ubifs2ioctl - convert UBIFS inode flags to ioctl inode flags. * @ubifs_flags: flags to convert * - * This function convert UBIFS (@UBIFS_COMPR_FL, etc) to ioctl flags - * (@FS_COMPR_FL, etc). + * This function converts UBIFS inode flags (@UBIFS_COMPR_FL, etc) to ioctl + * flags (@FS_COMPR_FL, etc). */ static int ubifs2ioctl(int ubifs_flags) { @@ -126,7 +126,7 @@ static int setflags(struct inode *inode, int flags) ui->flags = ioctl2ubifs(flags); ubifs_set_inode_flags(inode); - inode->i_ctime = ubifs_current_time(inode); + inode->i_ctime = current_time(inode); release = ui->dirty; mark_inode_dirty_sync(inode); mutex_unlock(&ui->ui_mutex); diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 8ece6ca58c0ba128f658ea0c42ddc7976d942314..caf83d68fb38c2918bf9d421b4a67cd8a96571b7 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -224,16 +224,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, return (void *)((struct ubifs_branch *)idx->branches)->key; } -/** - * ubifs_current_time - round current time to time granularity. - * @inode: inode - */ -static inline struct timespec ubifs_current_time(struct inode *inode) -{ - return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? - current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; -} - /** * ubifs_tnc_lookup - look up a file-system node. * @c: UBIFS file-system description object diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 586d59347fff00c4572af57bf6e487f6fa2932d8..3af4472061cc0139638efa80245516f27fe175b0 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -442,7 +442,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, { int empty_offs, pad_len; - lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); ubifs_assert(!(*offs & 7)); diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c index 7f1ead29e7270a39374bef080035beaf42374820..8c25081a510969055405b95ed57e8d94ec556f44 100644 --- a/fs/ubifs/sb.c +++ b/fs/ubifs/sb.c @@ -84,6 +84,8 @@ static int create_default_filesystem(struct ubifs_info *c) int min_leb_cnt = UBIFS_MIN_LEB_CNT; long long tmp64, main_bytes; __le64 tmp_le64; + __le32 tmp_le32; + struct timespec ts; /* Some functions called from here depend on the @c->key_len filed */ c->key_len = UBIFS_SK_LEN; @@ -298,13 +300,17 @@ static int create_default_filesystem(struct ubifs_info *c) ino->ch.node_type = UBIFS_INO_NODE; ino->creat_sqnum = cpu_to_le64(++c->max_sqnum); ino->nlink = cpu_to_le32(2); - tmp_le64 = cpu_to_le64(CURRENT_TIME_SEC.tv_sec); + + ktime_get_real_ts(&ts); + ts = timespec_trunc(ts, DEFAULT_TIME_GRAN); + tmp_le64 = cpu_to_le64(ts.tv_sec); ino->atime_sec = tmp_le64; ino->ctime_sec = tmp_le64; ino->mtime_sec = tmp_le64; - ino->atime_nsec = 0; - ino->ctime_nsec = 0; - ino->mtime_nsec = 0; + tmp_le32 = cpu_to_le32(ts.tv_nsec); + ino->atime_nsec = tmp_le32; + ino->ctime_nsec = tmp_le32; + ino->mtime_nsec = tmp_le32; ino->mode = cpu_to_le32(S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO); ino->size = cpu_to_le64(UBIFS_INO_NODE_SZ); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b73811bd7676d6221243f21d52b06d9efed7cf4a..cf4cc99b75b55531789a03b065d86f521489f67e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1827,7 +1827,6 @@ static void ubifs_put_super(struct super_block *sb) } ubifs_umount(c); - bdi_destroy(&c->bdi); ubi_close_volume(c->ubi); mutex_unlock(&c->umount_mutex); } @@ -2019,29 +2018,25 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) goto out; } + err = ubifs_parse_options(c, data, 0); + if (err) + goto out_close; + /* * UBIFS provides 'backing_dev_info' in order to disable read-ahead. For * UBIFS, I/O is not deferred, it is done immediately in readpage, * which means the user would have to wait not just for their own I/O * but the read-ahead I/O as well i.e. completely pointless. * - * Read-ahead will be disabled because @c->bdi.ra_pages is 0. + * Read-ahead will be disabled because @sb->s_bdi->ra_pages is 0. Also + * @sb->s_bdi->capabilities are initialized to 0 so there won't be any + * writeback happening. */ - c->bdi.name = "ubifs", - c->bdi.capabilities = 0; - err = bdi_init(&c->bdi); + err = super_setup_bdi_name(sb, "ubifs_%d_%d", c->vi.ubi_num, + c->vi.vol_id); if (err) goto out_close; - err = bdi_register(&c->bdi, NULL, "ubifs_%d_%d", - c->vi.ubi_num, c->vi.vol_id); - if (err) - goto out_bdi; - - err = ubifs_parse_options(c, data, 0); - if (err) - goto out_bdi; - sb->s_bdi = &c->bdi; sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; @@ -2080,8 +2075,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) ubifs_umount(c); out_unlock: mutex_unlock(&c->umount_mutex); -out_bdi: - bdi_destroy(&c->bdi); out_close: ubi_close_volume(c->ubi); out: diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 4d57e488038e342f3b5ed84f5554862a86e440f3..298b4d89eee98b74440d84097d1789ba5de8a906 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -972,7 +972,6 @@ struct ubifs_debug_info; * struct ubifs_info - UBIFS file-system description data structure * (per-superblock). * @vfs_sb: VFS @struct super_block object - * @bdi: backing device info object to make VFS happy and disable read-ahead * * @highest_inum: highest used inode number * @max_sqnum: current global sequence number @@ -1220,7 +1219,6 @@ struct ubifs_debug_info; */ struct ubifs_info { struct super_block *vfs_sb; - struct backing_dev_info bdi; ino_t highest_inum; unsigned long long max_sqnum; @@ -1461,7 +1459,6 @@ extern const struct inode_operations ubifs_file_inode_operations; extern const struct file_operations ubifs_dir_operations; extern const struct inode_operations ubifs_dir_inode_operations; extern const struct inode_operations ubifs_symlink_inode_operations; -extern struct backing_dev_info ubifs_backing_dev_info; extern struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; /* io.c */ @@ -1756,13 +1753,23 @@ int ubifs_check_dir_empty(struct inode *dir); /* xattr.c */ extern const struct xattr_handler *ubifs_xattr_handlers[]; ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size); -int ubifs_init_security(struct inode *dentry, struct inode *inode, - const struct qstr *qstr); int ubifs_xattr_set(struct inode *host, const char *name, const void *value, size_t size, int flags); ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf, size_t size); +#ifdef CONFIG_UBIFS_FS_SECURITY +extern int ubifs_init_security(struct inode *dentry, struct inode *inode, + const struct qstr *qstr); +#else +static inline int ubifs_init_security(struct inode *dentry, + struct inode *inode, const struct qstr *qstr) +{ + return 0; +} +#endif + + /* super.c */ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum); diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index efe00fcb8b750961918a7e2c98ee3d670968ca72..6c9e62c2ef559b2d6b3d3383436775795e927375 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -152,7 +152,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host, ui->data_len = size; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt += 1; host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -234,7 +234,7 @@ static int change_xattr(struct ubifs_info *c, struct inode *host, mutex_unlock(&ui->ui_mutex); mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_size -= CALC_XATTR_BYTES(old_size); host_ui->xattr_size += CALC_XATTR_BYTES(size); @@ -488,7 +488,7 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host, return err; mutex_lock(&host_ui->ui_mutex); - host->i_ctime = ubifs_current_time(host); + host->i_ctime = current_time(host); host_ui->xattr_cnt -= 1; host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm)); host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len); @@ -559,6 +559,7 @@ static int ubifs_xattr_remove(struct inode *host, const char *name) return err; } +#ifdef CONFIG_UBIFS_FS_SECURITY static int init_xattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { @@ -599,6 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode, } return err; } +#endif static int xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, @@ -639,15 +641,19 @@ static const struct xattr_handler ubifs_trusted_xattr_handler = { .set = xattr_set, }; +#ifdef CONFIG_UBIFS_FS_SECURITY static const struct xattr_handler ubifs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, .get = xattr_get, .set = xattr_set, }; +#endif const struct xattr_handler *ubifs_xattr_handlers[] = { &ubifs_user_xattr_handler, &ubifs_trusted_xattr_handler, +#ifdef CONFIG_UBIFS_FS_SECURITY &ubifs_security_xattr_handler, +#endif NULL }; diff --git a/fs/udf/file.c b/fs/udf/file.c index e04cc0cdca9d736683c4fc5ac647e7fbdcc872c4..f5eb2d5b3bac86d91eaaddddbec6237dbbadfa38 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -44,12 +44,12 @@ static void __udf_adinicb_readpage(struct page *page) char *kaddr; struct udf_inode_info *iinfo = UDF_I(inode); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, inode->i_size); memset(kaddr + inode->i_size, 0, PAGE_SIZE - inode->i_size); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } static int udf_adinicb_readpage(struct file *file, struct page *page) @@ -70,11 +70,11 @@ static int udf_adinicb_writepage(struct page *page, BUG_ON(!PageLocked(page)); - kaddr = kmap(page); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - mark_inode_dirty(inode); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); + mark_inode_dirty(inode); unlock_page(page); return 0; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a8d8f71ef8bdb577675284e9cb5f03d38f1d34f7..98c510e172032df27a92b296507a3c1cb3017cdb 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -276,14 +276,14 @@ int udf_expand_file_adinicb(struct inode *inode) return -ENOMEM; if (!PageUptodate(page)) { - kaddr = kmap(page); + kaddr = kmap_atomic(page); memset(kaddr + iinfo->i_lenAlloc, 0x00, PAGE_SIZE - iinfo->i_lenAlloc); memcpy(kaddr, iinfo->i_ext.i_data + iinfo->i_lenEAttr, iinfo->i_lenAlloc); flush_dcache_page(page); SetPageUptodate(page); - kunmap(page); + kunmap_atomic(kaddr); } down_write(&iinfo->i_data_sem); memset(iinfo->i_ext.i_data + iinfo->i_lenEAttr, 0x00, @@ -300,11 +300,11 @@ int udf_expand_file_adinicb(struct inode *inode) if (err) { /* Restore everything back so that we don't lose data... */ lock_page(page); - kaddr = kmap(page); down_write(&iinfo->i_data_sem); + kaddr = kmap_atomic(page); memcpy(iinfo->i_ext.i_data + iinfo->i_lenEAttr, kaddr, inode->i_size); - kunmap(page); + kunmap_atomic(kaddr); unlock_page(page); iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; inode->i_data.a_ops = &udf_adinicb_aops; @@ -1535,7 +1535,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode) inode->i_data.a_ops = &udf_symlink_aops; inode->i_op = &udf_symlink_inode_operations; inode_nohighmem(inode); - inode->i_mode = S_IFLNK | S_IRWXUGO; + inode->i_mode = S_IFLNK | 0777; break; case ICBTAG_FILE_TYPE_MAIN: udf_debug("METADATA FILE-----\n"); @@ -1591,9 +1591,9 @@ static umode_t udf_convert_permissions(struct fileEntry *fe) permissions = le32_to_cpu(fe->permissions); flags = le16_to_cpu(fe->icbTag.flags); - mode = ((permissions) & S_IRWXO) | - ((permissions >> 2) & S_IRWXG) | - ((permissions >> 4) & S_IRWXU) | + mode = ((permissions) & 0007) | + ((permissions >> 2) & 0070) | + ((permissions >> 4) & 0700) | ((flags & ICBTAG_FLAG_SETUID) ? S_ISUID : 0) | ((flags & ICBTAG_FLAG_SETGID) ? S_ISGID : 0) | ((flags & ICBTAG_FLAG_STICKY) ? S_ISVTX : 0); @@ -1669,9 +1669,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) else fe->gid = cpu_to_le32(i_gid_read(inode)); - udfperms = ((inode->i_mode & S_IRWXO)) | - ((inode->i_mode & S_IRWXG) << 2) | - ((inode->i_mode & S_IRWXU) << 4); + udfperms = ((inode->i_mode & 0007)) | + ((inode->i_mode & 0070) << 2) | + ((inode->i_mode & 0700) << 4); udfperms |= (le32_to_cpu(fe->permissions) & (FE_PERM_O_DELETE | FE_PERM_O_CHATTR | diff --git a/fs/udf/namei.c b/fs/udf/namei.c index babf48d0e55330ec798460c2a7500878b90eeb06..385ee89d58242d9ffc920b9d2cb6db55999d073b 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -906,7 +906,7 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); + struct inode *inode = udf_new_inode(dir, S_IFLNK | 0777); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c index 9774555b37217eba4c7fa5375e6df0c7cfa64ae8..d1dd8cc33179137988aedf9bb15098547906a561 100644 --- a/fs/ufs/ialloc.c +++ b/fs/ufs/ialloc.c @@ -176,6 +176,7 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) struct ufs_cg_private_info * ucpi; struct ufs_cylinder_group * ucg; struct inode * inode; + struct timespec64 ts; unsigned cg, bit, i, j, start; struct ufs_inode_info *ufsi; int err = -ENOSPC; @@ -323,8 +324,9 @@ struct inode *ufs_new_inode(struct inode *dir, umode_t mode) lock_buffer(bh); ufs2_inode = (struct ufs2_inode *)bh->b_data; ufs2_inode += ufs_inotofsbo(inode->i_ino); - ufs2_inode->ui_birthtime = cpu_to_fs64(sb, CURRENT_TIME.tv_sec); - ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, CURRENT_TIME.tv_nsec); + ktime_get_real_ts64(&ts); + ufs2_inode->ui_birthtime = cpu_to_fs64(sb, ts.tv_sec); + ufs2_inode->ui_birthnsec = cpu_to_fs32(sb, ts.tv_nsec); mark_buffer_dirty(bh); unlock_buffer(bh); if (sb->s_flags & MS_SYNCHRONOUS) diff --git a/fs/utimes.c b/fs/utimes.c index 32b15b3f6629023e55c34474bae0fa942a8df6ea..6571d8c848a04054e7517f7ad29b55d5b9022232 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -1,14 +1,10 @@ -#include #include -#include -#include #include #include -#include -#include #include #include #include +#include #include #ifdef __ARCH_WANT_SYS_UTIME @@ -219,3 +215,63 @@ SYSCALL_DEFINE2(utimes, char __user *, filename, { return sys_futimesat(AT_FDCWD, filename, utimes); } + +#ifdef CONFIG_COMPAT +/* + * Not all architectures have sys_utime, so implement this in terms + * of sys_utimes. + */ +COMPAT_SYSCALL_DEFINE2(utime, const char __user *, filename, + struct compat_utimbuf __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t->actime) || + get_user(tv[1].tv_sec, &t->modtime)) + return -EFAULT; + tv[0].tv_nsec = 0; + tv[1].tv_nsec = 0; + } + return do_utimes(AT_FDCWD, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE4(utimensat, unsigned int, dfd, const char __user *, filename, struct compat_timespec __user *, t, int, flags) +{ + struct timespec tv[2]; + + if (t) { + if (compat_get_timespec(&tv[0], &t[0]) || + compat_get_timespec(&tv[1], &t[1])) + return -EFAULT; + + if (tv[0].tv_nsec == UTIME_OMIT && tv[1].tv_nsec == UTIME_OMIT) + return 0; + } + return do_utimes(dfd, filename, t ? tv : NULL, flags); +} + +COMPAT_SYSCALL_DEFINE3(futimesat, unsigned int, dfd, const char __user *, filename, struct compat_timeval __user *, t) +{ + struct timespec tv[2]; + + if (t) { + if (get_user(tv[0].tv_sec, &t[0].tv_sec) || + get_user(tv[0].tv_nsec, &t[0].tv_usec) || + get_user(tv[1].tv_sec, &t[1].tv_sec) || + get_user(tv[1].tv_nsec, &t[1].tv_usec)) + return -EFAULT; + if (tv[0].tv_nsec >= 1000000 || tv[0].tv_nsec < 0 || + tv[1].tv_nsec >= 1000000 || tv[1].tv_nsec < 0) + return -EINVAL; + tv[0].tv_nsec *= 1000; + tv[1].tv_nsec *= 1000; + } + return do_utimes(dfd, filename, t ? tv : NULL, 0); +} + +COMPAT_SYSCALL_DEFINE2(utimes, const char __user *, filename, struct compat_timeval __user *, t) +{ + return compat_sys_futimesat(AT_FDCWD, filename, t); +} +#endif diff --git a/fs/xattr.c b/fs/xattr.c index 7e3317cf4045523a7d0b17f102d51775808a8950..464c94bf65f9e55339ec9d4bc515ec990691deb7 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -431,12 +431,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, if (size) { if (size > XATTR_SIZE_MAX) return -E2BIG; - kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvmalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; if (copy_from_user(kvalue, value, size)) { error = -EFAULT; goto out; @@ -528,12 +525,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, if (size) { if (size > XATTR_SIZE_MAX) size = XATTR_SIZE_MAX; - kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!kvalue) { - kvalue = vmalloc(size); - if (!kvalue) - return -ENOMEM; - } + kvalue = kvzalloc(size, GFP_KERNEL); + if (!kvalue) + return -ENOMEM; } error = vfs_getxattr(d, kname, kvalue, size); @@ -611,12 +605,9 @@ listxattr(struct dentry *d, char __user *list, size_t size) if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; - klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); - if (!klist) { - klist = vmalloc(size); - if (!klist) - return -ENOMEM; - } + klist = kvmalloc(size, GFP_KERNEL); + if (!klist) + return -ENOMEM; } error = vfs_listxattr(d, klist, size); diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 26ef1958b65b5289583090b6143b2ddec03c8c90..5c90f82b8f6b88d4f93a3f7f33b3c0e1b8ec4b20 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -79,6 +79,7 @@ xfs-y += xfs_aops.o \ xfs_extent_busy.o \ xfs_file.o \ xfs_filestream.o \ + xfs_fsmap.o \ xfs_fsops.o \ xfs_globals.o \ xfs_icache.o \ diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 70a5b55e0870a0523c0dd8ce629debf2fccebe25..393b6849aeb31991ed43808ff213fb3cadba00d0 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -48,7 +48,7 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) void * kmem_zalloc_large(size_t size, xfs_km_flags_t flags) { - unsigned noio_flag = 0; + unsigned nofs_flag = 0; void *ptr; gfp_t lflags; @@ -60,17 +60,17 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags) * __vmalloc() will allocate data pages and auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we may be under GFP_NOFS context * here. Hence we need to tell memory reclaim that we are in such a - * context via PF_MEMALLOC_NOIO to prevent memory reclaim re-entering + * context via PF_MEMALLOC_NOFS to prevent memory reclaim re-entering * the filesystem here and potentially deadlocking. */ - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - noio_flag = memalloc_noio_save(); + if (flags & KM_NOFS) + nofs_flag = memalloc_nofs_save(); lflags = kmem_flags_convert(flags); - ptr = __vmalloc(size, lflags | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + ptr = __vmalloc(size, lflags | __GFP_ZERO, PAGE_KERNEL); - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) - memalloc_noio_restore(noio_flag); + if (flags & KM_NOFS) + memalloc_nofs_restore(nofs_flag); return ptr; } diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index f0fc84fcaac2553283f90bc3f157b924bd03d932..d6ea520162b26530bbb560b667d925b6c6690b63 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -50,7 +50,7 @@ kmem_flags_convert(xfs_km_flags_t flags) lflags = GFP_ATOMIC | __GFP_NOWARN; } else { lflags = GFP_KERNEL | __GFP_NOWARN; - if ((current->flags & PF_FSTRANS) || (flags & KM_NOFS)) + if (flags & KM_NOFS) lflags &= ~__GFP_FS; } diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 369adcc18c02900f096a5ccd543a8c424dff7894..7486401ccbd36e0b076c020f15f433e9592322cc 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2868,3 +2868,60 @@ xfs_free_extent( xfs_trans_brelse(tp, agbp); return error; } + +struct xfs_alloc_query_range_info { + xfs_alloc_query_range_fn fn; + void *priv; +}; + +/* Format btree record and pass to our callback. */ +STATIC int +xfs_alloc_query_range_helper( + struct xfs_btree_cur *cur, + union xfs_btree_rec *rec, + void *priv) +{ + struct xfs_alloc_query_range_info *query = priv; + struct xfs_alloc_rec_incore irec; + + irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); + irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); + return query->fn(cur, &irec, query->priv); +} + +/* Find all free space within a given range of blocks. */ +int +xfs_alloc_query_range( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + low_brec.a = *low_rec; + high_brec.a = *high_rec; + query.priv = priv; + query.fn = fn; + return xfs_btree_query_range(cur, &low_brec, &high_brec, + xfs_alloc_query_range_helper, &query); +} + +/* Find all free space records. */ +int +xfs_alloc_query_all( + struct xfs_btree_cur *cur, + xfs_alloc_query_range_fn fn, + void *priv) +{ + struct xfs_alloc_query_range_info query; + + ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); +} diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 2a8d0fa6fbbea3973f96183934d2ff2e0e581f53..77d9c27330abc2fbe7dbf192a8d25e8c2352fa17 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -219,4 +219,16 @@ int xfs_free_extent_fix_freelist(struct xfs_trans *tp, xfs_agnumber_t agno, xfs_extlen_t xfs_prealloc_blocks(struct xfs_mount *mp); +typedef int (*xfs_alloc_query_range_fn)( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv); + +int xfs_alloc_query_range(struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *low_rec, + struct xfs_alloc_rec_incore *high_rec, + xfs_alloc_query_range_fn fn, void *priv); +int xfs_alloc_query_all(struct xfs_btree_cur *cur, xfs_alloc_query_range_fn fn, + void *priv); + #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index efb467b10a71d35927426a5e916b5d96d7dd9978..e1fcfe7f0a9a4fb5665c296c6b171d69cdf80542 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -205,19 +205,37 @@ xfs_allocbt_init_key_from_rec( union xfs_btree_key *key, union xfs_btree_rec *rec) { - ASSERT(rec->alloc.ar_startblock != 0); - key->alloc.ar_startblock = rec->alloc.ar_startblock; key->alloc.ar_blockcount = rec->alloc.ar_blockcount; } +STATIC void +xfs_bnobt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + __u32 x; + + x = be32_to_cpu(rec->alloc.ar_startblock); + x += be32_to_cpu(rec->alloc.ar_blockcount) - 1; + key->alloc.ar_startblock = cpu_to_be32(x); + key->alloc.ar_blockcount = 0; +} + +STATIC void +xfs_cntbt_init_high_key_from_rec( + union xfs_btree_key *key, + union xfs_btree_rec *rec) +{ + key->alloc.ar_blockcount = rec->alloc.ar_blockcount; + key->alloc.ar_startblock = 0; +} + STATIC void xfs_allocbt_init_rec_from_cur( struct xfs_btree_cur *cur, union xfs_btree_rec *rec) { - ASSERT(cur->bc_rec.a.ar_startblock != 0); - rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock); rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount); } @@ -236,18 +254,24 @@ xfs_allocbt_init_ptr_from_cur( } STATIC __int64_t -xfs_allocbt_key_diff( +xfs_bnobt_key_diff( struct xfs_btree_cur *cur, union xfs_btree_key *key) { xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; xfs_alloc_key_t *kp = &key->alloc; - __int64_t diff; - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return (__int64_t)be32_to_cpu(kp->ar_startblock) - - rec->ar_startblock; - } + return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; +} + +STATIC __int64_t +xfs_cntbt_key_diff( + struct xfs_btree_cur *cur, + union xfs_btree_key *key) +{ + xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a; + xfs_alloc_key_t *kp = &key->alloc; + __int64_t diff; diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount; if (diff) @@ -256,6 +280,33 @@ xfs_allocbt_key_diff( return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock; } +STATIC __int64_t +xfs_bnobt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return (__int64_t)be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + +STATIC __int64_t +xfs_cntbt_diff_two_keys( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + __int64_t diff; + + diff = be32_to_cpu(k1->alloc.ar_blockcount) - + be32_to_cpu(k2->alloc.ar_blockcount); + if (diff) + return diff; + + return be32_to_cpu(k1->alloc.ar_startblock) - + be32_to_cpu(k2->alloc.ar_startblock); +} + static bool xfs_allocbt_verify( struct xfs_buf *bp) @@ -346,44 +397,54 @@ const struct xfs_buf_ops xfs_allocbt_buf_ops = { #if defined(DEBUG) || defined(XFS_WARN) STATIC int -xfs_allocbt_keys_inorder( +xfs_bnobt_keys_inorder( struct xfs_btree_cur *cur, union xfs_btree_key *k1, union xfs_btree_key *k2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock); - } else { - return be32_to_cpu(k1->alloc.ar_blockcount) < - be32_to_cpu(k2->alloc.ar_blockcount) || - (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && - be32_to_cpu(k1->alloc.ar_startblock) < - be32_to_cpu(k2->alloc.ar_startblock)); - } + return be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock); } STATIC int -xfs_allocbt_recs_inorder( +xfs_bnobt_recs_inorder( struct xfs_btree_cur *cur, union xfs_btree_rec *r1, union xfs_btree_rec *r2) { - if (cur->bc_btnum == XFS_BTNUM_BNO) { - return be32_to_cpu(r1->alloc.ar_startblock) + - be32_to_cpu(r1->alloc.ar_blockcount) <= - be32_to_cpu(r2->alloc.ar_startblock); - } else { - return be32_to_cpu(r1->alloc.ar_blockcount) < - be32_to_cpu(r2->alloc.ar_blockcount) || - (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && - be32_to_cpu(r1->alloc.ar_startblock) < - be32_to_cpu(r2->alloc.ar_startblock)); - } + return be32_to_cpu(r1->alloc.ar_startblock) + + be32_to_cpu(r1->alloc.ar_blockcount) <= + be32_to_cpu(r2->alloc.ar_startblock); +} + +STATIC int +xfs_cntbt_keys_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_key *k1, + union xfs_btree_key *k2) +{ + return be32_to_cpu(k1->alloc.ar_blockcount) < + be32_to_cpu(k2->alloc.ar_blockcount) || + (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount && + be32_to_cpu(k1->alloc.ar_startblock) < + be32_to_cpu(k2->alloc.ar_startblock)); } -#endif /* DEBUG */ -static const struct xfs_btree_ops xfs_allocbt_ops = { +STATIC int +xfs_cntbt_recs_inorder( + struct xfs_btree_cur *cur, + union xfs_btree_rec *r1, + union xfs_btree_rec *r2) +{ + return be32_to_cpu(r1->alloc.ar_blockcount) < + be32_to_cpu(r2->alloc.ar_blockcount) || + (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount && + be32_to_cpu(r1->alloc.ar_startblock) < + be32_to_cpu(r2->alloc.ar_startblock)); +} +#endif /* DEBUG */ + +static const struct xfs_btree_ops xfs_bnobt_ops = { .rec_len = sizeof(xfs_alloc_rec_t), .key_len = sizeof(xfs_alloc_key_t), @@ -395,13 +456,39 @@ static const struct xfs_btree_ops xfs_allocbt_ops = { .get_minrecs = xfs_allocbt_get_minrecs, .get_maxrecs = xfs_allocbt_get_maxrecs, .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_bnobt_init_high_key_from_rec, .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, - .key_diff = xfs_allocbt_key_diff, + .key_diff = xfs_bnobt_key_diff, .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_bnobt_diff_two_keys, #if defined(DEBUG) || defined(XFS_WARN) - .keys_inorder = xfs_allocbt_keys_inorder, - .recs_inorder = xfs_allocbt_recs_inorder, + .keys_inorder = xfs_bnobt_keys_inorder, + .recs_inorder = xfs_bnobt_recs_inorder, +#endif +}; + +static const struct xfs_btree_ops xfs_cntbt_ops = { + .rec_len = sizeof(xfs_alloc_rec_t), + .key_len = sizeof(xfs_alloc_key_t), + + .dup_cursor = xfs_allocbt_dup_cursor, + .set_root = xfs_allocbt_set_root, + .alloc_block = xfs_allocbt_alloc_block, + .free_block = xfs_allocbt_free_block, + .update_lastrec = xfs_allocbt_update_lastrec, + .get_minrecs = xfs_allocbt_get_minrecs, + .get_maxrecs = xfs_allocbt_get_maxrecs, + .init_key_from_rec = xfs_allocbt_init_key_from_rec, + .init_high_key_from_rec = xfs_cntbt_init_high_key_from_rec, + .init_rec_from_cur = xfs_allocbt_init_rec_from_cur, + .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur, + .key_diff = xfs_cntbt_key_diff, + .buf_ops = &xfs_allocbt_buf_ops, + .diff_two_keys = xfs_cntbt_diff_two_keys, +#if defined(DEBUG) || defined(XFS_WARN) + .keys_inorder = xfs_cntbt_keys_inorder, + .recs_inorder = xfs_cntbt_recs_inorder, #endif }; @@ -427,16 +514,15 @@ xfs_allocbt_init_cursor( cur->bc_mp = mp; cur->bc_btnum = btnum; cur->bc_blocklog = mp->m_sb.sb_blocklog; - cur->bc_ops = &xfs_allocbt_ops; - if (btnum == XFS_BTNUM_BNO) - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); - else - cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); if (btnum == XFS_BTNUM_CNT) { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2); + cur->bc_ops = &xfs_cntbt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]); cur->bc_flags = XFS_BTREE_LASTREC_UPDATE; } else { + cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2); + cur->bc_ops = &xfs_bnobt_ops; cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]); } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9bd104f32908962046af6d2dd4437a045fecdb36..f02eb767339219e8e59418429c666b3934d795f4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -764,7 +764,6 @@ xfs_bmap_extents_to_btree( args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino); } else if (dfops->dop_low) { args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: args.fsbno = *firstblock; } else { args.type = XFS_ALLOCTYPE_NEAR_BNO; @@ -779,20 +778,6 @@ xfs_bmap_extents_to_btree( return error; } - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) { xfs_iroot_realloc(ip, -1, whichfork); xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -925,7 +910,6 @@ xfs_bmap_local_to_extents( * file currently fits in an inode. */ if (*firstblock == NULLFSBLOCK) { -try_another_ag: args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino); args.type = XFS_ALLOCTYPE_START_BNO; } else { @@ -938,19 +922,6 @@ xfs_bmap_local_to_extents( if (error) goto done; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - goto try_another_ag; - } /* Can't fail, the space was reserved. */ ASSERT(args.fsbno != NULLFSBLOCK); ASSERT(args.len == 1); @@ -1260,7 +1231,6 @@ xfs_bmap_read_extents( xfs_fsblock_t bno; /* block # of "block" */ xfs_buf_t *bp; /* buffer for "block" */ int error; /* error return value */ - xfs_exntfmt_t exntf; /* XFS_EXTFMT_NOSTATE, if checking */ xfs_extnum_t i, j; /* index into the extents list */ xfs_ifork_t *ifp; /* fork structure */ int level; /* btree level, for checking */ @@ -1271,8 +1241,6 @@ xfs_bmap_read_extents( mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); - exntf = (whichfork != XFS_DATA_FORK) ? XFS_EXTFMT_NOSTATE : - XFS_EXTFMT_INODE(ip); block = ifp->if_broot; /* * Root level must use BMAP_BROOT_PTR_ADDR macro to get ptr out. @@ -1340,18 +1308,9 @@ xfs_bmap_read_extents( xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i); trp->l0 = be64_to_cpu(frp->l0); trp->l1 = be64_to_cpu(frp->l1); - } - if (exntf == XFS_EXTFMT_NOSTATE) { - /* - * Check all attribute bmap btree records and - * any "older" data bmap btree records for a - * set bit in the "extent flag" position. - */ - if (unlikely(xfs_check_nostate_extents(ifp, - start, num_recs))) { + if (!xfs_bmbt_validate_extent(mp, whichfork, trp)) { XFS_ERROR_REPORT("xfs_bmap_read_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); + XFS_ERRLEVEL_LOW, mp); goto error0; } } @@ -2879,27 +2838,30 @@ xfs_bmap_add_extent_hole_delay( */ STATIC int /* error */ xfs_bmap_add_extent_hole_real( - struct xfs_bmalloca *bma, - int whichfork) + struct xfs_trans *tp, + struct xfs_inode *ip, + int whichfork, + xfs_extnum_t *idx, + struct xfs_btree_cur **curp, + struct xfs_bmbt_irec *new, + xfs_fsblock_t *first, + struct xfs_defer_ops *dfops, + int *logflagsp) { - struct xfs_bmbt_irec *new = &bma->got; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + struct xfs_mount *mp = ip->i_mount; + struct xfs_btree_cur *cur = *curp; int error; /* error return value */ int i; /* temp state */ - xfs_ifork_t *ifp; /* inode fork pointer */ xfs_bmbt_irec_t left; /* left neighbor extent entry */ xfs_bmbt_irec_t right; /* right neighbor extent entry */ int rval=0; /* return value (logging flags) */ int state; /* state bits, accessed thru macros */ - struct xfs_mount *mp; - mp = bma->ip->i_mount; - ifp = XFS_IFORK_PTR(bma->ip, whichfork); - - ASSERT(bma->idx >= 0); - ASSERT(bma->idx <= xfs_iext_count(ifp)); + ASSERT(*idx >= 0); + ASSERT(*idx <= xfs_iext_count(ifp)); ASSERT(!isnullstartblock(new->br_startblock)); - ASSERT(!bma->cur || - !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); + ASSERT(!cur || !(cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL)); XFS_STATS_INC(mp, xs_add_exlist); @@ -2912,9 +2874,9 @@ xfs_bmap_add_extent_hole_real( /* * Check and set flags if this segment has a left neighbor. */ - if (bma->idx > 0) { + if (*idx > 0) { state |= BMAP_LEFT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1), &left); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx - 1), &left); if (isnullstartblock(left.br_startblock)) state |= BMAP_LEFT_DELAY; } @@ -2923,9 +2885,9 @@ xfs_bmap_add_extent_hole_real( * Check and set flags if this segment has a current value. * Not true if we're inserting into the "hole" at eof. */ - if (bma->idx < xfs_iext_count(ifp)) { + if (*idx < xfs_iext_count(ifp)) { state |= BMAP_RIGHT_VALID; - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right); + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right); if (isnullstartblock(right.br_startblock)) state |= BMAP_RIGHT_DELAY; } @@ -2962,36 +2924,36 @@ xfs_bmap_add_extent_hole_real( * left and on the right. * Merge all three into a single extent record. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount + right.br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - xfs_iext_remove(bma->ip, bma->idx + 1, 1, state); + xfs_iext_remove(ip, *idx + 1, 1, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) - 1); - if (bma->cur == NULL) { + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, right.br_startoff, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_delete(bma->cur, &i); + error = xfs_btree_delete(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_btree_decrement(bma->cur, 0, &i); + error = xfs_btree_decrement(cur, 0, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount + @@ -3008,23 +2970,23 @@ xfs_bmap_add_extent_hole_real( * on the left. * Merge the new allocation with the left neighbor. */ - --bma->idx; - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, bma->idx), + --*idx; + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_blockcount(xfs_iext_get_ext(ifp, *idx), left.br_blockcount + new->br_blockcount); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, left.br_startoff, + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, left.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, left.br_startoff, + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, left.br_blockcount + new->br_blockcount, @@ -3040,25 +3002,25 @@ xfs_bmap_add_extent_hole_real( * on the right. * Merge the new allocation with the right neighbor. */ - trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_); - xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, bma->idx), + trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_); + xfs_bmbt_set_allf(xfs_iext_get_ext(ifp, *idx), new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, right.br_state); - trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_); + trace_xfs_bmap_post_update(ip, *idx, state, _THIS_IP_); - if (bma->cur == NULL) { + if (cur == NULL) { rval = xfs_ilog_fext(whichfork); } else { rval = 0; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, right.br_startoff, right.br_startblock, right.br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); - error = xfs_bmbt_update(bma->cur, new->br_startoff, + error = xfs_bmbt_update(cur, new->br_startoff, new->br_startblock, new->br_blockcount + right.br_blockcount, @@ -3074,22 +3036,22 @@ xfs_bmap_add_extent_hole_real( * real allocation. * Insert a new entry. */ - xfs_iext_insert(bma->ip, bma->idx, 1, new, state); - XFS_IFORK_NEXT_SET(bma->ip, whichfork, - XFS_IFORK_NEXTENTS(bma->ip, whichfork) + 1); - if (bma->cur == NULL) { + xfs_iext_insert(ip, *idx, 1, new, state); + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) + 1); + if (cur == NULL) { rval = XFS_ILOG_CORE | xfs_ilog_fext(whichfork); } else { rval = XFS_ILOG_CORE; - error = xfs_bmbt_lookup_eq(bma->cur, + error = xfs_bmbt_lookup_eq(cur, new->br_startoff, new->br_startblock, new->br_blockcount, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 0, done); - bma->cur->bc_rec.b.br_state = new->br_state; - error = xfs_btree_insert(bma->cur, &i); + cur->bc_rec.b.br_state = new->br_state; + error = xfs_btree_insert(cur, &i); if (error) goto done; XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); @@ -3098,30 +3060,30 @@ xfs_bmap_add_extent_hole_real( } /* add reverse mapping */ - error = xfs_rmap_map_extent(mp, bma->dfops, bma->ip, whichfork, new); + error = xfs_rmap_map_extent(mp, dfops, ip, whichfork, new); if (error) goto done; /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, whichfork)) { + if (xfs_bmap_needs_btree(ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ - ASSERT(bma->cur == NULL); - error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, - bma->firstblock, bma->dfops, &bma->cur, + ASSERT(cur == NULL); + error = xfs_bmap_extents_to_btree(tp, ip, first, dfops, curp, 0, &tmp_logflags, whichfork); - bma->logflags |= tmp_logflags; + *logflagsp |= tmp_logflags; + cur = *curp; if (error) goto done; } /* clear out the allocated field, done with it now in any case. */ - if (bma->cur) - bma->cur->bc_private.b.allocated = 0; + if (cur) + cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); + xfs_bmap_check_leaf_extents(cur, ip, whichfork); done: - bma->logflags |= rval; + *logflagsp |= rval; return error; } @@ -3852,60 +3814,6 @@ xfs_bmap_btalloc( return 0; } -/* - * For a remap operation, just "allocate" an extent at the address that the - * caller passed in, and ensure that the AGFL is the right size. The caller - * will then map the "allocated" extent into the file somewhere. - */ -STATIC int -xfs_bmap_remap_alloc( - struct xfs_bmalloca *ap) -{ - struct xfs_trans *tp = ap->tp; - struct xfs_mount *mp = tp->t_mountp; - xfs_agblock_t bno; - struct xfs_alloc_arg args; - int error; - - /* - * validate that the block number is legal - the enables us to detect - * and handle a silent filesystem corruption rather than crashing. - */ - memset(&args, 0, sizeof(struct xfs_alloc_arg)); - args.tp = ap->tp; - args.mp = ap->tp->t_mountp; - bno = *ap->firstblock; - args.agno = XFS_FSB_TO_AGNO(mp, bno); - args.agbno = XFS_FSB_TO_AGBNO(mp, bno); - if (args.agno >= mp->m_sb.sb_agcount || - args.agbno >= mp->m_sb.sb_agblocks) - return -EFSCORRUPTED; - - /* "Allocate" the extent from the range we passed in. */ - trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length); - ap->blkno = bno; - ap->ip->i_d.di_nblocks += ap->length; - xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE); - - /* Fix the freelist, like a real allocator does. */ - args.datatype = ap->datatype; - args.pag = xfs_perag_get(args.mp, args.agno); - ASSERT(args.pag); - - /* - * The freelist fixing code will decline the allocation if - * the size and shape of the free space doesn't allow for - * allocating the extent and updating all the metadata that - * happens during an allocation. We're remapping, not - * allocating, so skip that check by pretending to be freeing. - */ - error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); - xfs_perag_put(args.pag); - if (error) - trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_); - return error; -} - /* * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file. * It figures out where to ask the underlying allocator to put the new extent. @@ -3914,8 +3822,6 @@ STATIC int xfs_bmap_alloc( struct xfs_bmalloca *ap) /* bmap alloc argument struct */ { - if (ap->flags & XFS_BMAPI_REMAP) - return xfs_bmap_remap_alloc(ap); if (XFS_IS_REALTIME_INODE(ap->ip) && xfs_alloc_is_userdata(ap->datatype)) return xfs_bmap_rtalloc(ap); @@ -4386,7 +4292,9 @@ xfs_bmapi_allocate( if (bma->wasdel) error = xfs_bmap_add_extent_delay_real(bma, whichfork); else - error = xfs_bmap_add_extent_hole_real(bma, whichfork); + error = xfs_bmap_add_extent_hole_real(bma->tp, bma->ip, + whichfork, &bma->idx, &bma->cur, &bma->got, + bma->firstblock, bma->dfops, &bma->logflags); bma->logflags |= tmp_logflags; if (error) @@ -4549,9 +4457,7 @@ xfs_bmapi_write( ASSERT(len > 0); ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); - ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK); - ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP)); - ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP)); + ASSERT(!(flags & XFS_BMAPI_REMAP)); /* zeroing is for currently only for data extents, not metadata */ ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) != @@ -4635,13 +4541,8 @@ xfs_bmapi_write( } else { need_alloc = true; } - } else { - /* - * Make sure we only reflink into a hole. - */ - ASSERT(!(flags & XFS_BMAPI_REMAP)); - if (isnullstartblock(bma.got.br_startblock)) - wasdelay = true; + } else if (isnullstartblock(bma.got.br_startblock)) { + wasdelay = true; } /* @@ -4770,6 +4671,93 @@ xfs_bmapi_write( return error; } +static int +xfs_bmapi_remap( + struct xfs_trans *tp, + struct xfs_inode *ip, + xfs_fileoff_t bno, + xfs_filblks_t len, + xfs_fsblock_t startblock, + struct xfs_defer_ops *dfops) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK); + struct xfs_btree_cur *cur = NULL; + xfs_fsblock_t firstblock = NULLFSBLOCK; + struct xfs_bmbt_irec got; + xfs_extnum_t idx; + int logflags = 0, error; + + ASSERT(len > 0); + ASSERT(len <= (xfs_filblks_t)MAXEXTLEN); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + + if (unlikely(XFS_TEST_ERROR( + (XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_FORMAT(ip, XFS_DATA_FORK) != XFS_DINODE_FMT_BTREE), + mp, XFS_ERRTAG_BMAPIFORMAT, XFS_RANDOM_BMAPIFORMAT))) { + XFS_ERROR_REPORT("xfs_bmapi_remap", XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } + + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + + if (!(ifp->if_flags & XFS_IFEXTENTS)) { + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + return error; + } + + if (xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got)) { + /* make sure we only reflink into a hole. */ + ASSERT(got.br_startoff > bno); + ASSERT(got.br_startoff - bno >= len); + } + + ip->i_d.di_nblocks += len; + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + + if (ifp->if_flags & XFS_IFBROOT) { + cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK); + cur->bc_private.b.firstblock = firstblock; + cur->bc_private.b.dfops = dfops; + cur->bc_private.b.flags = 0; + } + + got.br_startoff = bno; + got.br_startblock = startblock; + got.br_blockcount = len; + got.br_state = XFS_EXT_NORM; + + error = xfs_bmap_add_extent_hole_real(tp, ip, XFS_DATA_FORK, &idx, &cur, + &got, &firstblock, dfops, &logflags); + if (error) + goto error0; + + if (xfs_bmap_wants_extents(ip, XFS_DATA_FORK)) { + int tmp_logflags = 0; + + error = xfs_bmap_btree_to_extents(tp, ip, cur, + &tmp_logflags, XFS_DATA_FORK); + logflags |= tmp_logflags; + } + +error0: + if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS) + logflags &= ~XFS_ILOG_DEXT; + else if (ip->i_d.di_format != XFS_DINODE_FMT_BTREE) + logflags &= ~XFS_ILOG_DBROOT; + + if (logflags) + xfs_trans_log_inode(tp, ip, logflags); + if (cur) { + xfs_btree_del_cursor(cur, + error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + } + return error; +} + /* * When a delalloc extent is split (e.g., due to a hole punch), the original * indlen reservation must be shared across the two new extents that are left @@ -4887,7 +4875,7 @@ xfs_bmap_del_extent_delay( ASSERT(got_endoff >= del_endoff); if (isrt) { - int64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_frextents(mp, rtexts); @@ -6488,27 +6476,15 @@ xfs_bmap_finish_one( xfs_filblks_t blockcount, xfs_exntst_t state) { - struct xfs_bmbt_irec bmap; - int nimaps = 1; - xfs_fsblock_t firstfsb; - int flags = XFS_BMAPI_REMAP; - int done; - int error = 0; - - bmap.br_startblock = startblock; - bmap.br_startoff = startoff; - bmap.br_blockcount = blockcount; - bmap.br_state = state; + int error = 0, done; trace_xfs_bmap_deferred(tp->t_mountp, XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type, XFS_FSB_TO_AGBNO(tp->t_mountp, startblock), ip->i_ino, whichfork, startoff, blockcount, state); - if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK) + if (WARN_ON_ONCE(whichfork != XFS_DATA_FORK)) return -EFSCORRUPTED; - if (whichfork == XFS_ATTR_FORK) - flags |= XFS_BMAPI_ATTRFORK; if (XFS_TEST_ERROR(false, tp->t_mountp, XFS_ERRTAG_BMAP_FINISH_ONE, @@ -6517,16 +6493,12 @@ xfs_bmap_finish_one( switch (type) { case XFS_BMAP_MAP: - firstfsb = bmap.br_startblock; - error = xfs_bmapi_write(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, &firstfsb, - bmap.br_blockcount, &bmap, &nimaps, - dfops); + error = xfs_bmapi_remap(tp, ip, startoff, blockcount, + startblock, dfops); break; case XFS_BMAP_UNMAP: - error = xfs_bunmapi(tp, ip, bmap.br_startoff, - bmap.br_blockcount, flags, 1, &firstfsb, - dfops, &done); + error = xfs_bunmapi(tp, ip, startoff, blockcount, + XFS_BMAPI_REMAP, 1, &startblock, dfops, &done); ASSERT(done); break; default: diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index cdef87db5262bdc2f43b17f5a1e76fb45a107233..c35a14fa15272ab5ef38f8b47953ae120d72fb4e 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -171,6 +171,18 @@ static inline int xfs_bmapi_whichfork(int bmapi_flags) { BMAP_COWFORK, "COW" } +/* + * Return true if the extent is a real, allocated extent, or false if it is a + * delayed allocation, and unwritten extent or a hole. + */ +static inline bool xfs_bmap_is_real_extent(struct xfs_bmbt_irec *irec) +{ + return irec->br_state != XFS_EXT_UNWRITTEN && + irec->br_startblock != HOLESTARTBLOCK && + irec->br_startblock != DELAYSTARTBLOCK && + !isnullstartblock(irec->br_startblock); +} + /* * This macro is used to determine how many extents will be shifted * in one write transaction. We could require two splits, @@ -232,8 +244,6 @@ int xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_bmbt_irec *del); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, xfs_extnum_t *idx, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); -int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, - xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *next_fsb, xfs_fileoff_t offset_shift_fsb, diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index fd55db47938562868d25d4998407ce7650c4da4f..6cba69aff077470752daa8457116ff57802575f2 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -366,32 +366,6 @@ xfs_bmbt_to_bmdr( memcpy(tpp, fpp, sizeof(*fpp) * dmxr); } -/* - * Check extent records, which have just been read, for - * any bit in the extent flag field. ASSERT on debug - * kernels, as this condition should not occur. - * Return an error condition (1) if any flags found, - * otherwise return 0. - */ - -int -xfs_check_nostate_extents( - xfs_ifork_t *ifp, - xfs_extnum_t idx, - xfs_extnum_t num) -{ - for (; num > 0; num--, idx++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx); - if ((ep->l0 >> - (64 - BMBT_EXNTFLAG_BITLEN)) != 0) { - ASSERT(0); - return 1; - } - } - return 0; -} - - STATIC struct xfs_btree_cur * xfs_bmbt_dup_cursor( struct xfs_btree_cur *cur) @@ -448,7 +422,6 @@ xfs_bmbt_alloc_block( if (args.fsbno == NULLFSBLOCK) { args.fsbno = be64_to_cpu(start->l); args.type = XFS_ALLOCTYPE_START_BNO; -try_another_ag: /* * Make sure there is sufficient room left in the AG to * complete a full tree split for an extent insert. If @@ -477,22 +450,6 @@ xfs_bmbt_alloc_block( if (error) goto error0; - /* - * During a CoW operation, the allocation and bmbt updates occur in - * different transactions. The mapping code tries to put new bmbt - * blocks near extents being mapped, but the only way to guarantee this - * is if the alloc and the mapping happen in a single transaction that - * has a block reservation. That isn't the case here, so if we run out - * of space we'll try again with another AG. - */ - if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) && - args.fsbno == NULLFSBLOCK && - args.type == XFS_ALLOCTYPE_NEAR_BNO) { - args.fsbno = cur->bc_private.b.firstblock; - args.type = XFS_ALLOCTYPE_FIRST_AG; - goto try_another_ag; - } - if (args.fsbno == NULLFSBLOCK && args.minleft) { /* * Could not find an AG with enough free space to satisfy diff --git a/fs/xfs/libxfs/xfs_bmap_btree.h b/fs/xfs/libxfs/xfs_bmap_btree.h index 819a8a4dee952d4648ee6333a7c61ea7dff6e78f..9da5a8d4f184b6384638f4090b6441626a6a1ab0 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.h +++ b/fs/xfs/libxfs/xfs_bmap_btree.h @@ -24,14 +24,6 @@ struct xfs_mount; struct xfs_inode; struct xfs_trans; -/* - * Extent state and extent format macros. - */ -#define XFS_EXTFMT_INODE(x) \ - (xfs_sb_version_hasextflgbit(&((x)->i_mount->m_sb)) ? \ - XFS_EXTFMT_HASSTATE : XFS_EXTFMT_NOSTATE) -#define ISUNWRITTEN(x) ((x)->br_state == XFS_EXT_UNWRITTEN) - /* * Btree block header size depends on a superblock flag. */ @@ -140,4 +132,18 @@ extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip, extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *, struct xfs_trans *, struct xfs_inode *, int); +/* + * Check that the extent does not contain an invalid unwritten extent flag. + */ +static inline bool xfs_bmbt_validate_extent(struct xfs_mount *mp, int whichfork, + struct xfs_bmbt_rec_host *ep) +{ + if (ep->l0 >> (64 - BMBT_EXNTFLAG_BITLEN) == 0) + return true; + if (whichfork == XFS_DATA_FORK && + xfs_sb_version_hasextflgbit(&mp->m_sb)) + return true; + return false; +} + #endif /* __XFS_BMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index c3decedc94557e14c3cb7fa7eee81d09e54f4c9e..5392674bf8930550d949f6c01d0f47ec03228ba8 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2886,7 +2886,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_FSTRANS; + unsigned long new_pflags = PF_MEMALLOC_NOFS; /* * we are in a transaction context here, but may also be doing work @@ -4842,6 +4842,21 @@ xfs_btree_query_range( fn, priv); } +/* Query a btree for all records. */ +int +xfs_btree_query_all( + struct xfs_btree_cur *cur, + xfs_btree_query_range_fn fn, + void *priv) +{ + union xfs_btree_irec low_rec; + union xfs_btree_irec high_rec; + + memset(&low_rec, 0, sizeof(low_rec)); + memset(&high_rec, 0xFF, sizeof(high_rec)); + return xfs_btree_query_range(cur, &low_rec, &high_rec, fn, priv); +} + /* * Calculate the number of blocks needed to store a given number of records * in a short-format (per-AG metadata) btree. diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 4bb62580a7fd0029702b7d5092638af34b78c70b..27bed08261c530fac9508f9138f00b0f13a8cdd9 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -496,6 +496,8 @@ typedef int (*xfs_btree_query_range_fn)(struct xfs_btree_cur *cur, int xfs_btree_query_range(struct xfs_btree_cur *cur, union xfs_btree_irec *low_rec, union xfs_btree_irec *high_rec, xfs_btree_query_range_fn fn, void *priv); +int xfs_btree_query_all(struct xfs_btree_cur *cur, xfs_btree_query_range_fn fn, + void *priv); typedef int (*xfs_btree_visit_blocks_fn)(struct xfs_btree_cur *cur, int level, void *data); diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index ac9a003dd29acf80d7c766788cb40d1f98d9132c..747085b4ef4406d387464b40fdea302b83c25c4c 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -35,13 +35,8 @@ int xfs_calc_dquots_per_chunk( unsigned int nbblks) /* basic block units */ { - unsigned int ndquots; - ASSERT(nbblks > 0); - ndquots = BBTOB(nbblks); - do_div(ndquots, sizeof(xfs_dqblk_t)); - - return ndquots; + return BBTOB(nbblks) / sizeof(xfs_dqblk_t); } /* diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 6b7579e7b60a228ee6e633221bc64952f4a93a41..a1dccd8d96bcecaae119d7404f4c461022cab07d 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -930,10 +930,8 @@ static inline uint xfs_dinode_size(int version) /* * The 32 bit link count in the inode theoretically maxes out at UINT_MAX. * Since the pathconf interface is signed, we use 2^31 - 1 instead. - * The old inode format had a 16 bit link count, so its maximum is USHRT_MAX. */ #define XFS_MAXLINK ((1U << 31) - 1U) -#define XFS_MAXLINK_1 65535U /* * Values for di_format @@ -1577,20 +1575,11 @@ static inline xfs_filblks_t startblockval(xfs_fsblock_t x) return (xfs_filblks_t)((x) & ~STARTBLOCKMASK); } -/* - * Possible extent formats. - */ -typedef enum { - XFS_EXTFMT_NOSTATE = 0, - XFS_EXTFMT_HASSTATE -} xfs_exntfmt_t; - /* * Possible extent states. */ typedef enum { XFS_EXT_NORM, XFS_EXT_UNWRITTEN, - XFS_EXT_DMAPI_OFFLINE, XFS_EXT_INVALID } xfs_exntst_t; /* diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b72dc821d78b97ba5f93a21122eab9946e2e947a..095bdf049a3fe2242aed690fceb2167d4b97eaad 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -92,6 +92,18 @@ struct getbmapx { #define BMV_OF_LAST 0x4 /* segment is the last in the file */ #define BMV_OF_SHARED 0x8 /* segment shared with another file */ +/* fmr_owner special values for FS_IOC_GETFSMAP */ +#define XFS_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define XFS_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define XFS_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define XFS_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define XFS_FMR_OWN_AG FMR_OWNER('X', 3) /* per-AG metadata */ +#define XFS_FMR_OWN_INOBT FMR_OWNER('X', 4) /* inode btree blocks */ +#define XFS_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define XFS_FMR_OWN_REFC FMR_OWNER('X', 6) /* refcount tree */ +#define XFS_FMR_OWN_COW FMR_OWNER('X', 7) /* cow staging */ +#define XFS_FMR_OWN_DEFECTIVE FMR_OWNER('X', 8) /* bad blocks */ + /* * Structure for XFS_IOC_FSSETDM. * For use by backup and restore programs to set the XFS on-disk inode @@ -502,6 +514,7 @@ typedef struct xfs_swapext #define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap) #define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64) #define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_fs_eofblocks) +/* XFS_IOC_GETFSMAP ------ hoisted 59 */ /* * ioctl commands that replace IRIX syssgi()'s diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index d93f9d918cfc11ada2ded50d21e0eb9226381f93..09c3d1aecef267cbb207e274f892542be22d1275 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -508,7 +508,7 @@ xfs_iread( /* even unallocated inodes are verified */ if (!xfs_dinode_verify(mp, ip->i_ino, dip)) { - xfs_alert(mp, "%s: validation failed for inode %lld failed", + xfs_alert(mp, "%s: validation failed for inode %lld", __func__, ip->i_ino); XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, dip); diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c index 8a37efe04de3235d72357914647e5f480cbbd512..0e80f34fe97c509cef3eef61ff7237c20a19bdd4 100644 --- a/fs/xfs/libxfs/xfs_inode_fork.c +++ b/fs/xfs/libxfs/xfs_inode_fork.c @@ -42,35 +42,6 @@ STATIC int xfs_iformat_local(xfs_inode_t *, xfs_dinode_t *, int, int); STATIC int xfs_iformat_extents(xfs_inode_t *, xfs_dinode_t *, int); STATIC int xfs_iformat_btree(xfs_inode_t *, xfs_dinode_t *, int); -#ifdef DEBUG -/* - * Make sure that the extents in the given memory buffer - * are valid. - */ -void -xfs_validate_extents( - xfs_ifork_t *ifp, - int nrecs, - xfs_exntfmt_t fmt) -{ - xfs_bmbt_irec_t irec; - xfs_bmbt_rec_host_t rec; - int i; - - for (i = 0; i < nrecs; i++) { - xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); - rec.l0 = get_unaligned(&ep->l0); - rec.l1 = get_unaligned(&ep->l1); - xfs_bmbt_get_all(&rec, &irec); - if (fmt == XFS_EXTFMT_NOSTATE) - ASSERT(irec.br_state == XFS_EXT_NORM); - } -} -#else /* DEBUG */ -#define xfs_validate_extents(ifp, nrecs, fmt) -#endif /* DEBUG */ - - /* * Move inode type and inode format specific information from the * on-disk inode to the in-core inode. For fifos, devs, and sockets @@ -352,40 +323,33 @@ xfs_iformat_local( } /* - * The file consists of a set of extents all - * of which fit into the on-disk inode. - * If there are few enough extents to fit into - * the if_inline_ext, then copy them there. - * Otherwise allocate a buffer for them and copy - * them into it. Either way, set if_extents - * to point at the extents. + * The file consists of a set of extents all of which fit into the on-disk + * inode. If there are few enough extents to fit into the if_inline_ext, then + * copy them there. Otherwise allocate a buffer for them and copy them into it. + * Either way, set if_extents to point at the extents. */ STATIC int xfs_iformat_extents( - xfs_inode_t *ip, - xfs_dinode_t *dip, - int whichfork) + struct xfs_inode *ip, + struct xfs_dinode *dip, + int whichfork) { - xfs_bmbt_rec_t *dp; - xfs_ifork_t *ifp; - int nex; - int size; - int i; - - ifp = XFS_IFORK_PTR(ip, whichfork); - nex = XFS_DFORK_NEXTENTS(dip, whichfork); - size = nex * (uint)sizeof(xfs_bmbt_rec_t); + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork); + int nex = XFS_DFORK_NEXTENTS(dip, whichfork); + int size = nex * sizeof(xfs_bmbt_rec_t); + struct xfs_bmbt_rec *dp; + int i; /* - * If the number of extents is unreasonable, then something - * is wrong and we just bail out rather than crash in - * kmem_alloc() or memcpy() below. + * If the number of extents is unreasonable, then something is wrong and + * we just bail out rather than crash in kmem_alloc() or memcpy() below. */ - if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, ip->i_mount, whichfork))) { + if (unlikely(size < 0 || size > XFS_DFORK_SIZE(dip, mp, whichfork))) { xfs_warn(ip->i_mount, "corrupt inode %Lu ((a)extents = %d).", (unsigned long long) ip->i_ino, nex); XFS_CORRUPTION_ERROR("xfs_iformat_extents(1)", XFS_ERRLEVEL_LOW, - ip->i_mount, dip); + mp, dip); return -EFSCORRUPTED; } @@ -400,22 +364,17 @@ xfs_iformat_extents( ifp->if_bytes = size; if (size) { dp = (xfs_bmbt_rec_t *) XFS_DFORK_PTR(dip, whichfork); - xfs_validate_extents(ifp, nex, XFS_EXTFMT_INODE(ip)); for (i = 0; i < nex; i++, dp++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); ep->l0 = get_unaligned_be64(&dp->l0); ep->l1 = get_unaligned_be64(&dp->l1); + if (!xfs_bmbt_validate_extent(mp, whichfork, ep)) { + XFS_ERROR_REPORT("xfs_iformat_extents(2)", + XFS_ERRLEVEL_LOW, mp); + return -EFSCORRUPTED; + } } XFS_BMAP_TRACE_EXLIST(ip, nex, whichfork); - if (whichfork != XFS_DATA_FORK || - XFS_EXTFMT_INODE(ip) == XFS_EXTFMT_NOSTATE) - if (unlikely(xfs_check_nostate_extents( - ifp, 0, nex))) { - XFS_ERROR_REPORT("xfs_iformat_extents(2)", - XFS_ERRLEVEL_LOW, - ip->i_mount); - return -EFSCORRUPTED; - } } ifp->if_flags |= XFS_IFEXTENTS; return 0; @@ -518,7 +477,6 @@ xfs_iread_extents( xfs_iext_destroy(ifp); return error; } - xfs_validate_extents(ifp, nextents, XFS_EXTFMT_INODE(ip)); ifp->if_flags |= XFS_IFEXTENTS; return 0; } @@ -837,6 +795,9 @@ xfs_iextents_copy( copied = 0; for (i = 0; i < nrecs; i++) { xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, i); + + ASSERT(xfs_bmbt_validate_extent(ip->i_mount, whichfork, ep)); + start_block = xfs_bmbt_get_startblock(ep); if (isnullstartblock(start_block)) { /* @@ -852,7 +813,6 @@ xfs_iextents_copy( copied++; } ASSERT(copied != 0); - xfs_validate_extents(ifp, copied, XFS_EXTFMT_INODE(ip)); return (copied * (uint)sizeof(xfs_bmbt_rec_t)); } diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c index 3a8cc7139912bf83d0830090c3c88a91e79b6677..06cfb93c2ef95b2c9454272db24d9ae7b7f6816a 100644 --- a/fs/xfs/libxfs/xfs_rmap.c +++ b/fs/xfs/libxfs/xfs_rmap.c @@ -2001,14 +2001,14 @@ xfs_rmap_query_range_helper( /* Find all rmaps between two keys. */ int xfs_rmap_query_range( - struct xfs_btree_cur *cur, - struct xfs_rmap_irec *low_rec, - struct xfs_rmap_irec *high_rec, - xfs_rmap_query_range_fn fn, - void *priv) + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *low_rec, + struct xfs_rmap_irec *high_rec, + xfs_rmap_query_range_fn fn, + void *priv) { - union xfs_btree_irec low_brec; - union xfs_btree_irec high_brec; + union xfs_btree_irec low_brec; + union xfs_btree_irec high_brec; struct xfs_rmap_query_range_info query; low_brec.r = *low_rec; @@ -2019,6 +2019,20 @@ xfs_rmap_query_range( xfs_rmap_query_range_helper, &query); } +/* Find all rmaps. */ +int +xfs_rmap_query_all( + struct xfs_btree_cur *cur, + xfs_rmap_query_range_fn fn, + void *priv) +{ + struct xfs_rmap_query_range_info query; + + query.priv = priv; + query.fn = fn; + return xfs_btree_query_all(cur, xfs_rmap_query_range_helper, &query); +} + /* Clean up after calling xfs_rmap_finish_one. */ void xfs_rmap_finish_one_cleanup( @@ -2291,3 +2305,31 @@ xfs_rmap_free_extent( return __xfs_rmap_add(mp, dfops, XFS_RMAP_FREE, owner, XFS_DATA_FORK, &bmap); } + +/* Compare rmap records. Returns -1 if a < b, 1 if a > b, and 0 if equal. */ +int +xfs_rmap_compare( + const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b) +{ + __u64 oa; + __u64 ob; + + oa = xfs_rmap_irec_offset_pack(a); + ob = xfs_rmap_irec_offset_pack(b); + + if (a->rm_startblock < b->rm_startblock) + return -1; + else if (a->rm_startblock > b->rm_startblock) + return 1; + else if (a->rm_owner < b->rm_owner) + return -1; + else if (a->rm_owner > b->rm_owner) + return 1; + else if (oa < ob) + return -1; + else if (oa > ob) + return 1; + else + return 0; +} diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h index 789930599339058ed8db437a6ff5be061095877c..98f908fea103508a55955c97fc42f9052cf3316b 100644 --- a/fs/xfs/libxfs/xfs_rmap.h +++ b/fs/xfs/libxfs/xfs_rmap.h @@ -162,6 +162,8 @@ typedef int (*xfs_rmap_query_range_fn)( int xfs_rmap_query_range(struct xfs_btree_cur *cur, struct xfs_rmap_irec *low_rec, struct xfs_rmap_irec *high_rec, xfs_rmap_query_range_fn fn, void *priv); +int xfs_rmap_query_all(struct xfs_btree_cur *cur, xfs_rmap_query_range_fn fn, + void *priv); enum xfs_rmap_intent_type { XFS_RMAP_MAP, @@ -212,5 +214,7 @@ int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno, int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno, uint64_t owner, uint64_t offset, unsigned int flags, struct xfs_rmap_irec *irec, int *stat); +int xfs_rmap_compare(const struct xfs_rmap_irec *a, + const struct xfs_rmap_irec *b); #endif /* __XFS_RMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index ea45584a9913bbd51908c9a477bb25c90d000fa9..e47b99e59f6035c0bb76a98aa76df998c659348c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1016,3 +1016,73 @@ xfs_rtfree_extent( } return 0; } + +/* Find all the free records within a given range. */ +int +xfs_rtalloc_query_range( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec rec; + struct xfs_mount *mp = tp->t_mountp; + xfs_rtblock_t rtstart; + xfs_rtblock_t rtend; + xfs_rtblock_t rem; + int is_free; + int error = 0; + + if (low_rec->ar_startblock > high_rec->ar_startblock) + return -EINVAL; + else if (low_rec->ar_startblock == high_rec->ar_startblock) + return 0; + + /* Iterate the bitmap, looking for discrepancies. */ + rtstart = low_rec->ar_startblock; + rem = high_rec->ar_startblock - rtstart; + while (rem) { + /* Is the first block free? */ + error = xfs_rtcheck_range(mp, tp, rtstart, 1, 1, &rtend, + &is_free); + if (error) + break; + + /* How long does the extent go for? */ + error = xfs_rtfind_forw(mp, tp, rtstart, + high_rec->ar_startblock - 1, &rtend); + if (error) + break; + + if (is_free) { + rec.ar_startblock = rtstart; + rec.ar_blockcount = rtend - rtstart + 1; + + error = fn(tp, &rec, priv); + if (error) + break; + } + + rem -= rtend - rtstart + 1; + rtstart = rtend + 1; + } + + return error; +} + +/* Find all the free records. */ +int +xfs_rtalloc_query_all( + struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv) +{ + struct xfs_rtalloc_rec keys[2]; + + keys[0].ar_startblock = 0; + keys[1].ar_startblock = tp->t_mountp->m_sb.sb_rblocks; + keys[0].ar_blockcount = keys[1].ar_blockcount = 0; + + return xfs_rtalloc_query_range(tp, &keys[0], &keys[1], fn, priv); +} diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index 7917f6e44286a4591e9109138a7be07dc1a18eb1..d787c677d2a3201cc62d042fbee55415923934f6 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -21,8 +21,20 @@ /* * Components of space reservations. */ + +/* Worst case number of rmaps that can be held in a block. */ #define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) \ (((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0])) + +/* Adding one rmap could split every level up to the top of the tree. */ +#define XFS_RMAPADD_SPACE_RES(mp) ((mp)->m_rmap_maxlevels) + +/* Blocks we might need to add "b" rmaps to a tree. */ +#define XFS_NRMAPADD_SPACE_RES(mp, b)\ + (((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ + XFS_RMAPADD_SPACE_RES(mp)) + #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) \ (((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0])) #define XFS_EXTENTADD_SPACE_RES(mp,w) (XFS_BM_MAXLEVELS(mp,w) - 1) @@ -30,13 +42,12 @@ (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ XFS_EXTENTADD_SPACE_RES(mp,w)) + +/* Blocks we might need to add "b" mappings & rmappings to a file. */ #define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\ - (((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \ - XFS_EXTENTADD_SPACE_RES(mp,w) + \ - ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \ - XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \ - (mp)->m_rmap_maxlevels) + (XFS_NEXTENTADD_SPACE_RES((mp), (b), (w)) + \ + XFS_NRMAPADD_SPACE_RES((mp), (b))) + #define XFS_DAENTER_1B(mp,w) \ ((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1) #define XFS_DAENTER_DBS(mp,w) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 61494295d92fe1acb7d343bc3a4e1594f09027ab..09af0f7cd55e278312881999755d3d8d0793d5c8 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -111,11 +111,11 @@ xfs_finish_page_writeback( bsize = bh->b_size; do { + if (off > end) + break; next = bh->b_this_page; if (off < bvec->bv_offset) goto next_bh; - if (off > end) - break; bh->b_end_io(bh, !error); next_bh: off += bsize; @@ -189,7 +189,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return 0; } @@ -252,7 +252,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -1016,7 +1016,7 @@ xfs_do_writepage( * Given that we do not allow direct reclaim to call us, we should * never be called while in a filesystem transaction. */ - if (WARN_ON_ONCE(current->flags & PF_FSTRANS)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) goto redirty; /* @@ -1261,8 +1261,8 @@ xfs_get_blocks( if (nimaps) { trace_xfs_get_blocks_found(ip, offset, size, - ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN - : XFS_IO_OVERWRITE, &imap); + imap.br_state == XFS_EXT_UNWRITTEN ? + XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, &imap); xfs_iunlock(ip, lockmode); } else { trace_xfs_get_blocks_notfound(ip, offset, size); @@ -1276,9 +1276,7 @@ xfs_get_blocks( * For unwritten extents do not report a disk address in the buffered * read case (treat as if we're reading into a hole). */ - if (imap.br_startblock != HOLESTARTBLOCK && - imap.br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(&imap)) + if (xfs_bmap_is_real_extent(&imap)) xfs_map_buffer(inode, bh_result, &imap, offset); /* diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 9bf57c76623b404ebf5e1e57dd1582ad52de7c92..d419d23fa2149ae3b022cdec3aba6879c987eec1 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -34,6 +34,8 @@ #include "xfs_bmap.h" #include "xfs_icache.h" #include "xfs_trace.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" kmem_zone_t *xfs_bui_zone; @@ -215,6 +217,7 @@ void xfs_bui_release( struct xfs_bui_log_item *buip) { + ASSERT(atomic_read(&buip->bui_refcount) > 0); if (atomic_dec_and_test(&buip->bui_refcount)) { xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR); xfs_bui_item_free(buip); @@ -446,7 +449,8 @@ xfs_bui_recover( return -EIO; } - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK), 0, 0, &tp); if (error) return error; budp = xfs_trans_get_bud(tp, buip); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 828532ce0adca80ddfa174540aae6aa06663302b..2b954308a1d671e9f09d06a5353713297f11be76 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -81,7 +81,7 @@ xfs_zero_extent( return blkdev_issue_zeroout(xfs_find_bdev_for_inode(VFS_I(ip)), block << (mp->m_super->s_blocksize_bits - 9), count_fsb << (mp->m_super->s_blocksize_bits - 9), - GFP_NOFS, true); + GFP_NOFS, 0); } int @@ -448,10 +448,9 @@ xfs_getbmap_adjust_shared( next_map->br_blockcount = 0; /* Only written data blocks can be shared. */ - if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK || - map->br_startblock == DELAYSTARTBLOCK || - map->br_startblock == HOLESTARTBLOCK || - ISUNWRITTEN(map)) + if (!xfs_is_reflink_inode(ip) || + whichfork != XFS_DATA_FORK || + !xfs_bmap_is_real_extent(map)) return 0; agno = XFS_FSB_TO_AGNO(mp, map->br_startblock); @@ -904,9 +903,9 @@ xfs_can_free_eofblocks(struct xfs_inode *ip, bool force) } /* - * This is called by xfs_inactive to free any blocks beyond eof - * when the link count isn't zero and by xfs_dm_punch_hole() when - * punching a hole to EOF. + * This is called to free any blocks beyond eof. The caller must hold + * IOLOCK_EXCL unless we are in the inode reclaim path and have the only + * reference to the inode. */ int xfs_free_eofblocks( @@ -921,8 +920,6 @@ xfs_free_eofblocks( struct xfs_bmbt_irec imap; struct xfs_mount *mp = ip->i_mount; - ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); - /* * Figure out if there are any blocks beyond the end * of the file. If not, then there is nothing to do. @@ -1209,11 +1206,8 @@ xfs_adjust_extent_unmap_boundaries( return error; if (nimap && imap.br_startblock != HOLESTARTBLOCK) { - xfs_daddr_t block; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - block = imap.br_startblock; - mod = do_div(block, mp->m_sb.sb_rextsize); + mod = do_mod(imap.br_startblock, mp->m_sb.sb_rextsize); if (mod) *startoffset_fsb += mp->m_sb.sb_rextsize - mod; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index b6208728ba39767bbb2898880fbb515abcc5ab4f..62fa39276a24bd91c26e3aa18c9f162f19842b95 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -443,17 +443,17 @@ _xfs_buf_map_pages( bp->b_addr = NULL; } else { int retried = 0; - unsigned noio_flag; + unsigned nofs_flag; /* * vm_map_ram() will allocate auxillary structures (e.g. * pagetables) with GFP_KERNEL, yet we are likely to be under * GFP_NOFS context here. Hence we need to tell memory reclaim - * that we are in such a context via PF_MEMALLOC_NOIO to prevent + * that we are in such a context via PF_MEMALLOC_NOFS to prevent * memory reclaim re-entering the filesystem here and * potentially deadlocking. */ - noio_flag = memalloc_noio_save(); + nofs_flag = memalloc_nofs_save(); do { bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, -1, PAGE_KERNEL); @@ -461,7 +461,7 @@ _xfs_buf_map_pages( break; vm_unmap_aliases(); } while (retried++ <= 1); - memalloc_noio_restore(noio_flag); + memalloc_nofs_restore(nofs_flag); if (!bp->b_addr) return -ENOMEM; @@ -1079,6 +1079,8 @@ void xfs_buf_unlock( struct xfs_buf *bp) { + ASSERT(xfs_buf_islocked(bp)); + XB_CLEAR_OWNER(bp); up(&bp->b_sema); @@ -1814,6 +1816,28 @@ xfs_alloc_buftarg( return NULL; } +/* + * Cancel a delayed write list. + * + * Remove each buffer from the list, clear the delwri queue flag and drop the + * associated buffer reference. + */ +void +xfs_buf_delwri_cancel( + struct list_head *list) +{ + struct xfs_buf *bp; + + while (!list_empty(list)) { + bp = list_first_entry(list, struct xfs_buf, b_list); + + xfs_buf_lock(bp); + bp->b_flags &= ~_XBF_DELWRI_Q; + list_del_init(&bp->b_list); + xfs_buf_relse(bp); + } +} + /* * Add a buffer to the delayed write list. * diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3c867e5a63e1d591bd51cc9ba34ef185bbd9e0e7..8d1d44f87ce98834ad67cee6840d96d07d5666c0 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -291,7 +291,6 @@ xfs_buf_readahead( return xfs_buf_readahead_map(target, &map, 1, ops); } -struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks); void xfs_buf_set_empty(struct xfs_buf *bp, size_t numblks); int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); @@ -330,6 +329,7 @@ extern void *xfs_buf_offset(struct xfs_buf *, size_t); extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ +extern void xfs_buf_delwri_cancel(struct list_head *); extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); extern int xfs_buf_delwri_submit(struct list_head *); extern int xfs_buf_delwri_submit_nowait(struct list_head *); diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c index ad9396e516f6e389b88bca5dc2dc41d3372ed714..20b7a5c6eb2f68e93e1ad11be1d086a0d1d6aa33 100644 --- a/fs/xfs/xfs_dir2_readdir.c +++ b/fs/xfs/xfs_dir2_readdir.c @@ -394,6 +394,7 @@ xfs_dir2_leaf_readbuf( /* * Do we need more readahead? + * Each loop tries to process 1 full dir blk; last may be partial. */ blk_start_plug(&plug); for (mip->ra_index = mip->ra_offset = i = 0; @@ -404,7 +405,8 @@ xfs_dir2_leaf_readbuf( * Read-ahead a contiguous directory block. */ if (i > mip->ra_current && - map[mip->ra_index].br_blockcount >= geo->fsbcount) { + (map[mip->ra_index].br_blockcount - mip->ra_offset) >= + geo->fsbcount) { xfs_dir3_data_readahead(dp, map[mip->ra_index].br_startoff + mip->ra_offset, XFS_FSB_TO_DADDR(dp->i_mount, @@ -425,14 +427,19 @@ xfs_dir2_leaf_readbuf( } /* - * Advance offset through the mapping table. + * Advance offset through the mapping table, processing a full + * dir block even if it is fragmented into several extents. + * But stop if we have consumed all valid mappings, even if + * it's not yet a full directory block. */ - for (j = 0; j < geo->fsbcount; j += length ) { + for (j = 0; + j < geo->fsbcount && mip->ra_index < mip->map_valid; + j += length ) { /* * The rest of this extent but not more than a dir * block. */ - length = min_t(int, geo->fsbcount, + length = min_t(int, geo->fsbcount - j, map[mip->ra_index].br_blockcount - mip->ra_offset); mip->ra_offset += length; diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index d796ffac729690ab5273a6007380e76170602cef..6a05d278da64f666e525ca96dfae306face43565 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -132,6 +132,11 @@ xfs_trim_extents( error = xfs_btree_decrement(cur, 0, &i); if (error) goto out_del_cursor; + + if (fatal_signal_pending(current)) { + error = -ERESTARTSYS; + goto out_del_cursor; + } } out_del_cursor: @@ -196,8 +201,11 @@ xfs_ioc_trim( for (agno = start_agno; agno <= end_agno; agno++) { error = xfs_trim_extents(mp, agno, start, end, minlen, &blocks_trimmed); - if (error) + if (error) { last_error = error; + if (error == -ERESTARTSYS) + break; + } } if (last_error) diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index d7bc14906af87f14ef570a968ead42100445c8bd..44f8c54512102577dbe768e5f137298ba1ee79c7 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -290,6 +290,7 @@ void xfs_efi_release( struct xfs_efi_log_item *efip) { + ASSERT(atomic_read(&efip->efi_refcount) > 0); if (atomic_dec_and_test(&efip->efi_refcount)) { xfs_trans_ail_remove(&efip->efi_item, SHUTDOWN_LOG_IO_ERROR); xfs_efi_item_free(efip); diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c new file mode 100644 index 0000000000000000000000000000000000000000..3683819887a5658eff23d6e3258d21567bbdc0b4 --- /dev/null +++ b/fs/xfs/xfs_fsmap.c @@ -0,0 +1,940 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_sb.h" +#include "xfs_mount.h" +#include "xfs_defer.h" +#include "xfs_inode.h" +#include "xfs_trans.h" +#include "xfs_error.h" +#include "xfs_btree.h" +#include "xfs_rmap_btree.h" +#include "xfs_trace.h" +#include "xfs_log.h" +#include "xfs_rmap.h" +#include "xfs_alloc.h" +#include "xfs_bit.h" +#include +#include "xfs_fsmap.h" +#include "xfs_refcount.h" +#include "xfs_refcount_btree.h" +#include "xfs_alloc_btree.h" +#include "xfs_rtalloc.h" + +/* Convert an xfs_fsmap to an fsmap. */ +void +xfs_fsmap_from_internal( + struct fsmap *dest, + struct xfs_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BBTOB(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BBTOB(src->fmr_offset); + dest->fmr_length = BBTOB(src->fmr_length); + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an xfs_fsmap. */ +void +xfs_fsmap_to_internal( + struct xfs_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = BTOBBT(src->fmr_physical); + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = BTOBBT(src->fmr_offset); + dest->fmr_length = BTOBBT(src->fmr_length); +} + +/* Convert an fsmap owner into an rmapbt owner. */ +static int +xfs_fsmap_owner_to_rmap( + struct xfs_rmap_irec *dest, + struct xfs_fsmap *src) +{ + if (!(src->fmr_flags & FMR_OF_SPECIAL_OWNER)) { + dest->rm_owner = src->fmr_owner; + return 0; + } + + switch (src->fmr_owner) { + case 0: /* "lowest owner id possible" */ + case -1ULL: /* "highest owner id possible" */ + dest->rm_owner = 0; + break; + case XFS_FMR_OWN_FREE: + dest->rm_owner = XFS_RMAP_OWN_NULL; + break; + case XFS_FMR_OWN_UNKNOWN: + dest->rm_owner = XFS_RMAP_OWN_UNKNOWN; + break; + case XFS_FMR_OWN_FS: + dest->rm_owner = XFS_RMAP_OWN_FS; + break; + case XFS_FMR_OWN_LOG: + dest->rm_owner = XFS_RMAP_OWN_LOG; + break; + case XFS_FMR_OWN_AG: + dest->rm_owner = XFS_RMAP_OWN_AG; + break; + case XFS_FMR_OWN_INOBT: + dest->rm_owner = XFS_RMAP_OWN_INOBT; + break; + case XFS_FMR_OWN_INODES: + dest->rm_owner = XFS_RMAP_OWN_INODES; + break; + case XFS_FMR_OWN_REFC: + dest->rm_owner = XFS_RMAP_OWN_REFC; + break; + case XFS_FMR_OWN_COW: + dest->rm_owner = XFS_RMAP_OWN_COW; + break; + case XFS_FMR_OWN_DEFECTIVE: /* not implemented */ + /* fall through */ + default: + return -EINVAL; + } + return 0; +} + +/* Convert an rmapbt owner into an fsmap owner. */ +static int +xfs_fsmap_owner_from_rmap( + struct xfs_fsmap *dest, + struct xfs_rmap_irec *src) +{ + dest->fmr_flags = 0; + if (!XFS_RMAP_NON_INODE_OWNER(src->rm_owner)) { + dest->fmr_owner = src->rm_owner; + return 0; + } + dest->fmr_flags |= FMR_OF_SPECIAL_OWNER; + + switch (src->rm_owner) { + case XFS_RMAP_OWN_FS: + dest->fmr_owner = XFS_FMR_OWN_FS; + break; + case XFS_RMAP_OWN_LOG: + dest->fmr_owner = XFS_FMR_OWN_LOG; + break; + case XFS_RMAP_OWN_AG: + dest->fmr_owner = XFS_FMR_OWN_AG; + break; + case XFS_RMAP_OWN_INOBT: + dest->fmr_owner = XFS_FMR_OWN_INOBT; + break; + case XFS_RMAP_OWN_INODES: + dest->fmr_owner = XFS_FMR_OWN_INODES; + break; + case XFS_RMAP_OWN_REFC: + dest->fmr_owner = XFS_FMR_OWN_REFC; + break; + case XFS_RMAP_OWN_COW: + dest->fmr_owner = XFS_FMR_OWN_COW; + break; + case XFS_RMAP_OWN_NULL: /* "free" */ + dest->fmr_owner = XFS_FMR_OWN_FREE; + break; + default: + return -EFSCORRUPTED; + } + return 0; +} + +/* getfsmap query state */ +struct xfs_getfsmap_info { + struct xfs_fsmap_head *head; + xfs_fsmap_format_t formatter; /* formatting fn */ + void *format_arg; /* format buffer */ + struct xfs_buf *agf_bp; /* AGF, for refcount queries */ + xfs_daddr_t next_daddr; /* next daddr we expect */ + u64 missing_owner; /* owner of holes */ + u32 dev; /* device id */ + xfs_agnumber_t agno; /* AG number, if applicable */ + struct xfs_rmap_irec low; /* low rmap key */ + struct xfs_rmap_irec high; /* high rmap key */ + bool last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct xfs_getfsmap_dev { + u32 dev; + int (*fn)(struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info); +}; + +/* Compare two getfsmap device handlers. */ +static int +xfs_getfsmap_dev_compare( + const void *p1, + const void *p2) +{ + const struct xfs_getfsmap_dev *d1 = p1; + const struct xfs_getfsmap_dev *d2 = p2; + + return d1->dev - d2->dev; +} + +/* Decide if this mapping is shared. */ +STATIC int +xfs_getfsmap_is_shared( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + bool *stat) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *cur; + xfs_agblock_t fbno; + xfs_extlen_t flen; + int error; + + *stat = false; + if (!xfs_sb_version_hasreflink(&mp->m_sb)) + return 0; + /* rt files will have agno set to NULLAGNUMBER */ + if (info->agno == NULLAGNUMBER) + return 0; + + /* Are there any shared blocks here? */ + flen = 0; + cur = xfs_refcountbt_init_cursor(mp, tp, info->agf_bp, + info->agno, NULL); + + error = xfs_refcount_find_shared(cur, rec->rm_startblock, + rec->rm_blockcount, &fbno, &flen, false); + + xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR); + if (error) + return error; + + *stat = flen > 0; + return 0; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +STATIC int +xfs_getfsmap_helper( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_rmap_irec *rec, + xfs_daddr_t rec_daddr) +{ + struct xfs_fsmap fmr; + struct xfs_mount *mp = tp->t_mountp; + bool shared; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (xfs_rmap_compare(rec, &info->low) < 0) { + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->head->fmh_count == 0) { + if (rec_daddr > info->next_daddr) + info->head->fmh_entries++; + + if (info->last) + return XFS_BTREE_QUERY_RANGE_CONTINUE; + + info->head->fmh_entries++; + + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_daddr > info->next_daddr) { + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + fmr.fmr_device = info->dev; + fmr.fmr_physical = info->next_daddr; + fmr.fmr_owner = info->missing_owner; + fmr.fmr_offset = 0; + fmr.fmr_length = rec_daddr - info->next_daddr; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + } + + if (info->last) + goto out; + + /* Fill out the extent we found */ + if (info->head->fmh_entries >= info->head->fmh_count) + return XFS_BTREE_QUERY_RANGE_ABORT; + + trace_xfs_fsmap_mapping(mp, info->dev, info->agno, rec); + + fmr.fmr_device = info->dev; + fmr.fmr_physical = rec_daddr; + error = xfs_fsmap_owner_from_rmap(&fmr, rec); + if (error) + return error; + fmr.fmr_offset = XFS_FSB_TO_BB(mp, rec->rm_offset); + fmr.fmr_length = XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (rec->rm_flags & XFS_RMAP_UNWRITTEN) + fmr.fmr_flags |= FMR_OF_PREALLOC; + if (rec->rm_flags & XFS_RMAP_ATTR_FORK) + fmr.fmr_flags |= FMR_OF_ATTR_FORK; + if (rec->rm_flags & XFS_RMAP_BMBT_BLOCK) + fmr.fmr_flags |= FMR_OF_EXTENT_MAP; + if (fmr.fmr_flags == 0) { + error = xfs_getfsmap_is_shared(tp, info, rec, &shared); + if (error) + return error; + if (shared) + fmr.fmr_flags |= FMR_OF_SHARED; + } + error = info->formatter(&fmr, info->format_arg); + if (error) + return error; + info->head->fmh_entries++; + +out: + rec_daddr += XFS_FSB_TO_BB(mp, rec->rm_blockcount); + if (info->next_daddr < rec_daddr) + info->next_daddr = rec_daddr; + return XFS_BTREE_QUERY_RANGE_CONTINUE; +} + +/* Transform a rmapbt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_helper( + struct xfs_btree_cur *cur, + struct xfs_rmap_irec *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + xfs_fsblock_t fsb; + xfs_daddr_t rec_daddr; + + fsb = XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, rec->rm_startblock); + rec_daddr = XFS_FSB_TO_DADDR(mp, fsb); + + return xfs_getfsmap_helper(cur->bc_tp, info, rec, rec_daddr); +} + +/* Transform a rtbitmap "record" into a fsmap */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_helper( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_FSB_TO_BB(mp, rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &irec, rec_daddr); +} + +/* Transform a bnobt irec into a fsmap */ +STATIC int +xfs_getfsmap_datadev_bnobt_helper( + struct xfs_btree_cur *cur, + struct xfs_alloc_rec_incore *rec, + void *priv) +{ + struct xfs_mount *mp = cur->bc_mp; + struct xfs_getfsmap_info *info = priv; + struct xfs_rmap_irec irec; + xfs_daddr_t rec_daddr; + + rec_daddr = XFS_AGB_TO_DADDR(mp, cur->bc_private.a.agno, + rec->ar_startblock); + + irec.rm_startblock = rec->ar_startblock; + irec.rm_blockcount = rec->ar_blockcount; + irec.rm_owner = XFS_RMAP_OWN_NULL; /* "free" */ + irec.rm_offset = 0; + irec.rm_flags = 0; + + return xfs_getfsmap_helper(cur->bc_tp, info, &irec, rec_daddr); +} + +/* Set rmap flags based on the getfsmap flags */ +static void +xfs_getfsmap_set_irec_flags( + struct xfs_rmap_irec *irec, + struct xfs_fsmap *fmr) +{ + irec->rm_flags = 0; + if (fmr->fmr_flags & FMR_OF_ATTR_FORK) + irec->rm_flags |= XFS_RMAP_ATTR_FORK; + if (fmr->fmr_flags & FMR_OF_EXTENT_MAP) + irec->rm_flags |= XFS_RMAP_BMBT_BLOCK; + if (fmr->fmr_flags & FMR_OF_PREALLOC) + irec->rm_flags |= XFS_RMAP_UNWRITTEN; +} + +/* Execute a getfsmap query against the log device. */ +STATIC int +xfs_getfsmap_logdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rmap_irec rmap; + int error; + + /* Set up search keys */ + info->low.rm_startblock = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, keys); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + error = xfs_fsmap_owner_to_rmap(&info->high, keys + 1); + if (error) + return error; + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + info->missing_owner = XFS_FMR_OWN_FREE; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + rmap.rm_startblock = 0; + rmap.rm_blockcount = mp->m_sb.sb_logblocks; + rmap.rm_owner = XFS_RMAP_OWN_LOG; + rmap.rm_offset = 0; + rmap.rm_flags = 0; + + return xfs_getfsmap_helper(tp, info, &rmap, 0); +} + +/* Execute a getfsmap query against the realtime device. */ +STATIC int +__xfs_getfsmap_rtdev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *), + struct xfs_getfsmap_info *info) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_BB_TO_FSBT(mp, keys[0].fmr_physical); + end_fsb = XFS_BB_TO_FSB(mp, keys[1].fmr_physical); + + /* Set up search keys */ + info->low.rm_startblock = start_fsb; + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = end_fsb; + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + return error; + info->high.rm_offset = XFS_BB_TO_FSBT(mp, keys[1].fmr_offset); + info->high.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, &info->high); + + return query_fn(tp, info); +} + +/* Actually query the realtime bitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info) +{ + struct xfs_rtalloc_rec alow; + struct xfs_rtalloc_rec ahigh; + int error; + + xfs_ilock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + + alow.ar_startblock = info->low.rm_startblock; + ahigh.ar_startblock = info->high.rm_startblock; + error = xfs_rtalloc_query_range(tp, &alow, &ahigh, + xfs_getfsmap_rtdev_rtbitmap_helper, info); + if (error) + goto err; + + /* Report any gaps at the end of the rtbitmap */ + info->last = true; + error = xfs_getfsmap_rtdev_rtbitmap_helper(tp, &ahigh, info); + if (error) + goto err; +err: + xfs_iunlock(tp->t_mountp->m_rbmip, XFS_ILOCK_SHARED); + return error; +} + +/* Execute a getfsmap query against the realtime device rtbitmap. */ +STATIC int +xfs_getfsmap_rtdev_rtbitmap( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_rtdev(tp, keys, xfs_getfsmap_rtdev_rtbitmap_query, + info); +} + +/* Execute a getfsmap query against the regular data device. */ +STATIC int +__xfs_getfsmap_datadev( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info, + int (*query_fn)(struct xfs_trans *, + struct xfs_getfsmap_info *, + struct xfs_btree_cur **, + void *), + void *priv) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_btree_cur *bt_cur = NULL; + xfs_fsblock_t start_fsb; + xfs_fsblock_t end_fsb; + xfs_agnumber_t start_ag; + xfs_agnumber_t end_ag; + xfs_daddr_t eofs; + int error = 0; + + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = XFS_DADDR_TO_FSB(mp, keys[0].fmr_physical); + end_fsb = XFS_DADDR_TO_FSB(mp, keys[1].fmr_physical); + + /* + * Convert the fsmap low/high keys to AG based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the AG. + */ + info->low.rm_startblock = XFS_FSB_TO_AGBNO(mp, start_fsb); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + if (error) + return error; + info->low.rm_blockcount = 0; + xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + + info->high.rm_startblock = -1U; + info->high.rm_owner = ULLONG_MAX; + info->high.rm_offset = ULLONG_MAX; + info->high.rm_blockcount = 0; + info->high.rm_flags = XFS_RMAP_KEY_FLAGS | XFS_RMAP_REC_FLAGS; + + start_ag = XFS_FSB_TO_AGNO(mp, start_fsb); + end_ag = XFS_FSB_TO_AGNO(mp, end_fsb); + + /* Query each AG */ + for (info->agno = start_ag; info->agno <= end_ag; info->agno++) { + /* + * Set the AG high key from the fsmap high key if this + * is the last AG that we're querying. + */ + if (info->agno == end_ag) { + info->high.rm_startblock = XFS_FSB_TO_AGBNO(mp, + end_fsb); + info->high.rm_offset = XFS_BB_TO_FSBT(mp, + keys[1].fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->high, &keys[1]); + if (error) + goto err; + xfs_getfsmap_set_irec_flags(&info->high, &keys[1]); + } + + if (bt_cur) { + xfs_btree_del_cursor(bt_cur, XFS_BTREE_NOERROR); + bt_cur = NULL; + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + error = xfs_alloc_read_agf(mp, tp, info->agno, 0, + &info->agf_bp); + if (error) + goto err; + + trace_xfs_fsmap_low_key(mp, info->dev, info->agno, &info->low); + trace_xfs_fsmap_high_key(mp, info->dev, info->agno, + &info->high); + + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + + /* + * Set the AG low key to the start of the AG prior to + * moving on to the next AG. + */ + if (info->agno == start_ag) { + info->low.rm_startblock = 0; + info->low.rm_owner = 0; + info->low.rm_offset = 0; + info->low.rm_flags = 0; + } + } + + /* Report any gap at the end of the AG */ + info->last = true; + error = query_fn(tp, info, &bt_cur, priv); + if (error) + goto err; + +err: + if (bt_cur) + xfs_btree_del_cursor(bt_cur, error < 0 ? XFS_BTREE_ERROR : + XFS_BTREE_NOERROR); + if (info->agf_bp) { + xfs_trans_brelse(tp, info->agf_bp); + info->agf_bp = NULL; + } + + return error; +} + +/* Actually query the rmap btree. */ +STATIC int +xfs_getfsmap_datadev_rmapbt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_helper(*curpp, &info->high, info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_rmapbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno); + return xfs_rmap_query_range(*curpp, &info->low, &info->high, + xfs_getfsmap_datadev_helper, info); +} + +/* Execute a getfsmap query against the regular data device rmapbt. */ +STATIC int +xfs_getfsmap_datadev_rmapbt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + info->missing_owner = XFS_FMR_OWN_FREE; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_rmapbt_query, NULL); +} + +/* Actually query the bno btree. */ +STATIC int +xfs_getfsmap_datadev_bnobt_query( + struct xfs_trans *tp, + struct xfs_getfsmap_info *info, + struct xfs_btree_cur **curpp, + void *priv) +{ + struct xfs_alloc_rec_incore *key = priv; + + /* Report any gap at the end of the last AG. */ + if (info->last) + return xfs_getfsmap_datadev_bnobt_helper(*curpp, &key[1], info); + + /* Allocate cursor for this AG and query_range it. */ + *curpp = xfs_allocbt_init_cursor(tp->t_mountp, tp, info->agf_bp, + info->agno, XFS_BTNUM_BNO); + key->ar_startblock = info->low.rm_startblock; + key[1].ar_startblock = info->high.rm_startblock; + return xfs_alloc_query_range(*curpp, key, &key[1], + xfs_getfsmap_datadev_bnobt_helper, info); +} + +/* Execute a getfsmap query against the regular data device's bnobt. */ +STATIC int +xfs_getfsmap_datadev_bnobt( + struct xfs_trans *tp, + struct xfs_fsmap *keys, + struct xfs_getfsmap_info *info) +{ + struct xfs_alloc_rec_incore akeys[2]; + + info->missing_owner = XFS_FMR_OWN_UNKNOWN; + return __xfs_getfsmap_datadev(tp, keys, info, + xfs_getfsmap_datadev_bnobt_query, &akeys[0]); +} + +/* Do we recognize the device? */ +STATIC bool +xfs_getfsmap_is_valid_device( + struct xfs_mount *mp, + struct xfs_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) + return true; + if (mp->m_logdev_targp && + fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) + return true; + if (mp->m_rtdev_targp && + fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +STATIC bool +xfs_getfsmap_check_keys( + struct xfs_fsmap *low_key, + struct xfs_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + if (low_key->fmr_offset > high_key->fmr_offset) + return false; + if (low_key->fmr_offset < high_key->fmr_offset) + return true; + + return false; +} + +#define XFS_GETFSMAP_DEVS 3 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * xfs_fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide sector addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * xfs_getfsmap_info.next_daddr -- next disk addr we expect to see; this + * is how we detect gaps in the fsmap + records and report them. + * xfs_getfsmap_info.low/high -- per-AG low/high keys computed from + * dkeys; used to query the metadata. + */ +int +xfs_getfsmap( + struct xfs_mount *mp, + struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, + void *arg) +{ + struct xfs_trans *tp = NULL; + struct xfs_fsmap dkeys[2]; /* per-dev keys */ + struct xfs_getfsmap_dev handlers[XFS_GETFSMAP_DEVS]; + struct xfs_getfsmap_info info = { NULL }; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[0]) || + !xfs_getfsmap_is_valid_device(mp, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + if (xfs_sb_version_hasrmapbt(&mp->m_sb)) + handlers[0].fn = xfs_getfsmap_datadev_rmapbt; + else + handlers[0].fn = xfs_getfsmap_datadev_bnobt; + if (mp->m_logdev_targp != mp->m_ddev_targp) { + handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].fn = xfs_getfsmap_logdev; + } + if (mp->m_rtdev_targp) { + handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].fn = xfs_getfsmap_rtdev_rtbitmap; + } + + xfs_sort(handlers, XFS_GETFSMAP_DEVS, sizeof(struct xfs_getfsmap_dev), + xfs_getfsmap_dev_compare); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * If the low key mapping refers to file data, the same physical + * blocks could be mapped to several other files/offsets. + * According to rmapbt record ordering, the minimal next + * possible record for the block range is the next starting + * offset in the same inode. Therefore, bump the file offset to + * continue the search appropriately. For all other low key + * mapping types (attr blocks, metadata), bump the physical + * offset as there can be no other mapping for the same physical + * block range. + */ + dkeys[0] = head->fmh_keys[0]; + if (dkeys[0].fmr_flags & (FMR_OF_SPECIAL_OWNER | FMR_OF_EXTENT_MAP)) { + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + if (dkeys[0].fmr_offset) + return -EINVAL; + } else + dkeys[0].fmr_offset += dkeys[0].fmr_length; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct xfs_fsmap)); + + if (!xfs_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.next_daddr = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.formatter = formatter; + info.format_arg = arg; + info.head = head; + + /* For each device we support... */ + for (i = 0; i < XFS_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct xfs_fsmap)); + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + break; + + info.dev = handlers[i].dev; + info.last = false; + info.agno = NULLAGNUMBER; + error = handlers[i].fn(tp, dkeys, &info); + if (error) + break; + xfs_trans_cancel(tp); + tp = NULL; + info.next_daddr = 0; + } + + if (tp) + xfs_trans_cancel(tp); + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/xfs/xfs_fsmap.h b/fs/xfs/xfs_fsmap.h new file mode 100644 index 0000000000000000000000000000000000000000..0b9bf822595c0e7d43c6f43ae9104c8fcdaa995c --- /dev/null +++ b/fs/xfs/xfs_fsmap.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __XFS_FSMAP_H__ +#define __XFS_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct xfs_fsmap { + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + xfs_fileoff_t fmr_offset; /* file offset of segment */ + xfs_filblks_t fmr_length; /* length of segment, blocks */ +}; + +struct xfs_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct xfs_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void xfs_fsmap_from_internal(struct fsmap *dest, struct xfs_fsmap *src); +void xfs_fsmap_to_internal(struct xfs_fsmap *dest, struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*xfs_fsmap_format_t)(struct xfs_fsmap *, void *); + +int xfs_getfsmap(struct xfs_mount *mp, struct xfs_fsmap_head *head, + xfs_fsmap_format_t formatter, void *arg); + +#endif /* __XFS_FSMAP_H__ */ diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3531f8f72fa5e10b83f0fa8bd37afc560b2dbf0a..f61c84f8e31a363ce144424336359630647eac07 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -262,6 +262,22 @@ xfs_inode_clear_reclaim_tag( xfs_perag_clear_reclaim_tag(pag); } +static void +xfs_inew_wait( + struct xfs_inode *ip) +{ + wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_INEW_BIT); + DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_INEW_BIT); + + do { + prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); + if (!xfs_iflags_test(ip, XFS_INEW)) + break; + schedule(); + } while (true); + finish_wait(wq, &wait.wait); +} + /* * When we recycle a reclaimable inode, we need to re-initialise the VFS inode * part of the structure. This is made more complex by the fact we store @@ -366,14 +382,17 @@ xfs_iget_cache_hit( error = xfs_reinit_inode(mp, inode); if (error) { + bool wake; /* * Re-initializing the inode failed, and we are in deep * trouble. Try to re-add it to the reclaim list. */ rcu_read_lock(); spin_lock(&ip->i_flags_lock); - + wake = !!__xfs_iflags_test(ip, XFS_INEW); ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM); + if (wake) + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); ASSERT(ip->i_flags & XFS_IRECLAIMABLE); trace_xfs_iget_reclaim_fail(ip); goto out_error; @@ -623,9 +642,11 @@ xfs_iget( STATIC int xfs_inode_ag_walk_grab( - struct xfs_inode *ip) + struct xfs_inode *ip, + int flags) { struct inode *inode = VFS_I(ip); + bool newinos = !!(flags & XFS_AGITER_INEW_WAIT); ASSERT(rcu_read_lock_held()); @@ -643,7 +664,8 @@ xfs_inode_ag_walk_grab( goto out_unlock_noent; /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - if (__xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + if ((!newinos && __xfs_iflags_test(ip, XFS_INEW)) || + __xfs_iflags_test(ip, XFS_IRECLAIMABLE | XFS_IRECLAIM)) goto out_unlock_noent; spin_unlock(&ip->i_flags_lock); @@ -671,7 +693,8 @@ xfs_inode_ag_walk( void *args), int flags, void *args, - int tag) + int tag, + int iter_flags) { uint32_t first_index; int last_error = 0; @@ -713,7 +736,7 @@ xfs_inode_ag_walk( for (i = 0; i < nr_found; i++) { struct xfs_inode *ip = batch[i]; - if (done || xfs_inode_ag_walk_grab(ip)) + if (done || xfs_inode_ag_walk_grab(ip, iter_flags)) batch[i] = NULL; /* @@ -741,6 +764,9 @@ xfs_inode_ag_walk( for (i = 0; i < nr_found; i++) { if (!batch[i]) continue; + if ((iter_flags & XFS_AGITER_INEW_WAIT) && + xfs_iflags_test(batch[i], XFS_INEW)) + xfs_inew_wait(batch[i]); error = execute(batch[i], flags, args); IRELE(batch[i]); if (error == -EAGAIN) { @@ -820,12 +846,13 @@ xfs_cowblocks_worker( } int -xfs_inode_ag_iterator( +xfs_inode_ag_iterator_flags( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, - void *args) + void *args, + int iter_flags) { struct xfs_perag *pag; int error = 0; @@ -835,7 +862,8 @@ xfs_inode_ag_iterator( ag = 0; while ((pag = xfs_perag_get(mp, ag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1, + iter_flags); xfs_perag_put(pag); if (error) { last_error = error; @@ -846,6 +874,17 @@ xfs_inode_ag_iterator( return last_error; } +int +xfs_inode_ag_iterator( + struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, + void *args), + int flags, + void *args) +{ + return xfs_inode_ag_iterator_flags(mp, execute, flags, args, 0); +} + int xfs_inode_ag_iterator_tag( struct xfs_mount *mp, @@ -863,7 +902,8 @@ xfs_inode_ag_iterator_tag( ag = 0; while ((pag = xfs_perag_get_tag(mp, ag, tag))) { ag = pag->pag_agno + 1; - error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag); + error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag, + 0); xfs_perag_put(pag); if (error) { last_error = error; diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h index 8a7c849b4deade24831871a28d76c9af5c7f3629..9183f77958ef7946b50d6b3fde672ad6108702d4 100644 --- a/fs/xfs/xfs_icache.h +++ b/fs/xfs/xfs_icache.h @@ -48,6 +48,11 @@ struct xfs_eofblocks { #define XFS_IGET_UNTRUSTED 0x2 #define XFS_IGET_DONTCACHE 0x4 +/* + * flags for AG inode iterator + */ +#define XFS_AGITER_INEW_WAIT 0x1 /* wait on new inodes */ + int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino, uint flags, uint lock_flags, xfs_inode_t **ipp); @@ -79,6 +84,9 @@ void xfs_cowblocks_worker(struct work_struct *); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args); +int xfs_inode_ag_iterator_flags(struct xfs_mount *mp, + int (*execute)(struct xfs_inode *ip, int flags, void *args), + int flags, void *args, int iter_flags); int xfs_inode_ag_iterator_tag(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, int flags, void *args), int flags, void *args, int tag); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 7605d83965963566e1d6acef679591cf9787d161..ec9826c565009e5244682beae93c4efc5261b487 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1906,12 +1906,13 @@ xfs_inactive( * force is true because we are evicting an inode from the * cache. Post-eof blocks must be freed, lest we end up with * broken free space accounting. + * + * Note: don't bother with iolock here since lockdep complains + * about acquiring it in reclaim context. We have the only + * reference to the inode at this point anyways. */ - if (xfs_can_free_eofblocks(ip, true)) { - xfs_ilock(ip, XFS_IOLOCK_EXCL); + if (xfs_can_free_eofblocks(ip, true)) xfs_free_eofblocks(ip); - xfs_iunlock(ip, XFS_IOLOCK_EXCL); - } return; } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 10dcf27b4c85ae9d437e77a554dd15f139a40bb3..10e89fcb49d7441d0a4e5e931809562de4c4b754 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -216,7 +216,8 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip) #define XFS_IRECLAIM (1 << 0) /* started reclaiming this inode */ #define XFS_ISTALE (1 << 1) /* inode has been staled */ #define XFS_IRECLAIMABLE (1 << 2) /* inode can be reclaimed */ -#define XFS_INEW (1 << 3) /* inode has just been allocated */ +#define __XFS_INEW_BIT 3 /* inode has just been allocated */ +#define XFS_INEW (1 << __XFS_INEW_BIT) #define XFS_ITRUNCATED (1 << 5) /* truncated down so flush-on-close */ #define XFS_IDIRTY_RELEASE (1 << 6) /* dirty release already seen */ #define __XFS_IFLOCK_BIT 7 /* inode is being flushed right now */ @@ -464,6 +465,7 @@ static inline void xfs_finish_inode_setup(struct xfs_inode *ip) xfs_iflags_clear(ip, XFS_INEW); barrier(); unlock_new_inode(VFS_I(ip)); + wake_up_bit(&ip->i_flags, __XFS_INEW_BIT); } static inline void xfs_setup_existing_inode(struct xfs_inode *ip) diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index d90e7811ccdd9ccec14f9ebb29219bc065d36270..08cb7d1a4a3a40ebc5f456c3767d998f433c8ed4 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -731,22 +731,27 @@ xfs_iflush_done( * holding the lock before removing the inode from the AIL. */ if (need_ail) { - struct xfs_log_item *log_items[need_ail]; - int i = 0; + bool mlip_changed = false; + + /* this is an opencoded batch version of xfs_trans_ail_delete */ spin_lock(&ailp->xa_lock); for (blip = lip; blip; blip = blip->li_bio_list) { - iip = INODE_ITEM(blip); - if (iip->ili_logged && - blip->li_lsn == iip->ili_flush_lsn) { - log_items[i++] = blip; - } - ASSERT(i <= need_ail); + if (INODE_ITEM(blip)->ili_logged && + blip->li_lsn == INODE_ITEM(blip)->ili_flush_lsn) + mlip_changed |= xfs_ail_delete_one(ailp, blip); } - /* xfs_trans_ail_delete_bulk() drops the AIL lock. */ - xfs_trans_ail_delete_bulk(ailp, log_items, i, - SHUTDOWN_CORRUPT_INCORE); - } + if (mlip_changed) { + if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) + xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (list_empty(&ailp->xa_ail)) + wake_up_all(&ailp->xa_empty); + } + spin_unlock(&ailp->xa_lock); + + if (mlip_changed) + xfs_log_space_wake(ailp->xa_mount); + } /* * clean up and unlock the flush lock now we are done. We can clear the diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 2fd7fdf5438f0be85220b9981bff09efce773079..6190697603c947d3706e102e4c00b9c391bfbe2f 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -41,6 +41,9 @@ #include "xfs_trans.h" #include "xfs_pnfs.h" #include "xfs_acl.h" +#include "xfs_btree.h" +#include +#include "xfs_fsmap.h" #include #include @@ -1543,10 +1546,11 @@ xfs_ioc_getbmap( unsigned int cmd, void __user *arg) { - struct getbmapx bmx; + struct getbmapx bmx = { 0 }; int error; - if (copy_from_user(&bmx, arg, sizeof(struct getbmapx))) + /* struct getbmap is a strict subset of struct getbmapx. */ + if (copy_from_user(&bmx, arg, offsetof(struct getbmapx, bmv_iflags))) return -EFAULT; if (bmx.bmv_count < 2) @@ -1608,6 +1612,84 @@ xfs_ioc_getbmapx( return 0; } +struct getfsmap_info { + struct xfs_mount *mp; + struct fsmap_head __user *data; + unsigned int idx; + __u32 last_flags; +}; + +STATIC int +xfs_getfsmap_format(struct xfs_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_xfs_getfsmap_mapping(info->mp, xfm); + + info->last_flags = xfm->fmr_flags; + xfs_fsmap_from_internal(&fm, xfm); + if (copy_to_user(&info->data->fmh_recs[info->idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +STATIC int +xfs_ioc_getfsmap( + struct xfs_inode *ip, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = { NULL }; + struct xfs_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + xfs_fsmap_to_internal(&xhead.fmh_keys[0], &head.fmh_keys[0]); + xfs_fsmap_to_internal(&xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_xfs_getfsmap_low_key(ip->i_mount, &xhead.fmh_keys[0]); + trace_xfs_getfsmap_high_key(ip->i_mount, &xhead.fmh_keys[1]); + + info.mp = ip->i_mount; + info.data = arg; + error = xfs_getfsmap(ip->i_mount, &xhead, xfs_getfsmap_format, &info); + if (error == XFS_BTREE_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.idx) { + info.last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.data->fmh_recs[info.idx - 1].fmr_flags, + &info.last_flags, sizeof(info.last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + int xfs_ioc_swapext( xfs_swapext_t *sxp) @@ -1788,6 +1870,9 @@ xfs_file_ioctl( case XFS_IOC_GETBMAPX: return xfs_ioc_getbmapx(ip, arg); + case FS_IOC_GETFSMAP: + return xfs_ioc_getfsmap(ip, arg); + case XFS_IOC_FD_TO_HANDLE: case XFS_IOC_PATH_TO_HANDLE: case XFS_IOC_PATH_TO_FSHANDLE: { diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index 7c49938c5aed417cf0a83b08526d32fea1190fad..fa0bc4d46065ab01024bf65fb690282bef020b62 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "xfs.h" #include "xfs_fs.h" #include "xfs_format.h" @@ -554,6 +555,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GOINGDOWN: case XFS_IOC_ERROR_INJECTION: case XFS_IOC_ERROR_CLEARALL: + case FS_IOC_GETFSMAP: return xfs_file_ioctl(filp, cmd, p); #ifndef BROKEN_X86_ALIGNMENT /* These are handled fine if no alignment issues */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 288ee5b840d738116b8981e9618fac36fb24614f..a63f61c256bdc54f0a8ee6088702dbd5e91ee9aa 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -240,7 +240,7 @@ xfs_iomap_write_direct( */ if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - if (ISUNWRITTEN(imap)) { + if (imap->br_state == XFS_EXT_UNWRITTEN) { tflags |= XFS_TRANS_RESERVE; resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; } @@ -945,7 +945,7 @@ static inline bool imap_needs_alloc(struct inode *inode, return !nimaps || imap->br_startblock == HOLESTARTBLOCK || imap->br_startblock == DELAYSTARTBLOCK || - (IS_DAX(inode) && ISUNWRITTEN(imap)); + (IS_DAX(inode) && imap->br_state == XFS_EXT_UNWRITTEN); } static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags) @@ -976,6 +976,7 @@ xfs_file_iomap_begin( int nimaps = 1, error = 0; bool shared = false, trimmed = false; unsigned lockmode; + struct block_device *bdev; if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; @@ -1063,6 +1064,14 @@ xfs_file_iomap_begin( } xfs_bmbt_to_iomap(ip, iomap, &imap); + + /* optionally associate a dax device with the iomap bdev */ + bdev = iomap->bdev; + if (blk_queue_dax(bdev->bd_queue)) + iomap->dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); + else + iomap->dax_dev = NULL; + if (shared) iomap->flags |= IOMAP_F_SHARED; return 0; @@ -1140,6 +1149,7 @@ xfs_file_iomap_end( unsigned flags, struct iomap *iomap) { + put_dax(iomap->dax_dev); if ((flags & IOMAP_WRITE) && iomap->type == IOMAP_DELALLOC) return xfs_file_iomap_end_delalloc(XFS_I(inode), offset, length, written, iomap); @@ -1170,10 +1180,10 @@ xfs_xattr_iomap_begin( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - lockmode = xfs_ilock_data_map_shared(ip); + lockmode = xfs_ilock_attr_map_shared(ip); /* if there are no attribute fork or extents, return ENOENT */ - if (XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { + if (!XFS_IFORK_Q(ip) || !ip->i_d.di_anextents) { error = -ENOENT; goto out_unlock; } diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 592fdf7111cbfb2e69d536cebe117eb00b2626be..044fb0e15390c7c5538514ec2f49b72fa5942ade 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -212,88 +212,6 @@ static inline kgid_t xfs_gid_to_kgid(__uint32_t gid) #define xfs_sort(a,n,s,fn) sort(a,n,s,fn,NULL) #define xfs_stack_trace() dump_stack() - -/* Move the kernel do_div definition off to one side */ - -#if defined __i386__ -/* For ia32 we need to pull some tricks to get past various versions - * of the compiler which do not like us using do_div in the middle - * of large functions. - */ -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - *(__u64 *)a = c; - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} - -/* Side effect free 64 bit mod operation */ -static inline __u32 xfs_do_mod(void *a, __u32 b, int n) -{ - switch (n) { - case 4: - return *(__u32 *)a % b; - case 8: - { - unsigned long __upper, __low, __high, __mod; - __u64 c = *(__u64 *)a; - __upper = __high = c >> 32; - __low = c; - if (__high) { - __upper = __high % (b); - __high = __high / (b); - } - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (b), "0" (__low), "1" (__upper)); - asm("":"=A" (c):"a" (__low),"d" (__high)); - return __mod; - } - } - - /* NOTREACHED */ - return 0; -} -#else -static inline __u32 xfs_do_div(void *a, __u32 b, int n) -{ - __u32 mod; - - switch (n) { - case 4: - mod = *(__u32 *)a % b; - *(__u32 *)a = *(__u32 *)a / b; - return mod; - case 8: - mod = do_div(*(__u64 *)a, b); - return mod; - } - - /* NOTREACHED */ - return 0; -} - /* Side effect free 64 bit mod operation */ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) { @@ -310,10 +228,7 @@ static inline __u32 xfs_do_mod(void *a, __u32 b, int n) /* NOTREACHED */ return 0; } -#endif -#undef do_div -#define do_div(a, b) xfs_do_div(&(a), (b), sizeof(a)) #define do_mod(a, b) xfs_do_mod(&(a), (b), sizeof(a)) static inline __uint64_t roundup_64(__uint64_t x, __uint32_t y) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index b1469f0a91a6c04329cb2fff2dc6e9953485a071..3731f13f63e982e50c0342ec7550c3e6e0a1294b 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1293,7 +1293,7 @@ void xfs_log_work_queue( struct xfs_mount *mp) { - queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work, + queue_delayed_work(mp->m_sync_workqueue, &mp->m_log->l_work, msecs_to_jiffies(xfs_syncd_centisecs * 10)); } @@ -1852,7 +1852,7 @@ xlog_sync( */ if (log->l_badcrc_factor && (prandom_u32() % log->l_badcrc_factor == 0)) { - iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_header.h_crc &= cpu_to_le32(0xAAAAAAAA); iclog->ic_state |= XLOG_STATE_IOABORT; xfs_warn(log->l_mp, "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 4a98762ec8b45f3c0dd021c04cb5bcf9a74e63bc..cd0b077deb354b6de5dda287cffaeb351e0e8cce 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3796,7 +3796,7 @@ xlog_recover_bud_pass2( * This routine is called when an inode create format structure is found in a * committed transaction in the log. It's purpose is to initialise the inodes * being allocated on disk. This requires us to get inode cluster buffers that - * match the range to be intialised, stamped with inode templates and written + * match the range to be initialised, stamped with inode templates and written * by delayed write so that subsequent modifications will hit the cached buffer * and only need writing out at the end of recovery. */ diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 688ebff1f66384a309cca74539cbe4d27172b177..2eaf8185916610ee115ebd98f8869f2df59b5fe9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -73,6 +73,10 @@ xfs_uuid_mount( uuid_t *uuid = &mp->m_sb.sb_uuid; int hole, i; + /* Publish UUID in struct super_block */ + BUILD_BUG_ON(sizeof(mp->m_super->s_uuid) != sizeof(uuid_t)); + memcpy(&mp->m_super->s_uuid, uuid, sizeof(uuid_t)); + if (mp->m_flags & XFS_MOUNT_NOUUID) return 0; diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 6db6fd6b82b0ae7eaee47fd88735325734aa51e0..9fa312a41c930cc188ec14f43097b0400a07bb27 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -183,6 +183,7 @@ typedef struct xfs_mount { struct workqueue_struct *m_reclaim_workqueue; struct workqueue_struct *m_log_workqueue; struct workqueue_struct *m_eofblocks_workqueue; + struct workqueue_struct *m_sync_workqueue; /* * Generation of the filesysyem layout. This is incremented by each @@ -312,7 +313,7 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname, static inline xfs_agnumber_t xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); do_div(ld, mp->m_sb.sb_agblocks); return (xfs_agnumber_t) ld; } @@ -320,7 +321,7 @@ xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d) static inline xfs_agblock_t xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) { - xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d); + xfs_rfsblock_t ld = XFS_BB_TO_FSBT(mp, d); return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks); } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index b669b123287bb115e561a6ea1c0be8ae4db359db..5fe6e70b88efe39847ea238945ba7963a741e34a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -851,8 +851,8 @@ xfs_qm_reset_dqcounts( * started afresh by xfs_qm_quotacheck. */ #ifdef DEBUG - j = XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB); - do_div(j, sizeof(xfs_dqblk_t)); + j = (int)XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) / + sizeof(xfs_dqblk_t); ASSERT(mp->m_quotainfo->qi_dqperchunk == j); #endif dqb = bp->b_addr; @@ -1384,12 +1384,7 @@ xfs_qm_quotacheck( mp->m_qflags |= flags; error_return: - while (!list_empty(&buffer_list)) { - struct xfs_buf *bp = - list_first_entry(&buffer_list, struct xfs_buf, b_list); - list_del_init(&bp->b_list); - xfs_buf_relse(bp); - } + xfs_buf_delwri_cancel(&buffer_list); if (error) { xfs_warn(mp, diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 475a3882a81fef1cedaabf33383d04c1ee8d3e4b..9cb5c381b01cf3b53cbaadbc738906ec8952402c 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -759,5 +759,6 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL); + xfs_inode_ag_iterator_flags(mp, xfs_dqrele_inode, flags, NULL, + XFS_AGITER_INEW_WAIT); } diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index 6e4c7446c3d4561f85d86686d5b4a40bc4cd0ce6..96fe209b5eb61fb5b45fcc46f6733c6b46f29308 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -221,6 +221,7 @@ void xfs_cui_release( struct xfs_cui_log_item *cuip) { + ASSERT(atomic_read(&cuip->cui_refcount) > 0); if (atomic_dec_and_test(&cuip->cui_refcount)) { xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR); xfs_cui_item_free(cuip); diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 4a84c5ea266d8f8fcec61aa55776fec339d27aaf..ffe6fe7a7eb546721827bcdb8e810a8777a1455b 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -206,11 +206,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_reflink_inode(ip) || - ISUNWRITTEN(irec) || - irec->br_startblock == HOLESTARTBLOCK || - irec->br_startblock == DELAYSTARTBLOCK || - isnullstartblock(irec->br_startblock)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_real_extent(irec)) { *shared = false; return 0; } @@ -709,8 +705,22 @@ xfs_reflink_end_cow( offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset); end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count); - /* Start a rolling transaction to switch the mappings */ - resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK); + /* + * Start a rolling transaction to switch the mappings. We're + * unlikely ever to have to remap 16T worth of single-block + * extents, so just cap the worst case extent count to 2^32-1. + * Stick a warning in just in case, and avoid 64-bit division. + */ + BUILD_BUG_ON(MAX_RW_COUNT > UINT_MAX); + if (end_fsb - offset_fsb > UINT_MAX) { + error = -EFSCORRUPTED; + xfs_force_shutdown(ip->i_mount, SHUTDOWN_CORRUPT_INCORE); + ASSERT(0); + goto out; + } + resblks = XFS_NEXTENTADD_SPACE_RES(ip->i_mount, + (unsigned int)(end_fsb - offset_fsb), + XFS_DATA_FORK); error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write, resblks, 0, 0, &tp); if (error) @@ -1045,12 +1055,12 @@ xfs_reflink_remap_extent( xfs_off_t new_isize) { struct xfs_mount *mp = ip->i_mount; + bool real_extent = xfs_bmap_is_real_extent(irec); struct xfs_trans *tp; xfs_fsblock_t firstfsb; unsigned int resblks; struct xfs_defer_ops dfops; struct xfs_bmbt_irec uirec; - bool real_extent; xfs_filblks_t rlen; xfs_filblks_t unmap_len; xfs_off_t newlen; @@ -1059,11 +1069,6 @@ xfs_reflink_remap_extent( unmap_len = irec->br_startoff + irec->br_blockcount - destoff; trace_xfs_reflink_punch_range(ip, destoff, unmap_len); - /* Only remap normal extents. */ - real_extent = (irec->br_startblock != HOLESTARTBLOCK && - irec->br_startblock != DELAYSTARTBLOCK && - !ISUNWRITTEN(irec)); - /* No reflinking if we're low on space */ if (real_extent) { error = xfs_reflink_ag_has_free_space(mp, @@ -1359,9 +1364,7 @@ xfs_reflink_dirty_extents( goto out; if (nmaps == 0) break; - if (map[0].br_startblock == HOLESTARTBLOCK || - map[0].br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map[0])) + if (!xfs_bmap_is_real_extent(&map[0])) goto next; map[1] = map[0]; @@ -1435,9 +1438,7 @@ xfs_reflink_clear_inode_flag( return error; if (nmaps == 0) break; - if (map.br_startblock == HOLESTARTBLOCK || - map.br_startblock == DELAYSTARTBLOCK || - ISUNWRITTEN(&map)) + if (!xfs_bmap_is_real_extent(&map)) goto next; agno = XFS_FSB_TO_AGNO(mp, map.br_startblock); diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 73c827831551942550c3e0a3a66c4a7abdf52ef6..f3b139c9aa1674a3f652ec622286e9aba616d5c3 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -243,6 +243,7 @@ void xfs_rui_release( struct xfs_rui_log_item *ruip) { + ASSERT(atomic_read(&ruip->rui_refcount) > 0); if (atomic_dec_and_test(&ruip->rui_refcount)) { xfs_trans_ail_remove(&ruip->rui_item, SHUTDOWN_LOG_IO_ERROR); xfs_rui_item_free(ruip); diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 51dd3c7266086d5039e5b77f8ec9ea72c5d95f31..f13133e6f19fa4a25e4359d19d7cadebf5e2b970 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -23,6 +23,16 @@ struct xfs_mount; struct xfs_trans; +struct xfs_rtalloc_rec { + xfs_rtblock_t ar_startblock; + xfs_rtblock_t ar_blockcount; +}; + +typedef int (*xfs_rtalloc_query_range_fn)( + struct xfs_trans *tp, + struct xfs_rtalloc_rec *rec, + void *priv); + #ifdef CONFIG_XFS_RT /* * Function prototypes for exported functions. @@ -118,13 +128,21 @@ int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, int xfs_rtfree_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, struct xfs_buf **rbpp, xfs_fsblock_t *rsb); - - +int xfs_rtalloc_query_range(struct xfs_trans *tp, + struct xfs_rtalloc_rec *low_rec, + struct xfs_rtalloc_rec *high_rec, + xfs_rtalloc_query_range_fn fn, + void *priv); +int xfs_rtalloc_query_all(struct xfs_trans *tp, + xfs_rtalloc_query_range_fn fn, + void *priv); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) +# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) +# define xfs_rtalloc_query_all(t,f,p) (ENOSYS) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 685c042a120f16a8a9a8dad69d8ee7ce6f9274a0..455a575f101db1f22b8c8c0aca247d627f82ea4c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -52,6 +52,7 @@ #include "xfs_reflink.h" #include +#include #include #include #include @@ -877,8 +878,15 @@ xfs_init_mount_workqueues( if (!mp->m_eofblocks_workqueue) goto out_destroy_log; + mp->m_sync_workqueue = alloc_workqueue("xfs-sync/%s", WQ_FREEZABLE, 0, + mp->m_fsname); + if (!mp->m_sync_workqueue) + goto out_destroy_eofb; + return 0; +out_destroy_eofb: + destroy_workqueue(mp->m_eofblocks_workqueue); out_destroy_log: destroy_workqueue(mp->m_log_workqueue); out_destroy_reclaim: @@ -899,6 +907,7 @@ STATIC void xfs_destroy_mount_workqueues( struct xfs_mount *mp) { + destroy_workqueue(mp->m_sync_workqueue); destroy_workqueue(mp->m_eofblocks_workqueue); destroy_workqueue(mp->m_log_workqueue); destroy_workqueue(mp->m_reclaim_workqueue); diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 7f17ae6d709a1013f277ab608eb98f64298573f2..5d95fe34829438a0a2486bc50278d84b0d0714bf 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -47,6 +47,7 @@ #include "xfs_inode_item.h" #include "xfs_bmap_btree.h" #include "xfs_filestream.h" +#include "xfs_fsmap.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 383ac227ce2c324cee29b5e8a209fa6b98583192..7c5a16528d8bb0506e999d2673404604c2b76920 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -40,6 +40,8 @@ struct xfs_inode_log_format; struct xfs_bmbt_irec; struct xfs_btree_cur; struct xfs_refcount_irec; +struct xfs_fsmap; +struct xfs_rmap_irec; DECLARE_EVENT_CLASS(xfs_attr_list_class, TP_PROTO(struct xfs_attr_list_context *ctx), @@ -2190,7 +2192,7 @@ DECLARE_EVENT_CLASS(xfs_discard_class, __entry->agbno = agbno; __entry->len = len; ), - TP_printk("dev %d:%d agno %u agbno %u len %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2253,8 +2255,8 @@ DECLARE_EVENT_CLASS(xfs_defer_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) ), TP_fast_assign( __entry->dev = mp ? mp->m_super->s_dev : 0; @@ -2262,7 +2264,7 @@ DECLARE_EVENT_CLASS(xfs_defer_class, __entry->committed = dop->dop_committed; __entry->low = dop->dop_low; ), - TP_printk("dev %d:%d ops %p committed %d low %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2279,8 +2281,8 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, TP_STRUCT__entry( __field(dev_t, dev) __field(void *, dop) - __field(bool, committed) - __field(bool, low) + __field(char, committed) + __field(char, low) __field(int, error) ), TP_fast_assign( @@ -2290,7 +2292,7 @@ DECLARE_EVENT_CLASS(xfs_defer_error_class, __entry->low = dop->dop_low; __entry->error = error; ), - TP_printk("dev %d:%d ops %p committed %d low %d err %d\n", + TP_printk("dev %d:%d ops %p committed %d low %d err %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->dop, __entry->committed, @@ -2309,7 +2311,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __field(dev_t, dev) __field(int, type) __field(void *, intent) - __field(bool, committed) + __field(char, committed) __field(int, nr) ), TP_fast_assign( @@ -2319,7 +2321,7 @@ DECLARE_EVENT_CLASS(xfs_defer_pending_class, __entry->committed = dfp->dfp_done != NULL; __entry->nr = dfp->dfp_count; ), - TP_printk("dev %d:%d optype %d intent %p committed %d nr %d\n", + TP_printk("dev %d:%d optype %d intent %p committed %d nr %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->type, __entry->intent, @@ -2614,7 +2616,8 @@ DECLARE_EVENT_CLASS(xfs_ag_resv_class, __entry->asked = r ? r->ar_asked : 0; __entry->len = len; ), - TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u resv %u ask %u len %u\n", + TP_printk("dev %d:%d agno %u resv %d freeblks %u flcount %u " + "resv %u ask %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->resv, @@ -2667,7 +2670,7 @@ DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class, __entry->agbno = agbno; __entry->dir = dir; ), - TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n", + TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, @@ -2700,7 +2703,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_class, __entry->blockcount = irec->rc_blockcount; __entry->refcount = irec->rc_refcount; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2735,7 +2738,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class, __entry->refcount = irec->rc_refcount; __entry->agbno = agbno; ), - TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n", + TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->startblock, @@ -2776,7 +2779,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class, __entry->i2_refcount = i2->rc_refcount; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2822,7 +2825,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class, __entry->agbno = agbno; ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u @ agbno %u\n", + "agbno %u len %u refcount %u @ agbno %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -2875,7 +2878,7 @@ DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class, ), TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- " "agbno %u len %u refcount %u -- " - "agbno %u len %u refcount %u\n", + "agbno %u len %u refcount %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->i1_startblock, @@ -3001,31 +3004,6 @@ DEFINE_EVENT(xfs_inode_error_class, name, \ unsigned long caller_ip), \ TP_ARGS(ip, error, caller_ip)) -/* reflink allocator */ -TRACE_EVENT(xfs_bmap_remap_alloc, - TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno, - xfs_extlen_t len), - TP_ARGS(ip, fsbno, len), - TP_STRUCT__entry( - __field(dev_t, dev) - __field(xfs_ino_t, ino) - __field(xfs_fsblock_t, fsbno) - __field(xfs_extlen_t, len) - ), - TP_fast_assign( - __entry->dev = VFS_I(ip)->i_sb->s_dev; - __entry->ino = ip->i_ino; - __entry->fsbno = fsbno; - __entry->len = len; - ), - TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, - __entry->fsbno, - __entry->len) -); -DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error); - /* reflink tracepoint classes */ /* two-file io tracepoint class */ @@ -3227,7 +3205,7 @@ TRACE_EVENT(xfs_ioctl_clone, ), TP_printk("dev %d:%d " "ino 0x%lx isize 0x%llx -> " - "ino 0x%lx isize 0x%llx\n", + "ino 0x%lx isize 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->src_ino, __entry->src_isize, @@ -3267,6 +3245,88 @@ DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap); DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece); DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error); +/* fsmap traces */ +DECLARE_EVENT_CLASS(xfs_fsmap_class, + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, + struct xfs_rmap_irec *rmap), + TP_ARGS(mp, keydev, agno, rmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_agnumber_t, agno) + __field(xfs_fsblock_t, bno) + __field(xfs_filblks_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(unsigned int, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->bno = rmap->rm_startblock; + __entry->len = rmap->rm_blockcount; + __entry->owner = rmap->rm_owner; + __entry->offset = rmap->rm_offset; + __entry->flags = rmap->rm_flags; + ), + TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld offset %llu flags 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->bno, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_FSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_fsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, u32 keydev, xfs_agnumber_t agno, \ + struct xfs_rmap_irec *rmap), \ + TP_ARGS(mp, keydev, agno, rmap)) +DEFINE_FSMAP_EVENT(xfs_fsmap_low_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_high_key); +DEFINE_FSMAP_EVENT(xfs_fsmap_mapping); + +DECLARE_EVENT_CLASS(xfs_getfsmap_class, + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), + TP_ARGS(mp, fsmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(xfs_daddr_t, block) + __field(xfs_daddr_t, len) + __field(__uint64_t, owner) + __field(__uint64_t, offset) + __field(__uint64_t, flags) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->keydev = new_decode_dev(fsmap->fmr_device); + __entry->block = fsmap->fmr_physical; + __entry->len = fsmap->fmr_length; + __entry->owner = fsmap->fmr_owner; + __entry->offset = fsmap->fmr_offset; + __entry->flags = fsmap->fmr_flags; + ), + TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld offset %llu flags 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->block, + __entry->len, + __entry->owner, + __entry->offset, + __entry->flags) +) +#define DEFINE_GETFSMAP_EVENT(name) \ +DEFINE_EVENT(xfs_getfsmap_class, name, \ + TP_PROTO(struct xfs_mount *mp, struct xfs_fsmap *fsmap), \ + TP_ARGS(mp, fsmap)) +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_low_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_high_key); +DEFINE_GETFSMAP_EVENT(xfs_getfsmap_mapping); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 70f42ea86dfbd4a7de39d0709ee4a90e59f0a758..2011620008de8a4d422643bc20dbc2313873c16e 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -134,7 +134,7 @@ xfs_trans_reserve( bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); /* * Attempt to reserve the needed disk blocks by decrementing @@ -144,7 +144,7 @@ xfs_trans_reserve( if (blocks > 0) { error = xfs_mod_fdblocks(tp->t_mountp, -((int64_t)blocks), rsvd); if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return -ENOSPC; } tp->t_blk_res += blocks; @@ -221,7 +221,7 @@ xfs_trans_reserve( tp->t_blk_res = 0; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); return error; } @@ -262,6 +262,28 @@ xfs_trans_alloc( return 0; } +/* + * Create an empty transaction with no reservation. This is a defensive + * mechanism for routines that query metadata without actually modifying + * them -- if the metadata being queried is somehow cross-linked (think a + * btree block pointer that points higher in the tree), we risk deadlock. + * However, blocks grabbed as part of a transaction can be re-grabbed. + * The verifiers will notice the corrupt block and the operation will fail + * back to userspace without deadlocking. + * + * Note the zero-length reservation; this transaction MUST be cancelled + * without any dirty data. + */ +int +xfs_trans_alloc_empty( + struct xfs_mount *mp, + struct xfs_trans **tpp) +{ + struct xfs_trans_res resv = {0}; + + return xfs_trans_alloc(mp, &resv, 0, 0, XFS_TRANS_NO_WRITECOUNT, tpp); +} + /* * Record the indicated change to the given field for application * to the file system's superblock when the transaction commits. @@ -914,7 +936,7 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -944,7 +966,7 @@ __xfs_trans_commit( if (commit_lsn == -1 && !error) error = -EIO; } - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); @@ -998,7 +1020,7 @@ xfs_trans_cancel( xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); + current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); @@ -1012,17 +1034,14 @@ xfs_trans_cancel( * chunk we've been working on and get a new transaction to continue. */ int -__xfs_trans_roll( +xfs_trans_roll( struct xfs_trans **tpp, - struct xfs_inode *dp, - int *committed) + struct xfs_inode *dp) { struct xfs_trans *trans; struct xfs_trans_res tres; int error; - *committed = 0; - /* * Ensure that the inode is always logged. */ @@ -1048,7 +1067,6 @@ __xfs_trans_roll( if (error) return error; - *committed = 1; trans = *tpp; /* @@ -1071,12 +1089,3 @@ __xfs_trans_roll( xfs_trans_ijoin(trans, dp, 0); return 0; } - -int -xfs_trans_roll( - struct xfs_trans **tpp, - struct xfs_inode *dp) -{ - int committed; - return __xfs_trans_roll(tpp, dp, &committed); -} diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 1646f659b60fbeaac09985ebbe984eb2d345cd80..a07acbf0bd8a4f491fc4976b31dfcc696e7a5b18 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -158,6 +158,8 @@ typedef struct xfs_trans { int xfs_trans_alloc(struct xfs_mount *mp, struct xfs_trans_res *resp, uint blocks, uint rtextents, uint flags, struct xfs_trans **tpp); +int xfs_trans_alloc_empty(struct xfs_mount *mp, + struct xfs_trans **tpp); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); struct xfs_buf *xfs_trans_get_buf_map(struct xfs_trans *tp, @@ -226,7 +228,6 @@ int xfs_trans_free_extent(struct xfs_trans *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t, struct xfs_owner_info *); int xfs_trans_commit(struct xfs_trans *); -int __xfs_trans_roll(struct xfs_trans **, struct xfs_inode *, int *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index d6c9c3e9e02b2c45f2cd57074cbbcdebde1e804a..9056c0f34a3c42d40a6acaba8a189a55bbf0cc97 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -684,8 +684,23 @@ xfs_trans_ail_update_bulk( } } -/* - * xfs_trans_ail_delete_bulk - remove multiple log items from the AIL +bool +xfs_ail_delete_one( + struct xfs_ail *ailp, + struct xfs_log_item *lip) +{ + struct xfs_log_item *mlip = xfs_ail_min(ailp); + + trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); + xfs_ail_delete(ailp, lip); + lip->li_flags &= ~XFS_LI_IN_AIL; + lip->li_lsn = 0; + + return mlip == lip; +} + +/** + * Remove a log items from the AIL * * @xfs_trans_ail_delete_bulk takes an array of log items that all need to * removed from the AIL. The caller is already holding the AIL lock, and done @@ -706,52 +721,36 @@ xfs_trans_ail_update_bulk( * before returning. */ void -xfs_trans_ail_delete_bulk( +xfs_trans_ail_delete( struct xfs_ail *ailp, - struct xfs_log_item **log_items, - int nr_items, + struct xfs_log_item *lip, int shutdown_type) __releases(ailp->xa_lock) { - xfs_log_item_t *mlip; - int mlip_changed = 0; - int i; + struct xfs_mount *mp = ailp->xa_mount; + bool mlip_changed; - mlip = xfs_ail_min(ailp); - - for (i = 0; i < nr_items; i++) { - struct xfs_log_item *lip = log_items[i]; - if (!(lip->li_flags & XFS_LI_IN_AIL)) { - struct xfs_mount *mp = ailp->xa_mount; - - spin_unlock(&ailp->xa_lock); - if (!XFS_FORCED_SHUTDOWN(mp)) { - xfs_alert_tag(mp, XFS_PTAG_AILDELETE, - "%s: attempting to delete a log item that is not in the AIL", - __func__); - xfs_force_shutdown(mp, shutdown_type); - } - return; + if (!(lip->li_flags & XFS_LI_IN_AIL)) { + spin_unlock(&ailp->xa_lock); + if (!XFS_FORCED_SHUTDOWN(mp)) { + xfs_alert_tag(mp, XFS_PTAG_AILDELETE, + "%s: attempting to delete a log item that is not in the AIL", + __func__); + xfs_force_shutdown(mp, shutdown_type); } - - trace_xfs_ail_delete(lip, mlip->li_lsn, lip->li_lsn); - xfs_ail_delete(ailp, lip); - lip->li_flags &= ~XFS_LI_IN_AIL; - lip->li_lsn = 0; - if (mlip == lip) - mlip_changed = 1; + return; } + mlip_changed = xfs_ail_delete_one(ailp, lip); if (mlip_changed) { - if (!XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - xlog_assign_tail_lsn_locked(ailp->xa_mount); + if (!XFS_FORCED_SHUTDOWN(mp)) + xlog_assign_tail_lsn_locked(mp); if (list_empty(&ailp->xa_ail)) wake_up_all(&ailp->xa_empty); - spin_unlock(&ailp->xa_lock); + } + spin_unlock(&ailp->xa_lock); + if (mlip_changed) xfs_log_space_wake(ailp->xa_mount); - } else { - spin_unlock(&ailp->xa_lock); - } } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index 49931b72da8a8400f767a1cb7d368b31ce62c04b..d91706c56c633a1eb4ed243cf182d2891200e24d 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -106,18 +106,9 @@ xfs_trans_ail_update( xfs_trans_ail_update_bulk(ailp, NULL, &lip, 1, lsn); } -void xfs_trans_ail_delete_bulk(struct xfs_ail *ailp, - struct xfs_log_item **log_items, int nr_items, - int shutdown_type) - __releases(ailp->xa_lock); -static inline void -xfs_trans_ail_delete( - struct xfs_ail *ailp, - xfs_log_item_t *lip, - int shutdown_type) __releases(ailp->xa_lock) -{ - xfs_trans_ail_delete_bulk(ailp, &lip, 1, shutdown_type); -} +bool xfs_ail_delete_one(struct xfs_ail *ailp, struct xfs_log_item *lip); +void xfs_trans_ail_delete(struct xfs_ail *ailp, struct xfs_log_item *lip, + int shutdown_type) __releases(ailp->xa_lock); static inline void xfs_trans_ail_remove( diff --git a/include/Kbuild b/include/Kbuild deleted file mode 100644 index bab1145bc7a7d620c579e60abef2b6652603df78..0000000000000000000000000000000000000000 --- a/include/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# Top-level Makefile calls into asm-$(ARCH) -# List only non-arch directories below diff --git a/include/acpi/acconfig.h b/include/acpi/acconfig.h index 07740072da55d9bc64794c5f7232be4d5e256bc0..6db3b4668b1a2b1e6c618d49565f96b1c4521021 100644 --- a/include/acpi/acconfig.h +++ b/include/acpi/acconfig.h @@ -78,6 +78,7 @@ #define ACPI_MAX_EXTPARSE_CACHE_DEPTH 96 /* Parse tree objects */ #define ACPI_MAX_OBJECT_CACHE_DEPTH 96 /* Interpreter operand objects */ #define ACPI_MAX_NAMESPACE_CACHE_DEPTH 96 /* Namespace objects */ +#define ACPI_MAX_COMMENT_CACHE_DEPTH 96 /* Comments for the -ca option */ /* * Should the subsystem abort the loading of an ACPI table if the diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index ef0ae8aaa56771a3a1c4e00a4712f55cbdda9114..197f3fffc9a7151ed61d0b960f5e452f6beccb5c 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -88,6 +88,7 @@ acpi_evaluate_dsm_typed(acpi_handle handle, const u8 *uuid, u64 rev, u64 func, } bool acpi_dev_found(const char *hid); +bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); #ifdef CONFIG_ACPI @@ -386,6 +387,7 @@ struct acpi_data_node { const char *name; acpi_handle handle; struct fwnode_handle fwnode; + struct fwnode_handle *parent; struct acpi_device_data data; struct list_head sibling; struct kobject kobj; @@ -575,7 +577,7 @@ struct acpi_pci_root { bool acpi_dma_supported(struct acpi_device *adev); enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev); -void acpi_dma_configure(struct device *dev, enum dev_dma_attr attr); +int acpi_dma_configure(struct device *dev, enum dev_dma_attr attr); void acpi_dma_deconfigure(struct device *dev); struct acpi_device *acpi_find_child_device(struct acpi_device *parent, @@ -586,6 +588,15 @@ struct acpi_pci_root *acpi_pci_find_root(acpi_handle handle); int acpi_enable_wakeup_device_power(struct acpi_device *dev, int state); int acpi_disable_wakeup_device_power(struct acpi_device *dev); +#ifdef CONFIG_X86 +bool acpi_device_always_present(struct acpi_device *adev); +#else +static inline bool acpi_device_always_present(struct acpi_device *adev) +{ + return false; +} +#endif + #ifdef CONFIG_PM acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, void (*work_func)(struct work_struct *work)); diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 3795386ea7068a1e0a77801ee98e9a569c59f3e2..15c86ce4df537bec81aaf1e83e5961b0b9e5913f 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -46,7 +46,7 @@ /* Current ACPICA subsystem version in YYYYMMDD format */ -#define ACPI_CA_VERSION 0x20170119 +#define ACPI_CA_VERSION 0x20170303 #include #include diff --git a/include/acpi/actbl2.h b/include/acpi/actbl2.h index 7aee9fb3bd1f06f88c3a83b708ae0b5e452170cb..faa9f2c0d5de8cd545bd1ea911b352fa74a0128d 100644 --- a/include/acpi/actbl2.h +++ b/include/acpi/actbl2.h @@ -87,6 +87,7 @@ #define ACPI_SIG_WDAT "WDAT" /* Watchdog Action Table */ #define ACPI_SIG_WDDT "WDDT" /* Watchdog Timer Description Table */ #define ACPI_SIG_WDRT "WDRT" /* Watchdog Resource Table */ +#define ACPI_SIG_XXXX "XXXX" /* Intermediate AML header for ASL/ASL+ converter */ #ifdef ACPI_UNDEFINED_TABLES /* @@ -783,6 +784,15 @@ struct acpi_iort_smmu { #define ACPI_IORT_SMMU_DVM_SUPPORTED (1) #define ACPI_IORT_SMMU_COHERENT_WALK (1<<1) +/* Global interrupt format */ + +struct acpi_iort_smmu_gsi { + u32 nsg_irpt; + u32 nsg_irpt_flags; + u32 nsg_cfg_irpt; + u32 nsg_cfg_irpt_flags; +}; + struct acpi_iort_smmu_v3 { u64 base_address; /* SMMUv3 base address */ u32 flags; @@ -1294,6 +1304,7 @@ struct acpi_table_tpm2 { #define ACPI_TPM2_MEMORY_MAPPED 6 #define ACPI_TPM2_COMMAND_BUFFER 7 #define ACPI_TPM2_COMMAND_BUFFER_WITH_START_METHOD 8 +#define ACPI_TPM2_COMMAND_BUFFER_WITH_SMC 11 /******************************************************************************* * diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 427a7c3e6c75c5d3789ed12a4836d0d19c202938..2010c0516f273f05185d8131b363996c37d68903 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -103,6 +103,7 @@ struct cppc_perf_caps { u32 highest_perf; u32 nominal_perf; u32 lowest_perf; + u32 lowest_nonlinear_perf; }; struct cppc_perf_ctrls { @@ -115,7 +116,7 @@ struct cppc_perf_fb_ctrs { u64 reference; u64 delivered; u64 reference_perf; - u64 ctr_wrap_time; + u64 wraparound_time; }; /* Per CPU container for runtime CPPC management. */ diff --git a/include/asm-generic/Kbuild.asm b/include/asm-generic/Kbuild.asm deleted file mode 100644 index d2ee86b4c091c7bcde862f8b34cb822570468d98..0000000000000000000000000000000000000000 --- a/include/asm-generic/Kbuild.asm +++ /dev/null @@ -1 +0,0 @@ -include include/uapi/asm-generic/Kbuild.asm diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6f96247226a4d2b9c1e9aa56be423ffa0c4d49df..d6f4aed479a12b0da4ed81c3c905183a08e55e39 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -5,7 +5,9 @@ #ifdef CONFIG_GENERIC_BUG #define BUGFLAG_WARNING (1 << 0) -#define BUGFLAG_TAINT(taint) (BUGFLAG_WARNING | ((taint) << 8)) +#define BUGFLAG_ONCE (1 << 1) +#define BUGFLAG_DONE (1 << 2) +#define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -55,6 +57,18 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif +#ifdef __WARN_FLAGS +#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) +#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) + +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_ONCE_TAINT(TAINT_WARN); \ + unlikely(__ret_warn_on); \ +}) +#endif + /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant issues that need prompt attention if they should ever @@ -97,7 +111,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #endif #ifndef WARN -#define WARN(condition, format...) ({ \ +#define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ __WARN_printf(format); \ @@ -112,6 +126,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, unlikely(__ret_warn_on); \ }) +#ifndef WARN_ON_ONCE #define WARN_ON_ONCE(condition) ({ \ static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ @@ -122,6 +137,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } \ unlikely(__ret_warn_once); \ }) +#endif #define WARN_ONCE(condition, format...) ({ \ static bool __section(.data.unlikely) __warned; \ diff --git a/include/asm-generic/extable.h b/include/asm-generic/extable.h new file mode 100644 index 0000000000000000000000000000000000000000..ca14c6664027d40f88473ca1a97bbaeef6dcc1ed --- /dev/null +++ b/include/asm-generic/extable.h @@ -0,0 +1,26 @@ +#ifndef __ASM_GENERIC_EXTABLE_H +#define __ASM_GENERIC_EXTABLE_H + +/* + * The exception table consists of pairs of addresses: the first is the + * address of an instruction that is allowed to fault, and the second is + * the address at which the program should continue. No registers are + * modified, so it is entirely up to the continuation code to figure out + * what to do. + * + * All the routines below use bits of fixup code that are out of line + * with the main instruction path. This means when everything is well, + * we don't even have to jump over them. Further, they do not intrude + * on our cache or tlb entries. + */ + +struct exception_table_entry +{ + unsigned long insn, fixup; +}; + + +struct pt_regs; +extern int fixup_exception(struct pt_regs *regs); + +#endif diff --git a/include/asm-generic/mm_hooks.h b/include/asm-generic/mm_hooks.h index cc5d9a1405df925c43ec62a03be03c30ae921f56..41e5b6784b975d1974604c6265486ab858c3865f 100644 --- a/include/asm-generic/mm_hooks.h +++ b/include/asm-generic/mm_hooks.h @@ -32,10 +32,4 @@ static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, /* by default, allow everything */ return true; } - -static inline bool arch_pte_access_permitted(pte_t pte, bool write) -{ - /* by default, allow everything */ - return true; -} #endif /* _ASM_GENERIC_MM_HOOKS_H */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 1fad160f35de8e89953af075173a2ad219c9693b..7dfa767dc68012ac52ac81d48e92b3ec79c97311 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -341,6 +341,31 @@ static inline int pte_unused(pte_t pte) } #endif +#ifndef pte_access_permitted +#define pte_access_permitted(pte, write) \ + (pte_present(pte) && (!(write) || pte_write(pte))) +#endif + +#ifndef pmd_access_permitted +#define pmd_access_permitted(pmd, write) \ + (pmd_present(pmd) && (!(write) || pmd_write(pmd))) +#endif + +#ifndef pud_access_permitted +#define pud_access_permitted(pud, write) \ + (pud_present(pud) && (!(write) || pud_write(pud))) +#endif + +#ifndef p4d_access_permitted +#define p4d_access_permitted(p4d, write) \ + (p4d_present(p4d) && (!(write) || p4d_write(p4d))) +#endif + +#ifndef pgd_access_permitted +#define pgd_access_permitted(pgd, write) \ + (pgd_present(pgd) && (!(write) || pgd_write(pgd))) +#endif + #ifndef __HAVE_ARCH_PMD_SAME #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) diff --git a/include/asm-generic/set_memory.h b/include/asm-generic/set_memory.h new file mode 100644 index 0000000000000000000000000000000000000000..83e81f8996b22c0d3185e7c61ad0e566ac11aa3a --- /dev/null +++ b/include/asm-generic/set_memory.h @@ -0,0 +1,12 @@ +#ifndef __ASM_SET_MEMORY_H +#define __ASM_SET_MEMORY_H + +/* + * Functions to change memory attributes. + */ +int set_memory_ro(unsigned long addr, int numpages); +int set_memory_rw(unsigned long addr, int numpages); +int set_memory_x(unsigned long addr, int numpages); +int set_memory_nx(unsigned long addr, int numpages); + +#endif diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h index cc6bb319e464f46e8e2b8b9010261ba585718e94..bbe4bb438e3934e99748bf9c1a815ace2b0fda7f 100644 --- a/include/asm-generic/uaccess.h +++ b/include/asm-generic/uaccess.h @@ -6,7 +6,6 @@ * on any machine that has kernel and user data in the same * address space, e.g. all NOMMU machines. */ -#include #include #include @@ -35,9 +34,6 @@ static inline void set_fs(mm_segment_t fs) #define segment_eq(a, b) ((a).seg == (b).seg) #endif -#define VERIFY_READ 0 -#define VERIFY_WRITE 1 - #define access_ok(type, addr, size) __access_ok((unsigned long)(addr),(size)) /* @@ -51,87 +47,6 @@ static inline int __access_ok(unsigned long addr, unsigned long size) } #endif -/* - * The exception table consists of pairs of addresses: the first is the - * address of an instruction that is allowed to fault, and the second is - * the address at which the program should continue. No registers are - * modified, so it is entirely up to the continuation code to figure out - * what to do. - * - * All the routines below use bits of fixup code that are out of line - * with the main instruction path. This means when everything is well, - * we don't even have to jump over them. Further, they do not intrude - * on our cache or tlb entries. - */ - -struct exception_table_entry -{ - unsigned long insn, fixup; -}; - -/* - * architectures with an MMU should override these two - */ -#ifndef __copy_from_user -static inline __must_check long __copy_from_user(void *to, - const void __user * from, unsigned long n) -{ - if (__builtin_constant_p(n)) { - switch(n) { - case 1: - *(u8 *)to = *(u8 __force *)from; - return 0; - case 2: - *(u16 *)to = *(u16 __force *)from; - return 0; - case 4: - *(u32 *)to = *(u32 __force *)from; - return 0; -#ifdef CONFIG_64BIT - case 8: - *(u64 *)to = *(u64 __force *)from; - return 0; -#endif - default: - break; - } - } - - memcpy(to, (const void __force *)from, n); - return 0; -} -#endif - -#ifndef __copy_to_user -static inline __must_check long __copy_to_user(void __user *to, - const void *from, unsigned long n) -{ - if (__builtin_constant_p(n)) { - switch(n) { - case 1: - *(u8 __force *)to = *(u8 *)from; - return 0; - case 2: - *(u16 __force *)to = *(u16 *)from; - return 0; - case 4: - *(u32 __force *)to = *(u32 *)from; - return 0; -#ifdef CONFIG_64BIT - case 8: - *(u64 __force *)to = *(u64 *)from; - return 0; -#endif - default: - break; - } - } - - memcpy((void __force *)to, from, n); - return 0; -} -#endif - /* * These are the main single-value transfer routines. They automatically * use the right size if we just have the right pointer type. @@ -171,8 +86,7 @@ static inline __must_check long __copy_to_user(void __user *to, static inline int __put_user_fn(size_t size, void __user *ptr, void *x) { - size = __copy_to_user(ptr, x, size); - return size ? -EFAULT : size; + return unlikely(raw_copy_to_user(ptr, x, size)) ? -EFAULT : 0; } #define __put_user_fn(sz, u, k) __put_user_fn(sz, u, k) @@ -187,28 +101,28 @@ extern int __put_user_bad(void) __attribute__((noreturn)); __chk_user_ptr(ptr); \ switch (sizeof(*(ptr))) { \ case 1: { \ - unsigned char __x; \ + unsigned char __x = 0; \ __gu_err = __get_user_fn(sizeof (*(ptr)), \ ptr, &__x); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 2: { \ - unsigned short __x; \ + unsigned short __x = 0; \ __gu_err = __get_user_fn(sizeof (*(ptr)), \ ptr, &__x); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 4: { \ - unsigned int __x; \ + unsigned int __x = 0; \ __gu_err = __get_user_fn(sizeof (*(ptr)), \ ptr, &__x); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ break; \ }; \ case 8: { \ - unsigned long long __x; \ + unsigned long long __x = 0; \ __gu_err = __get_user_fn(sizeof (*(ptr)), \ ptr, &__x); \ (x) = *(__force __typeof__(*(ptr)) *) &__x; \ @@ -233,12 +147,7 @@ extern int __put_user_bad(void) __attribute__((noreturn)); #ifndef __get_user_fn static inline int __get_user_fn(size_t size, const void __user *ptr, void *x) { - size_t n = __copy_from_user(x, ptr, size); - if (unlikely(n)) { - memset(x + (size - n), 0, n); - return -EFAULT; - } - return 0; + return unlikely(raw_copy_from_user(x, ptr, size)) ? -EFAULT : 0; } #define __get_user_fn(sz, u, k) __get_user_fn(sz, u, k) @@ -247,36 +156,6 @@ static inline int __get_user_fn(size_t size, const void __user *ptr, void *x) extern int __get_user_bad(void) __attribute__((noreturn)); -#ifndef __copy_from_user_inatomic -#define __copy_from_user_inatomic __copy_from_user -#endif - -#ifndef __copy_to_user_inatomic -#define __copy_to_user_inatomic __copy_to_user -#endif - -static inline long copy_from_user(void *to, - const void __user * from, unsigned long n) -{ - unsigned long res = n; - might_fault(); - if (likely(access_ok(VERIFY_READ, from, n))) - res = __copy_from_user(to, from, n); - if (unlikely(res)) - memset(to + (n - res), 0, res); - return res; -} - -static inline long copy_to_user(void __user *to, - const void *from, unsigned long n) -{ - might_fault(); - if (access_ok(VERIFY_WRITE, to, n)) - return __copy_to_user(to, from, n); - else - return n; -} - /* * Copy a null terminated string from userspace. */ @@ -348,4 +227,6 @@ clear_user(void __user *to, unsigned long n) return __clear_user(to, n); } +#include + #endif /* __ASM_GENERIC_UACCESS_H */ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 143db9c523e25f38488bd43302b82f316e3124a9..314a0b9219c64704151ad50680bd89d133f1af57 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -287,8 +287,6 @@ *(.rodata1) \ } \ \ - BUG_TABLE \ - \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -568,7 +566,6 @@ IRQCHIP_OF_MATCH_TABLE() \ ACPI_PROBE_TABLE(irqchip) \ ACPI_PROBE_TABLE(clksrc) \ - ACPI_PROBE_TABLE(iort) \ EARLYCON_TABLE() #define INIT_TEXT \ @@ -857,7 +854,8 @@ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ CONSTRUCTORS \ - } + } \ + BUG_TABLE #define INIT_TEXT_SECTION(inittext_align) \ . = ALIGN(inittext_align); \ diff --git a/include/clocksource/arm_arch_timer.h b/include/clocksource/arm_arch_timer.h index caedb74c92108188e57652b56a1c198f3263c092..cc805b72994acd1442a50a4ec6e7d5593a833d76 100644 --- a/include/clocksource/arm_arch_timer.h +++ b/include/clocksource/arm_arch_timer.h @@ -16,9 +16,13 @@ #ifndef __CLKSOURCE_ARM_ARCH_TIMER_H #define __CLKSOURCE_ARM_ARCH_TIMER_H +#include #include #include +#define ARCH_TIMER_TYPE_CP15 BIT(0) +#define ARCH_TIMER_TYPE_MEM BIT(1) + #define ARCH_TIMER_CTRL_ENABLE (1 << 0) #define ARCH_TIMER_CTRL_IT_MASK (1 << 1) #define ARCH_TIMER_CTRL_IT_STAT (1 << 2) @@ -34,11 +38,27 @@ enum arch_timer_reg { ARCH_TIMER_REG_TVAL, }; +enum arch_timer_ppi_nr { + ARCH_TIMER_PHYS_SECURE_PPI, + ARCH_TIMER_PHYS_NONSECURE_PPI, + ARCH_TIMER_VIRT_PPI, + ARCH_TIMER_HYP_PPI, + ARCH_TIMER_MAX_TIMER_PPI +}; + +enum arch_timer_spi_nr { + ARCH_TIMER_PHYS_SPI, + ARCH_TIMER_VIRT_SPI, + ARCH_TIMER_MAX_TIMER_SPI +}; + #define ARCH_TIMER_PHYS_ACCESS 0 #define ARCH_TIMER_VIRT_ACCESS 1 #define ARCH_TIMER_MEM_PHYS_ACCESS 2 #define ARCH_TIMER_MEM_VIRT_ACCESS 3 +#define ARCH_TIMER_MEM_MAX_FRAMES 8 + #define ARCH_TIMER_USR_PCT_ACCESS_EN (1 << 0) /* physical counter */ #define ARCH_TIMER_USR_VCT_ACCESS_EN (1 << 1) /* virtual counter */ #define ARCH_TIMER_VIRT_EVT_EN (1 << 2) @@ -54,6 +74,20 @@ struct arch_timer_kvm_info { int virtual_irq; }; +struct arch_timer_mem_frame { + bool valid; + phys_addr_t cntbase; + size_t size; + int phys_irq; + int virt_irq; +}; + +struct arch_timer_mem { + phys_addr_t cntctlbase; + size_t size; + struct arch_timer_mem_frame frame[ARCH_TIMER_MEM_MAX_FRAMES]; +}; + #ifdef CONFIG_ARM_ARCH_TIMER extern u32 arch_timer_get_rate(void); diff --git a/include/crypto/gf128mul.h b/include/crypto/gf128mul.h index 592d47e565a8b305406ca8e4b8745d0c9715f109..0977fb18ff68c63ad66eccbb34cc4bff8c514af0 100644 --- a/include/crypto/gf128mul.h +++ b/include/crypto/gf128mul.h @@ -43,12 +43,13 @@ --------------------------------------------------------------------------- Issue Date: 31/01/2006 - An implementation of field multiplication in Galois Field GF(128) + An implementation of field multiplication in Galois Field GF(2^128) */ #ifndef _CRYPTO_GF128MUL_H #define _CRYPTO_GF128MUL_H +#include #include #include @@ -65,7 +66,7 @@ * are left and the lsb's are right. char b[16] is an array and b[0] is * the first octet. * - * 80000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 + * 10000000 00000000 00000000 00000000 .... 00000000 00000000 00000000 * b[0] b[1] b[2] b[3] b[13] b[14] b[15] * * Every bit is a coefficient of some power of X. We can store the bits @@ -85,15 +86,17 @@ * Both of the above formats are easy to implement on big-endian * machines. * - * EME (which is patent encumbered) uses the ble format (bits are stored - * in big endian order and the bytes in little endian). The above buffer - * represents X^7 in this case and the primitive polynomial is b[0] = 0x87. + * XTS and EME (the latter of which is patent encumbered) use the ble + * format (bits are stored in big endian order and the bytes in little + * endian). The above buffer represents X^7 in this case and the + * primitive polynomial is b[0] = 0x87. * * The common machine word-size is smaller than 128 bits, so to make * an efficient implementation we must split into machine word sizes. - * This file uses one 32bit for the moment. Machine endianness comes into - * play. The lle format in relation to machine endianness is discussed - * below by the original author of gf128mul Dr Brian Gladman. + * This implementation uses 64-bit words for the moment. Machine + * endianness comes into play. The lle format in relation to machine + * endianness is discussed below by the original author of gf128mul Dr + * Brian Gladman. * * Let's look at the bbe and ble format on a little endian machine. * @@ -127,10 +130,10 @@ * machines this will automatically aligned to wordsize and on a 64-bit * machine also. */ -/* Multiply a GF128 field element by x. Field elements are held in arrays - of bytes in which field bits 8n..8n + 7 are held in byte[n], with lower - indexed bits placed in the more numerically significant bit positions - within bytes. +/* Multiply a GF(2^128) field element by x. Field elements are + held in arrays of bytes in which field bits 8n..8n + 7 are held in + byte[n], with lower indexed bits placed in the more numerically + significant bit positions within bytes. On little endian machines the bit indexes translate into the bit positions within four 32-bit words in the following way @@ -161,8 +164,58 @@ void gf128mul_lle(be128 *a, const be128 *b); void gf128mul_bbe(be128 *a, const be128 *b); -/* multiply by x in ble format, needed by XTS */ -void gf128mul_x_ble(be128 *a, const be128 *b); +/* + * The following functions multiply a field element by x in + * the polynomial field representation. They use 64-bit word operations + * to gain speed but compensate for machine endianness and hence work + * correctly on both styles of machine. + * + * They are defined here for performance. + */ + +static inline u64 gf128mul_mask_from_bit(u64 x, int which) +{ + /* a constant-time version of 'x & ((u64)1 << which) ? (u64)-1 : 0' */ + return ((s64)(x << (63 - which)) >> 63); +} + +static inline void gf128mul_x_lle(be128 *r, const be128 *x) +{ + u64 a = be64_to_cpu(x->a); + u64 b = be64_to_cpu(x->b); + + /* equivalent to gf128mul_table_le[(b << 7) & 0xff] << 48 + * (see crypto/gf128mul.c): */ + u64 _tt = gf128mul_mask_from_bit(b, 0) & ((u64)0xe1 << 56); + + r->b = cpu_to_be64((b >> 1) | (a << 63)); + r->a = cpu_to_be64((a >> 1) ^ _tt); +} + +static inline void gf128mul_x_bbe(be128 *r, const be128 *x) +{ + u64 a = be64_to_cpu(x->a); + u64 b = be64_to_cpu(x->b); + + /* equivalent to gf128mul_table_be[a >> 63] (see crypto/gf128mul.c): */ + u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; + + r->a = cpu_to_be64((a << 1) | (b >> 63)); + r->b = cpu_to_be64((b << 1) ^ _tt); +} + +/* needed by XTS */ +static inline void gf128mul_x_ble(le128 *r, const le128 *x) +{ + u64 a = le64_to_cpu(x->a); + u64 b = le64_to_cpu(x->b); + + /* equivalent to gf128mul_table_be[b >> 63] (see crypto/gf128mul.c): */ + u64 _tt = gf128mul_mask_from_bit(a, 63) & 0x87; + + r->a = cpu_to_le64((a << 1) | (b >> 63)); + r->b = cpu_to_le64((b << 1) ^ _tt); +} /* 4k table optimization */ @@ -172,8 +225,8 @@ struct gf128mul_4k { struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g); struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g); -void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t); -void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t); +void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t); +void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t); static inline void gf128mul_free_4k(struct gf128mul_4k *t) { @@ -194,6 +247,6 @@ struct gf128mul_64k { */ struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g); void gf128mul_free_64k(struct gf128mul_64k *t); -void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t); +void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t); #endif /* _CRYPTO_GF128MUL_H */ diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h index 1de2b5af12d7347bce4c593dfbf7386c48ddfa66..51052f65cefc6a81ef465437038492d1fe6c00f4 100644 --- a/include/crypto/internal/acompress.h +++ b/include/crypto/internal/acompress.h @@ -78,4 +78,7 @@ int crypto_register_acomp(struct acomp_alg *alg); */ int crypto_unregister_acomp(struct acomp_alg *alg); +int crypto_register_acomps(struct acomp_alg *algs, int count); +void crypto_unregister_acomps(struct acomp_alg *algs, int count); + #endif diff --git a/include/crypto/internal/hash.h b/include/crypto/internal/hash.h index 1d4f365d8f03a439ab0e20caf27ae34ad6e13e74..f6d9af3efa45a6cc8eb448a71f5d43d72d8fcbd1 100644 --- a/include/crypto/internal/hash.h +++ b/include/crypto/internal/hash.h @@ -166,6 +166,16 @@ static inline struct ahash_instance *ahash_alloc_instance( return crypto_alloc_instance2(name, alg, ahash_instance_headroom()); } +static inline void ahash_request_complete(struct ahash_request *req, int err) +{ + req->base.complete(&req->base, err); +} + +static inline u32 ahash_request_flags(struct ahash_request *req) +{ + return req->base.flags; +} + static inline struct crypto_ahash *crypto_spawn_ahash( struct crypto_ahash_spawn *spawn) { diff --git a/include/crypto/internal/scompress.h b/include/crypto/internal/scompress.h index 3fda3c5655a0e071870dd2f6ea1859bb4b269822..ccad9b2c9bd67679790cf3eb9d884cb5f47b0202 100644 --- a/include/crypto/internal/scompress.h +++ b/include/crypto/internal/scompress.h @@ -133,4 +133,7 @@ int crypto_register_scomp(struct scomp_alg *alg); */ int crypto_unregister_scomp(struct scomp_alg *alg); +int crypto_register_scomps(struct scomp_alg *algs, int count); +void crypto_unregister_scomps(struct scomp_alg *algs, int count); + #endif diff --git a/include/crypto/kpp.h b/include/crypto/kpp.h index 4307a2f2365f49433ecf46a03007a7316ab4df58..ce8e1f79374b64c30eb045d2cff8ad021bf340b1 100644 --- a/include/crypto/kpp.h +++ b/include/crypto/kpp.h @@ -74,7 +74,7 @@ struct crypto_kpp { * @base: Common crypto API algorithm data structure */ struct kpp_alg { - int (*set_secret)(struct crypto_kpp *tfm, void *buffer, + int (*set_secret)(struct crypto_kpp *tfm, const void *buffer, unsigned int len); int (*generate_public_key)(struct kpp_request *req); int (*compute_shared_secret)(struct kpp_request *req); @@ -273,8 +273,8 @@ struct kpp_secret { * * Return: zero on success; error code in case of error */ -static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, void *buffer, - unsigned int len) +static inline int crypto_kpp_set_secret(struct crypto_kpp *tfm, + const void *buffer, unsigned int len) { struct kpp_alg *alg = crypto_kpp_alg(tfm); diff --git a/include/crypto/public_key.h b/include/crypto/public_key.h index 882ca0e1e7a5967e1dde952c8e5ecdf616b0f2fd..e0b681a717bac93ce3323b1a353fceba8310f811 100644 --- a/include/crypto/public_key.h +++ b/include/crypto/public_key.h @@ -50,9 +50,20 @@ struct key; struct key_type; union key_payload; -extern int restrict_link_by_signature(struct key *trust_keyring, +extern int restrict_link_by_signature(struct key *dest_keyring, const struct key_type *type, - const union key_payload *payload); + const union key_payload *payload, + struct key *trust_keyring); + +extern int restrict_link_by_key_or_keyring(struct key *dest_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *trusted); + +extern int restrict_link_by_key_or_keyring_chain(struct key *trust_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *trusted); extern int verify_signature(const struct key *key, const struct public_key_signature *sig); diff --git a/include/crypto/xts.h b/include/crypto/xts.h index 77b630672b2c1a7754605a2094fb714b5ff002b8..c0bde308b28ae6566b6f96046b3a53b9951bec7f 100644 --- a/include/crypto/xts.h +++ b/include/crypto/xts.h @@ -11,7 +11,7 @@ struct blkcipher_desc; #define XTS_BLOCK_SIZE 16 struct xts_crypt_req { - be128 *tbuf; + le128 *tbuf; unsigned int tbuflen; void *tweak_ctx; diff --git a/include/drm/drm_mem_util.h b/include/drm/drm_mem_util.h index 70d4e221a3adb485ff7927c33737db23a127b577..d0f6cf2e5324411ac20d3f53515df0899a9601ed 100644 --- a/include/drm/drm_mem_util.h +++ b/include/drm/drm_mem_util.h @@ -37,8 +37,7 @@ static __inline__ void *drm_calloc_large(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kcalloc(nmemb, size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); + return vzalloc(size * nmemb); } /* Modeled after cairo's malloc_ab, it's like calloc but without the zeroing. */ @@ -50,8 +49,7 @@ static __inline__ void *drm_malloc_ab(size_t nmemb, size_t size) if (size * nmemb <= PAGE_SIZE) return kmalloc(nmemb * size, GFP_KERNEL); - return __vmalloc(size * nmemb, - GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL); + return vmalloc(size * nmemb); } static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) @@ -69,8 +67,7 @@ static __inline__ void *drm_malloc_gfp(size_t nmemb, size_t size, gfp_t gfp) return ptr; } - return __vmalloc(size * nmemb, - gfp | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size * nmemb, gfp, PAGE_KERNEL); } static __inline void drm_free_large(void *ptr) diff --git a/include/dt-bindings/clock/gxbb-clkc.h b/include/dt-bindings/clock/gxbb-clkc.h index 692846c7941b53ac8449ed69a21b2a946a89c3bf..3190e30b9398ca60037193daf59efd7006bf66ab 100644 --- a/include/dt-bindings/clock/gxbb-clkc.h +++ b/include/dt-bindings/clock/gxbb-clkc.h @@ -10,12 +10,18 @@ #define CLKID_FCLK_DIV2 4 #define CLKID_FCLK_DIV3 5 #define CLKID_FCLK_DIV4 6 +#define CLKID_GP0_PLL 9 #define CLKID_CLK81 12 #define CLKID_MPLL2 15 -#define CLKID_SPI 34 #define CLKID_I2C 22 #define CLKID_SAR_ADC 23 +#define CLKID_RNG0 25 +#define CLKID_SPI 34 #define CLKID_ETH 36 +#define CLKID_AIU_GLUE 38 +#define CLKID_I2S_OUT 40 +#define CLKID_MIXER_IFACE 44 +#define CLKID_AIU 47 #define CLKID_USB0 50 #define CLKID_USB1 51 #define CLKID_USB 55 @@ -24,11 +30,17 @@ #define CLKID_USB0_DDR_BRIDGE 65 #define CLKID_SANA 69 #define CLKID_GCLK_VENCI_INT0 77 +#define CLKID_AOCLK_GATE 80 #define CLKID_AO_I2C 93 #define CLKID_SD_EMMC_A 94 #define CLKID_SD_EMMC_B 95 #define CLKID_SD_EMMC_C 96 #define CLKID_SAR_ADC_CLK 97 #define CLKID_SAR_ADC_SEL 98 +#define CLKID_MALI_0_SEL 100 +#define CLKID_MALI_0 102 +#define CLKID_MALI_1_SEL 103 +#define CLKID_MALI_1 105 +#define CLKID_MALI 106 #endif /* __GXBB_CLKC_H */ diff --git a/include/dt-bindings/clock/hi6220-clock.h b/include/dt-bindings/clock/hi6220-clock.h index 6b03c84f42783635573307c053d63e9f0d3ee083..b8ba665aab7b3486ad39777acda4a4aa0a249a5d 100644 --- a/include/dt-bindings/clock/hi6220-clock.h +++ b/include/dt-bindings/clock/hi6220-clock.h @@ -124,7 +124,10 @@ #define HI6220_CS_DAPB 57 #define HI6220_CS_ATB_DIV 58 -#define HI6220_SYS_NR_CLKS 59 +/* gate clock */ +#define HI6220_DAPB_CLK 59 + +#define HI6220_SYS_NR_CLKS 60 /* clk in Hi6220 media controller */ /* gate clocks */ diff --git a/include/dt-bindings/clock/mt6797-clk.h b/include/dt-bindings/clock/mt6797-clk.h new file mode 100644 index 0000000000000000000000000000000000000000..2f25a5aca01992fa32aa421bb3c5c8c6b3120b38 --- /dev/null +++ b/include/dt-bindings/clock/mt6797-clk.h @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2017 MediaTek Inc. + * Author: Kevin Chen + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _DT_BINDINGS_CLK_MT6797_H +#define _DT_BINDINGS_CLK_MT6797_H + +/* TOPCKGEN */ +#define CLK_TOP_MUX_ULPOSC_AXI_CK_MUX_PRE 1 +#define CLK_TOP_MUX_ULPOSC_AXI_CK_MUX 2 +#define CLK_TOP_MUX_AXI 3 +#define CLK_TOP_MUX_MEM 4 +#define CLK_TOP_MUX_DDRPHYCFG 5 +#define CLK_TOP_MUX_MM 6 +#define CLK_TOP_MUX_PWM 7 +#define CLK_TOP_MUX_VDEC 8 +#define CLK_TOP_MUX_VENC 9 +#define CLK_TOP_MUX_MFG 10 +#define CLK_TOP_MUX_CAMTG 11 +#define CLK_TOP_MUX_UART 12 +#define CLK_TOP_MUX_SPI 13 +#define CLK_TOP_MUX_ULPOSC_SPI_CK_MUX 14 +#define CLK_TOP_MUX_USB20 15 +#define CLK_TOP_MUX_MSDC50_0_HCLK 16 +#define CLK_TOP_MUX_MSDC50_0 17 +#define CLK_TOP_MUX_MSDC30_1 18 +#define CLK_TOP_MUX_MSDC30_2 19 +#define CLK_TOP_MUX_AUDIO 20 +#define CLK_TOP_MUX_AUD_INTBUS 21 +#define CLK_TOP_MUX_PMICSPI 22 +#define CLK_TOP_MUX_SCP 23 +#define CLK_TOP_MUX_ATB 24 +#define CLK_TOP_MUX_MJC 25 +#define CLK_TOP_MUX_DPI0 26 +#define CLK_TOP_MUX_AUD_1 27 +#define CLK_TOP_MUX_AUD_2 28 +#define CLK_TOP_MUX_SSUSB_TOP_SYS 29 +#define CLK_TOP_MUX_SPM 30 +#define CLK_TOP_MUX_BSI_SPI 31 +#define CLK_TOP_MUX_AUDIO_H 32 +#define CLK_TOP_MUX_ANC_MD32 33 +#define CLK_TOP_MUX_MFG_52M 34 +#define CLK_TOP_SYSPLL_CK 35 +#define CLK_TOP_SYSPLL_D2 36 +#define CLK_TOP_SYSPLL1_D2 37 +#define CLK_TOP_SYSPLL1_D4 38 +#define CLK_TOP_SYSPLL1_D8 39 +#define CLK_TOP_SYSPLL1_D16 40 +#define CLK_TOP_SYSPLL_D3 41 +#define CLK_TOP_SYSPLL_D3_D3 42 +#define CLK_TOP_SYSPLL2_D2 43 +#define CLK_TOP_SYSPLL2_D4 44 +#define CLK_TOP_SYSPLL2_D8 45 +#define CLK_TOP_SYSPLL_D5 46 +#define CLK_TOP_SYSPLL3_D2 47 +#define CLK_TOP_SYSPLL3_D4 48 +#define CLK_TOP_SYSPLL_D7 49 +#define CLK_TOP_SYSPLL4_D2 50 +#define CLK_TOP_SYSPLL4_D4 51 +#define CLK_TOP_UNIVPLL_CK 52 +#define CLK_TOP_UNIVPLL_D7 53 +#define CLK_TOP_UNIVPLL_D26 54 +#define CLK_TOP_SSUSB_PHY_48M_CK 55 +#define CLK_TOP_USB_PHY48M_CK 56 +#define CLK_TOP_UNIVPLL_D2 57 +#define CLK_TOP_UNIVPLL1_D2 58 +#define CLK_TOP_UNIVPLL1_D4 59 +#define CLK_TOP_UNIVPLL1_D8 60 +#define CLK_TOP_UNIVPLL_D3 61 +#define CLK_TOP_UNIVPLL2_D2 62 +#define CLK_TOP_UNIVPLL2_D4 63 +#define CLK_TOP_UNIVPLL2_D8 64 +#define CLK_TOP_UNIVPLL_D5 65 +#define CLK_TOP_UNIVPLL3_D2 66 +#define CLK_TOP_UNIVPLL3_D4 67 +#define CLK_TOP_UNIVPLL3_D8 68 +#define CLK_TOP_ULPOSC_CK_ORG 69 +#define CLK_TOP_ULPOSC_CK 70 +#define CLK_TOP_ULPOSC_D2 71 +#define CLK_TOP_ULPOSC_D3 72 +#define CLK_TOP_ULPOSC_D4 73 +#define CLK_TOP_ULPOSC_D8 74 +#define CLK_TOP_ULPOSC_D10 75 +#define CLK_TOP_APLL1_CK 76 +#define CLK_TOP_APLL2_CK 77 +#define CLK_TOP_MFGPLL_CK 78 +#define CLK_TOP_MFGPLL_D2 79 +#define CLK_TOP_IMGPLL_CK 80 +#define CLK_TOP_IMGPLL_D2 81 +#define CLK_TOP_IMGPLL_D4 82 +#define CLK_TOP_CODECPLL_CK 83 +#define CLK_TOP_CODECPLL_D2 84 +#define CLK_TOP_VDECPLL_CK 85 +#define CLK_TOP_TVDPLL_CK 86 +#define CLK_TOP_TVDPLL_D2 87 +#define CLK_TOP_TVDPLL_D4 88 +#define CLK_TOP_TVDPLL_D8 89 +#define CLK_TOP_TVDPLL_D16 90 +#define CLK_TOP_MSDCPLL_CK 91 +#define CLK_TOP_MSDCPLL_D2 92 +#define CLK_TOP_MSDCPLL_D4 93 +#define CLK_TOP_MSDCPLL_D8 94 +#define CLK_TOP_NR 95 + +/* APMIXED_SYS */ +#define CLK_APMIXED_MAINPLL 1 +#define CLK_APMIXED_UNIVPLL 2 +#define CLK_APMIXED_MFGPLL 3 +#define CLK_APMIXED_MSDCPLL 4 +#define CLK_APMIXED_IMGPLL 5 +#define CLK_APMIXED_TVDPLL 6 +#define CLK_APMIXED_CODECPLL 7 +#define CLK_APMIXED_VDECPLL 8 +#define CLK_APMIXED_APLL1 9 +#define CLK_APMIXED_APLL2 10 +#define CLK_APMIXED_NR 11 + +/* INFRA_SYS */ +#define CLK_INFRA_PMIC_TMR 1 +#define CLK_INFRA_PMIC_AP 2 +#define CLK_INFRA_PMIC_MD 3 +#define CLK_INFRA_PMIC_CONN 4 +#define CLK_INFRA_SCP 5 +#define CLK_INFRA_SEJ 6 +#define CLK_INFRA_APXGPT 7 +#define CLK_INFRA_SEJ_13M 8 +#define CLK_INFRA_ICUSB 9 +#define CLK_INFRA_GCE 10 +#define CLK_INFRA_THERM 11 +#define CLK_INFRA_I2C0 12 +#define CLK_INFRA_I2C1 13 +#define CLK_INFRA_I2C2 14 +#define CLK_INFRA_I2C3 15 +#define CLK_INFRA_PWM_HCLK 16 +#define CLK_INFRA_PWM1 17 +#define CLK_INFRA_PWM2 18 +#define CLK_INFRA_PWM3 19 +#define CLK_INFRA_PWM4 20 +#define CLK_INFRA_PWM 21 +#define CLK_INFRA_UART0 22 +#define CLK_INFRA_UART1 23 +#define CLK_INFRA_UART2 24 +#define CLK_INFRA_UART3 25 +#define CLK_INFRA_MD2MD_CCIF_0 26 +#define CLK_INFRA_MD2MD_CCIF_1 27 +#define CLK_INFRA_MD2MD_CCIF_2 28 +#define CLK_INFRA_FHCTL 29 +#define CLK_INFRA_BTIF 30 +#define CLK_INFRA_MD2MD_CCIF_3 31 +#define CLK_INFRA_SPI 32 +#define CLK_INFRA_MSDC0 33 +#define CLK_INFRA_MD2MD_CCIF_4 34 +#define CLK_INFRA_MSDC1 35 +#define CLK_INFRA_MSDC2 36 +#define CLK_INFRA_MD2MD_CCIF_5 37 +#define CLK_INFRA_GCPU 38 +#define CLK_INFRA_TRNG 39 +#define CLK_INFRA_AUXADC 40 +#define CLK_INFRA_CPUM 41 +#define CLK_INFRA_AP_C2K_CCIF_0 42 +#define CLK_INFRA_AP_C2K_CCIF_1 43 +#define CLK_INFRA_CLDMA 44 +#define CLK_INFRA_DISP_PWM 45 +#define CLK_INFRA_AP_DMA 46 +#define CLK_INFRA_DEVICE_APC 47 +#define CLK_INFRA_L2C_SRAM 48 +#define CLK_INFRA_CCIF_AP 49 +#define CLK_INFRA_AUDIO 50 +#define CLK_INFRA_CCIF_MD 51 +#define CLK_INFRA_DRAMC_F26M 52 +#define CLK_INFRA_I2C4 53 +#define CLK_INFRA_I2C_APPM 54 +#define CLK_INFRA_I2C_GPUPM 55 +#define CLK_INFRA_I2C2_IMM 56 +#define CLK_INFRA_I2C2_ARB 57 +#define CLK_INFRA_I2C3_IMM 58 +#define CLK_INFRA_I2C3_ARB 59 +#define CLK_INFRA_I2C5 60 +#define CLK_INFRA_SYS_CIRQ 61 +#define CLK_INFRA_SPI1 62 +#define CLK_INFRA_DRAMC_B_F26M 63 +#define CLK_INFRA_ANC_MD32 64 +#define CLK_INFRA_ANC_MD32_32K 65 +#define CLK_INFRA_DVFS_SPM1 66 +#define CLK_INFRA_AES_TOP0 67 +#define CLK_INFRA_AES_TOP1 68 +#define CLK_INFRA_SSUSB_BUS 69 +#define CLK_INFRA_SPI2 70 +#define CLK_INFRA_SPI3 71 +#define CLK_INFRA_SPI4 72 +#define CLK_INFRA_SPI5 73 +#define CLK_INFRA_IRTX 74 +#define CLK_INFRA_SSUSB_SYS 75 +#define CLK_INFRA_SSUSB_REF 76 +#define CLK_INFRA_AUDIO_26M 77 +#define CLK_INFRA_AUDIO_26M_PAD_TOP 78 +#define CLK_INFRA_MODEM_TEMP_SHARE 79 +#define CLK_INFRA_VAD_WRAP_SOC 80 +#define CLK_INFRA_DRAMC_CONF 81 +#define CLK_INFRA_DRAMC_B_CONF 82 +#define CLK_INFRA_MFG_VCG 83 +#define CLK_INFRA_13M 84 +#define CLK_INFRA_NR 85 + +/* IMG_SYS */ +#define CLK_IMG_FDVT 1 +#define CLK_IMG_DPE 2 +#define CLK_IMG_DIP 3 +#define CLK_IMG_LARB6 4 +#define CLK_IMG_NR 5 + +/* MM_SYS */ +#define CLK_MM_SMI_COMMON 1 +#define CLK_MM_SMI_LARB0 2 +#define CLK_MM_SMI_LARB5 3 +#define CLK_MM_CAM_MDP 4 +#define CLK_MM_MDP_RDMA0 5 +#define CLK_MM_MDP_RDMA1 6 +#define CLK_MM_MDP_RSZ0 7 +#define CLK_MM_MDP_RSZ1 8 +#define CLK_MM_MDP_RSZ2 9 +#define CLK_MM_MDP_TDSHP 10 +#define CLK_MM_MDP_COLOR 11 +#define CLK_MM_MDP_WDMA 12 +#define CLK_MM_MDP_WROT0 13 +#define CLK_MM_MDP_WROT1 14 +#define CLK_MM_FAKE_ENG 15 +#define CLK_MM_DISP_OVL0 16 +#define CLK_MM_DISP_OVL1 17 +#define CLK_MM_DISP_OVL0_2L 18 +#define CLK_MM_DISP_OVL1_2L 19 +#define CLK_MM_DISP_RDMA0 20 +#define CLK_MM_DISP_RDMA1 21 +#define CLK_MM_DISP_WDMA0 22 +#define CLK_MM_DISP_WDMA1 23 +#define CLK_MM_DISP_COLOR 24 +#define CLK_MM_DISP_CCORR 25 +#define CLK_MM_DISP_AAL 26 +#define CLK_MM_DISP_GAMMA 27 +#define CLK_MM_DISP_OD 28 +#define CLK_MM_DISP_DITHER 29 +#define CLK_MM_DISP_UFOE 30 +#define CLK_MM_DISP_DSC 31 +#define CLK_MM_DISP_SPLIT 32 +#define CLK_MM_DSI0_MM_CLOCK 33 +#define CLK_MM_DSI1_MM_CLOCK 34 +#define CLK_MM_DPI_MM_CLOCK 35 +#define CLK_MM_DPI_INTERFACE_CLOCK 36 +#define CLK_MM_LARB4_AXI_ASIF_MM_CLOCK 37 +#define CLK_MM_LARB4_AXI_ASIF_MJC_CLOCK 38 +#define CLK_MM_DISP_OVL0_MOUT_CLOCK 39 +#define CLK_MM_FAKE_ENG2 40 +#define CLK_MM_DSI0_INTERFACE_CLOCK 41 +#define CLK_MM_DSI1_INTERFACE_CLOCK 42 +#define CLK_MM_NR 43 + +/* VDEC_SYS */ +#define CLK_VDEC_CKEN_ENG 1 +#define CLK_VDEC_ACTIVE 2 +#define CLK_VDEC_CKEN 3 +#define CLK_VDEC_LARB1_CKEN 4 +#define CLK_VDEC_NR 5 + +/* VENC_SYS */ +#define CLK_VENC_0 1 +#define CLK_VENC_1 2 +#define CLK_VENC_2 3 +#define CLK_VENC_3 4 +#define CLK_VENC_NR 5 + +#endif /* _DT_BINDINGS_CLK_MT6797_H */ diff --git a/include/dt-bindings/clock/r7s72100-clock.h b/include/dt-bindings/clock/r7s72100-clock.h index ce09915c298fd8179a3289ebac8b2fe101015649..bc256d31099a6eafa64f73078e87ca99b10ec0fa 100644 --- a/include/dt-bindings/clock/r7s72100-clock.h +++ b/include/dt-bindings/clock/r7s72100-clock.h @@ -29,6 +29,9 @@ #define R7S72100_CLK_OSTM0 1 #define R7S72100_CLK_OSTM1 0 +/* MSTP6 */ +#define R7S72100_CLK_RTC 0 + /* MSTP7 */ #define R7S72100_CLK_ETHER 4 @@ -49,7 +52,9 @@ #define R7S72100_CLK_SPI4 3 /* MSTP12 */ -#define R7S72100_CLK_SDHI0 3 -#define R7S72100_CLK_SDHI1 2 +#define R7S72100_CLK_SDHI00 3 +#define R7S72100_CLK_SDHI01 2 +#define R7S72100_CLK_SDHI10 1 +#define R7S72100_CLK_SDHI11 0 #endif /* __DT_BINDINGS_CLOCK_R7S72100_H__ */ diff --git a/include/dt-bindings/clock/r8a73a4-clock.h b/include/dt-bindings/clock/r8a73a4-clock.h index dd11ecdf837e8e65da9e241ef55cdfe376722eb2..4b36681572570c15874e7027b5d987dd61ecaa3d 100644 --- a/include/dt-bindings/clock/r8a73a4-clock.h +++ b/include/dt-bindings/clock/r8a73a4-clock.h @@ -54,6 +54,7 @@ #define R8A73A4_CLK_IIC3 11 #define R8A73A4_CLK_IIC4 10 #define R8A73A4_CLK_IIC5 9 +#define R8A73A4_CLK_INTC_SYS 8 #define R8A73A4_CLK_IRQC 7 /* MSTP5 */ diff --git a/include/dt-bindings/clock/r8a7790-clock.h b/include/dt-bindings/clock/r8a7790-clock.h index fa5e8da809f2e802f3d440dca3d23f1ecb666e82..20641fa68e731ec0a49f3c44b24a8629fca30dd8 100644 --- a/include/dt-bindings/clock/r8a7790-clock.h +++ b/include/dt-bindings/clock/r8a7790-clock.h @@ -82,6 +82,7 @@ /* MSTP4 */ #define R8A7790_CLK_IRQC 7 +#define R8A7790_CLK_INTC_SYS 8 /* MSTP5 */ #define R8A7790_CLK_AUDIO_DMAC1 1 diff --git a/include/dt-bindings/clock/r8a7791-clock.h b/include/dt-bindings/clock/r8a7791-clock.h index ffa11379b3f0ade901ff7d293695ca5f593cb066..adc50dc31ab3423e008efd9959655d2150749403 100644 --- a/include/dt-bindings/clock/r8a7791-clock.h +++ b/include/dt-bindings/clock/r8a7791-clock.h @@ -72,6 +72,7 @@ /* MSTP4 */ #define R8A7791_CLK_IRQC 7 +#define R8A7791_CLK_INTC_SYS 8 /* MSTP5 */ #define R8A7791_CLK_AUDIO_DMAC1 1 diff --git a/include/dt-bindings/clock/r8a7792-clock.h b/include/dt-bindings/clock/r8a7792-clock.h index 9a8b392ceb000fc3eebbca97d659a877542c0625..5be90bc23bd7909788e658069f385f6ebec7fc7e 100644 --- a/include/dt-bindings/clock/r8a7792-clock.h +++ b/include/dt-bindings/clock/r8a7792-clock.h @@ -17,7 +17,6 @@ #define R8A7792_CLK_PLL3 3 #define R8A7792_CLK_LB 4 #define R8A7792_CLK_QSPI 5 -#define R8A7792_CLK_Z 6 /* MSTP0 */ #define R8A7792_CLK_MSIOF0 0 @@ -45,6 +44,7 @@ /* MSTP4 */ #define R8A7792_CLK_IRQC 7 +#define R8A7792_CLK_INTC_SYS 8 /* MSTP5 */ #define R8A7792_CLK_AUDIO_DMAC0 2 diff --git a/include/dt-bindings/clock/r8a7793-clock.h b/include/dt-bindings/clock/r8a7793-clock.h index efcbc594fe82e6837a73d575c215b826dcddca6d..7318d45d4e7e9888c41aa1fb66def087b4cb6ff4 100644 --- a/include/dt-bindings/clock/r8a7793-clock.h +++ b/include/dt-bindings/clock/r8a7793-clock.h @@ -77,10 +77,11 @@ /* MSTP4 */ #define R8A7793_CLK_IRQC 7 +#define R8A7793_CLK_INTC_SYS 8 /* MSTP5 */ -#define R8A7793_CLK_AUDIO_DMAC1 1 -#define R8A7793_CLK_AUDIO_DMAC0 2 +#define R8A7793_CLK_AUDIO_DMAC1 1 +#define R8A7793_CLK_AUDIO_DMAC0 2 #define R8A7793_CLK_ADSP_MOD 6 #define R8A7793_CLK_THERMAL 22 #define R8A7793_CLK_PWM 23 diff --git a/include/dt-bindings/clock/r8a7794-clock.h b/include/dt-bindings/clock/r8a7794-clock.h index 88e64846cf370ff4a32c8423b5d00833c8cb5a81..93e99c3ffc8dafebef94d0644cb15532c86ab1b4 100644 --- a/include/dt-bindings/clock/r8a7794-clock.h +++ b/include/dt-bindings/clock/r8a7794-clock.h @@ -64,6 +64,7 @@ /* MSTP4 */ #define R8A7794_CLK_IRQC 7 +#define R8A7794_CLK_INTC_SYS 8 /* MSTP5 */ #define R8A7794_CLK_AUDIO_DMAC0 2 @@ -81,6 +82,7 @@ #define R8A7794_CLK_SCIF2 19 #define R8A7794_CLK_SCIF1 20 #define R8A7794_CLK_SCIF0 21 +#define R8A7794_CLK_DU1 23 #define R8A7794_CLK_DU0 24 /* MSTP8 */ diff --git a/include/dt-bindings/clock/r8a7795-cpg-mssr.h b/include/dt-bindings/clock/r8a7795-cpg-mssr.h index e864aae0a2561c4bc3fac807c0edfc9d25b0a8dd..f047eaf261f34ac783b2187997894daffe552572 100644 --- a/include/dt-bindings/clock/r8a7795-cpg-mssr.h +++ b/include/dt-bindings/clock/r8a7795-cpg-mssr.h @@ -60,4 +60,11 @@ #define R8A7795_CLK_R 45 #define R8A7795_CLK_OSC 46 +/* r8a7795 ES2.0 CPG Core Clocks */ +#define R8A7795_CLK_S0D2 47 +#define R8A7795_CLK_S0D3 48 +#define R8A7795_CLK_S0D6 49 +#define R8A7795_CLK_S0D8 50 +#define R8A7795_CLK_S0D12 51 + #endif /* __DT_BINDINGS_CLOCK_R8A7795_CPG_MSSR_H__ */ diff --git a/include/dt-bindings/clock/rk1108-cru.h b/include/dt-bindings/clock/rk1108-cru.h deleted file mode 100644 index 9350a5527a36c9dd05537b9f04792ebfbdb100ca..0000000000000000000000000000000000000000 --- a/include/dt-bindings/clock/rk1108-cru.h +++ /dev/null @@ -1,269 +0,0 @@ -/* - * Copyright (c) 2016 Rockchip Electronics Co. Ltd. - * Author: Shawn Lin - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - */ - -#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RK1108_H -#define _DT_BINDINGS_CLK_ROCKCHIP_RK1108_H - -/* pll id */ -#define PLL_APLL 0 -#define PLL_DPLL 1 -#define PLL_GPLL 2 -#define ARMCLK 3 - -/* sclk gates (special clocks) */ -#define SCLK_SPI0 65 -#define SCLK_NANDC 67 -#define SCLK_SDMMC 68 -#define SCLK_SDIO 69 -#define SCLK_EMMC 71 -#define SCLK_UART0 72 -#define SCLK_UART1 73 -#define SCLK_UART2 74 -#define SCLK_I2S0 75 -#define SCLK_I2S1 76 -#define SCLK_I2S2 77 -#define SCLK_TIMER0 78 -#define SCLK_TIMER1 79 -#define SCLK_SFC 80 -#define SCLK_SDMMC_DRV 81 -#define SCLK_SDIO_DRV 82 -#define SCLK_EMMC_DRV 83 -#define SCLK_SDMMC_SAMPLE 84 -#define SCLK_SDIO_SAMPLE 85 -#define SCLK_EMMC_SAMPLE 86 - -/* aclk gates */ -#define ACLK_DMAC 192 -#define ACLK_PRE 193 -#define ACLK_CORE 194 -#define ACLK_ENMCORE 195 - -/* pclk gates */ -#define PCLK_GPIO1 256 -#define PCLK_GPIO2 257 -#define PCLK_GPIO3 258 -#define PCLK_GRF 259 -#define PCLK_I2C1 260 -#define PCLK_I2C2 261 -#define PCLK_I2C3 262 -#define PCLK_SPI 263 -#define PCLK_SFC 264 -#define PCLK_UART0 265 -#define PCLK_UART1 266 -#define PCLK_UART2 267 -#define PCLK_TSADC 268 -#define PCLK_PWM 269 -#define PCLK_TIMER 270 -#define PCLK_PERI 271 - -/* hclk gates */ -#define HCLK_I2S0_8CH 320 -#define HCLK_I2S1_8CH 321 -#define HCLK_I2S2_2CH 322 -#define HCLK_NANDC 323 -#define HCLK_SDMMC 324 -#define HCLK_SDIO 325 -#define HCLK_EMMC 326 -#define HCLK_PERI 327 -#define HCLK_SFC 328 - -#define CLK_NR_CLKS (HCLK_SFC + 1) - -/* reset id */ -#define SRST_CORE_PO_AD 0 -#define SRST_CORE_AD 1 -#define SRST_L2_AD 2 -#define SRST_CPU_NIU_AD 3 -#define SRST_CORE_PO 4 -#define SRST_CORE 5 -#define SRST_L2 6 -#define SRST_CORE_DBG 8 -#define PRST_DBG 9 -#define RST_DAP 10 -#define PRST_DBG_NIU 11 -#define ARST_STRC_SYS_AD 15 - -#define SRST_DDRPHY_CLKDIV 16 -#define SRST_DDRPHY 17 -#define PRST_DDRPHY 18 -#define PRST_HDMIPHY 19 -#define PRST_VDACPHY 20 -#define PRST_VADCPHY 21 -#define PRST_MIPI_CSI_PHY 22 -#define PRST_MIPI_DSI_PHY 23 -#define PRST_ACODEC 24 -#define ARST_BUS_NIU 25 -#define PRST_TOP_NIU 26 -#define ARST_INTMEM 27 -#define HRST_ROM 28 -#define ARST_DMAC 29 -#define SRST_MSCH_NIU 30 -#define PRST_MSCH_NIU 31 - -#define PRST_DDRUPCTL 32 -#define NRST_DDRUPCTL 33 -#define PRST_DDRMON 34 -#define HRST_I2S0_8CH 35 -#define MRST_I2S0_8CH 36 -#define HRST_I2S1_2CH 37 -#define MRST_IS21_2CH 38 -#define HRST_I2S2_2CH 39 -#define MRST_I2S2_2CH 40 -#define HRST_CRYPTO 41 -#define SRST_CRYPTO 42 -#define PRST_SPI 43 -#define SRST_SPI 44 -#define PRST_UART0 45 -#define PRST_UART1 46 -#define PRST_UART2 47 - -#define SRST_UART0 48 -#define SRST_UART1 49 -#define SRST_UART2 50 -#define PRST_I2C1 51 -#define PRST_I2C2 52 -#define PRST_I2C3 53 -#define SRST_I2C1 54 -#define SRST_I2C2 55 -#define SRST_I2C3 56 -#define PRST_PWM1 58 -#define SRST_PWM1 60 -#define PRST_WDT 61 -#define PRST_GPIO1 62 -#define PRST_GPIO2 63 - -#define PRST_GPIO3 64 -#define PRST_GRF 65 -#define PRST_EFUSE 66 -#define PRST_EFUSE512 67 -#define PRST_TIMER0 68 -#define SRST_TIMER0 69 -#define SRST_TIMER1 70 -#define PRST_TSADC 71 -#define SRST_TSADC 72 -#define PRST_SARADC 73 -#define SRST_SARADC 74 -#define HRST_SYSBUS 75 -#define PRST_USBGRF 76 - -#define ARST_PERIPH_NIU 80 -#define HRST_PERIPH_NIU 81 -#define PRST_PERIPH_NIU 82 -#define HRST_PERIPH 83 -#define HRST_SDMMC 84 -#define HRST_SDIO 85 -#define HRST_EMMC 86 -#define HRST_NANDC 87 -#define NRST_NANDC 88 -#define HRST_SFC 89 -#define SRST_SFC 90 -#define ARST_GMAC 91 -#define HRST_OTG 92 -#define SRST_OTG 93 -#define SRST_OTG_ADP 94 -#define HRST_HOST0 95 - -#define HRST_HOST0_AUX 96 -#define HRST_HOST0_ARB 97 -#define SRST_HOST0_EHCIPHY 98 -#define SRST_HOST0_UTMI 99 -#define SRST_USBPOR 100 -#define SRST_UTMI0 101 -#define SRST_UTMI1 102 - -#define ARST_VIO0_NIU 102 -#define ARST_VIO1_NIU 103 -#define HRST_VIO_NIU 104 -#define PRST_VIO_NIU 105 -#define ARST_VOP 106 -#define HRST_VOP 107 -#define DRST_VOP 108 -#define ARST_IEP 109 -#define HRST_IEP 110 -#define ARST_RGA 111 -#define HRST_RGA 112 -#define SRST_RGA 113 -#define PRST_CVBS 114 -#define PRST_HDMI 115 -#define SRST_HDMI 116 -#define PRST_MIPI_DSI 117 - -#define ARST_ISP_NIU 118 -#define HRST_ISP_NIU 119 -#define HRST_ISP 120 -#define SRST_ISP 121 -#define ARST_VIP0 122 -#define HRST_VIP0 123 -#define PRST_VIP0 124 -#define ARST_VIP1 125 -#define HRST_VIP1 126 -#define PRST_VIP1 127 -#define ARST_VIP2 128 -#define HRST_VIP2 129 -#define PRST_VIP2 120 -#define ARST_VIP3 121 -#define HRST_VIP3 122 -#define PRST_VIP4 123 - -#define PRST_CIF1TO4 124 -#define SRST_CVBS_CLK 125 -#define HRST_CVBS 126 - -#define ARST_VPU_NIU 140 -#define HRST_VPU_NIU 141 -#define ARST_VPU 142 -#define HRST_VPU 143 -#define ARST_RKVDEC_NIU 144 -#define HRST_RKVDEC_NIU 145 -#define ARST_RKVDEC 146 -#define HRST_RKVDEC 147 -#define SRST_RKVDEC_CABAC 148 -#define SRST_RKVDEC_CORE 149 -#define ARST_RKVENC_NIU 150 -#define HRST_RKVENC_NIU 151 -#define ARST_RKVENC 152 -#define HRST_RKVENC 153 -#define SRST_RKVENC_CORE 154 - -#define SRST_DSP_CORE 156 -#define SRST_DSP_SYS 157 -#define SRST_DSP_GLOBAL 158 -#define SRST_DSP_OECM 159 -#define PRST_DSP_IOP_NIU 160 -#define ARST_DSP_EPP_NIU 161 -#define ARST_DSP_EDP_NIU 162 -#define PRST_DSP_DBG_NIU 163 -#define PRST_DSP_CFG_NIU 164 -#define PRST_DSP_GRF 165 -#define PRST_DSP_MAILBOX 166 -#define PRST_DSP_INTC 167 -#define PRST_DSP_PFM_MON 169 -#define SRST_DSP_PFM_MON 170 -#define ARST_DSP_EDAP_NIU 171 - -#define SRST_PMU 172 -#define SRST_PMU_I2C0 173 -#define PRST_PMU_I2C0 174 -#define PRST_PMU_GPIO0 175 -#define PRST_PMU_INTMEM 176 -#define PRST_PMU_PWM0 177 -#define SRST_PMU_PWM0 178 -#define PRST_PMU_GRF 179 -#define SRST_PMU_NIU 180 -#define SRST_PMU_PVTM 181 -#define ARST_DSP_EDP_PERF 184 -#define ARST_DSP_EPP_PERF 185 - -#endif /* _DT_BINDINGS_CLK_ROCKCHIP_RK1108_H */ diff --git a/include/dt-bindings/clock/rk3328-cru.h b/include/dt-bindings/clock/rk3328-cru.h index ee702c8e4c091532480561cd5759a32f5fad8c01..d2b26a4b43ebde2eae06c6fd596895e5147adc76 100644 --- a/include/dt-bindings/clock/rk3328-cru.h +++ b/include/dt-bindings/clock/rk3328-cru.h @@ -97,6 +97,7 @@ #define SCLK_MAC2IO_SRC 99 #define SCLK_MAC2IO 100 #define SCLK_MAC2PHY 101 +#define SCLK_MAC2IO_EXT 102 /* dclk gates */ #define DCLK_LCDC 120 diff --git a/include/dt-bindings/clock/rk3368-cru.h b/include/dt-bindings/clock/rk3368-cru.h index 9c5dd9ba2f6cd9fa06c1870b7b0704a7eebff004..aeb83e581a11ec5af82df30cdcf167c3a45b964b 100644 --- a/include/dt-bindings/clock/rk3368-cru.h +++ b/include/dt-bindings/clock/rk3368-cru.h @@ -44,13 +44,12 @@ #define SCLK_I2S_8CH 82 #define SCLK_SPDIF_8CH 83 #define SCLK_I2S_2CH 84 -#define SCLK_TIMER0 85 -#define SCLK_TIMER1 86 -#define SCLK_TIMER2 87 -#define SCLK_TIMER3 88 -#define SCLK_TIMER4 89 -#define SCLK_TIMER5 90 -#define SCLK_TIMER6 91 +#define SCLK_TIMER00 85 +#define SCLK_TIMER01 86 +#define SCLK_TIMER02 87 +#define SCLK_TIMER03 88 +#define SCLK_TIMER04 89 +#define SCLK_TIMER05 90 #define SCLK_OTGPHY0 93 #define SCLK_OTG_ADP 96 #define SCLK_HSICPHY480M 97 @@ -82,6 +81,12 @@ #define SCLK_SFC 126 #define SCLK_MAC 127 #define SCLK_MACREF_OUT 128 +#define SCLK_TIMER10 133 +#define SCLK_TIMER11 134 +#define SCLK_TIMER12 135 +#define SCLK_TIMER13 136 +#define SCLK_TIMER14 137 +#define SCLK_TIMER15 138 #define DCLK_VOP 190 #define MCLK_CRYPTO 191 diff --git a/include/dt-bindings/clock/rv1108-cru.h b/include/dt-bindings/clock/rv1108-cru.h new file mode 100644 index 0000000000000000000000000000000000000000..ae26f81059142867a009197e1c37c8e00d3dbd21 --- /dev/null +++ b/include/dt-bindings/clock/rv1108-cru.h @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2016 Rockchip Electronics Co. Ltd. + * Author: Shawn Lin + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _DT_BINDINGS_CLK_ROCKCHIP_RV1108_H +#define _DT_BINDINGS_CLK_ROCKCHIP_RV1108_H + +/* pll id */ +#define PLL_APLL 0 +#define PLL_DPLL 1 +#define PLL_GPLL 2 +#define ARMCLK 3 + +/* sclk gates (special clocks) */ +#define SCLK_SPI0 65 +#define SCLK_NANDC 67 +#define SCLK_SDMMC 68 +#define SCLK_SDIO 69 +#define SCLK_EMMC 71 +#define SCLK_UART0 72 +#define SCLK_UART1 73 +#define SCLK_UART2 74 +#define SCLK_I2S0 75 +#define SCLK_I2S1 76 +#define SCLK_I2S2 77 +#define SCLK_TIMER0 78 +#define SCLK_TIMER1 79 +#define SCLK_SFC 80 +#define SCLK_SDMMC_DRV 81 +#define SCLK_SDIO_DRV 82 +#define SCLK_EMMC_DRV 83 +#define SCLK_SDMMC_SAMPLE 84 +#define SCLK_SDIO_SAMPLE 85 +#define SCLK_EMMC_SAMPLE 86 + +/* aclk gates */ +#define ACLK_DMAC 192 +#define ACLK_PRE 193 +#define ACLK_CORE 194 +#define ACLK_ENMCORE 195 + +/* pclk gates */ +#define PCLK_GPIO1 256 +#define PCLK_GPIO2 257 +#define PCLK_GPIO3 258 +#define PCLK_GRF 259 +#define PCLK_I2C1 260 +#define PCLK_I2C2 261 +#define PCLK_I2C3 262 +#define PCLK_SPI 263 +#define PCLK_SFC 264 +#define PCLK_UART0 265 +#define PCLK_UART1 266 +#define PCLK_UART2 267 +#define PCLK_TSADC 268 +#define PCLK_PWM 269 +#define PCLK_TIMER 270 +#define PCLK_PERI 271 + +/* hclk gates */ +#define HCLK_I2S0_8CH 320 +#define HCLK_I2S1_8CH 321 +#define HCLK_I2S2_2CH 322 +#define HCLK_NANDC 323 +#define HCLK_SDMMC 324 +#define HCLK_SDIO 325 +#define HCLK_EMMC 326 +#define HCLK_PERI 327 +#define HCLK_SFC 328 + +#define CLK_NR_CLKS (HCLK_SFC + 1) + +/* reset id */ +#define SRST_CORE_PO_AD 0 +#define SRST_CORE_AD 1 +#define SRST_L2_AD 2 +#define SRST_CPU_NIU_AD 3 +#define SRST_CORE_PO 4 +#define SRST_CORE 5 +#define SRST_L2 6 +#define SRST_CORE_DBG 8 +#define PRST_DBG 9 +#define RST_DAP 10 +#define PRST_DBG_NIU 11 +#define ARST_STRC_SYS_AD 15 + +#define SRST_DDRPHY_CLKDIV 16 +#define SRST_DDRPHY 17 +#define PRST_DDRPHY 18 +#define PRST_HDMIPHY 19 +#define PRST_VDACPHY 20 +#define PRST_VADCPHY 21 +#define PRST_MIPI_CSI_PHY 22 +#define PRST_MIPI_DSI_PHY 23 +#define PRST_ACODEC 24 +#define ARST_BUS_NIU 25 +#define PRST_TOP_NIU 26 +#define ARST_INTMEM 27 +#define HRST_ROM 28 +#define ARST_DMAC 29 +#define SRST_MSCH_NIU 30 +#define PRST_MSCH_NIU 31 + +#define PRST_DDRUPCTL 32 +#define NRST_DDRUPCTL 33 +#define PRST_DDRMON 34 +#define HRST_I2S0_8CH 35 +#define MRST_I2S0_8CH 36 +#define HRST_I2S1_2CH 37 +#define MRST_IS21_2CH 38 +#define HRST_I2S2_2CH 39 +#define MRST_I2S2_2CH 40 +#define HRST_CRYPTO 41 +#define SRST_CRYPTO 42 +#define PRST_SPI 43 +#define SRST_SPI 44 +#define PRST_UART0 45 +#define PRST_UART1 46 +#define PRST_UART2 47 + +#define SRST_UART0 48 +#define SRST_UART1 49 +#define SRST_UART2 50 +#define PRST_I2C1 51 +#define PRST_I2C2 52 +#define PRST_I2C3 53 +#define SRST_I2C1 54 +#define SRST_I2C2 55 +#define SRST_I2C3 56 +#define PRST_PWM1 58 +#define SRST_PWM1 60 +#define PRST_WDT 61 +#define PRST_GPIO1 62 +#define PRST_GPIO2 63 + +#define PRST_GPIO3 64 +#define PRST_GRF 65 +#define PRST_EFUSE 66 +#define PRST_EFUSE512 67 +#define PRST_TIMER0 68 +#define SRST_TIMER0 69 +#define SRST_TIMER1 70 +#define PRST_TSADC 71 +#define SRST_TSADC 72 +#define PRST_SARADC 73 +#define SRST_SARADC 74 +#define HRST_SYSBUS 75 +#define PRST_USBGRF 76 + +#define ARST_PERIPH_NIU 80 +#define HRST_PERIPH_NIU 81 +#define PRST_PERIPH_NIU 82 +#define HRST_PERIPH 83 +#define HRST_SDMMC 84 +#define HRST_SDIO 85 +#define HRST_EMMC 86 +#define HRST_NANDC 87 +#define NRST_NANDC 88 +#define HRST_SFC 89 +#define SRST_SFC 90 +#define ARST_GMAC 91 +#define HRST_OTG 92 +#define SRST_OTG 93 +#define SRST_OTG_ADP 94 +#define HRST_HOST0 95 + +#define HRST_HOST0_AUX 96 +#define HRST_HOST0_ARB 97 +#define SRST_HOST0_EHCIPHY 98 +#define SRST_HOST0_UTMI 99 +#define SRST_USBPOR 100 +#define SRST_UTMI0 101 +#define SRST_UTMI1 102 + +#define ARST_VIO0_NIU 102 +#define ARST_VIO1_NIU 103 +#define HRST_VIO_NIU 104 +#define PRST_VIO_NIU 105 +#define ARST_VOP 106 +#define HRST_VOP 107 +#define DRST_VOP 108 +#define ARST_IEP 109 +#define HRST_IEP 110 +#define ARST_RGA 111 +#define HRST_RGA 112 +#define SRST_RGA 113 +#define PRST_CVBS 114 +#define PRST_HDMI 115 +#define SRST_HDMI 116 +#define PRST_MIPI_DSI 117 + +#define ARST_ISP_NIU 118 +#define HRST_ISP_NIU 119 +#define HRST_ISP 120 +#define SRST_ISP 121 +#define ARST_VIP0 122 +#define HRST_VIP0 123 +#define PRST_VIP0 124 +#define ARST_VIP1 125 +#define HRST_VIP1 126 +#define PRST_VIP1 127 +#define ARST_VIP2 128 +#define HRST_VIP2 129 +#define PRST_VIP2 120 +#define ARST_VIP3 121 +#define HRST_VIP3 122 +#define PRST_VIP4 123 + +#define PRST_CIF1TO4 124 +#define SRST_CVBS_CLK 125 +#define HRST_CVBS 126 + +#define ARST_VPU_NIU 140 +#define HRST_VPU_NIU 141 +#define ARST_VPU 142 +#define HRST_VPU 143 +#define ARST_RKVDEC_NIU 144 +#define HRST_RKVDEC_NIU 145 +#define ARST_RKVDEC 146 +#define HRST_RKVDEC 147 +#define SRST_RKVDEC_CABAC 148 +#define SRST_RKVDEC_CORE 149 +#define ARST_RKVENC_NIU 150 +#define HRST_RKVENC_NIU 151 +#define ARST_RKVENC 152 +#define HRST_RKVENC 153 +#define SRST_RKVENC_CORE 154 + +#define SRST_DSP_CORE 156 +#define SRST_DSP_SYS 157 +#define SRST_DSP_GLOBAL 158 +#define SRST_DSP_OECM 159 +#define PRST_DSP_IOP_NIU 160 +#define ARST_DSP_EPP_NIU 161 +#define ARST_DSP_EDP_NIU 162 +#define PRST_DSP_DBG_NIU 163 +#define PRST_DSP_CFG_NIU 164 +#define PRST_DSP_GRF 165 +#define PRST_DSP_MAILBOX 166 +#define PRST_DSP_INTC 167 +#define PRST_DSP_PFM_MON 169 +#define SRST_DSP_PFM_MON 170 +#define ARST_DSP_EDAP_NIU 171 + +#define SRST_PMU 172 +#define SRST_PMU_I2C0 173 +#define PRST_PMU_I2C0 174 +#define PRST_PMU_GPIO0 175 +#define PRST_PMU_INTMEM 176 +#define PRST_PMU_PWM0 177 +#define SRST_PMU_PWM0 178 +#define PRST_PMU_GRF 179 +#define SRST_PMU_NIU 180 +#define SRST_PMU_PVTM 181 +#define ARST_DSP_EDP_PERF 184 +#define ARST_DSP_EPP_PERF 185 + +#endif /* _DT_BINDINGS_CLK_ROCKCHIP_RV1108_H */ diff --git a/include/dt-bindings/clock/sun8i-h3-ccu.h b/include/dt-bindings/clock/sun8i-h3-ccu.h index efb7ba2bd51510533006efe7f0c9f9dcb016e24c..c2afc41d69644af3d9f920a56341d62003630883 100644 --- a/include/dt-bindings/clock/sun8i-h3-ccu.h +++ b/include/dt-bindings/clock/sun8i-h3-ccu.h @@ -91,7 +91,7 @@ #define CLK_BUS_UART1 63 #define CLK_BUS_UART2 64 #define CLK_BUS_UART3 65 -#define CLK_BUS_SCR 66 +#define CLK_BUS_SCR0 66 #define CLK_BUS_EPHY 67 #define CLK_BUS_DBG 68 @@ -142,4 +142,7 @@ #define CLK_GPU 114 +/* New clocks imported in H5 */ +#define CLK_BUS_SCR1 115 + #endif /* _DT_BINDINGS_CLK_SUN8I_H3_H_ */ diff --git a/include/dt-bindings/clock/sun8i-r-ccu.h b/include/dt-bindings/clock/sun8i-r-ccu.h new file mode 100644 index 0000000000000000000000000000000000000000..779d20aa0d05cffd8d565f5d570be52f8b94cee6 --- /dev/null +++ b/include/dt-bindings/clock/sun8i-r-ccu.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2016 Icenowy Zheng + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DT_BINDINGS_CLK_SUN8I_R_CCU_H_ +#define _DT_BINDINGS_CLK_SUN8I_R_CCU_H_ + +#define CLK_AR100 0 + +#define CLK_APB0_PIO 3 +#define CLK_APB0_IR 4 +#define CLK_APB0_TIMER 5 +#define CLK_APB0_RSB 6 +#define CLK_APB0_UART 7 +/* 8 is reserved for CLK_APB0_W1 on A31 */ +#define CLK_APB0_I2C 9 +#define CLK_APB0_TWD 10 + +#define CLK_IR 11 + +#endif /* _DT_BINDINGS_CLK_SUN8I_R_CCU_H_ */ diff --git a/include/dt-bindings/clock/tegra114-car.h b/include/dt-bindings/clock/tegra114-car.h index 534c03f8ad72bc23675e0a4fe7761b2257f1b741..ed5ca218c8573e735ccb3fc96ebd9c0ad43dafc5 100644 --- a/include/dt-bindings/clock/tegra114-car.h +++ b/include/dt-bindings/clock/tegra114-car.h @@ -156,7 +156,7 @@ /* 133 */ /* 134 */ /* 135 */ -/* 136 */ +#define TEGRA114_CLK_CEC 136 /* 137 */ /* 138 */ /* 139 */ diff --git a/include/dt-bindings/clock/tegra124-car-common.h b/include/dt-bindings/clock/tegra124-car-common.h index a2156090563f357ae6076a08639f14cb79469671..9352c7e2ce0ba6349b1d298b8356e2c04ba742fe 100644 --- a/include/dt-bindings/clock/tegra124-car-common.h +++ b/include/dt-bindings/clock/tegra124-car-common.h @@ -156,7 +156,7 @@ /* 133 */ /* 134 */ /* 135 */ -/* 136 */ +#define TEGRA124_CLK_CEC 136 /* 137 */ /* 138 */ /* 139 */ diff --git a/include/dt-bindings/clock/tegra210-car.h b/include/dt-bindings/clock/tegra210-car.h index 35288b20f2c9cffcaebdf5075946b25dce6fad4b..46689cd3750bf11aed2115f4d34df9f1642ce265 100644 --- a/include/dt-bindings/clock/tegra210-car.h +++ b/include/dt-bindings/clock/tegra210-car.h @@ -39,7 +39,7 @@ /* 20 (register bit affects vi and vi_sensor) */ /* 21 */ #define TEGRA210_CLK_USBD 22 -#define TEGRA210_CLK_ISP 23 +#define TEGRA210_CLK_ISPA 23 /* 24 */ /* 25 */ #define TEGRA210_CLK_DISP2 26 @@ -156,7 +156,7 @@ /* 133 */ /* 134 */ /* 135 */ -/* 136 */ +#define TEGRA210_CLK_CEC 136 /* 137 */ /* 138 */ /* 139 */ @@ -173,7 +173,7 @@ #define TEGRA210_CLK_ENTROPY 149 /* 150 */ /* 151 */ -/* 152 */ +#define TEGRA210_CLK_DP2 152 /* 153 */ /* 154 */ /* 155 (bit affects dfll_ref and dfll_soc) */ @@ -210,7 +210,7 @@ #define TEGRA210_CLK_DBGAPB 185 /* 186 */ #define TEGRA210_CLK_PLL_P_OUT_ADSP 187 -/* 188 */ +/* 188 ((bit affects pll_a_out_adsp and pll_a_out0_out_adsp)*/ #define TEGRA210_CLK_PLL_G_REF 189 /* 190 */ /* 191 */ @@ -222,7 +222,7 @@ /* 196 */ #define TEGRA210_CLK_DMIC3 197 #define TEGRA210_CLK_APE 198 -/* 199 */ +#define TEGRA210_CLK_ADSP 199 /* 200 */ /* 201 */ #define TEGRA210_CLK_MAUD 202 @@ -241,10 +241,10 @@ /* 215 */ /* 216 */ /* 217 */ -/* 218 */ +#define TEGRA210_CLK_ADSP_NEON 218 #define TEGRA210_CLK_NVENC 219 -/* 220 */ -/* 221 */ +#define TEGRA210_CLK_IQC2 220 +#define TEGRA210_CLK_IQC1 221 #define TEGRA210_CLK_SOR_SAFE 222 #define TEGRA210_CLK_PLL_P_OUT_CPU 223 @@ -349,9 +349,9 @@ #define TEGRA210_CLK_PLL_RE_OUT1 319 /* 320 */ /* 321 */ -/* 322 */ -/* 323 */ -/* 324 */ +#define TEGRA210_CLK_ISP 322 +#define TEGRA210_CLK_PLL_A_OUT_ADSP 323 +#define TEGRA210_CLK_PLL_A_OUT0_OUT_ADSP 324 /* 325 */ /* 326 */ /* 327 */ @@ -396,6 +396,15 @@ #define TEGRA210_CLK_PLL_C_UD 364 #define TEGRA210_CLK_SCLK_MUX 365 -#define TEGRA210_CLK_CLK_MAX 366 +#define TEGRA210_CLK_ACLK 370 + +#define TEGRA210_CLK_DMIC1_SYNC_CLK 388 +#define TEGRA210_CLK_DMIC1_SYNC_CLK_MUX 389 +#define TEGRA210_CLK_DMIC2_SYNC_CLK 390 +#define TEGRA210_CLK_DMIC2_SYNC_CLK_MUX 391 +#define TEGRA210_CLK_DMIC3_SYNC_CLK 392 +#define TEGRA210_CLK_DMIC3_SYNC_CLK_MUX 393 + +#define TEGRA210_CLK_CLK_MAX 394 #endif /* _DT_BINDINGS_CLOCK_TEGRA210_CAR_H */ diff --git a/include/dt-bindings/clock/tegra30-car.h b/include/dt-bindings/clock/tegra30-car.h index 889e49ba0aa3de3f3b83ad27b1d0f4b12521a05a..7213354b9652c77f99fbd41a9a4c26d0bfb4662f 100644 --- a/include/dt-bindings/clock/tegra30-car.h +++ b/include/dt-bindings/clock/tegra30-car.h @@ -156,7 +156,7 @@ /* 133 */ /* 134 */ /* 135 */ -/* 136 */ +#define TEGRA30_CLK_CEC 136 /* 137 */ /* 138 */ /* 139 */ diff --git a/include/dt-bindings/genpd/k2g.h b/include/dt-bindings/genpd/k2g.h new file mode 100644 index 0000000000000000000000000000000000000000..1f31f17e19ebca5e528ad9498047ec9bd973d3c4 --- /dev/null +++ b/include/dt-bindings/genpd/k2g.h @@ -0,0 +1,90 @@ +/* + * TI K2G SoC Device definitions + * + * Copyright (C) 2015-2017 Texas Instruments Incorporated - http://www.ti.com/ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef _DT_BINDINGS_GENPD_K2G_H +#define _DT_BINDINGS_GENPD_K2G_H + +/* Documented in http://processors.wiki.ti.com/index.php/TISCI */ + +#define K2G_DEV_PMMC0 0x0000 +#define K2G_DEV_MLB0 0x0001 +#define K2G_DEV_DSS0 0x0002 +#define K2G_DEV_MCBSP0 0x0003 +#define K2G_DEV_MCASP0 0x0004 +#define K2G_DEV_MCASP1 0x0005 +#define K2G_DEV_MCASP2 0x0006 +#define K2G_DEV_DCAN0 0x0008 +#define K2G_DEV_DCAN1 0x0009 +#define K2G_DEV_EMIF0 0x000a +#define K2G_DEV_MMCHS0 0x000b +#define K2G_DEV_MMCHS1 0x000c +#define K2G_DEV_GPMC0 0x000d +#define K2G_DEV_ELM0 0x000e +#define K2G_DEV_SPI0 0x0010 +#define K2G_DEV_SPI1 0x0011 +#define K2G_DEV_SPI2 0x0012 +#define K2G_DEV_SPI3 0x0013 +#define K2G_DEV_ICSS0 0x0014 +#define K2G_DEV_ICSS1 0x0015 +#define K2G_DEV_USB0 0x0016 +#define K2G_DEV_USB1 0x0017 +#define K2G_DEV_NSS0 0x0018 +#define K2G_DEV_PCIE0 0x0019 +#define K2G_DEV_GPIO0 0x001b +#define K2G_DEV_GPIO1 0x001c +#define K2G_DEV_TIMER64_0 0x001d +#define K2G_DEV_TIMER64_1 0x001e +#define K2G_DEV_TIMER64_2 0x001f +#define K2G_DEV_TIMER64_3 0x0020 +#define K2G_DEV_TIMER64_4 0x0021 +#define K2G_DEV_TIMER64_5 0x0022 +#define K2G_DEV_TIMER64_6 0x0023 +#define K2G_DEV_MSGMGR0 0x0025 +#define K2G_DEV_BOOTCFG0 0x0026 +#define K2G_DEV_ARM_BOOTROM0 0x0027 +#define K2G_DEV_DSP_BOOTROM0 0x0029 +#define K2G_DEV_DEBUGSS0 0x002b +#define K2G_DEV_UART0 0x002c +#define K2G_DEV_UART1 0x002d +#define K2G_DEV_UART2 0x002e +#define K2G_DEV_EHRPWM0 0x002f +#define K2G_DEV_EHRPWM1 0x0030 +#define K2G_DEV_EHRPWM2 0x0031 +#define K2G_DEV_EHRPWM3 0x0032 +#define K2G_DEV_EHRPWM4 0x0033 +#define K2G_DEV_EHRPWM5 0x0034 +#define K2G_DEV_EQEP0 0x0035 +#define K2G_DEV_EQEP1 0x0036 +#define K2G_DEV_EQEP2 0x0037 +#define K2G_DEV_ECAP0 0x0038 +#define K2G_DEV_ECAP1 0x0039 +#define K2G_DEV_I2C0 0x003a +#define K2G_DEV_I2C1 0x003b +#define K2G_DEV_I2C2 0x003c +#define K2G_DEV_EDMA0 0x003f +#define K2G_DEV_SEMAPHORE0 0x0040 +#define K2G_DEV_INTC0 0x0041 +#define K2G_DEV_GIC0 0x0042 +#define K2G_DEV_QSPI0 0x0043 +#define K2G_DEV_ARM_64B_COUNTER0 0x0044 +#define K2G_DEV_TETRIS0 0x0045 +#define K2G_DEV_CGEM0 0x0046 +#define K2G_DEV_MSMC0 0x0047 +#define K2G_DEV_CBASS0 0x0049 +#define K2G_DEV_BOARD0 0x004c +#define K2G_DEV_EDMA1 0x004f + +#endif diff --git a/include/dt-bindings/gpio/gpio.h b/include/dt-bindings/gpio/gpio.h index c673d2c87c604ae86fc6bfd523a580308752148a..b4f54da694eb4a65ecff5a6a34c3ed2db1c088db 100644 --- a/include/dt-bindings/gpio/gpio.h +++ b/include/dt-bindings/gpio/gpio.h @@ -17,11 +17,15 @@ #define GPIO_PUSH_PULL 0 #define GPIO_SINGLE_ENDED 2 +/* Bit 2 express Open drain or open source */ +#define GPIO_LINE_OPEN_SOURCE 0 +#define GPIO_LINE_OPEN_DRAIN 4 + /* - * Open Drain/Collector is the combination of single-ended active low, - * Open Source/Emitter is the combination of single-ended active high. + * Open Drain/Collector is the combination of single-ended open drain interface. + * Open Source/Emitter is the combination of single-ended open source interface. */ -#define GPIO_OPEN_DRAIN (GPIO_SINGLE_ENDED | GPIO_ACTIVE_LOW) -#define GPIO_OPEN_SOURCE (GPIO_SINGLE_ENDED | GPIO_ACTIVE_HIGH) +#define GPIO_OPEN_DRAIN (GPIO_SINGLE_ENDED | GPIO_LINE_OPEN_DRAIN) +#define GPIO_OPEN_SOURCE (GPIO_SINGLE_ENDED | GPIO_LINE_OPEN_SOURCE) #endif diff --git a/include/dt-bindings/mfd/stm32f7-rcc.h b/include/dt-bindings/mfd/stm32f7-rcc.h new file mode 100644 index 0000000000000000000000000000000000000000..e36cc69959c76bc3b75f3aac20176d1cbdbd3e0a --- /dev/null +++ b/include/dt-bindings/mfd/stm32f7-rcc.h @@ -0,0 +1,112 @@ +/* + * This header provides constants for the STM32F7 RCC IP + */ + +#ifndef _DT_BINDINGS_MFD_STM32F7_RCC_H +#define _DT_BINDINGS_MFD_STM32F7_RCC_H + +/* AHB1 */ +#define STM32F7_RCC_AHB1_GPIOA 0 +#define STM32F7_RCC_AHB1_GPIOB 1 +#define STM32F7_RCC_AHB1_GPIOC 2 +#define STM32F7_RCC_AHB1_GPIOD 3 +#define STM32F7_RCC_AHB1_GPIOE 4 +#define STM32F7_RCC_AHB1_GPIOF 5 +#define STM32F7_RCC_AHB1_GPIOG 6 +#define STM32F7_RCC_AHB1_GPIOH 7 +#define STM32F7_RCC_AHB1_GPIOI 8 +#define STM32F7_RCC_AHB1_GPIOJ 9 +#define STM32F7_RCC_AHB1_GPIOK 10 +#define STM32F7_RCC_AHB1_CRC 12 +#define STM32F7_RCC_AHB1_BKPSRAM 18 +#define STM32F7_RCC_AHB1_DTCMRAM 20 +#define STM32F7_RCC_AHB1_DMA1 21 +#define STM32F7_RCC_AHB1_DMA2 22 +#define STM32F7_RCC_AHB1_DMA2D 23 +#define STM32F7_RCC_AHB1_ETHMAC 25 +#define STM32F7_RCC_AHB1_ETHMACTX 26 +#define STM32F7_RCC_AHB1_ETHMACRX 27 +#define STM32FF_RCC_AHB1_ETHMACPTP 28 +#define STM32F7_RCC_AHB1_OTGHS 29 +#define STM32F7_RCC_AHB1_OTGHSULPI 30 + +#define STM32F7_AHB1_RESET(bit) (STM32F7_RCC_AHB1_##bit + (0x10 * 8)) +#define STM32F7_AHB1_CLOCK(bit) (STM32F7_RCC_AHB1_##bit) + + +/* AHB2 */ +#define STM32F7_RCC_AHB2_DCMI 0 +#define STM32F7_RCC_AHB2_CRYP 4 +#define STM32F7_RCC_AHB2_HASH 5 +#define STM32F7_RCC_AHB2_RNG 6 +#define STM32F7_RCC_AHB2_OTGFS 7 + +#define STM32F7_AHB2_RESET(bit) (STM32F7_RCC_AHB2_##bit + (0x14 * 8)) +#define STM32F7_AHB2_CLOCK(bit) (STM32F7_RCC_AHB2_##bit + 0x20) + +/* AHB3 */ +#define STM32F7_RCC_AHB3_FMC 0 +#define STM32F7_RCC_AHB3_QSPI 1 + +#define STM32F7_AHB3_RESET(bit) (STM32F7_RCC_AHB3_##bit + (0x18 * 8)) +#define STM32F7_AHB3_CLOCK(bit) (STM32F7_RCC_AHB3_##bit + 0x40) + +/* APB1 */ +#define STM32F7_RCC_APB1_TIM2 0 +#define STM32F7_RCC_APB1_TIM3 1 +#define STM32F7_RCC_APB1_TIM4 2 +#define STM32F7_RCC_APB1_TIM5 3 +#define STM32F7_RCC_APB1_TIM6 4 +#define STM32F7_RCC_APB1_TIM7 5 +#define STM32F7_RCC_APB1_TIM12 6 +#define STM32F7_RCC_APB1_TIM13 7 +#define STM32F7_RCC_APB1_TIM14 8 +#define STM32F7_RCC_APB1_LPTIM1 9 +#define STM32F7_RCC_APB1_WWDG 11 +#define STM32F7_RCC_APB1_SPI2 14 +#define STM32F7_RCC_APB1_SPI3 15 +#define STM32F7_RCC_APB1_SPDIFRX 16 +#define STM32F7_RCC_APB1_UART2 17 +#define STM32F7_RCC_APB1_UART3 18 +#define STM32F7_RCC_APB1_UART4 19 +#define STM32F7_RCC_APB1_UART5 20 +#define STM32F7_RCC_APB1_I2C1 21 +#define STM32F7_RCC_APB1_I2C2 22 +#define STM32F7_RCC_APB1_I2C3 23 +#define STM32F7_RCC_APB1_I2C4 24 +#define STM32F7_RCC_APB1_CAN1 25 +#define STM32F7_RCC_APB1_CAN2 26 +#define STM32F7_RCC_APB1_CEC 27 +#define STM32F7_RCC_APB1_PWR 28 +#define STM32F7_RCC_APB1_DAC 29 +#define STM32F7_RCC_APB1_UART7 30 +#define STM32F7_RCC_APB1_UART8 31 + +#define STM32F7_APB1_RESET(bit) (STM32F7_RCC_APB1_##bit + (0x20 * 8)) +#define STM32F7_APB1_CLOCK(bit) (STM32F7_RCC_APB1_##bit + 0x80) + +/* APB2 */ +#define STM32F7_RCC_APB2_TIM1 0 +#define STM32F7_RCC_APB2_TIM8 1 +#define STM32F7_RCC_APB2_USART1 4 +#define STM32F7_RCC_APB2_USART6 5 +#define STM32F7_RCC_APB2_ADC1 8 +#define STM32F7_RCC_APB2_ADC2 9 +#define STM32F7_RCC_APB2_ADC3 10 +#define STM32F7_RCC_APB2_SDMMC1 11 +#define STM32F7_RCC_APB2_SPI1 12 +#define STM32F7_RCC_APB2_SPI4 13 +#define STM32F7_RCC_APB2_SYSCFG 14 +#define STM32F7_RCC_APB2_TIM9 16 +#define STM32F7_RCC_APB2_TIM10 17 +#define STM32F7_RCC_APB2_TIM11 18 +#define STM32F7_RCC_APB2_SPI5 20 +#define STM32F7_RCC_APB2_SPI6 21 +#define STM32F7_RCC_APB2_SAI1 22 +#define STM32F7_RCC_APB2_SAI2 23 +#define STM32F7_RCC_APB2_LTDC 26 + +#define STM32F7_APB2_RESET(bit) (STM32F7_RCC_APB2_##bit + (0x24 * 8)) +#define STM32F7_APB2_CLOCK(bit) (STM32F7_RCC_APB2_##bit + 0xA0) + +#endif /* _DT_BINDINGS_MFD_STM32F7_RCC_H */ diff --git a/include/dt-bindings/pinctrl/hisi.h b/include/dt-bindings/pinctrl/hisi.h index 38f1ea879ea136685f6a2b03d1087f0cdf8dd7ae..0359bfdc9119720e80f6e703f39ac30f0091fb12 100644 --- a/include/dt-bindings/pinctrl/hisi.h +++ b/include/dt-bindings/pinctrl/hisi.h @@ -56,4 +56,19 @@ #define DRIVE4_08MA (4 << 4) #define DRIVE4_10MA (6 << 4) +/* drive strength definition for hi3660 */ +#define DRIVE6_MASK (15 << 4) +#define DRIVE6_04MA (0 << 4) +#define DRIVE6_12MA (4 << 4) +#define DRIVE6_19MA (8 << 4) +#define DRIVE6_27MA (10 << 4) +#define DRIVE6_32MA (15 << 4) +#define DRIVE7_02MA (0 << 4) +#define DRIVE7_04MA (1 << 4) +#define DRIVE7_06MA (2 << 4) +#define DRIVE7_08MA (3 << 4) +#define DRIVE7_10MA (4 << 4) +#define DRIVE7_12MA (5 << 4) +#define DRIVE7_14MA (6 << 4) +#define DRIVE7_16MA (7 << 4) #endif diff --git a/include/dt-bindings/pinctrl/mt7623-pinfunc.h b/include/dt-bindings/pinctrl/mt7623-pinfunc.h index 2f00bdc4244274143f39b4e43ef185d42ba77420..436a87be864afbbf857ffc0cbfaea4d39b4dc91d 100644 --- a/include/dt-bindings/pinctrl/mt7623-pinfunc.h +++ b/include/dt-bindings/pinctrl/mt7623-pinfunc.h @@ -185,6 +185,12 @@ #define MT7623_PIN_56_SPI0_MO_FUNC_SPI0_MO (MTK_PIN_NO(56) | 1) #define MT7623_PIN_56_SPI0_MO_FUNC_SPI0_MI (MTK_PIN_NO(56) | 2) +#define MT7623_PIN_57_SDA1_FUNC_GPIO57 (MTK_PIN_NO(57) | 0) +#define MT7623_PIN_57_SDA1_FUNC_SDA1 (MTK_PIN_NO(57) | 1) + +#define MT7623_PIN_58_SCL1_FUNC_GPIO58 (MTK_PIN_NO(58) | 0) +#define MT7623_PIN_58_SCL1_FUNC_SCL1 (MTK_PIN_NO(58) | 1) + #define MT7623_PIN_60_WB_RSTB_FUNC_GPIO60 (MTK_PIN_NO(60) | 0) #define MT7623_PIN_60_WB_RSTB_FUNC_WB_RSTB (MTK_PIN_NO(60) | 1) @@ -244,6 +250,22 @@ #define MT7623_PIN_76_SCL0_FUNC_GPIO76 (MTK_PIN_NO(76) | 0) #define MT7623_PIN_76_SCL0_FUNC_SCL0 (MTK_PIN_NO(76) | 1) +#define MT7623_PIN_79_URXD0_FUNC_GPIO79 (MTK_PIN_NO(79) | 0) +#define MT7623_PIN_79_URXD0_FUNC_URXD0 (MTK_PIN_NO(79) | 1) +#define MT7623_PIN_79_URXD0_FUNC_UTXD0 (MTK_PIN_NO(79) | 2) + +#define MT7623_PIN_80_UTXD0_FUNC_GPIO80 (MTK_PIN_NO(80) | 0) +#define MT7623_PIN_80_UTXD0_FUNC_UTXD0 (MTK_PIN_NO(80) | 1) +#define MT7623_PIN_80_UTXD0_FUNC_URXD0 (MTK_PIN_NO(80) | 2) + +#define MT7623_PIN_81_URXD1_FUNC_GPIO81 (MTK_PIN_NO(81) | 0) +#define MT7623_PIN_81_URXD1_FUNC_URXD1 (MTK_PIN_NO(81) | 1) +#define MT7623_PIN_81_URXD1_FUNC_UTXD1 (MTK_PIN_NO(81) | 2) + +#define MT7623_PIN_82_UTXD1_FUNC_GPIO82 (MTK_PIN_NO(82) | 0) +#define MT7623_PIN_82_UTXD1_FUNC_UTXD1 (MTK_PIN_NO(82) | 1) +#define MT7623_PIN_82_UTXD1_FUNC_URXD1 (MTK_PIN_NO(82) | 2) + #define MT7623_PIN_83_LCM_RST_FUNC_GPIO83 (MTK_PIN_NO(83) | 0) #define MT7623_PIN_83_LCM_RST_FUNC_LCM_RST (MTK_PIN_NO(83) | 1) @@ -351,10 +373,10 @@ #define MT7623_PIN_122_GPIO122_FUNC_SDA2 (MTK_PIN_NO(122) | 4) #define MT7623_PIN_122_GPIO122_FUNC_URXD0 (MTK_PIN_NO(122) | 5) -#define MT7623_PIN_123_GPIO123_FUNC_GPIO123 (MTK_PIN_NO(123) | 0) -#define MT7623_PIN_123_GPIO123_FUNC_TEST (MTK_PIN_NO(123) | 1) -#define MT7623_PIN_123_GPIO123_FUNC_SCL2 (MTK_PIN_NO(123) | 4) -#define MT7623_PIN_123_GPIO123_FUNC_UTXD0 (MTK_PIN_NO(123) | 5) +#define MT7623_PIN_123_HTPLG_FUNC_GPIO123 (MTK_PIN_NO(123) | 0) +#define MT7623_PIN_123_HTPLG_FUNC_HTPLG (MTK_PIN_NO(123) | 1) +#define MT7623_PIN_123_HTPLG_FUNC_SCL2 (MTK_PIN_NO(123) | 4) +#define MT7623_PIN_123_HTPLG_FUNC_UTXD0 (MTK_PIN_NO(123) | 5) #define MT7623_PIN_124_GPIO124_FUNC_GPIO124 (MTK_PIN_NO(124) | 0) #define MT7623_PIN_124_GPIO124_FUNC_TEST (MTK_PIN_NO(124) | 1) diff --git a/include/dt-bindings/power/imx7-power.h b/include/dt-bindings/power/imx7-power.h new file mode 100644 index 0000000000000000000000000000000000000000..3a181e4105171d5a039cee8ce27862254fd7444d --- /dev/null +++ b/include/dt-bindings/power/imx7-power.h @@ -0,0 +1,16 @@ +/* + * Copyright (C) 2017 Impinj + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __DT_BINDINGS_IMX7_POWER_H__ +#define __DT_BINDINGS_IMX7_POWER_H__ + +#define IMX7_POWER_DOMAIN_MIPI_PHY 0 +#define IMX7_POWER_DOMAIN_PCIE_PHY 1 +#define IMX7_POWER_DOMAIN_USB_HSIC_PHY 2 + +#endif diff --git a/include/dt-bindings/power/r8a7795-sysc.h b/include/dt-bindings/power/r8a7795-sysc.h index ee2e26ba605ef9a37801d1b2858506f8d17864cb..ad679eeda1370a9dec9dd21c0fae84c1cb53a150 100644 --- a/include/dt-bindings/power/r8a7795-sysc.h +++ b/include/dt-bindings/power/r8a7795-sysc.h @@ -33,7 +33,7 @@ #define R8A7795_PD_CA53_SCU 21 #define R8A7795_PD_3DG_E 22 #define R8A7795_PD_A3IR 24 -#define R8A7795_PD_A2VC0 25 +#define R8A7795_PD_A2VC0 25 /* ES1.x only */ #define R8A7795_PD_A2VC1 26 /* Always-on power area */ diff --git a/include/dt-bindings/reset/altr,rst-mgr-a10sr.h b/include/dt-bindings/reset/altr,rst-mgr-a10sr.h new file mode 100644 index 0000000000000000000000000000000000000000..9855925e5256a7b9d5ece637b605b8ff937ec99a --- /dev/null +++ b/include/dt-bindings/reset/altr,rst-mgr-a10sr.h @@ -0,0 +1,33 @@ +/* + * Copyright Intel Corporation (C) 2017. All Rights Reserved + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + * + * Reset binding definitions for Altera Arria10 MAX5 System Resource Chip + * + * Adapted from altr,rst-mgr-a10.h + */ + +#ifndef _DT_BINDINGS_RESET_ALTR_RST_MGR_A10SR_H +#define _DT_BINDINGS_RESET_ALTR_RST_MGR_A10SR_H + +/* Peripheral PHY resets */ +#define A10SR_RESET_ENET_HPS 0 +#define A10SR_RESET_PCIE 1 +#define A10SR_RESET_FILE 2 +#define A10SR_RESET_BQSPI 3 +#define A10SR_RESET_USB 4 + +#define A10SR_RESET_NUM 5 + +#endif diff --git a/include/dt-bindings/reset/imx7-reset.h b/include/dt-bindings/reset/imx7-reset.h new file mode 100644 index 0000000000000000000000000000000000000000..63948170c7b2d3ae5b62a219c83770aaa05be8ef --- /dev/null +++ b/include/dt-bindings/reset/imx7-reset.h @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2017 Impinj, Inc. + * + * Author: Andrey Smirnov + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef DT_BINDING_RESET_IMX7_H +#define DT_BINDING_RESET_IMX7_H + +#define IMX7_RESET_A7_CORE_POR_RESET0 0 +#define IMX7_RESET_A7_CORE_POR_RESET1 1 +#define IMX7_RESET_A7_CORE_RESET0 2 +#define IMX7_RESET_A7_CORE_RESET1 3 +#define IMX7_RESET_A7_DBG_RESET0 4 +#define IMX7_RESET_A7_DBG_RESET1 5 +#define IMX7_RESET_A7_ETM_RESET0 6 +#define IMX7_RESET_A7_ETM_RESET1 7 +#define IMX7_RESET_A7_SOC_DBG_RESET 8 +#define IMX7_RESET_A7_L2RESET 9 +#define IMX7_RESET_SW_M4C_RST 10 +#define IMX7_RESET_SW_M4P_RST 11 +#define IMX7_RESET_EIM_RST 12 +#define IMX7_RESET_HSICPHY_PORT_RST 13 +#define IMX7_RESET_USBPHY1_POR 14 +#define IMX7_RESET_USBPHY1_PORT_RST 15 +#define IMX7_RESET_USBPHY2_POR 16 +#define IMX7_RESET_USBPHY2_PORT_RST 17 +#define IMX7_RESET_MIPI_PHY_MRST 18 +#define IMX7_RESET_MIPI_PHY_SRST 19 + +/* + * IMX7_RESET_PCIEPHY is a logical reset line combining PCIEPHY_BTN + * and PCIEPHY_G_RST + */ +#define IMX7_RESET_PCIEPHY 20 +#define IMX7_RESET_PCIEPHY_PERST 21 + +/* + * IMX7_RESET_PCIE_CTRL_APPS_EN is not strictly a reset line, but it + * can be used to inhibit PCIe LTTSM, so, in a way, it can be thoguht + * of as one + */ +#define IMX7_RESET_PCIE_CTRL_APPS_EN 22 +#define IMX7_RESET_DDRC_PRST 23 +#define IMX7_RESET_DDRC_CORE_RST 24 + +#define IMX7_RESET_NUM 25 + +#endif + diff --git a/include/dt-bindings/reset/mt2701-resets.h b/include/dt-bindings/reset/mt2701-resets.h index aaf03057f755c3657e779f24bb821c150a82f413..21deb547cfa404cdc544936dafbbe09b24a7ef4d 100644 --- a/include/dt-bindings/reset/mt2701-resets.h +++ b/include/dt-bindings/reset/mt2701-resets.h @@ -80,4 +80,11 @@ #define MT2701_HIFSYS_PCIE1_RST 25 #define MT2701_HIFSYS_PCIE2_RST 26 +/* ETHSYS resets */ +#define MT2701_ETHSYS_SYS_RST 0 +#define MT2701_ETHSYS_MCM_RST 2 +#define MT2701_ETHSYS_FE_RST 6 +#define MT2701_ETHSYS_GMAC_RST 23 +#define MT2701_ETHSYS_PPE_RST 31 + #endif /* _DT_BINDINGS_RESET_CONTROLLER_MT2701 */ diff --git a/include/dt-bindings/reset/sun8i-h3-ccu.h b/include/dt-bindings/reset/sun8i-h3-ccu.h index 6b7af80c26ec610d5db90a19efc05022340911e1..484c2a22919d7270aeaed71ec0bf3151c38fb873 100644 --- a/include/dt-bindings/reset/sun8i-h3-ccu.h +++ b/include/dt-bindings/reset/sun8i-h3-ccu.h @@ -98,6 +98,9 @@ #define RST_BUS_UART1 50 #define RST_BUS_UART2 51 #define RST_BUS_UART3 52 -#define RST_BUS_SCR 53 +#define RST_BUS_SCR0 53 + +/* New resets imported in H5 */ +#define RST_BUS_SCR1 54 #endif /* _DT_BINDINGS_RST_SUN8I_H3_H_ */ diff --git a/include/dt-bindings/reset/sun8i-r-ccu.h b/include/dt-bindings/reset/sun8i-r-ccu.h new file mode 100644 index 0000000000000000000000000000000000000000..4ba64f3d6fc9e9c6122d436e664b07f3f646a008 --- /dev/null +++ b/include/dt-bindings/reset/sun8i-r-ccu.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2016 Icenowy Zheng + * + * This file is dual-licensed: you can use it either under the terms + * of the GPL or the X11 license, at your option. Note that this dual + * licensing only applies to this file, and not this project as a + * whole. + * + * a) This file is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This file is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Or, alternatively, + * + * b) Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _DT_BINDINGS_RST_SUN8I_R_CCU_H_ +#define _DT_BINDINGS_RST_SUN8I_R_CCU_H_ + +#define RST_APB0_IR 0 +#define RST_APB0_TIMER 1 +#define RST_APB0_RSB 2 +#define RST_APB0_UART 3 +/* 4 is reserved for RST_APB0_W1 on A31 */ +#define RST_APB0_I2C 5 + +#endif /* _DT_BINDINGS_RST_SUN8I_R_CCU_H_ */ diff --git a/include/dt-bindings/reset/tegra210-car.h b/include/dt-bindings/reset/tegra210-car.h new file mode 100644 index 0000000000000000000000000000000000000000..296ec6e3f8c0b3ed22e22e71d28a72ea33a40c22 --- /dev/null +++ b/include/dt-bindings/reset/tegra210-car.h @@ -0,0 +1,13 @@ +/* + * This header provides Tegra210-specific constants for binding + * nvidia,tegra210-car. + */ + +#ifndef _DT_BINDINGS_RESET_TEGRA210_CAR_H +#define _DT_BINDINGS_RESET_TEGRA210_CAR_H + +#define TEGRA210_RESET(x) (7 * 32 + (x)) +#define TEGRA210_RST_DFLL_DVCO TEGRA210_RESET(0) +#define TEGRA210_RST_ADSP TEGRA210_RESET(1) + +#endif /* _DT_BINDINGS_RESET_TEGRA210_CAR_H */ diff --git a/include/keys/system_keyring.h b/include/keys/system_keyring.h index fbd4647767e9162f10c5741825796c5c2061cef0..359c2f936004b42d4e892170f6e686213d7e3d63 100644 --- a/include/keys/system_keyring.h +++ b/include/keys/system_keyring.h @@ -18,7 +18,8 @@ extern int restrict_link_by_builtin_trusted(struct key *keyring, const struct key_type *type, - const union key_payload *payload); + const union key_payload *payload, + struct key *restriction_key); #else #define restrict_link_by_builtin_trusted restrict_link_reject @@ -28,11 +29,24 @@ extern int restrict_link_by_builtin_trusted(struct key *keyring, extern int restrict_link_by_builtin_and_secondary_trusted( struct key *keyring, const struct key_type *type, - const union key_payload *payload); + const union key_payload *payload, + struct key *restriction_key); #else #define restrict_link_by_builtin_and_secondary_trusted restrict_link_by_builtin_trusted #endif +#ifdef CONFIG_SYSTEM_BLACKLIST_KEYRING +extern int mark_hash_blacklisted(const char *hash); +extern int is_hash_blacklisted(const u8 *hash, size_t hash_len, + const char *type); +#else +static inline int is_hash_blacklisted(const u8 *hash, size_t hash_len, + const char *type) +{ + return 0; +} +#endif + #ifdef CONFIG_IMA_BLACKLIST_KEYRING extern struct key *ima_blacklist_keyring; diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index fe797d6ef89d90e3163c0e4f927286b314c8d5c8..295584f31a4ef3b04a6e6bf0d68467ada190382c 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -63,6 +63,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_timer_update_run(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 92e7e97ca8ff0f95ef9dba95b3e7078dd8c73049..1ab4633adf4fd16971edfa32f197a9e7b24c2d7b 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -50,6 +50,8 @@ void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, @@ -85,6 +87,11 @@ static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} +static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index c0b3d999c266f2271d61bd58b042352580faf5d2..97b8d3728b3103f1b54f711f15d06fa84691e372 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -148,7 +148,6 @@ struct vgic_its { gpa_t vgic_its_base; bool enabled; - bool initialized; struct vgic_io_device iodev; struct kvm_device *dev; @@ -162,6 +161,9 @@ struct vgic_its { u32 creadr; u32 cwriter; + /* migration ABI revision in use */ + u32 abi_rev; + /* Protects the device and collection lists */ struct mutex its_lock; struct list_head device_list; @@ -225,8 +227,6 @@ struct vgic_dist { struct vgic_v2_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ u64 vgic_elrsr; /* Saved only */ u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; @@ -236,8 +236,6 @@ struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ u32 vgic_elrsr; /* Saved only */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; @@ -264,8 +262,6 @@ struct vgic_cpu { */ struct list_head ap_list_head; - u64 live_lrs; - /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. @@ -289,6 +285,7 @@ extern struct static_key_false vgic_v2_cpuif_trap; int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write); void kvm_vgic_early_init(struct kvm *kvm); +int kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu); int kvm_vgic_create(struct kvm *kvm, u32 type); void kvm_vgic_destroy(struct kvm *kvm); void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu); @@ -307,6 +304,9 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); +void kvm_vgic_load(struct kvm_vcpu *vcpu); +void kvm_vgic_put(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 9b05886f9773cde8439a0c3e21b39ad29460c440..137e4a3d89c5225dc6a9535265f13e874bdbb8eb 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -233,10 +233,6 @@ int acpi_numa_init (void); int acpi_table_init (void); int acpi_table_parse(char *id, acpi_tbl_table_handler handler); -int __init acpi_parse_entries(char *id, unsigned long table_size, - acpi_tbl_entry_handler handler, - struct acpi_table_header *table_header, - int entry_id, unsigned int max_entries); int __init acpi_table_parse_entries(char *id, unsigned long table_size, int entry_id, acpi_tbl_entry_handler handler, @@ -595,6 +591,13 @@ enum acpi_reconfig_event { int acpi_reconfig_notifier_register(struct notifier_block *nb); int acpi_reconfig_notifier_unregister(struct notifier_block *nb); +#ifdef CONFIG_ACPI_GTDT +int acpi_gtdt_init(struct acpi_table_header *table, int *platform_timer_count); +int acpi_gtdt_map_ppi(int type); +bool acpi_gtdt_c3stop(int type); +int acpi_arch_timer_mem_init(struct arch_timer_mem *timer_mem, int *timer_count); +#endif + #else /* !CONFIG_ACPI */ #define acpi_disabled 1 @@ -611,6 +614,11 @@ static inline bool acpi_dev_found(const char *hid) return false; } +static inline bool acpi_dev_present(const char *hid, const char *uid, s64 hrv) +{ + return false; +} + static inline bool is_acpi_node(struct fwnode_handle *fwnode) { return false; @@ -762,8 +770,11 @@ static inline enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev) return DEV_DMA_NOT_SUPPORTED; } -static inline void acpi_dma_configure(struct device *dev, - enum dev_dma_attr attr) { } +static inline int acpi_dma_configure(struct device *dev, + enum dev_dma_attr attr) +{ + return 0; +} static inline void acpi_dma_deconfigure(struct device *dev) { } @@ -949,6 +960,10 @@ static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) adev->driver_gpios = NULL; } +int devm_acpi_dev_add_driver_gpios(struct device *dev, + const struct acpi_gpio_mapping *gpios); +void devm_acpi_dev_remove_driver_gpios(struct device *dev); + int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index); #else static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, @@ -958,6 +973,13 @@ static inline int acpi_dev_add_driver_gpios(struct acpi_device *adev, } static inline void acpi_dev_remove_driver_gpios(struct acpi_device *adev) {} +static inline int devm_acpi_dev_add_driver_gpios(struct device *dev, + const struct acpi_gpio_mapping *gpios) +{ + return -ENXIO; +} +static inline void devm_acpi_dev_remove_driver_gpios(struct device *dev) {} + static inline int acpi_dev_gpio_irq_get(struct acpi_device *adev, int index) { return -ENXIO; @@ -997,8 +1019,16 @@ int acpi_node_prop_read(struct fwnode_handle *fwnode, const char *propname, int acpi_dev_prop_read(struct acpi_device *adev, const char *propname, enum dev_prop_type proptype, void *val, size_t nval); -struct fwnode_handle *acpi_get_next_subnode(struct device *dev, - struct fwnode_handle *subnode); +struct fwnode_handle *acpi_get_next_subnode(struct fwnode_handle *fwnode, + struct fwnode_handle *child); +struct fwnode_handle *acpi_node_get_parent(struct fwnode_handle *fwnode); + +struct fwnode_handle *acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode, + struct fwnode_handle *prev); +int acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode, + struct fwnode_handle **remote, + struct fwnode_handle **port, + struct fwnode_handle **endpoint); struct acpi_probe_entry; typedef bool (*acpi_probe_entry_validate_subtbl)(struct acpi_subtable_header *, @@ -1115,12 +1145,34 @@ static inline int acpi_dev_prop_read(struct acpi_device *adev, return -ENXIO; } -static inline struct fwnode_handle *acpi_get_next_subnode(struct device *dev, - struct fwnode_handle *subnode) +static inline struct fwnode_handle * +acpi_get_next_subnode(struct fwnode_handle *fwnode, struct fwnode_handle *child) { return NULL; } +static inline struct fwnode_handle * +acpi_node_get_parent(struct fwnode_handle *fwnode) +{ + return NULL; +} + +static inline struct fwnode_handle * +acpi_graph_get_next_endpoint(struct fwnode_handle *fwnode, + struct fwnode_handle *prev) +{ + return ERR_PTR(-ENXIO); +} + +static inline int +acpi_graph_get_remote_endpoint(struct fwnode_handle *fwnode, + struct fwnode_handle **remote, + struct fwnode_handle **port, + struct fwnode_handle **endpoint) +{ + return -ENXIO; +} + #define ACPI_DECLARE_PROBE_ENTRY(table, name, table_id, subtable, valid, data, fn) \ static const void * __acpi_table_##name[] \ __attribute__((unused)) \ diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 77e08099e55404b53225e71f7bd0b9ca34caec4a..3ff9acea86161ae51fde3953de6bf27b8f5ebca0 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -34,6 +34,8 @@ void acpi_iort_init(void); bool iort_node_match(u8 type); u32 iort_msi_map_rid(struct device *dev, u32 req_id); struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id); +void acpi_configure_pmsi_domain(struct device *dev); +int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id); /* IOMMU interface */ void iort_set_dma_mask(struct device *dev); const struct iommu_ops *iort_iommu_configure(struct device *dev); @@ -45,6 +47,7 @@ static inline u32 iort_msi_map_rid(struct device *dev, u32 req_id) static inline struct irq_domain *iort_get_device_domain(struct device *dev, u32 req_id) { return NULL; } +static inline void acpi_configure_pmsi_domain(struct device *dev) { } /* IOMMU interface */ static inline void iort_set_dma_mask(struct device *dev) { } static inline @@ -52,7 +55,4 @@ const struct iommu_ops *iort_iommu_configure(struct device *dev) { return NULL; } #endif -#define IORT_ACPI_DECLARE(name, table_id, fn) \ - ACPI_DECLARE_PROBE_ENTRY(iort, name, table_id, 0, NULL, 0, fn) - #endif /* __ACPI_IORT_H__ */ diff --git a/include/linux/amba/pl080.h b/include/linux/amba/pl080.h index 91b84a7f053933089e63224aea1f616809b8af57..580b5323a717bfe720919ffba7f093d8e389c957 100644 --- a/include/linux/amba/pl080.h +++ b/include/linux/amba/pl080.h @@ -38,24 +38,16 @@ #define PL080_SOFT_LSREQ (0x2C) #define PL080_CONFIG (0x30) -#define PL080_CONFIG_M2_BE (1 << 2) -#define PL080_CONFIG_M1_BE (1 << 1) -#define PL080_CONFIG_ENABLE (1 << 0) +#define PL080_CONFIG_M2_BE BIT(2) +#define PL080_CONFIG_M1_BE BIT(1) +#define PL080_CONFIG_ENABLE BIT(0) #define PL080_SYNC (0x34) /* Per channel configuration registers */ -#define PL080_Cx_STRIDE (0x20) +/* Per channel configuration registers */ #define PL080_Cx_BASE(x) ((0x100 + (x * 0x20))) -#define PL080_Cx_SRC_ADDR(x) ((0x100 + (x * 0x20))) -#define PL080_Cx_DST_ADDR(x) ((0x104 + (x * 0x20))) -#define PL080_Cx_LLI(x) ((0x108 + (x * 0x20))) -#define PL080_Cx_CONTROL(x) ((0x10C + (x * 0x20))) -#define PL080_Cx_CONFIG(x) ((0x110 + (x * 0x20))) -#define PL080S_Cx_CONTROL2(x) ((0x110 + (x * 0x20))) -#define PL080S_Cx_CONFIG(x) ((0x114 + (x * 0x20))) - #define PL080_CH_SRC_ADDR (0x00) #define PL080_CH_DST_ADDR (0x04) #define PL080_CH_LLI (0x08) @@ -66,18 +58,18 @@ #define PL080_LLI_ADDR_MASK (0x3fffffff << 2) #define PL080_LLI_ADDR_SHIFT (2) -#define PL080_LLI_LM_AHB2 (1 << 0) +#define PL080_LLI_LM_AHB2 BIT(0) -#define PL080_CONTROL_TC_IRQ_EN (1 << 31) +#define PL080_CONTROL_TC_IRQ_EN BIT(31) #define PL080_CONTROL_PROT_MASK (0x7 << 28) #define PL080_CONTROL_PROT_SHIFT (28) -#define PL080_CONTROL_PROT_CACHE (1 << 30) -#define PL080_CONTROL_PROT_BUFF (1 << 29) -#define PL080_CONTROL_PROT_SYS (1 << 28) -#define PL080_CONTROL_DST_INCR (1 << 27) -#define PL080_CONTROL_SRC_INCR (1 << 26) -#define PL080_CONTROL_DST_AHB2 (1 << 25) -#define PL080_CONTROL_SRC_AHB2 (1 << 24) +#define PL080_CONTROL_PROT_CACHE BIT(30) +#define PL080_CONTROL_PROT_BUFF BIT(29) +#define PL080_CONTROL_PROT_SYS BIT(28) +#define PL080_CONTROL_DST_INCR BIT(27) +#define PL080_CONTROL_SRC_INCR BIT(26) +#define PL080_CONTROL_DST_AHB2 BIT(25) +#define PL080_CONTROL_SRC_AHB2 BIT(24) #define PL080_CONTROL_DWIDTH_MASK (0x7 << 21) #define PL080_CONTROL_DWIDTH_SHIFT (21) #define PL080_CONTROL_SWIDTH_MASK (0x7 << 18) @@ -103,20 +95,20 @@ #define PL080_WIDTH_16BIT (0x1) #define PL080_WIDTH_32BIT (0x2) -#define PL080N_CONFIG_ITPROT (1 << 20) -#define PL080N_CONFIG_SECPROT (1 << 19) -#define PL080_CONFIG_HALT (1 << 18) -#define PL080_CONFIG_ACTIVE (1 << 17) /* RO */ -#define PL080_CONFIG_LOCK (1 << 16) -#define PL080_CONFIG_TC_IRQ_MASK (1 << 15) -#define PL080_CONFIG_ERR_IRQ_MASK (1 << 14) +#define PL080N_CONFIG_ITPROT BIT(20) +#define PL080N_CONFIG_SECPROT BIT(19) +#define PL080_CONFIG_HALT BIT(18) +#define PL080_CONFIG_ACTIVE BIT(17) /* RO */ +#define PL080_CONFIG_LOCK BIT(16) +#define PL080_CONFIG_TC_IRQ_MASK BIT(15) +#define PL080_CONFIG_ERR_IRQ_MASK BIT(14) #define PL080_CONFIG_FLOW_CONTROL_MASK (0x7 << 11) #define PL080_CONFIG_FLOW_CONTROL_SHIFT (11) #define PL080_CONFIG_DST_SEL_MASK (0xf << 6) #define PL080_CONFIG_DST_SEL_SHIFT (6) #define PL080_CONFIG_SRC_SEL_MASK (0xf << 1) #define PL080_CONFIG_SRC_SEL_SHIFT (1) -#define PL080_CONFIG_ENABLE (1 << 0) +#define PL080_CONFIG_ENABLE BIT(0) #define PL080_FLOW_MEM2MEM (0x0) #define PL080_FLOW_MEM2PER (0x1) diff --git a/include/linux/amba/pl330.h b/include/linux/amba/pl330.h deleted file mode 100644 index fe93758e84036b8abd8a2b5c5191c0545050094b..0000000000000000000000000000000000000000 --- a/include/linux/amba/pl330.h +++ /dev/null @@ -1,35 +0,0 @@ -/* linux/include/linux/amba/pl330.h - * - * Copyright (C) 2010 Samsung Electronics Co. Ltd. - * Jaswinder Singh - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -#ifndef __AMBA_PL330_H_ -#define __AMBA_PL330_H_ - -#include - -struct dma_pl330_platdata { - /* - * Number of valid peripherals connected to DMAC. - * This may be different from the value read from - * CR0, as the PL330 implementation might have 'holes' - * in the peri list or the peri could also be reached - * from another DMAC which the platform prefers. - */ - u8 nr_valid_peri; - /* Array of valid peripherals */ - u8 *peri_id; - /* Operational capabilities */ - dma_cap_mask_t cap_mask; - /* Bytes to allocate for MC buffer */ - unsigned mcbuf_sz; -}; - -extern bool pl330_filter(struct dma_chan *chan, void *param); -#endif /* __AMBA_PL330_H_ */ diff --git a/include/linux/ata.h b/include/linux/ata.h index af6859b3a93d75194eab0817607fd02366a8a61d..ad7d9ee89ff043eaf2d8db8589d2acaccc07c855 100644 --- a/include/linux/ata.h +++ b/include/linux/ata.h @@ -817,11 +817,6 @@ static inline bool ata_id_sct_error_recovery_ctrl(const u16 *id) return id[ATA_ID_SCT_CMD_XPORT] & (1 << 3) ? true : false; } -static inline bool ata_id_sct_write_same(const u16 *id) -{ - return id[ATA_ID_SCT_CMD_XPORT] & (1 << 2) ? true : false; -} - static inline bool ata_id_sct_long_sector_access(const u16 *id) { return id[ATA_ID_SCT_CMD_XPORT] & (1 << 1) ? true : false; diff --git a/include/linux/atomic.h b/include/linux/atomic.h index e71835bf60a977a37277d44c6357a02e70c3a41d..c56be74101305f74524bc816d9c8fdb67ccb895d 100644 --- a/include/linux/atomic.h +++ b/include/linux/atomic.h @@ -423,6 +423,29 @@ #endif #endif /* atomic_cmpxchg_relaxed */ +#ifndef atomic_try_cmpxchg + +#define __atomic_try_cmpxchg(type, _p, _po, _n) \ +({ \ + typeof(_po) __po = (_po); \ + typeof(*(_po)) __r, __o = *__po; \ + __r = atomic_cmpxchg##type((_p), __o, (_n)); \ + if (unlikely(__r != __o)) \ + *__po = __r; \ + likely(__r == __o); \ +}) + +#define atomic_try_cmpxchg(_p, _po, _n) __atomic_try_cmpxchg(, _p, _po, _n) +#define atomic_try_cmpxchg_relaxed(_p, _po, _n) __atomic_try_cmpxchg(_relaxed, _p, _po, _n) +#define atomic_try_cmpxchg_acquire(_p, _po, _n) __atomic_try_cmpxchg(_acquire, _p, _po, _n) +#define atomic_try_cmpxchg_release(_p, _po, _n) __atomic_try_cmpxchg(_release, _p, _po, _n) + +#else /* atomic_try_cmpxchg */ +#define atomic_try_cmpxchg_relaxed atomic_try_cmpxchg +#define atomic_try_cmpxchg_acquire atomic_try_cmpxchg +#define atomic_try_cmpxchg_release atomic_try_cmpxchg +#endif /* atomic_try_cmpxchg */ + /* cmpxchg_relaxed */ #ifndef cmpxchg_relaxed #define cmpxchg_relaxed cmpxchg @@ -996,6 +1019,29 @@ static inline int atomic_dec_if_positive(atomic_t *v) #endif #endif /* atomic64_cmpxchg_relaxed */ +#ifndef atomic64_try_cmpxchg + +#define __atomic64_try_cmpxchg(type, _p, _po, _n) \ +({ \ + typeof(_po) __po = (_po); \ + typeof(*(_po)) __r, __o = *__po; \ + __r = atomic64_cmpxchg##type((_p), __o, (_n)); \ + if (unlikely(__r != __o)) \ + *__po = __r; \ + likely(__r == __o); \ +}) + +#define atomic64_try_cmpxchg(_p, _po, _n) __atomic64_try_cmpxchg(, _p, _po, _n) +#define atomic64_try_cmpxchg_relaxed(_p, _po, _n) __atomic64_try_cmpxchg(_relaxed, _p, _po, _n) +#define atomic64_try_cmpxchg_acquire(_p, _po, _n) __atomic64_try_cmpxchg(_acquire, _p, _po, _n) +#define atomic64_try_cmpxchg_release(_p, _po, _n) __atomic64_try_cmpxchg(_release, _p, _po, _n) + +#else /* atomic64_try_cmpxchg */ +#define atomic64_try_cmpxchg_relaxed atomic64_try_cmpxchg +#define atomic64_try_cmpxchg_acquire atomic64_try_cmpxchg +#define atomic64_try_cmpxchg_release atomic64_try_cmpxchg +#endif /* atomic64_try_cmpxchg */ + #ifndef atomic64_andnot static inline void atomic64_andnot(long long i, atomic64_t *v) { diff --git a/include/linux/audit.h b/include/linux/audit.h index 504e784b7ffa6da23212a0a0d110efe91774eb27..2150bdccfbab2953c33cd56d6d64eb798ca180ea 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -163,8 +163,7 @@ extern void audit_log_task_info(struct audit_buffer *ab, extern int audit_update_lsm_rules(void); /* Private API (for audit.c only) */ -extern int audit_rule_change(int type, __u32 portid, int seq, - void *data, size_t datasz); +extern int audit_rule_change(int type, int seq, void *data, size_t datasz); extern int audit_list_rules_send(struct sk_buff *request_skb, int seq); extern u32 audit_enabled; @@ -332,7 +331,7 @@ static inline void audit_ptrace(struct task_struct *t) /* Private API (for audit.c only) */ extern unsigned int audit_serial(void); extern int auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial); + struct timespec64 *t, unsigned int *serial); extern int audit_set_loginuid(kuid_t loginuid); static inline kuid_t audit_get_loginuid(struct task_struct *tsk) @@ -511,7 +510,7 @@ static inline void __audit_seccomp(unsigned long syscall, long signr, int code) static inline void audit_seccomp(unsigned long syscall, long signr, int code) { } static inline int auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) + struct timespec64 *t, unsigned int *serial) { return 0; } diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index ad955817916d00c35b6b41d204ec61013c88a10f..866c433e7d3220b29170ed98ccd1b70803a43964 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ + WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; @@ -54,7 +55,9 @@ struct bdi_writeback_congested { atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *bdi; /* the associated bdi */ + struct backing_dev_info *__bdi; /* the associated bdi, set to NULL + * on bdi unregistration. For memcg-wb + * internal use only! */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif @@ -143,7 +146,7 @@ struct backing_dev_info { congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ - char *name; + const char *name; struct kref refcnt; /* Reference counter for the structure */ unsigned int capabilities; /* Device capabilities */ @@ -161,7 +164,6 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ - atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index c52a48cb9a66379971be13008bd8cdfba087807e..557d84063934c65c2aa96c2b6141425af8a567bf 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,8 +17,6 @@ #include #include -int __must_check bdi_init(struct backing_dev_info *bdi); - static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) { kref_get(&bdi->refcnt); @@ -27,16 +25,18 @@ static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) void bdi_put(struct backing_dev_info *bdi); -__printf(3, 4) -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev); +__printf(2, 3) +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...); +int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, + va_list args); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner); void bdi_unregister(struct backing_dev_info *bdi); -int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); -void bdi_destroy(struct backing_dev_info *bdi); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); +static inline struct backing_dev_info *bdi_alloc(gfp_t gfp_mask) +{ + return bdi_alloc_node(gfp_mask, NUMA_NO_NODE); +} void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); diff --git a/include/linux/bcma/bcma_driver_pci.h b/include/linux/bcma/bcma_driver_pci.h index 9657f11d48a7547cbf63639c23f0de9bc07920c4..bca6a5e4ca3db3a6370b83621c80bb5ca876f27a 100644 --- a/include/linux/bcma/bcma_driver_pci.h +++ b/include/linux/bcma/bcma_driver_pci.h @@ -80,7 +80,7 @@ struct pci_dev; #define BCMA_CORE_PCI_MDIODATA_DEV_TX 0x1e /* SERDES TX Dev */ #define BCMA_CORE_PCI_MDIODATA_DEV_RX 0x1f /* SERDES RX Dev */ #define BCMA_CORE_PCI_PCIEIND_ADDR 0x0130 /* indirect access to the internal register */ -#define BCMA_CORE_PCI_PCIEIND_DATA 0x0134 /* Data to/from the internal regsiter */ +#define BCMA_CORE_PCI_PCIEIND_DATA 0x0134 /* Data to/from the internal register */ #define BCMA_CORE_PCI_CLKREQENCTRL 0x0138 /* >= rev 6, Clkreq rdma control */ #define BCMA_CORE_PCI_PCICFG0 0x0400 /* PCI config space 0 (rev >= 8) */ #define BCMA_CORE_PCI_PCICFG1 0x0500 /* PCI config space 1 (rev >= 8) */ diff --git a/include/linux/bio.h b/include/linux/bio.h index 8e521194f6fc4ad32138a51c962a365c74debaed..d1b04b0e99cf8c293d4ded6eccb2b0aa2fce0d41 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) -static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) +static inline unsigned bio_segments(struct bio *bio) { unsigned segs = 0; struct bio_vec bv; @@ -205,17 +205,12 @@ static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) break; } - __bio_for_each_segment(bv, bio, iter, *bvec) + bio_for_each_segment(bv, bio, iter) segs++; return segs; } -static inline unsigned bio_segments(struct bio *bio) -{ - return __bio_segments(bio, &bio->bi_iter); -} - /* * get a reference to a bio, so it won't disappear. the intended use is * something like: @@ -383,14 +378,12 @@ extern struct bio_set *bioset_create_nobvec(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern mempool_t *biovec_create_pool(int pool_entries); -extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *); +extern struct bio *bio_alloc_bioset(gfp_t, unsigned int, struct bio_set *); extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); -extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t, - struct bio_set *, int, int); extern struct bio_set *fs_bio_set; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 9382c5da7a2edfbc7cbccca2ff1304cd042c3bd2..c47aa248c640bc5ea806d5112ce7f7b5fb6b7a46 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -15,7 +15,7 @@ struct blk_mq_hw_ctx { unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; - struct work_struct run_work; + struct delayed_work run_work; cpumask_var_t cpumask; int next_cpu; int next_cpu_batch; @@ -51,15 +51,17 @@ struct blk_mq_hw_ctx { atomic_t nr_active; - struct delayed_work delayed_run_work; - struct delayed_work delay_work; - struct hlist_node cpuhp_dead; struct kobject kobj; unsigned long poll_considered; unsigned long poll_invoked; unsigned long poll_success; + +#ifdef CONFIG_BLK_DEBUG_FS + struct dentry *debugfs_dir; + struct dentry *sched_debugfs_dir; +#endif }; struct blk_mq_tag_set { @@ -82,7 +84,6 @@ struct blk_mq_tag_set { struct blk_mq_queue_data { struct request *rq; - struct list_head *list; bool last; }; @@ -90,9 +91,9 @@ typedef int (queue_rq_fn)(struct blk_mq_hw_ctx *, const struct blk_mq_queue_data typedef enum blk_eh_timer_return (timeout_fn)(struct request *, bool); typedef int (init_hctx_fn)(struct blk_mq_hw_ctx *, void *, unsigned int); typedef void (exit_hctx_fn)(struct blk_mq_hw_ctx *, unsigned int); -typedef int (init_request_fn)(void *, struct request *, unsigned int, +typedef int (init_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int, unsigned int); -typedef void (exit_request_fn)(void *, struct request *, unsigned int, +typedef void (exit_request_fn)(struct blk_mq_tag_set *set, struct request *, unsigned int); typedef int (reinit_request_fn)(void *, struct request *); @@ -143,6 +144,14 @@ struct blk_mq_ops { reinit_request_fn *reinit_request; map_queues_fn *map_queues; + +#ifdef CONFIG_BLK_DEBUG_FS + /* + * Used by the debugfs implementation to show driver-specific + * information about a request. + */ + void (*show_rq)(struct seq_file *m, struct request *rq); +#endif }; enum { @@ -153,7 +162,6 @@ enum { BLK_MQ_F_SHOULD_MERGE = 1 << 0, BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, - BLK_MQ_F_DEFER_ISSUE = 1 << 4, BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_NO_SCHED = 1 << 6, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, @@ -163,6 +171,7 @@ enum { BLK_MQ_S_TAG_ACTIVE = 1, BLK_MQ_S_SCHED_RESTART = 2, BLK_MQ_S_TAG_WAITING = 3, + BLK_MQ_S_START_ON_RUN = 4, BLK_MQ_MAX_DEPTH = 10240, @@ -230,7 +239,7 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, void blk_mq_kick_requeue_list(struct request_queue *q); void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); -void blk_mq_complete_request(struct request *rq, int error); +void blk_mq_complete_request(struct request *rq); bool blk_mq_queue_stopped(struct request_queue *q); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx); @@ -240,13 +249,14 @@ void blk_mq_start_hw_queues(struct request_queue *q); void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async); void blk_mq_run_hw_queues(struct request_queue *q, bool async); void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs); void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv); void blk_mq_freeze_queue(struct request_queue *q); void blk_mq_unfreeze_queue(struct request_queue *q); -void blk_mq_freeze_queue_start(struct request_queue *q); +void blk_freeze_queue_start(struct request_queue *q); void blk_mq_freeze_queue_wait(struct request_queue *q); int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, unsigned long timeout); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index d703acb55d0f0d196296ef4d82f4e97a5efd9c3a..61339bc444006a477bf3e84b32a1d8199a9e5f01 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -17,6 +17,10 @@ struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); +struct blk_issue_stat { + u64 stat; +}; + /* * main unit of I/O for the block layer and lower layers (ie drivers and * stacking drivers) @@ -29,7 +33,7 @@ struct bio { * top bits REQ_OP. Use * accessors. */ - unsigned short bi_flags; /* status, command, etc */ + unsigned short bi_flags; /* status, etc and bvec pool number */ unsigned short bi_ioprio; struct bvec_iter bi_iter; @@ -58,6 +62,10 @@ struct bio { */ struct io_context *bi_ioc; struct cgroup_subsys_state *bi_css; +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + void *bi_cg_private; + struct blk_issue_stat bi_issue_stat; +#endif #endif union { #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -102,12 +110,9 @@ struct bio { #define BIO_REFFED 8 /* bio has elevated ->bi_cnt */ #define BIO_THROTTLED 9 /* This bio has already been subjected to * throttling rules. Don't do it again. */ - -/* - * Flags starting here get preserved by bio_reset() - this includes - * BVEC_POOL_IDX() - */ -#define BIO_RESET_BITS 10 +#define BIO_TRACE_COMPLETION 10 /* bio_endio() should trace the final completion + * of this bio. */ +/* See BVEC_POOL_OFFSET below before adding new flags */ /* * We support 6 different bvec pools, the last one is magic in that it @@ -117,13 +122,22 @@ struct bio { #define BVEC_POOL_MAX (BVEC_POOL_NR - 1) /* - * Top 4 bits of bio flags indicate the pool the bvecs came from. We add + * Top 3 bits of bio flags indicate the pool the bvecs came from. We add * 1 to the actual index so that 0 indicates that there are no bvecs to be * freed. */ -#define BVEC_POOL_BITS (4) +#define BVEC_POOL_BITS (3) #define BVEC_POOL_OFFSET (16 - BVEC_POOL_BITS) #define BVEC_POOL_IDX(bio) ((bio)->bi_flags >> BVEC_POOL_OFFSET) +#if (1<< BVEC_POOL_BITS) < (BVEC_POOL_NR+1) +# error "BVEC_POOL_BITS is too small" +#endif + +/* + * Flags starting here get preserved by bio_reset() - this includes + * only BVEC_POOL_IDX() + */ +#define BIO_RESET_BITS BVEC_POOL_OFFSET /* * Operations and flags common to the bio and request structures. @@ -160,7 +174,7 @@ enum req_opf { /* write the same sector many times */ REQ_OP_WRITE_SAME = 7, /* write the zero filled sector many times */ - REQ_OP_WRITE_ZEROES = 8, + REQ_OP_WRITE_ZEROES = 9, /* SCSI passthrough using struct scsi_request */ REQ_OP_SCSI_IN = 32, @@ -187,6 +201,10 @@ enum req_flag_bits { __REQ_PREFLUSH, /* request for cache flush */ __REQ_RAHEAD, /* read ahead, can fail anytime */ __REQ_BACKGROUND, /* background IO */ + + /* command specific flags for REQ_OP_WRITE_ZEROES: */ + __REQ_NOUNMAP, /* do not free blocks when zeroing */ + __REQ_NR_BITS, /* stops here */ }; @@ -204,6 +222,8 @@ enum req_flag_bits { #define REQ_RAHEAD (1ULL << __REQ_RAHEAD) #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND) +#define REQ_NOUNMAP (1ULL << __REQ_NOUNMAP) + #define REQ_FAILFAST_MASK \ (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER) @@ -283,12 +303,6 @@ static inline bool blk_qc_t_is_internal(blk_qc_t cookie) return (cookie & BLK_QC_T_INTERNAL) != 0; } -struct blk_issue_stat { - u64 time; -}; - -#define BLK_RQ_STAT_BATCH 64 - struct blk_rq_stat { s64 mean; u64 min; @@ -296,7 +310,6 @@ struct blk_rq_stat { s32 nr_samples; s32 nr_batch; u64 batch; - s64 time; }; #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 01a696b0a4d3ae0118a2742c96d68775ef9f637a..ab92c4ea138b7c665c45b765c0e1f8a4958339ca 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -40,15 +40,20 @@ struct blkcg_gq; struct blk_flush_queue; struct pr_ops; struct rq_wb; +struct blk_queue_stats; +struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ +/* Must be consisitent with blk_mq_poll_stats_bkt() */ +#define BLK_MQ_POLL_STATS_BKTS 16 + /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ -#define BLKCG_MAX_POLS 2 +#define BLKCG_MAX_POLS 3 typedef void (rq_end_io_fn)(struct request *, int); @@ -173,6 +178,7 @@ struct request { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; + int error_count; /* for legacy drivers, don't use */ }; /* @@ -213,16 +219,14 @@ struct request { unsigned short ioprio; - void *special; /* opaque pointer available for LLD use */ + unsigned int timeout; - int errors; + void *special; /* opaque pointer available for LLD use */ unsigned int extra_len; /* length of alignment and padding */ unsigned long deadline; struct list_head timeout_list; - unsigned int timeout; - int retries; /* * completion callback. @@ -337,7 +341,6 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char cluster; - unsigned char discard_zeroes_data; unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; }; @@ -388,6 +391,7 @@ struct request_queue { int nr_rqs[2]; /* # allocated [a]sync rqs */ int nr_rqs_elvpriv; /* # allocated rqs w/ elvpriv */ + struct blk_queue_stats *stats; struct rq_wb *rq_wb; /* @@ -505,8 +509,6 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; - struct blk_rq_stat rq_stats[2]; - /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the @@ -516,6 +518,10 @@ struct request_queue { unsigned int rq_timeout; int poll_nsec; + + struct blk_stat_callback *poll_cb; + struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; + struct timer_list timeout; struct work_struct timeout_work; struct list_head timeout_list; @@ -573,7 +579,7 @@ struct request_queue { #ifdef CONFIG_BLK_DEBUG_FS struct dentry *debugfs_dir; - struct dentry *mq_debugfs_dir; + struct dentry *sched_debugfs_dir; #endif bool mq_sysfs_init_done; @@ -610,6 +616,8 @@ struct request_queue { #define QUEUE_FLAG_FLUSH_NQ 25 /* flush not queueuable */ #define QUEUE_FLAG_DAX 26 /* device supports DAX */ #define QUEUE_FLAG_STATS 27 /* track rq completion times */ +#define QUEUE_FLAG_POLL_STATS 28 /* collecting stats for hybrid polling */ +#define QUEUE_FLAG_REGISTERED 29 /* queue has been registered to a disk */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_STACKABLE) | \ @@ -918,6 +926,7 @@ extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern blk_qc_t generic_make_request(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); +extern void blk_init_request_from_bio(struct request *req, struct bio *bio); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); @@ -963,7 +972,7 @@ extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, uns extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); -extern int blk_execute_rq(struct request_queue *, struct gendisk *, +extern void blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); @@ -1080,20 +1089,6 @@ static inline unsigned int blk_rq_count_bios(struct request *rq) return nr_bios; } -/* - * blk_rq_set_prio - associate a request with prio from ioc - * @rq: request of interest - * @ioc: target iocontext - * - * Assocate request prio with ioc prio so request based drivers - * can leverage priority information. - */ -static inline void blk_rq_set_prio(struct request *rq, struct io_context *ioc) -{ - if (ioc) - rq->ioprio = ioc->ioprio; -} - /* * Request issue related functions. */ @@ -1120,13 +1115,10 @@ extern void blk_finish_request(struct request *rq, int error); extern bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void blk_end_request_all(struct request *rq, int error); -extern bool blk_end_request_cur(struct request *rq, int error); -extern bool blk_end_request_err(struct request *rq, int error); extern bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes); extern void __blk_end_request_all(struct request *rq, int error); extern bool __blk_end_request_cur(struct request *rq, int error); -extern bool __blk_end_request_err(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); @@ -1329,23 +1321,27 @@ static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt, return bqt->tag_index[tag]; } +extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); +extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ -#define BLKDEV_DISCARD_ZERO (1 << 1) /* must reliably zero data */ -extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *); extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); -extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page); + +#define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ +#define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ + extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, - bool discard); + unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, bool discard); + sector_t nr_sects, gfp_t gfp_mask, unsigned flags); + static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { @@ -1359,7 +1355,7 @@ static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - 9), nr_blocks << (sb->s_blocksize_bits - 9), - gfp_mask, true); + gfp_mask, 0); } extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm); @@ -1529,19 +1525,6 @@ static inline int bdev_discard_alignment(struct block_device *bdev) return q->limits.discard_alignment; } -static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) -{ - if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1) - return 1; - - return 0; -} - -static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) -{ - return queue_discard_zeroes_data(bdev_get_queue(bdev)); -} - static inline unsigned int bdev_write_same(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); @@ -1726,6 +1709,7 @@ int kblockd_schedule_work(struct work_struct *work); int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); +int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #ifdef CONFIG_BLK_CGROUP /* @@ -1939,28 +1923,12 @@ static inline bool integrity_req_gap_front_merge(struct request *req, #endif /* CONFIG_BLK_DEV_INTEGRITY */ -/** - * struct blk_dax_ctl - control and output parameters for ->direct_access - * @sector: (input) offset relative to a block_device - * @addr: (output) kernel virtual address for @sector populated by driver - * @pfn: (output) page frame number for @addr populated by driver - * @size: (input) number of bytes requested - */ -struct blk_dax_ctl { - sector_t sector; - void *addr; - long size; - pfn_t pfn; -}; - struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, bool); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); - long (*direct_access)(struct block_device *, sector_t, void **, pfn_t *, - long); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); /* ->media_changed() is DEPRECATED, use ->check_events() instead */ @@ -1979,9 +1947,6 @@ extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); -extern long bdev_direct_access(struct block_device *, struct blk_dax_ctl *); -extern int bdev_dax_supported(struct super_block *, int); -extern bool bdev_dax_capable(struct block_device *); #else /* CONFIG_BLOCK */ struct block_device; diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 909fc033173a7c893ffe7113f0e32568392b76ae..6bb38d76faf42a18437eb7565380b75a71096f78 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -35,6 +35,7 @@ struct bpf_map_ops { void *(*map_fd_get_ptr)(struct bpf_map *map, struct file *map_file, int fd); void (*map_fd_put_ptr)(void *ptr); + u32 (*map_gen_lookup)(struct bpf_map *map, struct bpf_insn *insn_buf); }; struct bpf_map { @@ -49,12 +50,7 @@ struct bpf_map { const struct bpf_map_ops *ops; struct work_struct work; atomic_t usercnt; -}; - -struct bpf_map_type_list { - struct list_head list_node; - const struct bpf_map_ops *ops; - enum bpf_map_type type; + struct bpf_map *inner_map_meta; }; /* function argument constraints */ @@ -167,12 +163,8 @@ struct bpf_verifier_ops { const struct bpf_insn *src, struct bpf_insn *dst, struct bpf_prog *prog); -}; - -struct bpf_prog_type_list { - struct list_head list_node; - const struct bpf_verifier_ops *ops; - enum bpf_prog_type type; + int (*test_run)(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); }; struct bpf_prog_aux { @@ -231,11 +223,21 @@ typedef unsigned long (*bpf_ctx_copy_t)(void *dst, const void *src, u64 bpf_event_output(struct bpf_map *map, u64 flags, void *meta, u64 meta_size, void *ctx, u64 ctx_size, bpf_ctx_copy_t ctx_copy); +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr); + #ifdef CONFIG_BPF_SYSCALL DECLARE_PER_CPU(int, bpf_prog_active); -void bpf_register_prog_type(struct bpf_prog_type_list *tl); -void bpf_register_map_type(struct bpf_map_type_list *tl); +#define BPF_PROG_TYPE(_id, _ops) \ + extern const struct bpf_verifier_ops _ops; +#define BPF_MAP_TYPE(_id, _ops) \ + extern const struct bpf_map_ops _ops; +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE struct bpf_prog *bpf_prog_get(u32 ufd); struct bpf_prog *bpf_prog_get_type(u32 ufd, enum bpf_prog_type type); @@ -275,6 +277,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value); int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file, void *key, void *value, u64 map_flags); void bpf_fd_array_map_clear(struct bpf_map *map); +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags); /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and * forced to use 'long' read/writes to try to atomically copy long counters. @@ -295,10 +299,6 @@ static inline void bpf_long_memcpy(void *dst, const void *src, u32 size) /* verify correctness of eBPF program */ int bpf_check(struct bpf_prog **fp, union bpf_attr *attr); #else -static inline void bpf_register_prog_type(struct bpf_prog_type_list *tl) -{ -} - static inline struct bpf_prog *bpf_prog_get(u32 ufd) { return ERR_PTR(-EOPNOTSUPP); diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h new file mode 100644 index 0000000000000000000000000000000000000000..03bf223f18be0001c80f4307ed4f521361984104 --- /dev/null +++ b/include/linux/bpf_types.h @@ -0,0 +1,36 @@ +/* internal file - do not include directly */ + +#ifdef CONFIG_NET +BPF_PROG_TYPE(BPF_PROG_TYPE_SOCKET_FILTER, sk_filter_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_CLS, tc_cls_act_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_SCHED_ACT, tc_cls_act_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_XDP, xdp_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SKB, cg_skb_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_CGROUP_SOCK, cg_sock_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_IN, lwt_inout_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_OUT, lwt_inout_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_LWT_XMIT, lwt_xmit_prog_ops) +#endif +#ifdef CONFIG_BPF_EVENTS +BPF_PROG_TYPE(BPF_PROG_TYPE_KPROBE, kprobe_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_TRACEPOINT, tracepoint_prog_ops) +BPF_PROG_TYPE(BPF_PROG_TYPE_PERF_EVENT, perf_event_prog_ops) +#endif + +BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PROG_ARRAY, prog_array_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERF_EVENT_ARRAY, perf_event_array_map_ops) +#ifdef CONFIG_CGROUPS +BPF_MAP_TYPE(BPF_MAP_TYPE_CGROUP_ARRAY, cgroup_array_map_ops) +#endif +BPF_MAP_TYPE(BPF_MAP_TYPE_HASH, htab_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_HASH, htab_percpu_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_HASH, htab_lru_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_LRU_PERCPU_HASH, htab_lru_percpu_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_LPM_TRIE, trie_map_ops) +#ifdef CONFIG_PERF_EVENTS +BPF_MAP_TYPE(BPF_MAP_TYPE_STACK_TRACE, stack_map_ops) +#endif +BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY_OF_MAPS, array_of_maps_map_ops) +BPF_MAP_TYPE(BPF_MAP_TYPE_HASH_OF_MAPS, htab_of_maps_map_ops) diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h index a13b031dc6b807f38c0e7e687ba2cfbfe4c65fb7..5efb4db44e1ef3223d984296ccf1ca0737224d10 100644 --- a/include/linux/bpf_verifier.h +++ b/include/linux/bpf_verifier.h @@ -66,7 +66,10 @@ struct bpf_verifier_state_list { }; struct bpf_insn_aux_data { - enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ + union { + enum bpf_reg_type ptr_type; /* pointer type for load/store insns */ + struct bpf_map *map_ptr; /* pointer for call insn into lookup_elem */ + }; }; #define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */ diff --git a/include/linux/brcmphy.h b/include/linux/brcmphy.h index 55e517130311980b36ad628054043130f88626af..abcda9b458ab65143acb1fb28b50225adbec83fd 100644 --- a/include/linux/brcmphy.h +++ b/include/linux/brcmphy.h @@ -25,6 +25,9 @@ #define PHY_ID_BCM57780 0x03625d90 #define PHY_ID_BCM7250 0xae025280 +#define PHY_ID_BCM7260 0xae025190 +#define PHY_ID_BCM7268 0xae025090 +#define PHY_ID_BCM7271 0xae0253b0 #define PHY_ID_BCM7278 0xae0251a0 #define PHY_ID_BCM7364 0xae025260 #define PHY_ID_BCM7366 0x600d8490 diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 79591c3660cc1dc61e1e3f4b501c470161c41eb9..bd029e52ef5ee232f77ff28ef3a7b4ec391f1d71 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -196,8 +196,6 @@ void ll_rw_block(int, int, int, struct buffer_head * bh[]); int sync_dirty_buffer(struct buffer_head *bh); int __sync_dirty_buffer(struct buffer_head *bh, int op_flags); void write_dirty_buffer(struct buffer_head *bh, int op_flags); -int _submit_bh(int op, int op_flags, struct buffer_head *bh, - unsigned long bio_flags); int submit_bh(int, int, struct buffer_head *); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize); diff --git a/include/linux/bug.h b/include/linux/bug.h index 5828489309bbd22a255f11064418dc3a6b9366de..687b557fc5eb9fca13f4dadd3643eb769886da67 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -105,7 +105,7 @@ static inline int is_warning_bug(const struct bug_entry *bug) return bug->flags & BUGFLAG_WARNING; } -const struct bug_entry *find_bug(unsigned long bugaddr); +struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); diff --git a/include/linux/can/core.h b/include/linux/can/core.h index df08a41d5be5f26cfa4cdc74935f5eae7fa51385..c9a17bb1221c326f7569d423d93304fe241f72e0 100644 --- a/include/linux/can/core.h +++ b/include/linux/can/core.h @@ -5,7 +5,7 @@ * * Authors: Oliver Hartkopp * Urs Thuermann - * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * */ @@ -17,7 +17,7 @@ #include #include -#define CAN_VERSION "20120528" +#define CAN_VERSION "20170425" /* increment this number each time you change some user-space interface */ #define CAN_ABI_VERSION "9" @@ -45,12 +45,13 @@ struct can_proto { extern int can_proto_register(const struct can_proto *cp); extern void can_proto_unregister(const struct can_proto *cp); -int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, +int can_rx_register(struct net *net, struct net_device *dev, + canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data, char *ident, struct sock *sk); -extern void can_rx_unregister(struct net_device *dev, canid_t can_id, - canid_t mask, +extern void can_rx_unregister(struct net *net, struct net_device *dev, + canid_t can_id, canid_t mask, void (*func)(struct sk_buff *, void *), void *data); diff --git a/include/linux/can/dev/peak_canfd.h b/include/linux/can/dev/peak_canfd.h new file mode 100644 index 0000000000000000000000000000000000000000..46dceef2cfa6c0efa6d4a52df5467dba6b6a7c2c --- /dev/null +++ b/include/linux/can/dev/peak_canfd.h @@ -0,0 +1,308 @@ +/* + * CAN driver for PEAK System micro-CAN based adapters + * + * Copyright (C) 2003-2011 PEAK System-Technik GmbH + * Copyright (C) 2011-2013 Stephane Grosjean + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published + * by the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +#ifndef PUCAN_H +#define PUCAN_H + +/* uCAN commands opcodes list (low-order 10 bits) */ +#define PUCAN_CMD_NOP 0x000 +#define PUCAN_CMD_RESET_MODE 0x001 +#define PUCAN_CMD_NORMAL_MODE 0x002 +#define PUCAN_CMD_LISTEN_ONLY_MODE 0x003 +#define PUCAN_CMD_TIMING_SLOW 0x004 +#define PUCAN_CMD_TIMING_FAST 0x005 +#define PUCAN_CMD_SET_STD_FILTER 0x006 +#define PUCAN_CMD_RESERVED2 0x007 +#define PUCAN_CMD_FILTER_STD 0x008 +#define PUCAN_CMD_TX_ABORT 0x009 +#define PUCAN_CMD_WR_ERR_CNT 0x00a +#define PUCAN_CMD_SET_EN_OPTION 0x00b +#define PUCAN_CMD_CLR_DIS_OPTION 0x00c +#define PUCAN_CMD_RX_BARRIER 0x010 +#define PUCAN_CMD_END_OF_COLLECTION 0x3ff + +/* uCAN received messages list */ +#define PUCAN_MSG_CAN_RX 0x0001 +#define PUCAN_MSG_ERROR 0x0002 +#define PUCAN_MSG_STATUS 0x0003 +#define PUCAN_MSG_BUSLOAD 0x0004 + +#define PUCAN_MSG_CACHE_CRITICAL 0x0102 + +/* uCAN transmitted messages */ +#define PUCAN_MSG_CAN_TX 0x1000 + +/* uCAN command common header */ +struct __packed pucan_command { + __le16 opcode_channel; + u16 args[3]; +}; + +/* return the opcode from the opcode_channel field of a command */ +static inline u16 pucan_cmd_get_opcode(struct pucan_command *c) +{ + return le16_to_cpu(c->opcode_channel) & 0x3ff; +} + +#define PUCAN_TSLOW_BRP_BITS 10 +#define PUCAN_TSLOW_TSGEG1_BITS 8 +#define PUCAN_TSLOW_TSGEG2_BITS 7 +#define PUCAN_TSLOW_SJW_BITS 7 + +#define PUCAN_TSLOW_BRP_MASK ((1 << PUCAN_TSLOW_BRP_BITS) - 1) +#define PUCAN_TSLOW_TSEG1_MASK ((1 << PUCAN_TSLOW_TSGEG1_BITS) - 1) +#define PUCAN_TSLOW_TSEG2_MASK ((1 << PUCAN_TSLOW_TSGEG2_BITS) - 1) +#define PUCAN_TSLOW_SJW_MASK ((1 << PUCAN_TSLOW_SJW_BITS) - 1) + +/* uCAN TIMING_SLOW command fields */ +#define PUCAN_TSLOW_SJW_T(s, t) (((s) & PUCAN_TSLOW_SJW_MASK) | \ + ((!!(t)) << 7)) +#define PUCAN_TSLOW_TSEG2(t) ((t) & PUCAN_TSLOW_TSEG2_MASK) +#define PUCAN_TSLOW_TSEG1(t) ((t) & PUCAN_TSLOW_TSEG1_MASK) +#define PUCAN_TSLOW_BRP(b) ((b) & PUCAN_TSLOW_BRP_MASK) + +struct __packed pucan_timing_slow { + __le16 opcode_channel; + + u8 ewl; /* Error Warning limit */ + u8 sjw_t; /* Sync Jump Width + Triple sampling */ + u8 tseg2; /* Timing SEGment 2 */ + u8 tseg1; /* Timing SEGment 1 */ + + __le16 brp; /* BaudRate Prescaler */ +}; + +#define PUCAN_TFAST_BRP_BITS 10 +#define PUCAN_TFAST_TSGEG1_BITS 5 +#define PUCAN_TFAST_TSGEG2_BITS 4 +#define PUCAN_TFAST_SJW_BITS 4 + +#define PUCAN_TFAST_BRP_MASK ((1 << PUCAN_TFAST_BRP_BITS) - 1) +#define PUCAN_TFAST_TSEG1_MASK ((1 << PUCAN_TFAST_TSGEG1_BITS) - 1) +#define PUCAN_TFAST_TSEG2_MASK ((1 << PUCAN_TFAST_TSGEG2_BITS) - 1) +#define PUCAN_TFAST_SJW_MASK ((1 << PUCAN_TFAST_SJW_BITS) - 1) + +/* uCAN TIMING_FAST command fields */ +#define PUCAN_TFAST_SJW(s) ((s) & PUCAN_TFAST_SJW_MASK) +#define PUCAN_TFAST_TSEG2(t) ((t) & PUCAN_TFAST_TSEG2_MASK) +#define PUCAN_TFAST_TSEG1(t) ((t) & PUCAN_TFAST_TSEG1_MASK) +#define PUCAN_TFAST_BRP(b) ((b) & PUCAN_TFAST_BRP_MASK) + +struct __packed pucan_timing_fast { + __le16 opcode_channel; + + u8 unused; + u8 sjw; /* Sync Jump Width */ + u8 tseg2; /* Timing SEGment 2 */ + u8 tseg1; /* Timing SEGment 1 */ + + __le16 brp; /* BaudRate Prescaler */ +}; + +/* uCAN FILTER_STD command fields */ +#define PUCAN_FLTSTD_ROW_IDX_BITS 6 + +struct __packed pucan_filter_std { + __le16 opcode_channel; + + __le16 idx; + __le32 mask; /* CAN-ID bitmask in idx range */ +}; + +#define PUCAN_FLTSTD_ROW_IDX_MAX ((1 << PUCAN_FLTSTD_ROW_IDX_BITS) - 1) + +/* uCAN SET_STD_FILTER command fields */ +struct __packed pucan_std_filter { + __le16 opcode_channel; + + u8 unused; + u8 idx; + __le32 mask; /* CAN-ID bitmask in idx range */ +}; + +/* uCAN TX_ABORT commands fields */ +#define PUCAN_TX_ABORT_FLUSH 0x0001 + +struct __packed pucan_tx_abort { + __le16 opcode_channel; + + __le16 flags; + u32 unused; +}; + +/* uCAN WR_ERR_CNT command fields */ +#define PUCAN_WRERRCNT_TE 0x4000 /* Tx error cntr write Enable */ +#define PUCAN_WRERRCNT_RE 0x8000 /* Rx error cntr write Enable */ + +struct __packed pucan_wr_err_cnt { + __le16 opcode_channel; + + __le16 sel_mask; + u8 tx_counter; /* Tx error counter new value */ + u8 rx_counter; /* Rx error counter new value */ + + u16 unused; +}; + +/* uCAN SET_EN/CLR_DIS _OPTION command fields */ +#define PUCAN_OPTION_ERROR 0x0001 +#define PUCAN_OPTION_BUSLOAD 0x0002 +#define PUCAN_OPTION_CANDFDISO 0x0004 + +struct __packed pucan_options { + __le16 opcode_channel; + + __le16 options; + u32 unused; +}; + +/* uCAN received messages global format */ +struct __packed pucan_msg { + __le16 size; + __le16 type; + __le32 ts_low; + __le32 ts_high; +}; + +/* uCAN flags for CAN/CANFD messages */ +#define PUCAN_MSG_SELF_RECEIVE 0x80 +#define PUCAN_MSG_ERROR_STATE_IND 0x40 /* error state indicator */ +#define PUCAN_MSG_BITRATE_SWITCH 0x20 /* bitrate switch */ +#define PUCAN_MSG_EXT_DATA_LEN 0x10 /* extended data length */ +#define PUCAN_MSG_SINGLE_SHOT 0x08 +#define PUCAN_MSG_LOOPED_BACK 0x04 +#define PUCAN_MSG_EXT_ID 0x02 +#define PUCAN_MSG_RTR 0x01 + +struct __packed pucan_rx_msg { + __le16 size; + __le16 type; + __le32 ts_low; + __le32 ts_high; + __le32 tag_low; + __le32 tag_high; + u8 channel_dlc; + u8 client; + __le16 flags; + __le32 can_id; + u8 d[0]; +}; + +/* uCAN error types */ +#define PUCAN_ERMSG_BIT_ERROR 0 +#define PUCAN_ERMSG_FORM_ERROR 1 +#define PUCAN_ERMSG_STUFF_ERROR 2 +#define PUCAN_ERMSG_OTHER_ERROR 3 +#define PUCAN_ERMSG_ERR_CNT_DEC 4 + +struct __packed pucan_error_msg { + __le16 size; + __le16 type; + __le32 ts_low; + __le32 ts_high; + u8 channel_type_d; + u8 code_g; + u8 tx_err_cnt; + u8 rx_err_cnt; +}; + +static inline int pucan_error_get_channel(const struct pucan_error_msg *msg) +{ + return msg->channel_type_d & 0x0f; +} + +#define PUCAN_RX_BARRIER 0x10 +#define PUCAN_BUS_PASSIVE 0x20 +#define PUCAN_BUS_WARNING 0x40 +#define PUCAN_BUS_BUSOFF 0x80 + +struct __packed pucan_status_msg { + __le16 size; + __le16 type; + __le32 ts_low; + __le32 ts_high; + u8 channel_p_w_b; + u8 unused[3]; +}; + +static inline int pucan_status_get_channel(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & 0x0f; +} + +static inline int pucan_status_is_rx_barrier(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & PUCAN_RX_BARRIER; +} + +static inline int pucan_status_is_passive(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & PUCAN_BUS_PASSIVE; +} + +static inline int pucan_status_is_warning(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & PUCAN_BUS_WARNING; +} + +static inline int pucan_status_is_busoff(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & PUCAN_BUS_BUSOFF; +} + +/* uCAN transmitted message format */ +#define PUCAN_MSG_CHANNEL_DLC(c, d) (((c) & 0xf) | ((d) << 4)) + +struct __packed pucan_tx_msg { + __le16 size; + __le16 type; + __le32 tag_low; + __le32 tag_high; + u8 channel_dlc; + u8 client; + __le16 flags; + __le32 can_id; + u8 d[0]; +}; + +/* build the cmd opcode_channel field with respect to the correct endianness */ +static inline __le16 pucan_cmd_opcode_channel(int index, int opcode) +{ + return cpu_to_le16(((index) << 12) | ((opcode) & 0x3ff)); +} + +/* return the channel number part from any received message channel_dlc field */ +static inline int pucan_msg_get_channel(const struct pucan_rx_msg *msg) +{ + return msg->channel_dlc & 0xf; +} + +/* return the dlc value from any received message channel_dlc field */ +static inline int pucan_msg_get_dlc(const struct pucan_rx_msg *msg) +{ + return msg->channel_dlc >> 4; +} + +static inline int pucan_ermsg_get_channel(const struct pucan_error_msg *msg) +{ + return msg->channel_type_d & 0x0f; +} + +static inline int pucan_stmsg_get_channel(const struct pucan_status_msg *msg) +{ + return msg->channel_p_w_b & 0x0f; +} + +#endif diff --git a/include/linux/can/platform/ti_hecc.h b/include/linux/can/platform/ti_hecc.h deleted file mode 100644 index a52f47ca6c8ad9c5159c34b4f568b5d84eccbd18..0000000000000000000000000000000000000000 --- a/include/linux/can/platform/ti_hecc.h +++ /dev/null @@ -1,44 +0,0 @@ -#ifndef _CAN_PLATFORM_TI_HECC_H -#define _CAN_PLATFORM_TI_HECC_H - -/* - * TI HECC (High End CAN Controller) driver platform header - * - * Copyright (C) 2009 Texas Instruments Incorporated - http://www.ti.com/ - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation version 2. - * - * This program is distributed as is WITHOUT ANY WARRANTY of any - * kind, whether express or implied; without even the implied warranty - * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - */ - -/** - * struct hecc_platform_data - HECC Platform Data - * - * @scc_hecc_offset: mostly 0 - should really never change - * @scc_ram_offset: SCC RAM offset - * @hecc_ram_offset: HECC RAM offset - * @mbx_offset: Mailbox RAM offset - * @int_line: Interrupt line to use - 0 or 1 - * @version: version for future use - * @transceiver_switch: platform specific callback fn for transceiver control - * - * Platform data structure to get all platform specific settings. - * this structure also accounts the fact that the IP may have different - * RAM and mailbox offsets for different SOC's - */ -struct ti_hecc_platform_data { - u32 scc_hecc_offset; - u32 scc_ram_offset; - u32 hecc_ram_offset; - u32 mbx_offset; - u32 int_line; - u32 version; - void (*transceiver_switch) (int); -}; -#endif /* !_CAN_PLATFORM_TI_HECC_H */ diff --git a/include/linux/ccp.h b/include/linux/ccp.h index c41b8d99dd0e7f352bc8e57e4e2ffc4dd08c63b7..3285c944194adcccdde5ae31745c939471260da6 100644 --- a/include/linux/ccp.h +++ b/include/linux/ccp.h @@ -123,6 +123,10 @@ enum ccp_aes_mode { CCP_AES_MODE_CFB, CCP_AES_MODE_CTR, CCP_AES_MODE_CMAC, + CCP_AES_MODE_GHASH, + CCP_AES_MODE_GCTR, + CCP_AES_MODE_GCM, + CCP_AES_MODE_GMAC, CCP_AES_MODE__LAST, }; @@ -137,6 +141,9 @@ enum ccp_aes_action { CCP_AES_ACTION_ENCRYPT, CCP_AES_ACTION__LAST, }; +/* Overloaded field */ +#define CCP_AES_GHASHAAD CCP_AES_ACTION_DECRYPT +#define CCP_AES_GHASHFINAL CCP_AES_ACTION_ENCRYPT /** * struct ccp_aes_engine - CCP AES operation @@ -181,6 +188,8 @@ struct ccp_aes_engine { struct scatterlist *cmac_key; /* K1/K2 cmac key required for * final cmac cmd */ u32 cmac_key_len; /* In bytes */ + + u32 aad_len; /* In bytes */ }; /***** XTS-AES engine *****/ @@ -249,6 +258,8 @@ enum ccp_sha_type { CCP_SHA_TYPE_1 = 1, CCP_SHA_TYPE_224, CCP_SHA_TYPE_256, + CCP_SHA_TYPE_384, + CCP_SHA_TYPE_512, CCP_SHA_TYPE__LAST, }; @@ -290,6 +301,60 @@ struct ccp_sha_engine { * final sha cmd */ }; +/***** 3DES engine *****/ +enum ccp_des3_mode { + CCP_DES3_MODE_ECB = 0, + CCP_DES3_MODE_CBC, + CCP_DES3_MODE_CFB, + CCP_DES3_MODE__LAST, +}; + +enum ccp_des3_type { + CCP_DES3_TYPE_168 = 1, + CCP_DES3_TYPE__LAST, + }; + +enum ccp_des3_action { + CCP_DES3_ACTION_DECRYPT = 0, + CCP_DES3_ACTION_ENCRYPT, + CCP_DES3_ACTION__LAST, +}; + +/** + * struct ccp_des3_engine - CCP SHA operation + * @type: Type of 3DES operation + * @mode: cipher mode + * @action: 3DES operation (decrypt/encrypt) + * @key: key to be used for this 3DES operation + * @key_len: length of key (in bytes) + * @iv: IV to be used for this AES operation + * @iv_len: length in bytes of iv + * @src: input data to be used for this operation + * @src_len: length of input data used for this operation (in bytes) + * @dst: output data produced by this operation + * + * Variables required to be set when calling ccp_enqueue_cmd(): + * - type, mode, action, key, key_len, src, dst, src_len + * - iv, iv_len for any mode other than ECB + * + * The iv variable is used as both input and output. On completion of the + * 3DES operation the new IV overwrites the old IV. + */ +struct ccp_des3_engine { + enum ccp_des3_type type; + enum ccp_des3_mode mode; + enum ccp_des3_action action; + + struct scatterlist *key; + u32 key_len; /* In bytes */ + + struct scatterlist *iv; + u32 iv_len; /* In bytes */ + + struct scatterlist *src, *dst; + u64 src_len; /* In bytes */ +}; + /***** RSA engine *****/ /** * struct ccp_rsa_engine - CCP RSA operation @@ -539,7 +604,7 @@ struct ccp_ecc_engine { enum ccp_engine { CCP_ENGINE_AES = 0, CCP_ENGINE_XTS_AES_128, - CCP_ENGINE_RSVD1, + CCP_ENGINE_DES3, CCP_ENGINE_SHA, CCP_ENGINE_RSA, CCP_ENGINE_PASSTHRU, @@ -587,6 +652,7 @@ struct ccp_cmd { union { struct ccp_aes_engine aes; struct ccp_xts_aes_engine xts; + struct ccp_des3_engine des3; struct ccp_sha_engine sha; struct ccp_rsa_engine rsa; struct ccp_passthru_engine passthru; diff --git a/include/linux/cdev.h b/include/linux/cdev.h index f8763615a5f2dc87073cbaa49e64a9e6a1ebe5e4..408bc09ce497bb14fdd058b11debbdda7c22fa78 100644 --- a/include/linux/cdev.h +++ b/include/linux/cdev.h @@ -4,6 +4,7 @@ #include #include #include +#include struct file_operations; struct inode; @@ -26,6 +27,10 @@ void cdev_put(struct cdev *p); int cdev_add(struct cdev *, dev_t, unsigned); +void cdev_set_parent(struct cdev *p, struct kobject *kobj); +int cdev_device_add(struct cdev *cdev, struct device *dev); +void cdev_device_del(struct cdev *cdev, struct device *dev); + void cdev_del(struct cdev *); void cd_forget(struct inode *); diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index ae2f66833762cd7d63bd77eafb9aed13b6ab2308..fd8b2953c78f85539104fd3741e82a9f37e1e1b5 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -105,8 +105,10 @@ static inline u64 ceph_sanitize_features(u64 features) */ #define CEPH_FEATURES_SUPPORTED_DEFAULT \ (CEPH_FEATURE_NOSRCADDR | \ + CEPH_FEATURE_FLOCK | \ CEPH_FEATURE_SUBSCRIBE2 | \ CEPH_FEATURE_RECONNECT_SEQ | \ + CEPH_FEATURE_DIRLAYOUTHASH | \ CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_OSDENC | \ @@ -114,11 +116,13 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_REPLY_CREATE_INODE | \ + CEPH_FEATURE_MDSENC | \ CEPH_FEATURE_OSDHASHPSPOOL | \ CEPH_FEATURE_OSD_CACHEPOOL | \ CEPH_FEATURE_CRUSH_V2 | \ CEPH_FEATURE_EXPORT_PEER | \ CEPH_FEATURE_OSDMAP_ENC | \ + CEPH_FEATURE_MDS_INLINE_DATA | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \ diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index f4b2ee18f38cbd51d2ded16380e15da7697d9f89..ad078ebe25d6beaad4a80cc906be78c3dd26c06e 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -365,6 +365,19 @@ extern const char *ceph_mds_op_name(int op); #define CEPH_READDIR_FRAG_END (1<<0) #define CEPH_READDIR_FRAG_COMPLETE (1<<8) #define CEPH_READDIR_HASH_ORDER (1<<9) +#define CEPH_READDIR_OFFSET_HASH (1<<10) + +/* + * open request flags + */ +#define CEPH_O_RDONLY 00000000 +#define CEPH_O_WRONLY 00000001 +#define CEPH_O_RDWR 00000002 +#define CEPH_O_CREAT 00000100 +#define CEPH_O_EXCL 00000200 +#define CEPH_O_TRUNC 00001000 +#define CEPH_O_DIRECTORY 00200000 +#define CEPH_O_NOFOLLOW 00400000 union ceph_mds_request_args { struct { @@ -384,6 +397,7 @@ union ceph_mds_request_args { __le32 max_entries; /* how many dentries to grab */ __le32 max_bytes; __le16 flags; + __le32 offset_hash; } __attribute__ ((packed)) readdir; struct { __le32 mode; diff --git a/include/linux/ceph/cls_lock_client.h b/include/linux/ceph/cls_lock_client.h index 84884d8d4710bc568b1590e2ede3775c863e63a1..0594d3bba774c5a78c0bc7e4fde5fbd741bd1c17 100644 --- a/include/linux/ceph/cls_lock_client.h +++ b/include/linux/ceph/cls_lock_client.h @@ -37,6 +37,11 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, struct ceph_object_locator *oloc, char *lock_name, char *cookie, struct ceph_entity_name *locker); +int ceph_cls_set_cookie(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *old_cookie, + char *tag, char *new_cookie); void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers); diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 88cd5dc8e238a2fa7e8c66a034a8e5b0ae248f21..3229ae6c78469019f0eb747c4e09a10137e26b0f 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -161,7 +162,7 @@ struct ceph_client { * dirtied. */ struct ceph_snap_context { - atomic_t nref; + refcount_t nref; u64 seq; u32 num_snaps; u64 snaps[]; @@ -262,10 +263,7 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client); extern void ceph_destroy_options(struct ceph_options *opt); extern int ceph_compare_options(struct ceph_options *new_opt, struct ceph_client *client); -extern struct ceph_client *ceph_create_client(struct ceph_options *opt, - void *private, - u64 supported_features, - u64 required_features); +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private); struct ceph_entity_addr *ceph_client_addr(struct ceph_client *client); u64 ceph_client_gid(struct ceph_client *client); extern void ceph_destroy_client(struct ceph_client *client); diff --git a/include/linux/ceph/mdsmap.h b/include/linux/ceph/mdsmap.h index 8ed5dc505fbb2e0672efd81089d1e35caf8e062c..d5f783f3226a8deb43f9336e7fe3c1b955b0e137 100644 --- a/include/linux/ceph/mdsmap.h +++ b/include/linux/ceph/mdsmap.h @@ -25,6 +25,7 @@ struct ceph_mdsmap { u32 m_session_autoclose; /* seconds */ u64 m_max_file_size; u32 m_max_mds; /* size of m_addr, m_state arrays */ + int m_num_mds; struct ceph_mds_info *m_info; /* which object pools file data can be stored in */ @@ -40,7 +41,7 @@ struct ceph_mdsmap { static inline struct ceph_entity_addr * ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) { - if (w >= m->m_max_mds) + if (w >= m->m_num_mds) return NULL; return &m->m_info[w].addr; } @@ -48,14 +49,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) { BUG_ON(w < 0); - if (w >= m->m_max_mds) + if (w >= m->m_num_mds) return CEPH_MDS_STATE_DNE; return m->m_info[w].state; } static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) { - if (w >= 0 && w < m->m_max_mds) + if (w >= 0 && w < m->m_num_mds) return m->m_info[w].laggy; return false; } diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index c125b5d9e13ceddacd921286f19133607835dee1..85650b415e73ff38487a6b2e6be7253466411a14 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -27,7 +28,7 @@ typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *); /* a given osd we're communicating with */ struct ceph_osd { - atomic_t o_ref; + refcount_t o_ref; struct ceph_osd_client *o_osdc; int o_osd; int o_incarnation; @@ -186,12 +187,12 @@ struct ceph_osd_request { struct timespec r_mtime; /* ditto */ u64 r_data_offset; /* ditto */ bool r_linger; /* don't resend on failure */ + bool r_abort_on_full; /* return ENOSPC when full */ /* internal */ unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_start_stamp; /* jiffies */ int r_attempts; - struct ceph_eversion r_replay_version; /* aka reassert_version */ u32 r_last_force_resend; u32 r_map_dne_bound; @@ -266,6 +267,7 @@ struct ceph_osd_client { struct rb_root osds; /* osds */ struct list_head osd_lru; /* idle osds */ spinlock_t osd_lru_lock; + u32 epoch_barrier; struct ceph_osd homeless_osd; atomic64_t last_tid; /* tid of last request */ u64 last_linger_id; @@ -304,6 +306,7 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg); extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg); +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb); extern void osd_req_op_init(struct ceph_osd_request *osd_req, unsigned int which, u16 opcode, u32 flags); diff --git a/include/linux/ceph/pagelist.h b/include/linux/ceph/pagelist.h index 13d71fe18b0cf54f6097242fd08b9a30c6c48bbe..75a7db21457de5e6fce1e75cfca3749ff539057c 100644 --- a/include/linux/ceph/pagelist.h +++ b/include/linux/ceph/pagelist.h @@ -2,7 +2,7 @@ #define __FS_CEPH_PAGELIST_H #include -#include +#include #include #include @@ -13,7 +13,7 @@ struct ceph_pagelist { size_t room; struct list_head free_list; size_t num_pages_free; - atomic_t refcnt; + refcount_t refcnt; }; struct ceph_pagelist_cursor { @@ -30,7 +30,7 @@ static inline void ceph_pagelist_init(struct ceph_pagelist *pl) pl->room = 0; INIT_LIST_HEAD(&pl->free_list); pl->num_pages_free = 0; - atomic_set(&pl->refcnt, 1); + refcount_set(&pl->refcnt, 1); } extern void ceph_pagelist_release(struct ceph_pagelist *pl); diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index 6a3f850cababb6f96130503014d83d256f2b9213..21745946cae154f53cd87311e9350465388f70a5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -106,9 +107,6 @@ struct cgroup_subsys_state { /* reference count - access via css_[try]get() and css_put() */ struct percpu_ref refcnt; - /* PI: the parent css */ - struct cgroup_subsys_state *parent; - /* siblings list anchored at the parent's ->children */ struct list_head sibling; struct list_head children; @@ -138,6 +136,12 @@ struct cgroup_subsys_state { /* percpu_ref killing and RCU release */ struct rcu_head rcu_head; struct work_struct destroy_work; + + /* + * PI: the parent css. Placed here for cache proximity to following + * fields of the containing structure. + */ + struct cgroup_subsys_state *parent; }; /* @@ -156,7 +160,7 @@ struct css_set { struct cgroup_subsys_state *subsys[CGROUP_SUBSYS_COUNT]; /* reference count */ - atomic_t refcount; + refcount_t refcount; /* the default cgroup associated with this css_set */ struct cgroup *dfl_cgrp; diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index af9c86e958bdad3be90cfb5f19aad99ede457339..ed2573e149faf070714e04f8160f7056b2fb1d3e 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -17,11 +17,11 @@ #include #include #include -#include #include #include #include #include +#include #include @@ -661,7 +661,7 @@ static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { - atomic_t count; + refcount_t count; struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; @@ -696,12 +696,12 @@ copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) - atomic_inc(&ns->count); + refcount_inc(&ns->count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { - if (ns && atomic_dec_and_test(&ns->count)) + if (ns && refcount_dec_and_test(&ns->count)) free_cgroup_ns(ns); } diff --git a/include/linux/clk.h b/include/linux/clk.h index e9d36b3e49de5b1bfa0f01277c093222e1ae2a6d..024cd07870d0e240c48f1eeb2793659150a57eef 100644 --- a/include/linux/clk.h +++ b/include/linux/clk.h @@ -132,8 +132,8 @@ int clk_get_phase(struct clk *clk); * @q: clk compared against p * * Returns true if the two struct clk pointers both point to the same hardware - * clock node. Put differently, returns true if struct clk *p and struct clk *q - * share the same struct clk_core object. + * clock node. Put differently, returns true if @p and @q + * share the same &struct clk_core object. * * Returns false otherwise. Note that two NULL clks are treated as matching. */ diff --git a/include/linux/clk/tegra.h b/include/linux/clk/tegra.h index 7007a5f480802ebcad8951b113f976801981a063..d23c9cf26993b5f4518710c2495f426cb48abf85 100644 --- a/include/linux/clk/tegra.h +++ b/include/linux/clk/tegra.h @@ -125,5 +125,8 @@ extern void tegra210_xusb_pll_hw_control_enable(void); extern void tegra210_xusb_pll_hw_sequence_start(void); extern void tegra210_sata_pll_hw_control_enable(void); extern void tegra210_sata_pll_hw_sequence_start(void); +extern void tegra210_set_sata_pll_seq_sw(bool state); +extern void tegra210_put_utmipll_in_iddq(void); +extern void tegra210_put_utmipll_out_iddq(void); #endif /* __LINUX_CLK_TEGRA_H_ */ diff --git a/include/linux/clk/ti.h b/include/linux/clk/ti.h index 6110fe09ed18c289881e63d3b8e1ecb62dfb08c9..d18da839b81013b0624bfcddb11451460e39bee9 100644 --- a/include/linux/clk/ti.h +++ b/include/linux/clk/ti.h @@ -18,6 +18,18 @@ #include #include +/** + * struct clk_omap_reg - OMAP register declaration + * @offset: offset from the master IP module base address + * @index: index of the master IP module + */ +struct clk_omap_reg { + void __iomem *ptr; + u16 offset; + u8 index; + u8 flags; +}; + /** * struct dpll_data - DPLL registers and integration data * @mult_div1_reg: register containing the DPLL M and N bitfields @@ -67,12 +79,12 @@ * can be placed into read-only space. */ struct dpll_data { - void __iomem *mult_div1_reg; + struct clk_omap_reg mult_div1_reg; u32 mult_mask; u32 div1_mask; struct clk_hw *clk_bypass; struct clk_hw *clk_ref; - void __iomem *control_reg; + struct clk_omap_reg control_reg; u32 enable_mask; unsigned long last_rounded_rate; u16 last_rounded_m; @@ -84,8 +96,8 @@ struct dpll_data { u16 max_divider; unsigned long max_rate; u8 modes; - void __iomem *autoidle_reg; - void __iomem *idlest_reg; + struct clk_omap_reg autoidle_reg; + struct clk_omap_reg idlest_reg; u32 autoidle_mask; u32 freqsel_mask; u32 idlest_mask; @@ -113,10 +125,10 @@ struct clk_hw_omap; */ struct clk_hw_omap_ops { void (*find_idlest)(struct clk_hw_omap *oclk, - void __iomem **idlest_reg, + struct clk_omap_reg *idlest_reg, u8 *idlest_bit, u8 *idlest_val); void (*find_companion)(struct clk_hw_omap *oclk, - void __iomem **other_reg, + struct clk_omap_reg *other_reg, u8 *other_bit); void (*allow_idle)(struct clk_hw_omap *oclk); void (*deny_idle)(struct clk_hw_omap *oclk); @@ -129,8 +141,6 @@ struct clk_hw_omap_ops { * @enable_bit: bitshift to write to enable/disable the clock (see @enable_reg) * @flags: see "struct clk.flags possibilities" above * @clksel_reg: for clksel clks, register va containing src/divisor select - * @clksel_mask: bitmask in @clksel_reg for the src/divisor selector - * @clksel: for clksel clks, pointer to struct clksel for this clock * @dpll_data: for DPLLs, pointer to struct dpll_data for this clock * @clkdm_name: clockdomain name that this clock is contained in * @clkdm: pointer to struct clockdomain, resolved from @clkdm_name at runtime @@ -141,12 +151,10 @@ struct clk_hw_omap { struct list_head node; unsigned long fixed_rate; u8 fixed_div; - void __iomem *enable_reg; + struct clk_omap_reg enable_reg; u8 enable_bit; u8 flags; - void __iomem *clksel_reg; - u32 clksel_mask; - const struct clksel *clksel; + struct clk_omap_reg clksel_reg; struct dpll_data *dpll_data; const char *clkdm_name; struct clockdomain *clkdm; @@ -172,7 +180,6 @@ struct clk_hw_omap { * should be used. This is a temporary solution - a better approach * would be to associate clock type-specific data with the clock, * similar to the struct dpll_data approach. - * MEMMAP_ADDRESSING: Use memmap addressing to access clock registers. */ #define ENABLE_REG_32BIT (1 << 0) /* Use 32-bit access */ #define CLOCK_IDLE_CONTROL (1 << 1) @@ -180,7 +187,6 @@ struct clk_hw_omap { #define ENABLE_ON_INIT (1 << 3) /* Enable upon framework init */ #define INVERT_ENABLE (1 << 4) /* 0 enables, 1 disables */ #define CLOCK_CLKOUTX2 (1 << 5) -#define MEMMAP_ADDRESSING (1 << 6) /* CM_CLKEN_PLL*.EN* bit values - not all are available for every DPLL */ #define DPLL_LOW_POWER_STOP 0x1 @@ -201,22 +207,13 @@ enum { CLK_MAX_MEMMAPS }; -/** - * struct clk_omap_reg - OMAP register declaration - * @offset: offset from the master IP module base address - * @index: index of the master IP module - */ -struct clk_omap_reg { - u16 offset; - u16 index; -}; - /** * struct ti_clk_ll_ops - low-level ops for clocks * @clk_readl: pointer to register read function * @clk_writel: pointer to register write function * @clkdm_clk_enable: pointer to clockdomain enable function * @clkdm_clk_disable: pointer to clockdomain disable function + * @clkdm_lookup: pointer to clockdomain lookup function * @cm_wait_module_ready: pointer to CM module wait ready function * @cm_split_idlest_reg: pointer to CM module function to split idlest reg * @@ -227,20 +224,20 @@ struct clk_omap_reg { * operations not provided directly by clock drivers. */ struct ti_clk_ll_ops { - u32 (*clk_readl)(void __iomem *reg); - void (*clk_writel)(u32 val, void __iomem *reg); + u32 (*clk_readl)(const struct clk_omap_reg *reg); + void (*clk_writel)(u32 val, const struct clk_omap_reg *reg); int (*clkdm_clk_enable)(struct clockdomain *clkdm, struct clk *clk); int (*clkdm_clk_disable)(struct clockdomain *clkdm, struct clk *clk); + struct clockdomain * (*clkdm_lookup)(const char *name); int (*cm_wait_module_ready)(u8 part, s16 prcm_mod, u16 idlest_reg, u8 idlest_shift); - int (*cm_split_idlest_reg)(void __iomem *idlest_reg, s16 *prcm_inst, - u8 *idlest_reg_id); + int (*cm_split_idlest_reg)(struct clk_omap_reg *idlest_reg, + s16 *prcm_inst, u8 *idlest_reg_id); }; #define to_clk_hw_omap(_hw) container_of(_hw, struct clk_hw_omap, hw) -void omap2_init_clk_clkdm(struct clk_hw *clk); int omap2_clk_disable_autoidle_all(void); int omap2_clk_enable_autoidle_all(void); int omap2_clk_allow_idle(struct clk *clk); diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 6d7edc3082f98466566cd1ae67640d7e0bd976d3..acc9ce05e5f022cdee1dd9e22019e6461ddf6475 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -182,7 +182,6 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *e extern void clockevents_register_device(struct clock_event_device *dev); extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); -extern void clockevents_config(struct clock_event_device *dev, u32 freq); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index cfc75848a35d2c58ed85720fcd344fdc2c6f04ad..f2b10d9ebd04e7d49d032b5cb097e7f2ae630e56 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -120,7 +120,7 @@ struct clocksource { #define CLOCK_SOURCE_RESELECT 0x100 /* simplify initialization of mask field */ -#define CLOCKSOURCE_MASK(bits) (u64)((bits) < 64 ? ((1ULL<<(bits))-1) : -1) +#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) static inline u32 clocksource_freq2mult(u32 freq, u32 shift_constant, u64 from) { diff --git a/include/linux/cma.h b/include/linux/cma.h index 03f32d0bd1d8a7be5a0fddbc058e0a7f6b059efe..3e8fbf5a5c73ae1b0a40308de1158df65988c2fa 100644 --- a/include/linux/cma.h +++ b/include/linux/cma.h @@ -21,15 +21,19 @@ struct cma; extern unsigned long totalcma_pages; extern phys_addr_t cma_get_base(const struct cma *cma); extern unsigned long cma_get_size(const struct cma *cma); +extern const char *cma_get_name(const struct cma *cma); extern int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, struct cma **res_cma); + bool fixed, const char *name, struct cma **res_cma); extern int cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, + const char *name, struct cma **res_cma); extern struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, gfp_t gfp_mask); extern bool cma_release(struct cma *cma, const struct page *pages, unsigned int count); + +extern int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data); #endif diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 5b8721efa948ec7d93fa7caeb79a535c51e8d69b..31e4e1f1547cc1698514d854898110446ba4a4f7 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -15,7 +15,6 @@ struct venus_comm { struct list_head vc_processing; int vc_inuse; struct super_block *vc_sb; - struct backing_dev_info bdi; struct mutex vc_mutex; }; diff --git a/include/linux/compat.h b/include/linux/compat.h index aef47be2a5c1a3fd3ea75161f5bc93627772515c..1c5f3152cbb57796caf1a587103ed3d092324a04 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -295,6 +295,13 @@ struct compat_old_sigaction { }; #endif +struct compat_keyctl_kdf_params { + compat_uptr_t hashname; + compat_uptr_t otherinfo; + __u32 otherinfolen; + __u32 __spare[8]; +}; + struct compat_statfs; struct compat_statfs64; struct compat_old_linux_dirent; @@ -528,11 +535,6 @@ asmlinkage long compat_sys_old_readdir(unsigned int fd, asmlinkage long compat_sys_getdents(unsigned int fd, struct compat_linux_dirent __user *dirent, unsigned int count); -#ifdef __ARCH_WANT_COMPAT_SYS_GETDENTS64 -asmlinkage long compat_sys_getdents64(unsigned int fd, - struct linux_dirent64 __user *dirent, - unsigned int count); -#endif asmlinkage long compat_sys_vmsplice(int fd, const struct compat_iovec __user *, unsigned int nr_segs, unsigned int flags); asmlinkage long compat_sys_open(const char __user *filename, int flags, @@ -723,6 +725,8 @@ asmlinkage long compat_sys_sched_rr_get_interval(compat_pid_t pid, asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32, int, const char __user *); +asmlinkage long compat_sys_arch_prctl(int option, unsigned long arg2); + /* * For most but not all architectures, "am I in a compat syscall?" and * "am I a compat task?" are the same question. For architectures on which diff --git a/include/linux/console.h b/include/linux/console.h index 5949d185558990daed3b4c76bbdfdad20fb34608..b8920a031a3e357d71101905e3396ba245160781 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -212,4 +212,6 @@ extern bool vgacon_text_force(void); static inline bool vgacon_text_force(void) { return false; } #endif +extern void console_init(void); + #endif /* _LINUX_CONSOLE_H */ diff --git a/include/linux/coresight.h b/include/linux/coresight.h index 2a5982c37dfb660d344777c79dcde044ca304f1c..035c16c9a5051afccd8e77db5417335866812c8d 100644 --- a/include/linux/coresight.h +++ b/include/linux/coresight.h @@ -201,7 +201,7 @@ struct coresight_ops_sink { void *sink_config); unsigned long (*reset_buffer)(struct coresight_device *csdev, struct perf_output_handle *handle, - void *sink_config, bool *lost); + void *sink_config); void (*update_buffer)(struct coresight_device *csdev, struct perf_output_handle *handle, void *sink_config); diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 87165f06a3079dcfe507674e495859c5627f74ca..a5ce0bbeadb5de5a1d28bb085258d6e4ae94abbf 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -120,6 +120,13 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; + /* + * Preferred average time interval between consecutive invocations of + * the driver to set the frequency for this policy. To be set by the + * scaling driver (0, which is the default, means no preference). + */ + unsigned int transition_delay_us; + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 62d240e962f036ad72d1ddf96933dd41f41acaf2..0f2a80377520ec1d12299c78775d7162c63ac8c2 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -94,6 +94,7 @@ enum cpuhp_state { CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, + CPUHP_AP_PERF_ARM_ACPI_STARTING, CPUHP_AP_PERF_ARM_STARTING, CPUHP_AP_ARM_L2X0_STARTING, CPUHP_AP_ARM_ARCH_TIMER_STARTING, @@ -137,6 +138,7 @@ enum cpuhp_state { CPUHP_AP_PERF_ARM_CCN_ONLINE, CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, + CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_ONLINE_DYN, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 96f1e88b767c0058bee7f3dbcc0d175e68735cfe..2404ad238c0b8c0309e8c1480339a19967968edf 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -40,9 +40,9 @@ extern int nr_cpu_ids; #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ -#define nr_cpumask_bits nr_cpu_ids +#define nr_cpumask_bits ((unsigned int)nr_cpu_ids) #else -#define nr_cpumask_bits NR_CPUS +#define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* @@ -667,6 +667,11 @@ void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); +static inline bool cpumask_available(cpumask_var_t mask) +{ + return mask != NULL; +} + #else typedef struct cpumask cpumask_var_t[1]; @@ -708,6 +713,11 @@ static inline void free_cpumask_var(cpumask_var_t mask) static inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } + +static inline bool cpumask_available(cpumask_var_t mask) +{ + return true; +} #endif /* CONFIG_CPUMASK_OFFSTACK */ /* It's common to want to use cpu_all_mask in struct member initializers, diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 611fce58d67039387215073978c02fda572c93fd..119a3f9604b0d90b499bdbd6627d30fbc936469b 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -42,7 +42,7 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); -extern void cpuset_update_active_cpus(bool cpu_online); +extern void cpuset_update_active_cpus(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -155,7 +155,7 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} -static inline void cpuset_update_active_cpus(bool cpu_online) +static inline void cpuset_update_active_cpus(void) { partition_sched_domains(1, NULL, NULL); } diff --git a/include/linux/crash_core.h b/include/linux/crash_core.h new file mode 100644 index 0000000000000000000000000000000000000000..541a197ba4a2107af28dfed68a2625e7bebf8f7c --- /dev/null +++ b/include/linux/crash_core.h @@ -0,0 +1,69 @@ +#ifndef LINUX_CRASH_CORE_H +#define LINUX_CRASH_CORE_H + +#include +#include +#include + +#define CRASH_CORE_NOTE_NAME "CORE" +#define CRASH_CORE_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) +#define CRASH_CORE_NOTE_NAME_BYTES ALIGN(sizeof(CRASH_CORE_NOTE_NAME), 4) +#define CRASH_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) + +#define CRASH_CORE_NOTE_BYTES ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + CRASH_CORE_NOTE_NAME_BYTES + \ + CRASH_CORE_NOTE_DESC_BYTES) + +#define VMCOREINFO_BYTES (4096) +#define VMCOREINFO_NOTE_NAME "VMCOREINFO" +#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) +#define VMCOREINFO_NOTE_SIZE ((CRASH_CORE_NOTE_HEAD_BYTES * 2) + \ + VMCOREINFO_NOTE_NAME_BYTES + \ + VMCOREINFO_BYTES) + +typedef u32 note_buf_t[CRASH_CORE_NOTE_BYTES/4]; + +void crash_save_vmcoreinfo(void); +void arch_crash_save_vmcoreinfo(void); +__printf(1, 2) +void vmcoreinfo_append_str(const char *fmt, ...); +phys_addr_t paddr_vmcoreinfo_note(void); + +#define VMCOREINFO_OSRELEASE(value) \ + vmcoreinfo_append_str("OSRELEASE=%s\n", value) +#define VMCOREINFO_PAGESIZE(value) \ + vmcoreinfo_append_str("PAGESIZE=%ld\n", value) +#define VMCOREINFO_SYMBOL(name) \ + vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) +#define VMCOREINFO_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(name)) +#define VMCOREINFO_STRUCT_SIZE(name) \ + vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ + (unsigned long)sizeof(struct name)) +#define VMCOREINFO_OFFSET(name, field) \ + vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ + (unsigned long)offsetof(struct name, field)) +#define VMCOREINFO_LENGTH(name, value) \ + vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) +#define VMCOREINFO_NUMBER(name) \ + vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) +#define VMCOREINFO_CONFIG(name) \ + vmcoreinfo_append_str("CONFIG_%s=y\n", #name) + +extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +extern size_t vmcoreinfo_size; +extern size_t vmcoreinfo_max_size; + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len); +void final_note(Elf_Word *buf); + +int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); +int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, + unsigned long long *crash_size, unsigned long long *crash_base); + +#endif /* LINUX_CRASH_CORE_H */ diff --git a/include/linux/crypto.h b/include/linux/crypto.h index c0b0cf3d2d2fac2ec58ac9f0b39f7fd30b552ffc..84da9978e9516a591fea2c1ad33af4c7db98a7b5 100644 --- a/include/linux/crypto.h +++ b/include/linux/crypto.h @@ -123,7 +123,7 @@ /* * Miscellaneous stuff. */ -#define CRYPTO_MAX_ALG_NAME 64 +#define CRYPTO_MAX_ALG_NAME 128 /* * The macro CRYPTO_MINALIGN_ATTR (along with the void * type in the actual diff --git a/include/linux/cryptohash.h b/include/linux/cryptohash.h index 3252799832cf933cba189710fb443cdf89061902..df4d3e943d288ed7acd8b27defe890cdd345c8f4 100644 --- a/include/linux/cryptohash.h +++ b/include/linux/cryptohash.h @@ -10,9 +10,4 @@ void sha_init(__u32 *buf); void sha_transform(__u32 *digest, const char *data, __u32 *W); -#define MD5_DIGEST_WORDS 4 -#define MD5_MESSAGE_BYTES 64 - -void md5_transform(__u32 *hash, __u32 const *in); - #endif diff --git a/include/linux/dax.h b/include/linux/dax.h index d8a3dc042e1cbb81f1c3a906ee6fc0d28fead8f8..00ebac854bb79f16ed5b04d74c6dafa641de0bb7 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -7,6 +7,54 @@ #include struct iomap_ops; +struct dax_device; +struct dax_operations { + /* + * direct_access: translate a device-relative + * logical-page-offset into an absolute physical pfn. Return the + * number of pages available for DAX at that pfn. + */ + long (*direct_access)(struct dax_device *, pgoff_t, long, + void **, pfn_t *); +}; + +int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); +#if IS_ENABLED(CONFIG_FS_DAX) +int __bdev_dax_supported(struct super_block *sb, int blocksize); +static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + return __bdev_dax_supported(sb, blocksize); +} +#else +static inline int bdev_dax_supported(struct super_block *sb, int blocksize) +{ + return -EOPNOTSUPP; +} +#endif + +#if IS_ENABLED(CONFIG_DAX) +struct dax_device *dax_get_by_host(const char *host); +void put_dax(struct dax_device *dax_dev); +#else +static inline struct dax_device *dax_get_by_host(const char *host) +{ + return NULL; +} + +static inline void put_dax(struct dax_device *dax_dev) +{ +} +#endif + +int dax_read_lock(void); +void dax_read_unlock(int id); +struct dax_device *alloc_dax(void *private, const char *host, + const struct dax_operations *ops); +bool dax_alive(struct dax_device *dax_dev); +void kill_dax(struct dax_device *dax_dev); +void *dax_get_private(struct dax_device *dax_dev); +long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, + void **kaddr, pfn_t *pfn); /* * We use lowest available bit in exceptional entry for locking, one bit for @@ -41,24 +89,19 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, const struct iomap_ops *ops); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); -int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); void dax_wake_mapping_entry_waiter(struct address_space *mapping, pgoff_t index, void *entry, bool wake_all); #ifdef CONFIG_FS_DAX -struct page *read_dax_sector(struct block_device *bdev, sector_t n); -int __dax_zero_page_range(struct block_device *bdev, sector_t sector, +int __dax_zero_page_range(struct block_device *bdev, + struct dax_device *dax_dev, sector_t sector, unsigned int offset, unsigned int length); #else -static inline struct page *read_dax_sector(struct block_device *bdev, - sector_t n) -{ - return ERR_PTR(-ENXIO); -} static inline int __dax_zero_page_range(struct block_device *bdev, - sector_t sector, unsigned int offset, unsigned int length) + struct dax_device *dax_dev, sector_t sector, + unsigned int offset, unsigned int length) { return -ENXIO; } diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h index 7dff776e6d160027b34d56fd7bf13030a07ebaea..9174b0d285824a879611e4ff93b5445d15cc7bfb 100644 --- a/include/linux/debugfs.h +++ b/include/linux/debugfs.h @@ -74,7 +74,7 @@ static const struct file_operations __fops = { \ .release = simple_attr_release, \ .read = debugfs_attr_read, \ .write = debugfs_attr_write, \ - .llseek = generic_file_llseek, \ + .llseek = no_llseek, \ } #if defined(CONFIG_DEBUG_FS) diff --git a/include/linux/dell-led.h b/include/linux/dell-led.h index 7009b8bec77bb4bb305b79c4c1c8437ba61d7078..3f033c48071e65d6bb11af2897d3f234b47643c5 100644 --- a/include/linux/dell-led.h +++ b/include/linux/dell-led.h @@ -1,10 +1,6 @@ #ifndef __DELL_LED_H__ #define __DELL_LED_H__ -enum { - DELL_LED_MICMUTE, -}; - -int dell_app_wmi_led_set(int whichled, int on); +int dell_micmute_led_set(int on); #endif diff --git a/include/linux/devfreq.h b/include/linux/devfreq.h index e0acb0e5243b49552480ed8cfac2d037226f4304..6c220e4ebb6b9d8441a811b822a10333fee21498 100644 --- a/include/linux/devfreq.h +++ b/include/linux/devfreq.h @@ -27,6 +27,7 @@ #define DEVFREQ_POSTCHANGE (1) struct devfreq; +struct devfreq_governor; /** * struct devfreq_dev_status - Data given from devfreq user device to @@ -100,35 +101,6 @@ struct devfreq_dev_profile { unsigned int max_state; }; -/** - * struct devfreq_governor - Devfreq policy governor - * @node: list node - contains registered devfreq governors - * @name: Governor's name - * @immutable: Immutable flag for governor. If the value is 1, - * this govenror is never changeable to other governor. - * @get_target_freq: Returns desired operating frequency for the device. - * Basically, get_target_freq will run - * devfreq_dev_profile.get_dev_status() to get the - * status of the device (load = busy_time / total_time). - * If no_central_polling is set, this callback is called - * only with update_devfreq() notified by OPP. - * @event_handler: Callback for devfreq core framework to notify events - * to governors. Events include per device governor - * init and exit, opp changes out of devfreq, suspend - * and resume of per device devfreq during device idle. - * - * Note that the callbacks are called with devfreq->lock locked by devfreq. - */ -struct devfreq_governor { - struct list_head node; - - const char name[DEVFREQ_NAME_LEN]; - const unsigned int immutable; - int (*get_target_freq)(struct devfreq *this, unsigned long *freq); - int (*event_handler)(struct devfreq *devfreq, - unsigned int event, void *data); -}; - /** * struct devfreq - Device devfreq structure * @node: list node - contains the devices with devfreq that have been diff --git a/include/linux/devfreq_cooling.h b/include/linux/devfreq_cooling.h index c35d0c0e0ada7a4e99363d217a86fc4cad8dafaf..4635f95000a4c50bff074ac5bb2f174985f7477a 100644 --- a/include/linux/devfreq_cooling.h +++ b/include/linux/devfreq_cooling.h @@ -34,6 +34,23 @@ * If get_dynamic_power() is NULL, then the * dynamic power is calculated as * @dyn_power_coeff * frequency * voltage^2 + * @get_real_power: When this is set, the framework uses it to ask the + * device driver for the actual power. + * Some devices have more sophisticated methods + * (like power counters) to approximate the actual power + * that they use. + * This function provides more accurate data to the + * thermal governor. When the driver does not provide + * such function, framework just uses pre-calculated + * table and scale the power by 'utilization' + * (based on 'busy_time' and 'total_time' taken from + * devfreq 'last_status'). + * The value returned by this function must be lower + * or equal than the maximum power value + * for the current state + * (which can be found in power_table[state]). + * When this interface is used, the power_table holds + * max total (static + dynamic) power value for each OPP. */ struct devfreq_cooling_power { unsigned long (*get_static_power)(struct devfreq *devfreq, @@ -41,6 +58,8 @@ struct devfreq_cooling_power { unsigned long (*get_dynamic_power)(struct devfreq *devfreq, unsigned long freq, unsigned long voltage); + int (*get_real_power)(struct devfreq *df, u32 *power, + unsigned long freq, unsigned long voltage); unsigned long dyn_power_coeff; }; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index a7e6903866fdc98689bb5189cffd4b8cb7a715fa..f4c639c0c362fd4c5106c5a6d2d6ceae1f3c795a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -22,11 +22,13 @@ struct bio_vec; /* * Type of table, mapped_device's mempool and request_queue */ -#define DM_TYPE_NONE 0 -#define DM_TYPE_BIO_BASED 1 -#define DM_TYPE_REQUEST_BASED 2 -#define DM_TYPE_MQ_REQUEST_BASED 3 -#define DM_TYPE_DAX_BIO_BASED 4 +enum dm_queue_mode { + DM_TYPE_NONE = 0, + DM_TYPE_BIO_BASED = 1, + DM_TYPE_REQUEST_BASED = 2, + DM_TYPE_MQ_REQUEST_BASED = 3, + DM_TYPE_DAX_BIO_BASED = 4, +}; typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t; @@ -128,13 +130,15 @@ typedef int (*dm_busy_fn) (struct dm_target *ti); * < 0 : error * >= 0 : the number of bytes accessible at the address */ -typedef long (*dm_direct_access_fn) (struct dm_target *ti, sector_t sector, - void **kaddr, pfn_t *pfn, long size); +typedef long (*dm_dax_direct_access_fn) (struct dm_target *ti, pgoff_t pgoff, + long nr_pages, void **kaddr, pfn_t *pfn); +#define PAGE_SECTORS (PAGE_SIZE / 512) void dm_error(const char *message); struct dm_dev { struct block_device *bdev; + struct dax_device *dax_dev; fmode_t mode; char name[16]; }; @@ -176,7 +180,7 @@ struct target_type { dm_busy_fn busy; dm_iterate_devices_fn iterate_devices; dm_io_hints_fn io_hints; - dm_direct_access_fn direct_access; + dm_dax_direct_access_fn direct_access; /* For internal device-mapper use. */ struct list_head list; @@ -221,6 +225,18 @@ struct target_type { */ typedef unsigned (*dm_num_write_bios_fn) (struct dm_target *ti, struct bio *bio); +/* + * A target implements own bio data integrity. + */ +#define DM_TARGET_INTEGRITY 0x00000010 +#define dm_target_has_integrity(type) ((type)->features & DM_TARGET_INTEGRITY) + +/* + * A target passes integrity data to the lower device. + */ +#define DM_TARGET_PASSES_INTEGRITY 0x00000020 +#define dm_target_passes_integrity(type) ((type)->features & DM_TARGET_PASSES_INTEGRITY) + struct dm_target { struct dm_table *table; struct target_type *type; @@ -254,6 +270,12 @@ struct dm_target { */ unsigned num_write_same_bios; + /* + * The number of WRITE ZEROES bios that will be submitted to the target. + * The bio number can be accessed with dm_bio_get_target_bio_nr. + */ + unsigned num_write_zeroes_bios; + /* * The minimum number of extra bytes allocated in each io for the * target to use. @@ -290,11 +312,6 @@ struct dm_target { * on max_io_len boundary. */ bool split_discard_bios:1; - - /* - * Set if this target does not return zeroes on discarded blocks. - */ - bool discard_zeroes_data_unsupported:1; }; /* Each target can link one of these into the table */ @@ -464,7 +481,7 @@ void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callback * Useful for "hybrid" target (supports both bio-based * and request-based). */ -void dm_table_set_type(struct dm_table *t, unsigned type); +void dm_table_set_type(struct dm_table *t, enum dm_queue_mode type); /* * Finally call this to make the table ready for use. @@ -578,6 +595,7 @@ extern struct ratelimit_state dm_ratelimit_state; /* * Definitions of return values from target end_io function. */ +#define DM_ENDIO_DONE 0 #define DM_ENDIO_INCOMPLETE 1 #define DM_ENDIO_REQUEUE 2 @@ -588,6 +606,7 @@ extern struct ratelimit_state dm_ratelimit_state; #define DM_MAPIO_REMAPPED 1 #define DM_MAPIO_REQUEUE DM_ENDIO_REQUEUE #define DM_MAPIO_DELAY_REQUEUE 3 +#define DM_MAPIO_KILL 4 #define dm_sector_div64(x, y)( \ { \ diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index 6048fa404e571165b40a7964cb1408f413bba9f0..a5195a7d6f77e40d23d29ba2793c7b393f4eb0bb 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -229,7 +229,7 @@ static inline struct dma_fence *dma_fence_get_rcu(struct dma_fence *fence) * * Function returns NULL if no refcount could be obtained, or the fence. * This function handles acquiring a reference to a fence that may be - * reallocated within the RCU grace period (such as with SLAB_DESTROY_BY_RCU), + * reallocated within the RCU grace period (such as with SLAB_TYPESAFE_BY_RCU), * so long as the caller is using RCU on the pointer to the fence. * * An alternative mechanism is to employ a seqlock to protect a bunch of @@ -257,7 +257,7 @@ dma_fence_get_rcu_safe(struct dma_fence * __rcu *fencep) * have successfully acquire a reference to it. If it no * longer matches, we are holding a reference to some other * reallocated pointer. This is possible if the allocator - * is using a freelist like SLAB_DESTROY_BY_RCU where the + * is using a freelist like SLAB_TYPESAFE_BY_RCU where the * fence remains valid for the RCU grace period, but it * may be reallocated. When using such allocators, we are * responsible for ensuring the reference we get is to diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h index 5725c94b1f121ece268c0fd8558cb94770c2c9cd..4eac2670bfa1a016679971f36ad698fe1d9f6a4e 100644 --- a/include/linux/dma-iommu.h +++ b/include/linux/dma-iommu.h @@ -20,6 +20,7 @@ #include #ifdef CONFIG_IOMMU_DMA +#include #include #include @@ -71,6 +72,7 @@ int iommu_dma_mapping_error(struct device *dev, dma_addr_t dma_addr); /* The DMA API isn't _quite_ the whole story, though... */ void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg); +void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list); #else @@ -100,6 +102,10 @@ static inline void iommu_dma_map_msi_msg(int irq, struct msi_msg *msg) { } +static inline void iommu_dma_get_resv_regions(struct device *dev, struct list_head *list) +{ +} + #endif /* CONFIG_IOMMU_DMA */ #endif /* __KERNEL__ */ #endif /* __DMA_IOMMU_H */ diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 0977317c6835c2526428f61c12fcfab976650b99..4f3eecedca2d7c75c683cc2f08d64be9ac95118d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -728,6 +728,18 @@ dma_mark_declared_memory_occupied(struct device *dev, } #endif /* CONFIG_HAVE_GENERIC_DMA_COHERENT */ +#ifdef CONFIG_HAS_DMA +int dma_configure(struct device *dev); +void dma_deconfigure(struct device *dev); +#else +static inline int dma_configure(struct device *dev) +{ + return 0; +} + +static inline void dma_deconfigure(struct device *dev) {} +#endif + /* * Managed DMA API */ diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 187c102997226d4107f2dd703413c105e980e39f..90884072fa73254f5da8bb2091c6c3d2bded2f20 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -39,6 +39,7 @@ extern int iommu_calculate_agaw(struct intel_iommu *iommu); extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu); extern int dmar_disabled; extern int intel_iommu_enabled; +extern int intel_iommu_tboot_noforce; #else static inline int iommu_calculate_agaw(struct intel_iommu *iommu) { diff --git a/include/linux/edac.h b/include/linux/edac.h index 5b6adf964248dd478dd6d266ccc59468d26951ce..8ae0f45fafd650f57b10b9db50cef9d91578506c 100644 --- a/include/linux/edac.h +++ b/include/linux/edac.h @@ -28,12 +28,10 @@ struct device; #define EDAC_OPSTATE_INT 2 extern int edac_op_state; -extern int edac_err_assert; -extern atomic_t edac_handlers; -extern int edac_handler_set(void); -extern void edac_atomic_assert_error(void); -extern struct bus_type *edac_get_sysfs_subsys(void); +struct bus_type *edac_get_sysfs_subsys(void); +int edac_get_report_status(void); +void edac_set_report_status(int new); enum { EDAC_REPORTING_ENABLED, @@ -41,28 +39,6 @@ enum { EDAC_REPORTING_FORCE }; -extern int edac_report_status; -#ifdef CONFIG_EDAC -static inline int get_edac_report_status(void) -{ - return edac_report_status; -} - -static inline void set_edac_report_status(int new) -{ - edac_report_status = new; -} -#else -static inline int get_edac_report_status(void) -{ - return EDAC_REPORTING_DISABLED; -} - -static inline void set_edac_report_status(int new) -{ -} -#endif - static inline void opstate_init(void) { switch (edac_op_state) { diff --git a/include/linux/efi-bgrt.h b/include/linux/efi-bgrt.h index 2fd3993c370b273c9ae8c0f58604a1280527f832..e6f624b53c3d89c6ba575bb61a3d868a251473ae 100644 --- a/include/linux/efi-bgrt.h +++ b/include/linux/efi-bgrt.h @@ -6,6 +6,7 @@ #ifdef CONFIG_ACPI_BGRT void efi_bgrt_init(struct acpi_table_header *table); +int __init acpi_parse_bgrt(struct acpi_table_header *table); /* The BGRT data itself; only valid if bgrt_image != NULL. */ extern size_t bgrt_image_size; @@ -14,6 +15,10 @@ extern struct acpi_table_bgrt bgrt_tab; #else /* !CONFIG_ACPI_BGRT */ static inline void efi_bgrt_init(struct acpi_table_header *table) {} +static inline int __init acpi_parse_bgrt(struct acpi_table_header *table) +{ + return 0; +} #endif /* !CONFIG_ACPI_BGRT */ diff --git a/include/linux/efi.h b/include/linux/efi.h index 94d34e0be24f55477573f48806de2d2817fe9722..ec36f42a2adde07cd9ca9b37ddaaf9442dccee31 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -1435,9 +1435,6 @@ static inline int efi_runtime_map_copy(void *buf, size_t bufsz) /* prototypes shared between arch specific and generic stub code */ -#define pr_efi(sys_table, msg) efi_printk(sys_table, "EFI stub: "msg) -#define pr_efi_err(sys_table, msg) efi_printk(sys_table, "EFI stub: ERROR: "msg) - void efi_printk(efi_system_table_t *sys_table_arg, char *str); void efi_free(efi_system_table_t *sys_table_arg, unsigned long size, @@ -1471,7 +1468,7 @@ efi_status_t handle_cmdline_files(efi_system_table_t *sys_table_arg, unsigned long *load_addr, unsigned long *load_size); -efi_status_t efi_parse_options(char *cmdline); +efi_status_t efi_parse_options(char const *cmdline); efi_status_t efi_setup_gop(efi_system_table_t *sys_table_arg, struct screen_info *si, efi_guid_t *proto, diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 22d39e8d4de16be2b69762cf24c92799dfb445d6..9ec5e22846e0f302e7e1b1d174d9ead17e619ee3 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -8,6 +8,9 @@ struct io_cq; struct elevator_type; +#ifdef CONFIG_BLK_DEBUG_FS +struct blk_mq_debugfs_attr; +#endif /* * Return values from elevator merger @@ -93,6 +96,8 @@ struct blk_mq_hw_ctx; struct elevator_mq_ops { int (*init_sched)(struct request_queue *, struct elevator_type *); void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); @@ -104,7 +109,7 @@ struct elevator_mq_ops { void (*insert_requests)(struct blk_mq_hw_ctx *, struct list_head *, bool); struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); bool (*has_work)(struct blk_mq_hw_ctx *); - void (*completed_request)(struct blk_mq_hw_ctx *, struct request *); + void (*completed_request)(struct request *); void (*started_request)(struct request *); void (*requeue_request)(struct request *); struct request *(*former_request)(struct request_queue *, struct request *); @@ -142,6 +147,10 @@ struct elevator_type char elevator_name[ELV_NAME_MAX]; struct module *elevator_owner; bool uses_mq; +#ifdef CONFIG_BLK_DEBUG_FS + const struct blk_mq_debugfs_attr *queue_debugfs_attrs; + const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; +#endif /* managed by elevator core */ char icq_cache_name[ELV_NAME_MAX + 5]; /* elvname + "_io_cq" */ @@ -212,7 +221,6 @@ extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); extern void elevator_exit(struct request_queue *, struct elevator_queue *); -extern int elevator_change(struct request_queue *, const char *); extern bool elv_bio_merge_ok(struct request *, struct bio *); extern struct elevator_queue *elevator_alloc(struct request_queue *, struct elevator_type *); diff --git a/include/linux/elf.h b/include/linux/elf.h index 20fa8d8ae31335744c4566eb8b6305997b44db8b..ba069e8f4f7873b002da2d632d81520eb2a662fd 100644 --- a/include/linux/elf.h +++ b/include/linux/elf.h @@ -29,6 +29,7 @@ extern Elf32_Dyn _DYNAMIC []; #define elf_note elf32_note #define elf_addr_t Elf32_Off #define Elf_Half Elf32_Half +#define Elf_Word Elf32_Word #else @@ -39,6 +40,7 @@ extern Elf64_Dyn _DYNAMIC []; #define elf_note elf64_note #define elf_addr_t Elf64_Off #define Elf_Half Elf64_Half +#define Elf_Word Elf64_Word #endif diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index c62b709b1ce087b7891f5d9c76aa2940b7f4a9a9..2d9f80848d4bd2b6e60dc06f1053ba91256c37ac 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -446,21 +446,6 @@ static inline void eth_addr_dec(u8 *addr) u64_to_ether_addr(u, addr); } -/** - * ether_addr_greater - Compare two Ethernet addresses - * @addr1: Pointer to a six-byte array containing the Ethernet address - * @addr2: Pointer other six-byte array containing the Ethernet address - * - * Compare two Ethernet addresses, returns true addr1 is greater than addr2 - */ -static inline bool ether_addr_greater(const u8 *addr1, const u8 *addr2) -{ - u64 u1 = ether_addr_to_u64(addr1); - u64 u2 = ether_addr_to_u64(addr2); - - return u1 > u2; -} - /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index 9ded8c6d8176b909cf68da0e125eef4441b7c9a9..83cc9863444b078765255b9010b015578768be09 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -60,6 +60,7 @@ enum ethtool_phys_id_state { enum { ETH_RSS_HASH_TOP_BIT, /* Configurable RSS hash function - Toeplitz */ ETH_RSS_HASH_XOR_BIT, /* Configurable RSS hash function - Xor */ + ETH_RSS_HASH_CRC32_BIT, /* Configurable RSS hash function - Crc32 */ /* * Add your fresh new hash function bits above and remember to update @@ -73,6 +74,7 @@ enum { #define ETH_RSS_HASH_TOP __ETH_RSS_HASH(TOP) #define ETH_RSS_HASH_XOR __ETH_RSS_HASH(XOR) +#define ETH_RSS_HASH_CRC32 __ETH_RSS_HASH(CRC32) #define ETH_RSS_HASH_UNKNOWN 0 #define ETH_RSS_HASH_NO_CHANGE 0 diff --git a/include/linux/extcon.h b/include/linux/extcon.h index 7010fb01a81a342e20abd9ac917d5c3f7aa3de14..7e206a9f88db81d2feb152ef4a98a9f0119af09e 100644 --- a/include/linux/extcon.h +++ b/include/linux/extcon.h @@ -236,11 +236,11 @@ extern int extcon_set_property_capability(struct extcon_dev *edev, unsigned int id, unsigned int prop); /* - * Following APIs are to monitor every action of a notifier. - * Registrar gets notified for every external port of a connection device. - * Probably this could be used to debug an action of notifier; however, - * we do not recommend to use this for normal 'notifiee' device drivers who - * want to be notified by a specific external port of the notifier. + * Following APIs are to monitor the status change of the external connectors. + * extcon_register_notifier(*edev, id, *nb) : Register a notifier block + * for specific external connector of the extcon. + * extcon_register_notifier_all(*edev, *nb) : Register a notifier block + * for all supported external connectors of the extcon. */ extern int extcon_register_notifier(struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); @@ -253,6 +253,17 @@ extern void devm_extcon_unregister_notifier(struct device *dev, struct extcon_dev *edev, unsigned int id, struct notifier_block *nb); +extern int extcon_register_notifier_all(struct extcon_dev *edev, + struct notifier_block *nb); +extern int extcon_unregister_notifier_all(struct extcon_dev *edev, + struct notifier_block *nb); +extern int devm_extcon_register_notifier_all(struct device *dev, + struct extcon_dev *edev, + struct notifier_block *nb); +extern void devm_extcon_unregister_notifier_all(struct device *dev, + struct extcon_dev *edev, + struct notifier_block *nb); + /* * Following API get the extcon device from devicetree. * This function use phandle of devicetree to get extcon device directly. diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e2d239ed4c60cb05fbf6072e1a3a3ea4e34594f3..b6feed6547ce92862acaf3153c5859d3eb132ccb 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -32,9 +32,9 @@ /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -#define F2FS_ROOT_INO(sbi) (sbi->root_ino_num) -#define F2FS_NODE_INO(sbi) (sbi->node_ino_num) -#define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num) +#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num) +#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num) #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 @@ -161,7 +162,7 @@ struct f2fs_checkpoint { */ #define F2FS_ORPHANS_PER_BLOCK 1020 -#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \ +#define GET_ORPHAN_BLOCKS(n) (((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \ F2FS_ORPHANS_PER_BLOCK) struct f2fs_orphan_block { @@ -301,6 +302,12 @@ struct f2fs_nat_block { #define SIT_VBLOCK_MAP_SIZE 64 #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) +/* + * F2FS uses 4 bytes to represent block address. As a result, supported size of + * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. + */ +#define F2FS_MAX_SEGMENT ((16 * 1024 * 1024) / 2) + /* * Note that f2fs_sit_entry->vblocks has the following bit-field information. * [15:10] : allocation type such as CURSEG_XXXX_TYPE @@ -449,7 +456,7 @@ typedef __le32 f2fs_hash_t; #define F2FS_SLOT_LEN 8 #define F2FS_SLOT_LEN_BITS 3 -#define GET_DENTRY_SLOTS(x) ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) +#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) /* MAX level for dir lookup */ #define MAX_DIR_HASH_DEPTH 63 diff --git a/include/linux/fcntl.h b/include/linux/fcntl.h index 76ce329e656d1eaca5a6d6792f97d3c067fa37c6..1b48d9c9a5613cc1b619444e2046c36dcfeda872 100644 --- a/include/linux/fcntl.h +++ b/include/linux/fcntl.h @@ -3,6 +3,12 @@ #include +/* list of all valid flags for the open/openat flags argument: */ +#define VALID_OPEN_FLAGS \ + (O_RDONLY | O_WRONLY | O_RDWR | O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC | \ + O_APPEND | O_NDELAY | O_NONBLOCK | O_NDELAY | __O_SYNC | O_DSYNC | \ + FASYNC | O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_NOFOLLOW | \ + O_NOATIME | O_CLOEXEC | O_PATH | __O_TMPFILE) #ifndef force_o_largefile #define force_o_largefile() (BITS_PER_LONG != 32) diff --git a/include/linux/filter.h b/include/linux/filter.h index fbf7b39e81035506b73ddc860e3029119104b255..56197f82af45fb0f7cf492961fe1eb1956105735 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -7,6 +7,7 @@ #include #include +#include #include #include #include @@ -18,7 +19,9 @@ #include -#include +#ifdef CONFIG_ARCH_HAS_SET_MEMORY +#include +#endif #include #include @@ -412,8 +415,7 @@ struct bpf_prog { locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ - dst_needed:1, /* Do we need dst entry? */ - xdp_adjust_head:1; /* Adjusting pkt head? */ + dst_needed:1; /* Do we need dst entry? */ kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ @@ -430,7 +432,7 @@ struct bpf_prog { }; struct sk_filter { - atomic_t refcnt; + refcount_t refcnt; struct rcu_head rcu; struct bpf_prog *prog; }; @@ -693,6 +695,11 @@ static inline bool bpf_jit_is_ebpf(void) # endif } +static inline bool ebpf_jit_enabled(void) +{ + return bpf_jit_enable && bpf_jit_is_ebpf(); +} + static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return fp->jited && bpf_jit_is_ebpf(); @@ -753,6 +760,11 @@ void bpf_prog_kallsyms_del(struct bpf_prog *fp); #else /* CONFIG_BPF_JIT */ +static inline bool ebpf_jit_enabled(void) +{ + return false; +} + static inline bool bpf_prog_ebpf_jited(const struct bpf_prog *fp) { return false; diff --git a/include/linux/firmware/meson/meson_sm.h b/include/linux/firmware/meson/meson_sm.h index 8e953c6f394ae58ef5902c2ae4fd44fd5f3c1173..37a5eaea69dde299a18855ffc99b1227513b25da 100644 --- a/include/linux/firmware/meson/meson_sm.h +++ b/include/linux/firmware/meson/meson_sm.h @@ -25,7 +25,7 @@ int meson_sm_call(unsigned int cmd_index, u32 *ret, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); int meson_sm_call_write(void *buffer, unsigned int b_size, unsigned int cmd_index, u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); -int meson_sm_call_read(void *buffer, unsigned int cmd_index, u32 arg0, u32 arg1, - u32 arg2, u32 arg3, u32 arg4); +int meson_sm_call_read(void *buffer, unsigned int bsize, unsigned int cmd_index, + u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4); #endif /* _MESON_SM_FW_H_ */ diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h index b6efb0c64408b35b7874726ca22c72f2e5fe21f3..11366b3ff0b4fda30d8ff466c180ed3e23beef5e 100644 --- a/include/linux/flex_array.h +++ b/include/linux/flex_array.h @@ -61,16 +61,83 @@ struct flex_array { FLEX_ARRAY_ELEMENTS_PER_PART(__element_size)); \ } +/** + * flex_array_alloc() - Creates a flexible array. + * @element_size: individual object size. + * @total: maximum number of objects which can be stored. + * @flags: GFP flags + * + * Return: Returns an object of structure flex_array. + */ struct flex_array *flex_array_alloc(int element_size, unsigned int total, gfp_t flags); + +/** + * flex_array_prealloc() - Ensures that memory for the elements indexed in the + * range defined by start and nr_elements has been allocated. + * @fa: array to allocate memory to. + * @start: start address + * @nr_elements: number of elements to be allocated. + * @flags: GFP flags + * + */ int flex_array_prealloc(struct flex_array *fa, unsigned int start, unsigned int nr_elements, gfp_t flags); + +/** + * flex_array_free() - Removes all elements of a flexible array. + * @fa: array to be freed. + */ void flex_array_free(struct flex_array *fa); + +/** + * flex_array_free_parts() - Removes all elements of a flexible array, but + * leaves the array itself in place. + * @fa: array to be emptied. + */ void flex_array_free_parts(struct flex_array *fa); + +/** + * flex_array_put() - Stores data into a flexible array. + * @fa: array where element is to be stored. + * @element_nr: position to copy, must be less than the maximum specified when + * the array was created. + * @src: data source to be copied into the array. + * @flags: GFP flags + * + * Return: Returns zero on success, a negative error code otherwise. + */ int flex_array_put(struct flex_array *fa, unsigned int element_nr, void *src, gfp_t flags); + +/** + * flex_array_clear() - Clears an individual element in the array, sets the + * given element to FLEX_ARRAY_FREE. + * @element_nr: element position to clear. + * @fa: array to which element to be cleared belongs. + * + * Return: Returns zero on success, -EINVAL otherwise. + */ int flex_array_clear(struct flex_array *fa, unsigned int element_nr); + +/** + * flex_array_get() - Retrieves data into a flexible array. + * + * @element_nr: Element position to retrieve data from. + * @fa: array from which data is to be retrieved. + * + * Return: Returns a pointer to the data element, or NULL if that + * particular element has never been allocated. + */ void *flex_array_get(struct flex_array *fa, unsigned int element_nr); + +/** + * flex_array_shrink() - Reduces the allocated size of an array. + * @fa: array to shrink. + * + * Return: Returns number of pages of memory actually freed. + * + */ int flex_array_shrink(struct flex_array *fa); #define flex_array_put_ptr(fa, nr, src, gfp) \ diff --git a/include/linux/fpga/altera-pr-ip-core.h b/include/linux/fpga/altera-pr-ip-core.h new file mode 100644 index 0000000000000000000000000000000000000000..3810a9033f496c0ca61715e13736602e37c61e96 --- /dev/null +++ b/include/linux/fpga/altera-pr-ip-core.h @@ -0,0 +1,29 @@ +/* + * Driver for Altera Partial Reconfiguration IP Core + * + * Copyright (C) 2016 Intel Corporation + * + * Based on socfpga-a10.c Copyright (C) 2015-2016 Altera Corporation + * by Alan Tull + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program. If not, see . + */ + +#ifndef _ALT_PR_IP_CORE_H +#define _ALT_PR_IP_CORE_H +#include + +int alt_pr_register(struct device *dev, void __iomem *reg_base); +int alt_pr_unregister(struct device *dev); + +#endif /* _ALT_PR_IP_CORE_H */ diff --git a/include/linux/fpga/fpga-mgr.h b/include/linux/fpga/fpga-mgr.h index 57beb5d09bfcb2b876266ba89c25e7a9ed81734b..b4ac24c4411d0fae1a77aa40024f7b9a5285ad5c 100644 --- a/include/linux/fpga/fpga-mgr.h +++ b/include/linux/fpga/fpga-mgr.h @@ -70,17 +70,21 @@ enum fpga_mgr_states { */ #define FPGA_MGR_PARTIAL_RECONFIG BIT(0) #define FPGA_MGR_EXTERNAL_CONFIG BIT(1) +#define FPGA_MGR_ENCRYPTED_BITSTREAM BIT(2) /** * struct fpga_image_info - information specific to a FPGA image * @flags: boolean flags as defined above * @enable_timeout_us: maximum time to enable traffic through bridge (uSec) * @disable_timeout_us: maximum time to disable traffic through bridge (uSec) + * @config_complete_timeout_us: maximum time for FPGA to switch to operating + * status in the write_complete op. */ struct fpga_image_info { u32 flags; u32 enable_timeout_us; u32 disable_timeout_us; + u32 config_complete_timeout_us; }; /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 7251f7bb45e8b80b44f28c2051c5bef8e947e6bb..803e5a9b265422d2c2034678331b6d4dd353d1de 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include @@ -250,9 +249,8 @@ enum positive_aop_returns { AOP_TRUNCATED_PAGE = 0x80001, }; -#define AOP_FLAG_UNINTERRUPTIBLE 0x0001 /* will not do a short write */ -#define AOP_FLAG_CONT_EXPAND 0x0002 /* called from cont_expand */ -#define AOP_FLAG_NOFS 0x0004 /* used by filesystem to direct +#define AOP_FLAG_CONT_EXPAND 0x0001 /* called from cont_expand */ +#define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct * helper code (eg buffer layer) * to clear GFP_FS from alloc */ @@ -546,6 +544,8 @@ is_uncached_acl(struct posix_acl *acl) #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 +struct fsnotify_mark_connector; + /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning @@ -645,7 +645,7 @@ struct inode { #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ - struct hlist_head i_fsnotify_marks; + struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #if IS_ENABLED(CONFIG_FS_ENCRYPTION) @@ -909,6 +909,8 @@ static inline struct file *get_file(struct file *f) #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ +#define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) + /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. @@ -1429,7 +1431,6 @@ static inline void i_gid_write(struct inode *inode, gid_t gid) inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } -extern struct timespec current_fs_time(struct super_block *sb); extern struct timespec current_time(struct inode *inode); /* @@ -2121,6 +2122,9 @@ extern int vfs_ustat(dev_t, struct kstatfs *); extern int freeze_super(struct super_block *super); extern int thaw_super(struct super_block *super); extern bool our_mnt(struct vfsmount *mnt); +extern __printf(2, 3) +int super_setup_bdi_name(struct super_block *sb, char *fmt, ...); +extern int super_setup_bdi(struct super_block *sb); extern int current_umask(void); @@ -2921,17 +2925,19 @@ extern int vfs_statx_fd(unsigned int, struct kstat *, u32, unsigned int); static inline int vfs_stat(const char __user *filename, struct kstat *stat) { - return vfs_statx(AT_FDCWD, filename, 0, stat, STATX_BASIC_STATS); + return vfs_statx(AT_FDCWD, filename, AT_NO_AUTOMOUNT, + stat, STATX_BASIC_STATS); } static inline int vfs_lstat(const char __user *name, struct kstat *stat) { - return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW, + return vfs_statx(AT_FDCWD, name, AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT, stat, STATX_BASIC_STATS); } static inline int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat, int flags) { - return vfs_statx(dfd, filename, flags, stat, STATX_BASIC_STATS); + return vfs_statx(dfd, filename, flags | AT_NO_AUTOMOUNT, + stat, STATX_BASIC_STATS); } static inline int vfs_fstat(int fd, struct kstat *stat) { @@ -2996,9 +3002,10 @@ extern const struct file_operations simple_dir_operations; extern const struct inode_operations simple_dir_inode_operations; extern void make_empty_dir_inode(struct inode *inode); extern bool is_empty_dir_inode(struct inode *inode); -struct tree_descr { char *name; const struct file_operations *ops; int mode; }; +struct tree_descr { const char *name; const struct file_operations *ops; int mode; }; struct dentry *d_alloc_name(struct dentry *, const char *); -extern int simple_fill_super(struct super_block *, unsigned long, struct tree_descr *); +extern int simple_fill_super(struct super_block *, unsigned long, + const struct tree_descr *); extern int simple_pin_fs(struct file_system_type *, struct vfsmount **mount, int *count); extern void simple_release_fs(struct vfsmount **mount, int *count); diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 10c1abfbac6c45d1049fdf9f1b1f133371521461..0a30c106c1e535188637d7947a74f77c933968a1 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -46,17 +46,6 @@ struct fscrypt_symlink_data { char encrypted_path[1]; } __packed; -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - struct fscrypt_str { unsigned char *name; u32 len; diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3511ca7988043e75440886f0530abae2dde1f1d8..ec406aed2f2f8c3331d0002fba8a83036e75c08b 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -147,6 +147,15 @@ static inline int fscrypt_fname_usr_to_disk(struct inode *inode, return -EOPNOTSUPP; } +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index a140f47e9b271212ebc0a6ecc6a1db0527e37afe..cd4e82c17304ffbe97868b4aec146ca623ec2d78 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -57,6 +57,80 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, struct fscrypt_str *); +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index e6e689b5569e6f2abf00859874645620fadd3155..c6c69318752bd7b8caeecee48496bb39227c2754 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -80,6 +80,7 @@ struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; +struct fsnotify_iter_info; /* * Each group much define these ops. The fsnotify infrastructure will call @@ -98,10 +99,13 @@ struct fsnotify_ops { struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie); + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); + /* called on final put+free to free memory */ + void (*free_mark)(struct fsnotify_mark *mark); }; /* @@ -163,6 +167,8 @@ struct fsnotify_group { struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ + atomic_t user_waits; /* Number of tasks waiting for user + * response */ /* groups can define private fields here or use the void *private */ union { @@ -194,6 +200,30 @@ struct fsnotify_group { #define FSNOTIFY_EVENT_PATH 1 #define FSNOTIFY_EVENT_INODE 2 +/* + * Inode / vfsmount point to this structure which tracks all marks attached to + * the inode / vfsmount. The reference to inode / vfsmount is held by this + * structure. We destroy this structure when there are no more marks attached + * to it. The structure is protected by fsnotify_mark_srcu. + */ +struct fsnotify_mark_connector { + spinlock_t lock; +#define FSNOTIFY_OBJ_TYPE_INODE 0x01 +#define FSNOTIFY_OBJ_TYPE_VFSMOUNT 0x02 +#define FSNOTIFY_OBJ_ALL_TYPES (FSNOTIFY_OBJ_TYPE_INODE | \ + FSNOTIFY_OBJ_TYPE_VFSMOUNT) + unsigned int flags; /* Type of object [lock] */ + union { /* Object pointer [lock] */ + struct inode *inode; + struct vfsmount *mnt; + }; + union { + struct hlist_head list; + /* Used listing heads to free after srcu period expires */ + struct fsnotify_mark_connector *destroy_next; + }; +}; + /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events @@ -223,22 +253,16 @@ struct fsnotify_mark { struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; - /* List of marks for inode / vfsmount [obj_lock] */ + /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; - union { /* Object pointer [mark->lock, group->mark_mutex] */ - struct inode *inode; /* inode this mark is associated with */ - struct vfsmount *mnt; /* vfsmount this mark is associated with */ - }; + /* Head of list of marks for an object [mark ref] */ + struct fsnotify_mark_connector *connector; /* Events types to ignore [mark->lock, group->mark_mutex] */ __u32 ignored_mask; -#define FSNOTIFY_MARK_FLAG_INODE 0x01 -#define FSNOTIFY_MARK_FLAG_VFSMOUNT 0x02 -#define FSNOTIFY_MARK_FLAG_OBJECT_PINNED 0x04 -#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x08 -#define FSNOTIFY_MARK_FLAG_ALIVE 0x10 -#define FSNOTIFY_MARK_FLAG_ATTACHED 0x20 +#define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 +#define FSNOTIFY_MARK_FLAG_ALIVE 0x02 +#define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 unsigned int flags; /* flags [mark->lock] */ - void (*free_mark)(struct fsnotify_mark *mark); /* called on final put+free */ }; #ifdef CONFIG_FSNOTIFY @@ -315,23 +339,18 @@ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group /* functions used to manipulate the marks attached to inodes */ -/* run all marks associated with a vfsmount and update mnt->mnt_fsnotify_mask */ -extern void fsnotify_recalc_vfsmount_mask(struct vfsmount *mnt); -/* run all marks associated with an inode and update inode->i_fsnotify_mask */ -extern void fsnotify_recalc_inode_mask(struct inode *inode); -extern void fsnotify_init_mark(struct fsnotify_mark *mark, void (*free_mark)(struct fsnotify_mark *mark)); -/* find (and take a reference) to a mark associated with group and inode */ -extern struct fsnotify_mark *fsnotify_find_inode_mark(struct fsnotify_group *group, struct inode *inode); -/* find (and take a reference) to a mark associated with group and vfsmount */ -extern struct fsnotify_mark *fsnotify_find_vfsmount_mark(struct fsnotify_group *group, struct vfsmount *mnt); -/* set the ignored_mask of a mark */ -extern void fsnotify_set_mark_ignored_mask_locked(struct fsnotify_mark *mark, __u32 mask); -/* set the mask of a mark (might pin the object into memory */ -extern void fsnotify_set_mark_mask_locked(struct fsnotify_mark *mark, __u32 mask); -/* attach the mark to both the group and the inode */ -extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, - struct inode *inode, struct vfsmount *mnt, int allow_dups); -extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct fsnotify_group *group, +/* Calculate mask of events for a list of marks */ +extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); +extern void fsnotify_init_mark(struct fsnotify_mark *mark, + struct fsnotify_group *group); +/* Find mark belonging to given group in the list of marks */ +extern struct fsnotify_mark *fsnotify_find_mark( + struct fsnotify_mark_connector __rcu **connp, + struct fsnotify_group *group); +/* attach the mark to the inode or vfsmount */ +extern int fsnotify_add_mark(struct fsnotify_mark *mark, struct inode *inode, + struct vfsmount *mnt, int allow_dups); +extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, struct inode *inode, struct vfsmount *mnt, int allow_dups); /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, @@ -340,15 +359,23 @@ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); +/* run all the marks in a group, and clear all of the marks attached to given object type */ +extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); /* run all the marks in a group, and clear all of the vfsmount marks */ -extern void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group); +static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT); +} /* run all the marks in a group, and clear all of the inode marks */ -extern void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group); -/* run all the marks in a group, and clear all of the marks where mark->flags & flags is true*/ -extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, unsigned int flags); +static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) +{ + fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE); +} extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_unmount_inodes(struct super_block *sb); +extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); +extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); /* put here because inotify does some weird stuff when destroying watches */ extern void fsnotify_init_event(struct fsnotify_event *event, diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 3633e8beff39e3ad67409a133109350128607ddd..473f088aabeac7b3899c155c778628e389e4d3fe 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -42,8 +42,10 @@ /* Main tracing buffer and events set up */ #ifdef CONFIG_TRACING void trace_init(void); +void early_trace_init(void); #else static inline void trace_init(void) { } +static inline void early_trace_init(void) { } #endif struct module; @@ -70,7 +72,7 @@ ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops); * CONTROL, SAVE_REGS, SAVE_REGS_IF_SUPPORTED, RECURSION_SAFE, STUB and * IPMODIFY are a kind of attribute flags which can be set only before * registering the ftrace_ops, and can not be modified while registered. - * Changing those attribute flags after regsitering ftrace_ops will + * Changing those attribute flags after registering ftrace_ops will * cause unexpected results. * * ENABLED - set/unset when ftrace_ops is registered/unregistered @@ -144,6 +146,10 @@ struct ftrace_ops_hash { struct ftrace_hash *filter_hash; struct mutex regex_lock; }; + +void ftrace_free_init_mem(void); +#else +static inline void ftrace_free_init_mem(void) { } #endif /* @@ -260,6 +266,7 @@ static inline int ftrace_nr_registered_ops(void) } static inline void clear_ftrace_function(void) { } static inline void ftrace_kill(void) { } +static inline void ftrace_free_init_mem(void) { } #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_STACK_TRACER @@ -279,15 +286,45 @@ int stack_trace_sysctl(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -#endif -struct ftrace_func_command { - struct list_head list; - char *name; - int (*func)(struct ftrace_hash *hash, - char *func, char *cmd, - char *params, int enable); -}; +/* DO NOT MODIFY THIS VARIABLE DIRECTLY! */ +DECLARE_PER_CPU(int, disable_stack_tracer); + +/** + * stack_tracer_disable - temporarily disable the stack tracer + * + * There's a few locations (namely in RCU) where stack tracing + * cannot be executed. This function is used to disable stack + * tracing during those critical sections. + * + * This function must be called with preemption or interrupts + * disabled and stack_tracer_enable() must be called shortly after + * while preemption or interrupts are still disabled. + */ +static inline void stack_tracer_disable(void) +{ + /* Preemption or interupts must be disabled */ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_inc(disable_stack_tracer); +} + +/** + * stack_tracer_enable - re-enable the stack tracer + * + * After stack_tracer_disable() is called, stack_tracer_enable() + * must be called shortly afterward. + */ +static inline void stack_tracer_enable(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_DEBUG)) + WARN_ON_ONCE(!preempt_count() || !irqs_disabled()); + this_cpu_dec(disable_stack_tracer); +} +#else +static inline void stack_tracer_disable(void) { } +static inline void stack_tracer_enable(void) { } +#endif #ifdef CONFIG_DYNAMIC_FTRACE @@ -315,30 +352,6 @@ void ftrace_bug(int err, struct dyn_ftrace *rec); struct seq_file; -struct ftrace_probe_ops { - void (*func)(unsigned long ip, - unsigned long parent_ip, - void **data); - int (*init)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - void (*free)(struct ftrace_probe_ops *ops, - unsigned long ip, void **data); - int (*print)(struct seq_file *m, - unsigned long ip, - struct ftrace_probe_ops *ops, - void *data); -}; - -extern int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data); -extern void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops); -extern void unregister_ftrace_function_probe_all(char *glob); - extern int ftrace_text_reserved(const void *start, const void *end); extern int ftrace_nr_registered_ops(void); @@ -400,9 +413,6 @@ void ftrace_set_global_notrace(unsigned char *buf, int len, int reset); void ftrace_free_filter(struct ftrace_ops *ops); void ftrace_ops_set_global_filter(struct ftrace_ops *ops); -int register_ftrace_command(struct ftrace_func_command *cmd); -int unregister_ftrace_command(struct ftrace_func_command *cmd); - enum { FTRACE_UPDATE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -433,8 +443,8 @@ enum { FTRACE_ITER_FILTER = (1 << 0), FTRACE_ITER_NOTRACE = (1 << 1), FTRACE_ITER_PRINTALL = (1 << 2), - FTRACE_ITER_DO_HASH = (1 << 3), - FTRACE_ITER_HASH = (1 << 4), + FTRACE_ITER_DO_PROBES = (1 << 3), + FTRACE_ITER_PROBE = (1 << 4), FTRACE_ITER_ENABLED = (1 << 5), }; @@ -618,14 +628,6 @@ static inline void ftrace_enable_daemon(void) { } static inline void ftrace_module_init(struct module *mod) { } static inline void ftrace_module_enable(struct module *mod) { } static inline void ftrace_release_mod(struct module *mod) { } -static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) -{ - return -EINVAL; -} -static inline __init int unregister_ftrace_command(char *cmd_name) -{ - return -EINVAL; -} static inline int ftrace_text_reserved(const void *start, const void *end) { return 0; diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 8bd28ce6d76e2491d10942f60ad77cc15e1edea0..3dff2398a5f0beb8cf0e5f4dd1d139bb727bf480 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -27,4 +27,16 @@ struct fwnode_handle { struct fwnode_handle *secondary; }; +/** + * struct fwnode_endpoint - Fwnode graph endpoint + * @port: Port number + * @id: Endpoint id + * @local_fwnode: reference to the related fwnode + */ +struct fwnode_endpoint { + unsigned int port; + unsigned int id; + const struct fwnode_handle *local_fwnode; +}; + #endif diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 76f39754e7b0299df616bc3cb909f9a35fce9ea1..acff9437e5c3776e48a9357af40291165719416d 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -159,11 +159,11 @@ struct badblocks; #if defined(CONFIG_BLK_DEV_INTEGRITY) struct blk_integrity { - struct blk_integrity_profile *profile; - unsigned char flags; - unsigned char tuple_size; - unsigned char interval_exp; - unsigned char tag_size; + const struct blk_integrity_profile *profile; + unsigned char flags; + unsigned char tuple_size; + unsigned char interval_exp; + unsigned char tag_size; }; #endif /* CONFIG_BLK_DEV_INTEGRITY */ @@ -722,11 +722,9 @@ static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) #if defined(CONFIG_BLK_DEV_INTEGRITY) extern void blk_integrity_add(struct gendisk *); extern void blk_integrity_del(struct gendisk *); -extern void blk_integrity_revalidate(struct gendisk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void blk_integrity_add(struct gendisk *disk) { } static inline void blk_integrity_del(struct gendisk *disk) { } -static inline void blk_integrity_revalidate(struct gendisk *disk) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #else /* CONFIG_BLOCK */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index db373b9d322361f7553cfbe36026d918fb9d09ee..2b1a44f5bdb60e6a28d874630f71dec7a0f4c40b 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -40,6 +40,11 @@ struct vm_area_struct; #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP 0x4000000u +#else +#define ___GFP_NOLOCKDEP 0 +#endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -179,8 +184,11 @@ struct vm_area_struct; #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 25 +#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* @@ -202,8 +210,16 @@ struct vm_area_struct; * * GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. * * GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. * * GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware @@ -297,8 +313,8 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the - * zone to use given the lowest 4 bits of gfp_t. Entries are ZONE_SHIFT long - * and there are 16 of them to cover all possible combinations of + * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT + * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. diff --git a/include/linux/gpio/consumer.h b/include/linux/gpio/consumer.h index 933d936566055de430f9db64ae152eb31785b7ff..8f702fcbe4853e349235813c3438493b362fea3c 100644 --- a/include/linux/gpio/consumer.h +++ b/include/linux/gpio/consumer.h @@ -170,14 +170,14 @@ static inline struct gpio_desc *__must_check gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline struct gpio_desc *__must_check gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline struct gpio_descs *__must_check @@ -191,7 +191,7 @@ static inline struct gpio_descs *__must_check gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline void gpiod_put(struct gpio_desc *desc) @@ -231,14 +231,14 @@ static inline struct gpio_desc *__must_check devm_gpiod_get_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline struct gpio_desc *__must_check devm_gpiod_get_index_optional(struct device *dev, const char *con_id, unsigned int index, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline struct gpio_descs *__must_check @@ -252,7 +252,7 @@ static inline struct gpio_descs *__must_check devm_gpiod_get_array_optional(struct device *dev, const char *con_id, enum gpiod_flags flags) { - return ERR_PTR(-ENOSYS); + return NULL; } static inline void devm_gpiod_put(struct device *dev, struct gpio_desc *desc) diff --git a/include/linux/gpio/driver.h b/include/linux/gpio/driver.h index 846f3b9894805c482242c8b0117fe2537842e5bb..393582867afddb17e2969a659bedcb975f710208 100644 --- a/include/linux/gpio/driver.h +++ b/include/linux/gpio/driver.h @@ -168,7 +168,7 @@ struct gpio_chip { unsigned int irq_base; irq_flow_handler_t irq_handler; unsigned int irq_default_type; - int irq_chained_parent; + unsigned int irq_chained_parent; bool irq_nested; bool irq_need_valid_mask; unsigned long *irq_valid_mask; @@ -244,12 +244,12 @@ int bgpio_init(struct gpio_chip *gc, struct device *dev, void gpiochip_set_chained_irqchip(struct gpio_chip *gpiochip, struct irq_chip *irqchip, - int parent_irq, + unsigned int parent_irq, irq_flow_handler_t parent_handler); void gpiochip_set_nested_irqchip(struct gpio_chip *gpiochip, struct irq_chip *irqchip, - int parent_irq); + unsigned int parent_irq); int gpiochip_irqchip_add_key(struct gpio_chip *gpiochip, struct irq_chip *irqchip, diff --git a/include/linux/gpio/gpio-reg.h b/include/linux/gpio/gpio-reg.h new file mode 100644 index 0000000000000000000000000000000000000000..90e0b9060e6dfc4c6d4ecf5781f0487ee283b657 --- /dev/null +++ b/include/linux/gpio/gpio-reg.h @@ -0,0 +1,13 @@ +#ifndef GPIO_REG_H +#define GPIO_REG_H + +struct device; +struct irq_domain; + +struct gpio_chip *gpio_reg_init(struct device *dev, void __iomem *reg, + int base, int num, const char *label, u32 direction, u32 def_out, + const char *const *names, struct irq_domain *irqdom, const int *irqs); + +int gpio_reg_resume(struct gpio_chip *gc); + +#endif diff --git a/include/linux/hid-sensor-hub.h b/include/linux/hid-sensor-hub.h index 7ef111d3ecc5561c419dc16ff4c08d88016e9085..f32d7c392c1ec9532a51e801e45a22e4942d0661 100644 --- a/include/linux/hid-sensor-hub.h +++ b/include/linux/hid-sensor-hub.h @@ -231,6 +231,8 @@ struct hid_sensor_common { unsigned usage_id; atomic_t data_ready; atomic_t user_requested_state; + int poll_interval; + int raw_hystersis; struct iio_trigger *trigger; int timestamp_ns_scale; struct hid_sensor_hub_attribute_info poll; diff --git a/include/linux/hid-sensor-ids.h b/include/linux/hid-sensor-ids.h index 30c7dc45e45fcc0110dfaae19ff69d523621414b..761f86242473609ed94cee70672e9e639e32e818 100644 --- a/include/linux/hid-sensor-ids.h +++ b/include/linux/hid-sensor-ids.h @@ -45,6 +45,14 @@ #define HID_USAGE_SENSOR_DATA_ATMOSPHERIC_PRESSURE 0x200430 #define HID_USAGE_SENSOR_ATMOSPHERIC_PRESSURE 0x200431 +/* Tempreture (200033) */ +#define HID_USAGE_SENSOR_TEMPERATURE 0x200033 +#define HID_USAGE_SENSOR_DATA_ENVIRONMENTAL_TEMPERATURE 0x200434 + +/* humidity */ +#define HID_USAGE_SENSOR_HUMIDITY 0x200032 +#define HID_USAGE_SENSOR_ATMOSPHERIC_HUMIDITY 0x200433 + /* Gyro 3D: (200076) */ #define HID_USAGE_SENSOR_GYRO_3D 0x200076 #define HID_USAGE_SENSOR_DATA_ANGL_VELOCITY 0x200456 diff --git a/include/linux/hid.h b/include/linux/hid.h index 28f38e2b8f309387b7a983937b50908c3095f7d9..5be325d890d96f9d823753e92296c379e3751076 100644 --- a/include/linux/hid.h +++ b/include/linux/hid.h @@ -268,6 +268,8 @@ struct hid_item { #define HID_CP_APPLICATIONLAUNCHBUTTONS 0x000c0180 #define HID_CP_GENERICGUIAPPLICATIONCONTROLS 0x000c0200 +#define HID_DG_DEVICECONFIG 0x000d000e +#define HID_DG_DEVICESETTINGS 0x000d0023 #define HID_DG_CONFIDENCE 0x000d0047 #define HID_DG_WIDTH 0x000d0048 #define HID_DG_HEIGHT 0x000d0049 @@ -322,7 +324,7 @@ struct hid_item { #define HID_QUIRK_MULTI_INPUT 0x00000040 #define HID_QUIRK_HIDINPUT_FORCE 0x00000080 #define HID_QUIRK_NO_EMPTY_INPUT 0x00000100 -#define HID_QUIRK_NO_INIT_INPUT_REPORTS 0x00000200 +/* 0x00000200 reserved for backward compatibility, was NO_INIT_INPUT_REPORTS */ #define HID_QUIRK_ALWAYS_POLL 0x00000400 #define HID_QUIRK_SKIP_OUTPUT_REPORTS 0x00010000 #define HID_QUIRK_SKIP_OUTPUT_REPORT_ID 0x00020000 @@ -541,7 +543,6 @@ struct hid_device { /* device report descriptor */ struct list_head inputs; /* The list of inputs */ void *hiddev; /* The hiddev structure */ void *hidraw; - int minor; /* Hiddev minor number */ int open; /* is the device open by anyone? */ char name[128]; /* Device name */ diff --git a/include/linux/hiddev.h b/include/linux/hiddev.h index a5dd8148660b7eb15482c99b1a602226007ec840..9216222229578075168262ac1c0cec5f88619f57 100644 --- a/include/linux/hiddev.h +++ b/include/linux/hiddev.h @@ -32,6 +32,18 @@ * In-kernel definitions. */ +struct hiddev { + int minor; + int exist; + int open; + struct mutex existancelock; + wait_queue_head_t wait; + struct hid_device *hid; + struct list_head list; + spinlock_t list_lock; + bool initialized; +}; + struct hid_device; struct hid_usage; struct hid_field; diff --git a/include/linux/host1x.h b/include/linux/host1x.h index 1ffbf2a8cb997a7986e0c44c2f72ce52814445f2..3d04aa1dc83e48207bc97bf2bce32f5b53291d24 100644 --- a/include/linux/host1x.h +++ b/include/linux/host1x.h @@ -26,6 +26,7 @@ enum host1x_class { HOST1X_CLASS_HOST1X = 0x1, HOST1X_CLASS_GR2D = 0x51, HOST1X_CLASS_GR2D_SB = 0x52, + HOST1X_CLASS_VIC = 0x5D, HOST1X_CLASS_GR3D = 0x60, }; diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 249e579ecd4c4d7ecf06f38e3c0fad75ba46000b..8c5b10eb7265fb221360cc9bb1a4eebf0dde06ca 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -276,8 +276,6 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) return timer->base->cpu_base->hres_active; } -extern void hrtimer_peek_ahead_timers(void); - /* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an @@ -300,8 +298,6 @@ extern unsigned int hrtimer_resolution; #define hrtimer_resolution (unsigned int)LOW_RES_NSEC -static inline void hrtimer_peek_ahead_timers(void) { } - static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return 0; @@ -456,7 +452,7 @@ static inline u64 hrtimer_forward_now(struct hrtimer *timer, } /* Precise sleep: */ -extern long hrtimer_nanosleep(struct timespec *rqtp, +extern long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid); diff --git a/include/linux/hwmon.h b/include/linux/hwmon.h index 88b6737491210aaf7a2ba66756672188992902ac..ceb751987c401e4015fc0c70ffb13d8d5b3eef27 100644 --- a/include/linux/hwmon.h +++ b/include/linux/hwmon.h @@ -337,7 +337,7 @@ struct hwmon_ops { int (*read)(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long *val); int (*read_string)(struct device *dev, enum hwmon_sensor_types type, - u32 attr, int channel, char **str); + u32 attr, int channel, const char **str); int (*write)(struct device *dev, enum hwmon_sensor_types type, u32 attr, int channel, long val); }; diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index 970771a5f7390268abb5c43ba1e1c8df48f2caa4..e09fc8290c2f02d92689f4499bcf2a22ae8ce8be 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -491,6 +491,12 @@ struct vmbus_channel_rescind_offer { u32 child_relid; } __packed; +static inline u32 +hv_ringbuffer_pending_size(const struct hv_ring_buffer_info *rbi) +{ + return rbi->ring_buffer->pending_send_sz; +} + /* * Request Offer -- no parameters, SynIC message contains the partition ID * Set Snoop -- no parameters, SynIC message contains the partition ID @@ -524,10 +530,10 @@ struct vmbus_channel_open_channel { u32 target_vp; /* - * The upstream ring buffer begins at offset zero in the memory - * described by RingBufferGpadlHandle. The downstream ring buffer - * follows it at this offset (in pages). - */ + * The upstream ring buffer begins at offset zero in the memory + * described by RingBufferGpadlHandle. The downstream ring buffer + * follows it at this offset (in pages). + */ u32 downstream_ringbuffer_pageoffset; /* User-specific data to be passed along to the server endpoint. */ @@ -1013,7 +1019,7 @@ extern int vmbus_open(struct vmbus_channel *channel, u32 recv_ringbuffersize, void *userdata, u32 userdatalen, - void(*onchannel_callback)(void *context), + void (*onchannel_callback)(void *context), void *context); extern void vmbus_close(struct vmbus_channel *channel); @@ -1155,6 +1161,17 @@ static inline void *hv_get_drvdata(struct hv_device *dev) return dev_get_drvdata(&dev->device); } +struct hv_ring_buffer_debug_info { + u32 current_interrupt_mask; + u32 current_read_index; + u32 current_write_index; + u32 bytes_avail_toread; + u32 bytes_avail_towrite; +}; + +void hv_ringbuffer_get_debuginfo(const struct hv_ring_buffer_info *ring_info, + struct hv_ring_buffer_debug_info *debug_info); + /* Vmbus interface */ #define vmbus_driver_register(driver) \ __vmbus_driver_register(driver, THIS_MODULE, KBUILD_MODNAME) @@ -1428,7 +1445,7 @@ struct hyperv_service_callback { char *log_msg; uuid_le data; struct vmbus_channel *channel; - void (*callback) (void *context); + void (*callback)(void *context); }; #define MAX_SRV_VER 0x7ffffff @@ -1504,16 +1521,6 @@ static inline void hv_signal_on_read(struct vmbus_channel *channel) cached_write_sz = hv_get_cached_bytes_to_write(rbi); if (cached_write_sz < pending_sz) vmbus_setevent(channel); - - return; -} - -static inline void -init_cached_read_index(struct vmbus_channel *channel) -{ - struct hv_ring_buffer_info *rbi = &channel->inbound; - - rbi->cached_read_index = rbi->ring_buffer->read_index; } /* @@ -1549,76 +1556,48 @@ static inline u32 hv_end_read(struct hv_ring_buffer_info *rbi) /* * An API to support in-place processing of incoming VMBUS packets. */ -#define VMBUS_PKT_TRAILER 8 -static inline struct vmpacket_descriptor * -get_next_pkt_raw(struct vmbus_channel *channel) +/* Get data payload associated with descriptor */ +static inline void *hv_pkt_data(const struct vmpacket_descriptor *desc) { - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 priv_read_loc = ring_info->priv_read_index; - void *ring_buffer = hv_get_ring_buffer(ring_info); - u32 dsize = ring_info->ring_datasize; - /* - * delta is the difference between what is available to read and - * what was already consumed in place. We commit read index after - * the whole batch is processed. - */ - u32 delta = priv_read_loc >= ring_info->ring_buffer->read_index ? - priv_read_loc - ring_info->ring_buffer->read_index : - (dsize - ring_info->ring_buffer->read_index) + priv_read_loc; - u32 bytes_avail_toread = (hv_get_bytes_to_read(ring_info) - delta); - - if (bytes_avail_toread < sizeof(struct vmpacket_descriptor)) - return NULL; - - return ring_buffer + priv_read_loc; + return (void *)((unsigned long)desc + (desc->offset8 << 3)); } -/* - * A helper function to step through packets "in-place" - * This API is to be called after each successful call - * get_next_pkt_raw(). - */ -static inline void put_pkt_raw(struct vmbus_channel *channel, - struct vmpacket_descriptor *desc) +/* Get data size associated with descriptor */ +static inline u32 hv_pkt_datalen(const struct vmpacket_descriptor *desc) { - struct hv_ring_buffer_info *ring_info = &channel->inbound; - u32 packetlen = desc->len8 << 3; - u32 dsize = ring_info->ring_datasize; - - /* - * Include the packet trailer. - */ - ring_info->priv_read_index += packetlen + VMBUS_PKT_TRAILER; - ring_info->priv_read_index %= dsize; + return (desc->len8 << 3) - (desc->offset8 << 3); } + +struct vmpacket_descriptor * +hv_pkt_iter_first(struct vmbus_channel *channel); + +struct vmpacket_descriptor * +__hv_pkt_iter_next(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt); + +void hv_pkt_iter_close(struct vmbus_channel *channel); + /* - * This call commits the read index and potentially signals the host. - * Here is the pattern for using the "in-place" consumption APIs: - * - * init_cached_read_index(); - * - * while (get_next_pkt_raw() { - * process the packet "in-place"; - * put_pkt_raw(); - * } - * if (packets processed in place) - * commit_rd_index(); + * Get next packet descriptor from iterator + * If at end of list, return NULL and update host. */ -static inline void commit_rd_index(struct vmbus_channel *channel) +static inline struct vmpacket_descriptor * +hv_pkt_iter_next(struct vmbus_channel *channel, + const struct vmpacket_descriptor *pkt) { - struct hv_ring_buffer_info *ring_info = &channel->inbound; - /* - * Make sure all reads are done before we update the read index since - * the writer may start writing to the read area once the read index - * is updated. - */ - virt_rmb(); - ring_info->ring_buffer->read_index = ring_info->priv_read_index; + struct vmpacket_descriptor *nxt; + + nxt = __hv_pkt_iter_next(channel, pkt); + if (!nxt) + hv_pkt_iter_close(channel); - hv_signal_on_read(channel); + return nxt; } +#define foreach_vmbus_pkt(pkt, channel) \ + for (pkt = hv_pkt_iter_first(channel); pkt; \ + pkt = hv_pkt_iter_next(channel, pkt)) #endif /* _HYPERV_H */ diff --git a/include/linux/i2c.h b/include/linux/i2c.h index 6b183521c61697aa3af56082b830247f960d06fb..72d0ece70ed30dba632f8df51a4cc7d229f02300 100644 --- a/include/linux/i2c.h +++ b/include/linux/i2c.h @@ -38,6 +38,7 @@ extern struct bus_type i2c_bus_type; extern struct device_type i2c_adapter_type; +extern struct device_type i2c_client_type; /* --- General options ------------------------------------------------ */ @@ -149,6 +150,7 @@ enum i2c_alert_protocol { * @detect: Callback for device detection * @address_list: The I2C addresses to probe (for detect) * @clients: List of detected clients we created (for i2c-core use only) + * @disable_i2c_core_irq_mapping: Tell the i2c-core to not do irq-mapping * * The driver.owner field should be set to the module owner of this driver. * The driver.name field should be set to the name of this driver. @@ -212,6 +214,8 @@ struct i2c_driver { int (*detect)(struct i2c_client *, struct i2c_board_info *); const unsigned short *address_list; struct list_head clients; + + bool disable_i2c_core_irq_mapping; }; #define to_i2c_driver(d) container_of(d, struct i2c_driver, driver) @@ -303,6 +307,8 @@ static inline int i2c_slave_event(struct i2c_client *client, * @of_node: pointer to OpenFirmware device node * @fwnode: device node supplied by the platform firmware * @properties: additional device properties for the device + * @resources: resources associated with the device + * @num_resources: number of resources in the @resources array * @irq: stored in i2c_client.irq * * I2C doesn't actually support hardware probing, although controllers and @@ -325,6 +331,8 @@ struct i2c_board_info { struct device_node *of_node; struct fwnode_handle *fwnode; const struct property_entry *properties; + const struct resource *resources; + unsigned int num_resources; int irq; }; @@ -824,11 +832,18 @@ static inline const struct of_device_id #if IS_ENABLED(CONFIG_ACPI) u32 i2c_acpi_find_bus_speed(struct device *dev); +struct i2c_client *i2c_acpi_new_device(struct device *dev, int index, + struct i2c_board_info *info); #else static inline u32 i2c_acpi_find_bus_speed(struct device *dev) { return 0; } +static inline struct i2c_client *i2c_acpi_new_device(struct device *dev, + int index, struct i2c_board_info *info) +{ + return NULL; +} #endif /* CONFIG_ACPI */ #endif /* _LINUX_I2C_H */ diff --git a/include/linux/i2c/i2c-hid.h b/include/linux/i2c/i2c-hid.h index 7aa901d920585949b2ecf6c546d65cb004daceea..1fb088239d12b9f4a5a849a2ce662343743c8bd9 100644 --- a/include/linux/i2c/i2c-hid.h +++ b/include/linux/i2c/i2c-hid.h @@ -14,9 +14,13 @@ #include +struct regulator; + /** * struct i2chid_platform_data - used by hid over i2c implementation. * @hid_descriptor_address: i2c register where the HID descriptor is stored. + * @supply: regulator for powering on the device. + * @post_power_delay_ms: delay after powering on before device is usable. * * Note that it is the responsibility of the platform driver (or the acpi 5.0 * driver, or the flattened device tree) to setup the irq related to the gpio in @@ -31,6 +35,8 @@ */ struct i2c_hid_platform_data { u16 hid_descriptor_address; + struct regulator *supply; + int post_power_delay_ms; }; #endif /* __LINUX_I2C_HID_H */ diff --git a/include/linux/ide.h b/include/linux/ide.h index 2f51c1724b5af647423770dfadafaa7a8837b600..6980ca322074b9cd80f69517195a02ea837c3f34 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -88,7 +88,7 @@ static inline bool ata_pm_request(struct request *rq) ide_req(rq)->type == ATA_PRIV_PM_RESUME); } -/* Error codes returned in rq->errors to the higher part of the driver. */ +/* Error codes returned in result to the higher part of the driver. */ enum { IDE_DRV_ERROR_GENERAL = 101, IDE_DRV_ERROR_FILEMARK = 102, diff --git a/include/linux/ieee80211.h b/include/linux/ieee80211.h index 0dd9498c694f95c166508e40896c94ccc724075c..69033353d0d1c2bc28cc831b4cea67ca49c0bfdf 100644 --- a/include/linux/ieee80211.h +++ b/include/linux/ieee80211.h @@ -7,7 +7,7 @@ * Copyright (c) 2005, Devicescape Software, Inc. * Copyright (c) 2006, Michael Wu * Copyright (c) 2013 - 2014 Intel Mobile Communications GmbH - * Copyright (c) 2016 Intel Deutschland GmbH + * Copyright (c) 2016 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1411,6 +1411,8 @@ struct ieee80211_ht_operation { #define IEEE80211_HT_OP_MODE_PROTECTION_NONHT_MIXED 3 #define IEEE80211_HT_OP_MODE_NON_GF_STA_PRSNT 0x0004 #define IEEE80211_HT_OP_MODE_NON_HT_STA_PRSNT 0x0010 +#define IEEE80211_HT_OP_MODE_CCFS2_SHIFT 5 +#define IEEE80211_HT_OP_MODE_CCFS2_MASK 0x1fe0 /* for stbc_param */ #define IEEE80211_HT_STBC_PARAM_DUAL_BEACON 0x0040 @@ -1525,14 +1527,14 @@ enum ieee80211_vht_chanwidth { * This structure is the "VHT operation element" as * described in 802.11ac D3.0 8.4.2.161 * @chan_width: Operating channel width + * @center_freq_seg0_idx: center freq segment 0 index * @center_freq_seg1_idx: center freq segment 1 index - * @center_freq_seg2_idx: center freq segment 2 index * @basic_mcs_set: VHT Basic MCS rate set */ struct ieee80211_vht_operation { u8 chan_width; + u8 center_freq_seg0_idx; u8 center_freq_seg1_idx; - u8 center_freq_seg2_idx; __le16 basic_mcs_set; } __packed; @@ -1721,6 +1723,9 @@ enum ieee80211_statuscode { WLAN_STATUS_REJECT_DSE_BAND = 96, WLAN_STATUS_DENIED_WITH_SUGGESTED_BAND_AND_CHANNEL = 99, WLAN_STATUS_DENIED_DUE_TO_SPECTRUM_MANAGEMENT = 103, + /* 802.11ai */ + WLAN_STATUS_FILS_AUTHENTICATION_FAILURE = 108, + WLAN_STATUS_UNKNOWN_AUTHENTICATION_SERVER = 109, }; @@ -2102,6 +2107,12 @@ enum ieee80211_key_len { #define FILS_NONCE_LEN 16 #define FILS_MAX_KEK_LEN 64 +#define FILS_ERP_MAX_USERNAME_LEN 16 +#define FILS_ERP_MAX_REALM_LEN 253 +#define FILS_ERP_MAX_RRK_LEN 64 + +#define PMK_MAX_LEN 48 + /* Public action codes */ enum ieee80211_pub_actioncode { WLAN_PUB_ACTION_EXT_CHANSW_ANN = 4, @@ -2166,37 +2177,37 @@ enum ieee80211_tdls_actioncode { #define WLAN_BSS_COEX_INFORMATION_REQUEST BIT(0) /** - * enum - mesh synchronization method identifier + * enum ieee80211_mesh_sync_method - mesh synchronization method identifier * * @IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET: the default synchronization method * @IEEE80211_SYNC_METHOD_VENDOR: a vendor specific synchronization method * that will be specified in a vendor specific information element */ -enum { +enum ieee80211_mesh_sync_method { IEEE80211_SYNC_METHOD_NEIGHBOR_OFFSET = 1, IEEE80211_SYNC_METHOD_VENDOR = 255, }; /** - * enum - mesh path selection protocol identifier + * enum ieee80211_mesh_path_protocol - mesh path selection protocol identifier * * @IEEE80211_PATH_PROTOCOL_HWMP: the default path selection protocol * @IEEE80211_PATH_PROTOCOL_VENDOR: a vendor specific protocol that will * be specified in a vendor specific information element */ -enum { +enum ieee80211_mesh_path_protocol { IEEE80211_PATH_PROTOCOL_HWMP = 1, IEEE80211_PATH_PROTOCOL_VENDOR = 255, }; /** - * enum - mesh path selection metric identifier + * enum ieee80211_mesh_path_metric - mesh path selection metric identifier * * @IEEE80211_PATH_METRIC_AIRTIME: the default path selection metric * @IEEE80211_PATH_METRIC_VENDOR: a vendor specific metric that will be * specified in a vendor specific information element */ -enum { +enum ieee80211_mesh_path_metric { IEEE80211_PATH_METRIC_AIRTIME = 1, IEEE80211_PATH_METRIC_VENDOR = 255, }; @@ -2305,6 +2316,32 @@ struct ieee80211_timeout_interval_ie { __le32 value; } __packed; +/** + * enum ieee80211_idle_options - BSS idle options + * @WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE: the station should send an RSN + * protected frame to the AP to reset the idle timer at the AP for + * the station. + */ +enum ieee80211_idle_options { + WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE = BIT(0), +}; + +/** + * struct ieee80211_bss_max_idle_period_ie + * + * This structure refers to "BSS Max idle period element" + * + * @max_idle_period: indicates the time period during which a station can + * refrain from transmitting frames to its associated AP without being + * disassociated. In units of 1000 TUs. + * @idle_options: indicates the options associated with the BSS idle capability + * as specified in &enum ieee80211_idle_options. + */ +struct ieee80211_bss_max_idle_period_ie { + __le16 max_idle_period; + u8 idle_options; +} __packed; + /* BACK action code */ enum ieee80211_back_actioncode { WLAN_ACTION_ADDBA_REQ = 0, @@ -2345,13 +2382,21 @@ enum ieee80211_sa_query_action { #define WLAN_CIPHER_SUITE_SMS4 SUITE(0x001472, 1) /* AKM suite selectors */ -#define WLAN_AKM_SUITE_8021X SUITE(0x000FAC, 1) -#define WLAN_AKM_SUITE_PSK SUITE(0x000FAC, 2) -#define WLAN_AKM_SUITE_8021X_SHA256 SUITE(0x000FAC, 5) -#define WLAN_AKM_SUITE_PSK_SHA256 SUITE(0x000FAC, 6) -#define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) -#define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) -#define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_8021X SUITE(0x000FAC, 1) +#define WLAN_AKM_SUITE_PSK SUITE(0x000FAC, 2) +#define WLAN_AKM_SUITE_FT_8021X SUITE(0x000FAC, 3) +#define WLAN_AKM_SUITE_FT_PSK SUITE(0x000FAC, 4) +#define WLAN_AKM_SUITE_8021X_SHA256 SUITE(0x000FAC, 5) +#define WLAN_AKM_SUITE_PSK_SHA256 SUITE(0x000FAC, 6) +#define WLAN_AKM_SUITE_TDLS SUITE(0x000FAC, 7) +#define WLAN_AKM_SUITE_SAE SUITE(0x000FAC, 8) +#define WLAN_AKM_SUITE_FT_OVER_SAE SUITE(0x000FAC, 9) +#define WLAN_AKM_SUITE_8021X_SUITE_B SUITE(0x000FAC, 11) +#define WLAN_AKM_SUITE_8021X_SUITE_B_192 SUITE(0x000FAC, 12) +#define WLAN_AKM_SUITE_FILS_SHA256 SUITE(0x000FAC, 14) +#define WLAN_AKM_SUITE_FILS_SHA384 SUITE(0x000FAC, 15) +#define WLAN_AKM_SUITE_FT_FILS_SHA256 SUITE(0x000FAC, 16) +#define WLAN_AKM_SUITE_FT_FILS_SHA384 SUITE(0x000FAC, 17) #define WLAN_MAX_KEY_LEN 32 diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h index c5847dc75a937a5ceed4a762ffcc9f2644b499bc..0c16866a7aacc0cbc361e8d5b902252f71323f3b 100644 --- a/include/linux/if_bridge.h +++ b/include/linux/if_bridge.h @@ -48,6 +48,7 @@ struct br_ip_list { #define BR_MCAST_FLOOD BIT(11) #define BR_MULTICAST_TO_UNICAST BIT(12) #define BR_VLAN_TUNNEL BIT(13) +#define BR_BCAST_FLOOD BIT(14) #define BR_DEFAULT_AGEING_TIME (300 * HZ) diff --git a/include/linux/inet.h b/include/linux/inet.h index 4cca05c9678e760a07c0c8dcd88f73cc90ddfdc1..636ebe87e6f886a2edbb6e3e8bd01b360998127a 100644 --- a/include/linux/inet.h +++ b/include/linux/inet.h @@ -43,6 +43,8 @@ #define _LINUX_INET_H #include +#include +#include /* * These mimic similar macros defined in user-space for inet_ntop(3). @@ -54,4 +56,8 @@ extern __be32 in_aton(const char *str); extern int in4_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); extern int in6_pton(const char *src, int srclen, u8 *dst, int delim, const char **end); + +extern int inet_pton_with_scope(struct net *net, unsigned short af, + const char *src, const char *port, struct sockaddr_storage *addr); + #endif /* _LINUX_INET_H */ diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index ee971f335a8b659f04e5a3048ca70e5bc361ee5f..a2e9d6ea1349fb85418a9ebafc55dd08d16ca6b0 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -153,8 +153,8 @@ struct in_ifaddr { int register_inetaddr_notifier(struct notifier_block *nb); int unregister_inetaddr_notifier(struct notifier_block *nb); -void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv4_devconf *devconf); +void inet_netconf_notify_devconf(struct net *net, int event, int type, + int ifindex, struct ipv4_devconf *devconf); struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref); static inline struct net_device *ip_dev_find(struct net *net, __be32 addr) diff --git a/include/linux/init.h b/include/linux/init.h index 79af0962fd525980064241c653d748f2f9be63c0..94769d687cf07d8b41081f3e966d7319b5a71ee9 100644 --- a/include/linux/init.h +++ b/include/linux/init.h @@ -39,7 +39,7 @@ /* These are for everybody (although not all archs will actually discard it in modules) */ -#define __init __section(.init.text) __cold notrace __latent_entropy +#define __init __section(.init.text) __cold __inittrace __latent_entropy #define __initdata __section(.init.data) #define __initconst __section(.init.rodata) #define __exitdata __section(.exit.data) @@ -68,8 +68,10 @@ #ifdef MODULE #define __exitused +#define __inittrace notrace #else #define __exitused __used +#define __inittrace #endif #define __exit __section(.exit.text) __exitused __cold notrace diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 91d9049f003938a77002d76f8a2f0b3313437294..e049526bc188f765c7521653cf6fb90a574bc673 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -181,6 +182,7 @@ extern struct cred init_cred; #ifdef CONFIG_RT_MUTEXES # define INIT_RT_MUTEXES(tsk) \ .pi_waiters = RB_ROOT, \ + .pi_top_task = NULL, \ .pi_waiters_leftmost = NULL, #else # define INIT_RT_MUTEXES(tsk) @@ -202,6 +204,13 @@ extern struct cred init_cred; # define INIT_KASAN(tsk) #endif +#ifdef CONFIG_LIVEPATCH +# define INIT_LIVEPATCH(tsk) \ + .patch_state = KLP_UNDEFINED, +#else +# define INIT_LIVEPATCH(tsk) +#endif + #ifdef CONFIG_THREAD_INFO_IN_TASK # define INIT_TASK_TI(tsk) \ .thread_info = INIT_THREAD_INFO(tsk), \ @@ -210,6 +219,12 @@ extern struct cred init_cred; # define INIT_TASK_TI(tsk) #endif +#ifdef CONFIG_SECURITY +#define INIT_TASK_SECURITY .security = NULL, +#else +#define INIT_TASK_SECURITY +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -288,6 +303,8 @@ extern struct cred init_cred; INIT_VTIME(tsk) \ INIT_NUMA_BALANCING(tsk) \ INIT_KASAN(tsk) \ + INIT_LIVEPATCH(tsk) \ + INIT_TASK_SECURITY \ } diff --git a/include/linux/input/eeti_ts.h b/include/linux/input/eeti_ts.h deleted file mode 100644 index 16625d799b6fc1ca85f89d5d9f81415332c8e86e..0000000000000000000000000000000000000000 --- a/include/linux/input/eeti_ts.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef LINUX_INPUT_EETI_TS_H -#define LINUX_INPUT_EETI_TS_H - -struct eeti_ts_platform_data { - int irq_gpio; - unsigned int irq_active_high; -}; - -#endif /* LINUX_INPUT_EETI_TS_H */ - diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index 37b04a0fdea4e004a63c053bfb3c90d5de3612f1..6174733a57ebc60b68fe674d1c271bb8af6d9d51 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -49,6 +49,8 @@ struct matrix_keymap_data { * @wakeup: controls whether the device should be set up as wakeup * source * @no_autorepeat: disable key autorepeat + * @drive_inactive_cols: drive inactive columns during scan, rather than + * making them inputs. * * This structure represents platform-specific data that use used by * matrix_keypad driver to perform proper initialization. @@ -73,6 +75,7 @@ struct matrix_keypad_platform_data { bool active_low; bool wakeup; bool no_autorepeat; + bool drive_inactive_cols; }; int matrix_keypad_build_keymap(const struct matrix_keymap_data *keymap_data, diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h index c573a52ae440e83894709328820bacb79391d15c..485a5b48f0380460fa0b46243e34aa6c83796c07 100644 --- a/include/linux/intel-iommu.h +++ b/include/linux/intel-iommu.h @@ -30,6 +30,8 @@ #include #include #include +#include + #include #include @@ -72,24 +74,8 @@ #define OFFSET_STRIDE (9) -#ifdef CONFIG_64BIT #define dmar_readq(a) readq(a) #define dmar_writeq(a,v) writeq(v,a) -#else -static inline u64 dmar_readq(void __iomem *addr) -{ - u32 lo, hi; - lo = readl(addr); - hi = readl(addr + 4); - return (((u64) hi) << 32) + lo; -} - -static inline void dmar_writeq(void __iomem *addr, u64 val) -{ - writel((u32)val, addr); - writel((u32)(val >> 32), addr + 4); -} -#endif #define DMAR_VER_MAJOR(v) (((v) & 0xf0) >> 4) #define DMAR_VER_MINOR(v) ((v) & 0x0f) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 53144e78a3691d2d0ea46b48319c5af337f0fda2..a6fba4804672858b87430b81c0af52378bd7ed89 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -155,7 +155,7 @@ extern int __must_check request_percpu_irq(unsigned int irq, irq_handler_t handler, const char *devname, void __percpu *percpu_dev_id); -extern void free_irq(unsigned int, void *); +extern const void *free_irq(unsigned int, void *); extern void free_percpu_irq(unsigned int, void __percpu *); struct device; diff --git a/include/linux/io.h b/include/linux/io.h index 82ef36eac8a16a8fc2b7b021f345910ac04b04b4..2195d9ea4aaae0c054f04aab2da7cff851d2b997 100644 --- a/include/linux/io.h +++ b/include/linux/io.h @@ -90,6 +90,27 @@ void devm_memunmap(struct device *dev, void *addr); void *__devm_memremap_pages(struct device *dev, struct resource *res); +#ifdef CONFIG_PCI +/* + * The PCI specifications (Rev 3.0, 3.2.5 "Transaction Ordering and + * Posting") mandate non-posted configuration transactions. There is + * no ioremap API in the kernel that can guarantee non-posted write + * semantics across arches so provide a default implementation for + * mapping PCI config space that defaults to ioremap_nocache(); arches + * should override it if they have memory mapping implementations that + * guarantee non-posted writes semantics to make the memory mapping + * compliant with the PCI specification. + */ +#ifndef pci_remap_cfgspace +#define pci_remap_cfgspace pci_remap_cfgspace +static inline void __iomem *pci_remap_cfgspace(phys_addr_t offset, + size_t size) +{ + return ioremap_nocache(offset, size); +} +#endif +#endif + /* * Some systems do not have legacy ISA devices. * /dev/port is not a valid interface on these systems. diff --git a/include/linux/iomap.h b/include/linux/iomap.h index 7291810067eb636dfa622b8b385ac45d7a520395..f753e788da3109cc85cd07ac44b2f7e47fcde9b6 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -41,6 +41,7 @@ struct iomap { u16 type; /* type of mapping */ u16 flags; /* flags for mapping */ struct block_device *bdev; /* block device for I/O */ + struct dax_device *dax_dev; /* dax_dev for dax operations */ }; /* diff --git a/include/linux/iommu.h b/include/linux/iommu.h index 2e4de0deee531adbd7c1cd8ff1d25e0d0eb98d47..2cb54adc4a334aa3f3a1732c35eaf9749d64b9e8 100644 --- a/include/linux/iommu.h +++ b/include/linux/iommu.h @@ -19,12 +19,12 @@ #ifndef __LINUX_IOMMU_H #define __LINUX_IOMMU_H +#include +#include +#include #include #include #include -#include -#include -#include #define IOMMU_READ (1 << 0) #define IOMMU_WRITE (1 << 1) @@ -32,10 +32,13 @@ #define IOMMU_NOEXEC (1 << 3) #define IOMMU_MMIO (1 << 4) /* e.g. things like MSI doorbells */ /* - * This is to make the IOMMU API setup privileged - * mapppings accessible by the master only at higher - * privileged execution level and inaccessible at - * less privileged levels. + * Where the bus hardware includes a privilege level as part of its access type + * markings, and certain devices are capable of issuing transactions marked as + * either 'supervisor' or 'user', the IOMMU_PRIV flag requests that the other + * given permission flags only apply to accesses at the higher privilege level, + * and that unprivileged transactions should have as little access as possible. + * This would usually imply the same permissions as kernel mappings on the CPU, + * if the IOMMU page table format is equivalent. */ #define IOMMU_PRIV (1 << 5) @@ -336,46 +339,9 @@ extern int iommu_domain_window_enable(struct iommu_domain *domain, u32 wnd_nr, phys_addr_t offset, u64 size, int prot); extern void iommu_domain_window_disable(struct iommu_domain *domain, u32 wnd_nr); -/** - * report_iommu_fault() - report about an IOMMU fault to the IOMMU framework - * @domain: the iommu domain where the fault has happened - * @dev: the device where the fault has happened - * @iova: the faulting address - * @flags: mmu fault flags (e.g. IOMMU_FAULT_READ/IOMMU_FAULT_WRITE/...) - * - * This function should be called by the low-level IOMMU implementations - * whenever IOMMU faults happen, to allow high-level users, that are - * interested in such events, to know about them. - * - * This event may be useful for several possible use cases: - * - mere logging of the event - * - dynamic TLB/PTE loading - * - if restarting of the faulting device is required - * - * Returns 0 on success and an appropriate error code otherwise (if dynamic - * PTE/TLB loading will one day be supported, implementations will be able - * to tell whether it succeeded or not according to this return value). - * - * Specifically, -ENOSYS is returned if a fault handler isn't installed - * (though fault handlers can also return -ENOSYS, in case they want to - * elicit the default behavior of the IOMMU drivers). - */ -static inline int report_iommu_fault(struct iommu_domain *domain, - struct device *dev, unsigned long iova, int flags) -{ - int ret = -ENOSYS; - - /* - * if upper layers showed interest and installed a fault handler, - * invoke it. - */ - if (domain->handler) - ret = domain->handler(domain, dev, iova, flags, - domain->handler_token); - trace_io_page_fault(dev, iova, flags); - return ret; -} +extern int report_iommu_fault(struct iommu_domain *domain, struct device *dev, + unsigned long iova, int flags); static inline size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova, struct scatterlist *sg, diff --git a/include/linux/iova.h b/include/linux/iova.h index f27bb2c62fca5cf19a74f3f03c64b8cb9036bd19..e0a892ae45c0834a82d42e06a6bfab65b424c899 100644 --- a/include/linux/iova.h +++ b/include/linux/iova.h @@ -82,6 +82,7 @@ static inline unsigned long iova_pfn(struct iova_domain *iovad, dma_addr_t iova) return iova >> iova_shift(iovad); } +#if IS_ENABLED(CONFIG_IOMMU_IOVA) int iova_cache_get(void); void iova_cache_put(void); @@ -106,5 +107,95 @@ void put_iova_domain(struct iova_domain *iovad); struct iova *split_and_remove_iova(struct iova_domain *iovad, struct iova *iova, unsigned long pfn_lo, unsigned long pfn_hi); void free_cpu_cached_iovas(unsigned int cpu, struct iova_domain *iovad); +#else +static inline int iova_cache_get(void) +{ + return -ENOTSUPP; +} + +static inline void iova_cache_put(void) +{ +} + +static inline struct iova *alloc_iova_mem(void) +{ + return NULL; +} + +static inline void free_iova_mem(struct iova *iova) +{ +} + +static inline void free_iova(struct iova_domain *iovad, unsigned long pfn) +{ +} + +static inline void __free_iova(struct iova_domain *iovad, struct iova *iova) +{ +} + +static inline struct iova *alloc_iova(struct iova_domain *iovad, + unsigned long size, + unsigned long limit_pfn, + bool size_aligned) +{ + return NULL; +} + +static inline void free_iova_fast(struct iova_domain *iovad, + unsigned long pfn, + unsigned long size) +{ +} + +static inline unsigned long alloc_iova_fast(struct iova_domain *iovad, + unsigned long size, + unsigned long limit_pfn) +{ + return 0; +} + +static inline struct iova *reserve_iova(struct iova_domain *iovad, + unsigned long pfn_lo, + unsigned long pfn_hi) +{ + return NULL; +} + +static inline void copy_reserved_iova(struct iova_domain *from, + struct iova_domain *to) +{ +} + +static inline void init_iova_domain(struct iova_domain *iovad, + unsigned long granule, + unsigned long start_pfn, + unsigned long pfn_32bit) +{ +} + +static inline struct iova *find_iova(struct iova_domain *iovad, + unsigned long pfn) +{ + return NULL; +} + +static inline void put_iova_domain(struct iova_domain *iovad) +{ +} + +static inline struct iova *split_and_remove_iova(struct iova_domain *iovad, + struct iova *iova, + unsigned long pfn_lo, + unsigned long pfn_hi) +{ + return NULL; +} + +static inline void free_cpu_cached_iovas(unsigned int cpu, + struct iova_domain *iovad) +{ +} +#endif #endif diff --git a/include/linux/ipc.h b/include/linux/ipc.h index 9d84942ae2e577835a338b8a352f9c93eabe148b..71fd92d81b266edf1781c542de550159bc33949a 100644 --- a/include/linux/ipc.h +++ b/include/linux/ipc.h @@ -8,8 +8,7 @@ #define IPCMNI 32768 /* <= MAX_INT limit for ipc arrays (including sysctl changes) */ /* used by in-kernel data structures */ -struct kern_ipc_perm -{ +struct kern_ipc_perm { spinlock_t lock; bool deleted; int id; @@ -18,9 +17,9 @@ struct kern_ipc_perm kgid_t gid; kuid_t cuid; kgid_t cgid; - umode_t mode; + umode_t mode; unsigned long seq; void *security; -}; +} ____cacheline_aligned_in_smp; #endif /* _LINUX_IPC_H */ diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 71be5b330d21305af23f8f7e1779988930755ed8..e1b442996f810529a755533270b6d21c350fbd5a 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -37,6 +37,7 @@ struct ipv6_devconf { __s32 accept_ra_rtr_pref; __s32 rtr_probe_interval; #ifdef CONFIG_IPV6_ROUTE_INFO + __s32 accept_ra_rt_info_min_plen; __s32 accept_ra_rt_info_max_plen; #endif #endif @@ -70,6 +71,7 @@ struct ipv6_devconf { #endif __u32 enhanced_dad; __u32 addr_gen_mode; + __s32 disable_policy; struct ctl_table_header *sysctl_header; }; diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h index 97cbca19430d82aa2b4c835db1edf2d6620517ba..fffb91202bc9366a523002cc61fc074d12f84725 100644 --- a/include/linux/irqchip/arm-gic-v3.h +++ b/include/linux/irqchip/arm-gic-v3.h @@ -132,6 +132,9 @@ #define GIC_BASER_SHAREABILITY(reg, type) \ (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT) +/* encode a size field of width @w containing @n - 1 units */ +#define GIC_ENCODE_SZ(n, w) (((unsigned long)(n) - 1) & GENMASK_ULL(((w) - 1), 0)) + #define GICR_PROPBASER_SHAREABILITY_SHIFT (10) #define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7) #define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56) @@ -156,6 +159,8 @@ #define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb) #define GICR_PROPBASER_IDBITS_MASK (0x1f) +#define GICR_PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 12)) +#define GICR_PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(51, 16)) #define GICR_PENDBASER_SHAREABILITY_SHIFT (10) #define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7) @@ -232,12 +237,18 @@ #define GITS_CTLR_QUIESCENT (1U << 31) #define GITS_TYPER_PLPIS (1UL << 0) +#define GITS_TYPER_ITT_ENTRY_SIZE_SHIFT 4 #define GITS_TYPER_IDBITS_SHIFT 8 #define GITS_TYPER_DEVBITS_SHIFT 13 #define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1) #define GITS_TYPER_PTA (1UL << 19) #define GITS_TYPER_HWCOLLCNT_SHIFT 24 +#define GITS_IIDR_REV_SHIFT 12 +#define GITS_IIDR_REV_MASK (0xf << GITS_IIDR_REV_SHIFT) +#define GITS_IIDR_REV(r) (((r) >> GITS_IIDR_REV_SHIFT) & 0xf) +#define GITS_IIDR_PRODUCTID_SHIFT 24 + #define GITS_CBASER_VALID (1ULL << 63) #define GITS_CBASER_SHAREABILITY_SHIFT (10) #define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59) @@ -290,6 +301,7 @@ #define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7) #define GITS_BASER_ENTRY_SIZE_SHIFT (48) #define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0x1f) + 1) +#define GITS_BASER_ENTRY_SIZE_MASK GENMASK_ULL(52, 48) #define GITS_BASER_SHAREABILITY_SHIFT (10) #define GITS_BASER_InnerShareable \ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) @@ -337,9 +349,11 @@ #define E_ITS_INT_UNMAPPED_INTERRUPT 0x010307 #define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507 #define E_ITS_MAPD_DEVICE_OOR 0x010801 +#define E_ITS_MAPD_ITTSIZE_OOR 0x010802 #define E_ITS_MAPC_PROCNUM_OOR 0x010902 #define E_ITS_MAPC_COLLECTION_OOR 0x010903 #define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04 +#define E_ITS_MAPTI_ID_OOR 0x010a05 #define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06 #define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07 #define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09 diff --git a/include/linux/irqchip/mips-gic.h b/include/linux/irqchip/mips-gic.h index 7b49c71c968ba0935a5482afb71055107563542b..2b0e56619e5377064e9dfe7e701c09b36c200bf2 100644 --- a/include/linux/irqchip/mips-gic.h +++ b/include/linux/irqchip/mips-gic.h @@ -258,7 +258,6 @@ extern unsigned int gic_present; extern void gic_init(unsigned long gic_base_addr, unsigned long gic_addrspace_size, unsigned int cpu_vec, unsigned int irqbase); -extern void gic_clocksource_init(unsigned int); extern u64 gic_read_count(void); extern unsigned int gic_get_count_width(void); extern u64 gic_read_compare(void); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index dfaa1f4dcb0c54911c16a8683a38dabbfb4dcb0f..606b6bce3a5bb062d12b35d84bf8a022cbe0abe8 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -491,6 +491,8 @@ struct jbd2_journal_handle unsigned long h_start_jiffies; unsigned int h_requested_credits; + + unsigned int saved_alloc_context; }; diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index 624215cebee5235a3375549f62e026b3c7e41f74..36872fbb815d72203e14582e3dab6ba5051ee8a7 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -1,6 +1,7 @@ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H +#include #include #include #include @@ -63,19 +64,13 @@ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -/* some arch's have a small-data section that can be accessed register-relative - * but that can only take up to, say, 4-byte variables. jiffies being part of - * an 8-byte variable may not be correctly accessed unless we force the issue - */ -#define __jiffy_data __attribute__((section(".data"))) - /* * The 64-bit value is not atomic - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ -extern u64 __jiffy_data jiffies_64; -extern unsigned long volatile __jiffy_data jiffies; +extern u64 __cacheline_aligned_in_smp jiffies_64; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); diff --git a/include/linux/kbuild.h b/include/linux/kbuild.h index 22a72198c14b7e338f4f10ed334f7c73be8cfcae..4e80f3a9ad58abc815926b701a8fbc83c153b5a9 100644 --- a/include/linux/kbuild.h +++ b/include/linux/kbuild.h @@ -2,14 +2,14 @@ #define __LINUX_KBUILD_H #define DEFINE(sym, val) \ - asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + asm volatile("\n.ascii \"->" #sym " %0 " #val "\"" : : "i" (val)) -#define BLANK() asm volatile("\n->" : : ) +#define BLANK() asm volatile("\n.ascii \"->\"" : : ) #define OFFSET(sym, str, mem) \ DEFINE(sym, offsetof(struct str, mem)) #define COMMENT(x) \ - asm volatile("\n->#" x) + asm volatile("\n.ascii \"->#" x "\"") #endif diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 4c26dc3a8295a9fb2910dcb18946ee86f54ceac8..13bc08aba70470dfa4ab5a5112e11035ac3fbc99 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -47,6 +47,7 @@ /* @a is a power of 2 value */ #define ALIGN(x, a) __ALIGN_KERNEL((x), (a)) +#define ALIGN_DOWN(x, a) __ALIGN_KERNEL((x) - ((a) - 1), (a)) #define __ALIGN_MASK(x, mask) __ALIGN_KERNEL_MASK((x), (mask)) #define PTR_ALIGN(p, a) ((typeof(p))ALIGN((unsigned long)(p), (a))) #define IS_ALIGNED(x, a) (((x) & ((typeof(x))(a) - 1)) == 0) @@ -438,6 +439,7 @@ extern int get_option(char **str, int *pint); extern char *get_options(const char *str, int nints, int *ints); extern unsigned long long memparse(const char *ptr, char **retptr); extern bool parse_option_str(const char *str, const char *option); +extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); extern int core_kernel_data(unsigned long addr); diff --git a/include/linux/kexec.h b/include/linux/kexec.h index d419d0e51fe55172e5cc5b42bb4c8c2fd9694141..c9481ebcbc0cab09429bde237f21e41cc7dcb061 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -14,17 +14,15 @@ #if !defined(__ASSEMBLY__) +#include #include #include #ifdef CONFIG_KEXEC_CORE #include -#include #include #include -#include -#include #include #include @@ -62,19 +60,15 @@ #define KEXEC_CRASH_MEM_ALIGN PAGE_SIZE #endif -#define KEXEC_NOTE_HEAD_BYTES ALIGN(sizeof(struct elf_note), 4) -#define KEXEC_CORE_NOTE_NAME "CORE" -#define KEXEC_CORE_NOTE_NAME_BYTES ALIGN(sizeof(KEXEC_CORE_NOTE_NAME), 4) -#define KEXEC_CORE_NOTE_DESC_BYTES ALIGN(sizeof(struct elf_prstatus), 4) +#define KEXEC_CORE_NOTE_NAME CRASH_CORE_NOTE_NAME + /* * The per-cpu notes area is a list of notes terminated by a "NULL" * note header. For kdump, the code in vmcore.c runs in the context * of the second kernel to combine them into one note. */ #ifndef KEXEC_NOTE_BYTES -#define KEXEC_NOTE_BYTES ( (KEXEC_NOTE_HEAD_BYTES * 2) + \ - KEXEC_CORE_NOTE_NAME_BYTES + \ - KEXEC_CORE_NOTE_DESC_BYTES ) +#define KEXEC_NOTE_BYTES CRASH_CORE_NOTE_BYTES #endif /* @@ -256,33 +250,6 @@ extern void crash_kexec(struct pt_regs *); int kexec_should_crash(struct task_struct *); int kexec_crash_loaded(void); void crash_save_cpu(struct pt_regs *regs, int cpu); -void crash_save_vmcoreinfo(void); -void arch_crash_save_vmcoreinfo(void); -__printf(1, 2) -void vmcoreinfo_append_str(const char *fmt, ...); -phys_addr_t paddr_vmcoreinfo_note(void); - -#define VMCOREINFO_OSRELEASE(value) \ - vmcoreinfo_append_str("OSRELEASE=%s\n", value) -#define VMCOREINFO_PAGESIZE(value) \ - vmcoreinfo_append_str("PAGESIZE=%ld\n", value) -#define VMCOREINFO_SYMBOL(name) \ - vmcoreinfo_append_str("SYMBOL(%s)=%lx\n", #name, (unsigned long)&name) -#define VMCOREINFO_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(name)) -#define VMCOREINFO_STRUCT_SIZE(name) \ - vmcoreinfo_append_str("SIZE(%s)=%lu\n", #name, \ - (unsigned long)sizeof(struct name)) -#define VMCOREINFO_OFFSET(name, field) \ - vmcoreinfo_append_str("OFFSET(%s.%s)=%lu\n", #name, #field, \ - (unsigned long)offsetof(struct name, field)) -#define VMCOREINFO_LENGTH(name, value) \ - vmcoreinfo_append_str("LENGTH(%s)=%lu\n", #name, (unsigned long)value) -#define VMCOREINFO_NUMBER(name) \ - vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) -#define VMCOREINFO_CONFIG(name) \ - vmcoreinfo_append_str("CONFIG_%s=y\n", #name) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; @@ -303,31 +270,15 @@ extern int kexec_load_disabled; #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ KEXEC_FILE_NO_INITRAMFS) -#define VMCOREINFO_BYTES (4096) -#define VMCOREINFO_NOTE_NAME "VMCOREINFO" -#define VMCOREINFO_NOTE_NAME_BYTES ALIGN(sizeof(VMCOREINFO_NOTE_NAME), 4) -#define VMCOREINFO_NOTE_SIZE (KEXEC_NOTE_HEAD_BYTES*2 + VMCOREINFO_BYTES \ - + VMCOREINFO_NOTE_NAME_BYTES) - /* Location of a reserved region to hold the crash kernel. */ extern struct resource crashk_res; extern struct resource crashk_low_res; -typedef u32 note_buf_t[KEXEC_NOTE_BYTES/4]; extern note_buf_t __percpu *crash_notes; -extern u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -extern size_t vmcoreinfo_size; -extern size_t vmcoreinfo_max_size; /* flag to track if kexec reboot is in progress */ extern bool kexec_in_progress; -int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_high(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); -int parse_crashkernel_low(char *cmdline, unsigned long long system_ram, - unsigned long long *crash_size, unsigned long long *crash_base); int crash_shrink_memory(unsigned long new_size); size_t crash_get_memory_size(void); void crash_free_reserved_phys_range(unsigned long begin, unsigned long end); diff --git a/include/linux/key-type.h b/include/linux/key-type.h index eaee981c55584bc0aa8ee72365449e376713a536..8496cf64575c679b49a0eeedf0c0d3c4d1e2dd1c 100644 --- a/include/linux/key-type.h +++ b/include/linux/key-type.h @@ -147,6 +147,14 @@ struct key_type { */ request_key_actor_t request_key; + /* Look up a keyring access restriction (optional) + * + * - NULL is a valid return value (meaning the requested restriction + * is known but will never block addition of a key) + * - should return -EINVAL if the restriction is unknown + */ + struct key_restriction *(*lookup_restriction)(const char *params); + /* internal fields */ struct list_head link; /* link in types list */ struct lock_class_key lock_class; /* key->sem lock class */ diff --git a/include/linux/key.h b/include/linux/key.h index e45212f2777e36f531efe35ad7d56fcf67105c0b..0c9b93b0d1f7cd6133ea87515d3a83fb25f8258b 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include @@ -126,6 +127,17 @@ static inline bool is_key_possessed(const key_ref_t key_ref) return (unsigned long) key_ref & 1UL; } +typedef int (*key_restrict_link_func_t)(struct key *dest_keyring, + const struct key_type *type, + const union key_payload *payload, + struct key *restriction_key); + +struct key_restriction { + key_restrict_link_func_t check; + struct key *key; + struct key_type *keytype; +}; + /*****************************************************************************/ /* * authentication token / access credential / keyring @@ -135,7 +147,7 @@ static inline bool is_key_possessed(const key_ref_t key_ref) * - Kerberos TGTs and tickets */ struct key { - atomic_t usage; /* number of references */ + refcount_t usage; /* number of references */ key_serial_t serial; /* key serial number */ union { struct list_head graveyard_link; @@ -205,18 +217,17 @@ struct key { }; /* This is set on a keyring to restrict the addition of a link to a key - * to it. If this method isn't provided then it is assumed that the + * to it. If this structure isn't provided then it is assumed that the * keyring is open to any addition. It is ignored for non-keyring - * keys. + * keys. Only set this value using keyring_restrict(), keyring_alloc(), + * or key_alloc(). * * This is intended for use with rings of trusted keys whereby addition * to the keyring needs to be controlled. KEY_ALLOC_BYPASS_RESTRICTION * overrides this, allowing the kernel to add extra keys without * restriction. */ - int (*restrict_link)(struct key *keyring, - const struct key_type *type, - const union key_payload *payload); + struct key_restriction *restrict_link; }; extern struct key *key_alloc(struct key_type *type, @@ -225,9 +236,7 @@ extern struct key *key_alloc(struct key_type *type, const struct cred *cred, key_perm_t perm, unsigned long flags, - int (*restrict_link)(struct key *, - const struct key_type *, - const union key_payload *)); + struct key_restriction *restrict_link); #define KEY_ALLOC_IN_QUOTA 0x0000 /* add to quota, reject if would overrun */ @@ -242,7 +251,7 @@ extern void key_put(struct key *key); static inline struct key *__key_get(struct key *key) { - atomic_inc(&key->usage); + refcount_inc(&key->usage); return key; } @@ -303,14 +312,13 @@ extern struct key *keyring_alloc(const char *description, kuid_t uid, kgid_t gid const struct cred *cred, key_perm_t perm, unsigned long flags, - int (*restrict_link)(struct key *, - const struct key_type *, - const union key_payload *), + struct key_restriction *restrict_link, struct key *dest); extern int restrict_link_reject(struct key *keyring, const struct key_type *type, - const union key_payload *payload); + const union key_payload *payload, + struct key *restriction_key); extern int keyring_clear(struct key *keyring); @@ -321,6 +329,9 @@ extern key_ref_t keyring_search(key_ref_t keyring, extern int keyring_add_key(struct key *keyring, struct key *key); +extern int keyring_restrict(key_ref_t keyring, const char *type, + const char *restriction); + extern struct key *key_lookup(key_serial_t id); static inline key_serial_t key_serial(const struct key *key) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e6284591599ec5361b9abcffb2f731512b76eec9..ca85cb80e99a60e33df79e2ed0d8815953d1d21d 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); +extern struct kobject * __must_check kobject_get_unless_zero( + struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index c328e4f7dcadb4276bddcfbcc97caa323c48a0a3..30f90c1a0aaf923d2227ae5ba5323ea5da728005 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -267,6 +267,8 @@ extern int arch_init_kprobes(void); extern void show_registers(struct pt_regs *regs); extern void kprobes_inc_nmissed_count(struct kprobe *p); extern bool arch_within_kprobe_blacklist(unsigned long addr); +extern bool arch_function_offset_within_entry(unsigned long offset); +extern bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset); extern bool within_kprobe_blacklist(unsigned long addr); @@ -379,6 +381,7 @@ static inline struct kprobe_ctlblk *get_kprobe_ctlblk(void) return this_cpu_ptr(&kprobe_ctlblk); } +kprobe_opcode_t *kprobe_lookup_name(const char *name, unsigned int offset); int register_kprobe(struct kprobe *p); void unregister_kprobe(struct kprobe *p); int register_kprobes(struct kprobe **kps, int num); diff --git a/include/linux/kref.h b/include/linux/kref.h index f4156f88f5575dceb23a78807a12e5fde037c701..29220724bf1cb43df4cc6840f4016fe7ba30d54c 100644 --- a/include/linux/kref.h +++ b/include/linux/kref.h @@ -66,8 +66,6 @@ static inline void kref_get(struct kref *kref) */ static inline int kref_put(struct kref *kref, void (*release)(struct kref *kref)) { - WARN_ON(release == NULL); - if (refcount_dec_and_test(&kref->refcount)) { release(kref); return 1; @@ -79,8 +77,6 @@ static inline int kref_put_mutex(struct kref *kref, void (*release)(struct kref *kref), struct mutex *lock) { - WARN_ON(release == NULL); - if (refcount_dec_and_mutex_lock(&kref->refcount, lock)) { release(kref); return 1; @@ -92,8 +88,6 @@ static inline int kref_put_lock(struct kref *kref, void (*release)(struct kref *kref), spinlock_t *lock) { - WARN_ON(release == NULL); - if (refcount_dec_and_lock(&kref->refcount, lock)) { release(kref); return 1; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index e1cfda4bee588d726e2cfe9089ccc20baa031864..78b44a024eaae8e9a9a0e2c448b4b3e2b7f48719 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -61,7 +61,7 @@ static inline void set_page_stable_node(struct page *page, struct page *ksm_might_need_to_copy(struct page *page, struct vm_area_struct *vma, unsigned long address); -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); void ksm_migrate_page(struct page *newpage, struct page *oldpage); #else /* !CONFIG_KSM */ @@ -94,10 +94,9 @@ static inline int page_referenced_ksm(struct page *page, return 0; } -static inline int rmap_walk_ksm(struct page *page, +static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { - return 0; } static inline void ksm_migrate_page(struct page *newpage, struct page *oldpage) diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index d0250744507a284138d0e7e702bb64e1d205e769..8c0664309815ff95b0308b379e863125866f31f6 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -115,14 +115,17 @@ static inline bool is_error_page(struct page *page) return IS_ERR(page); } +#define KVM_REQUEST_MASK GENMASK(7,0) +#define KVM_REQUEST_NO_WAKEUP BIT(8) +#define KVM_REQUEST_WAIT BIT(9) /* * Architecture-independent vcpu->requests bit members * Bits 4-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH 0 -#define KVM_REQ_MMU_RELOAD 1 -#define KVM_REQ_PENDING_TIMER 2 -#define KVM_REQ_UNHALT 3 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNHALT 3 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -268,6 +271,12 @@ struct kvm_vcpu { static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { + /* + * The memory barrier ensures a previous write to vcpu->requests cannot + * be reordered with the read of vcpu->mode. It pairs with the general + * memory barrier following the write of vcpu->mode in VCPU RUN. + */ + smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } @@ -375,8 +384,6 @@ struct kvm { struct mutex slots_lock; struct mm_struct *mm; /* userspace tied to this vm */ struct kvm_memslots *memslots[KVM_ADDRESS_SPACE_NUM]; - struct srcu_struct srcu; - struct srcu_struct irq_srcu; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; /* @@ -403,7 +410,7 @@ struct kvm { struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; @@ -429,6 +436,8 @@ struct kvm { struct list_head devices; struct dentry *debugfs_dentry; struct kvm_stat_data **debugfs_stat_data; + struct srcu_struct srcu; + struct srcu_struct irq_srcu; }; #define kvm_err(fmt, ...) \ @@ -490,6 +499,17 @@ static inline struct kvm_vcpu *kvm_get_vcpu_by_id(struct kvm *kvm, int id) return NULL; } +static inline int kvm_vcpu_get_idx(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu *tmp; + int idx; + + kvm_for_each_vcpu(idx, tmp, vcpu->kvm) + if (tmp == vcpu) + return idx; + BUG(); +} + #define kvm_for_each_memslot(memslot, slots) \ for (memslot = &slots->memslots[0]; \ memslot < slots->memslots + KVM_MEM_SLOTS_NUM && memslot->npages;\ @@ -502,10 +522,10 @@ int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else -static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) @@ -641,18 +661,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); -int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); -int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len); -int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len); +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len); +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); @@ -682,7 +702,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); -void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); +bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); @@ -767,8 +787,6 @@ void kvm_arch_check_processor_compat(void *rtn); int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu); int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu); -void *kvm_kvzalloc(unsigned long size); - #ifndef __KVM_HAVE_ARCH_VM_ALLOC static inline struct kvm *kvm_arch_alloc_vm(void) { @@ -877,22 +895,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -#else -static inline int kvm_iommu_map_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - return 0; -} - -static inline void kvm_iommu_unmap_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} -#endif - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. @@ -1025,6 +1027,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #define KVM_MAX_IRQ_ROUTES 1024 #endif +bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, @@ -1092,13 +1095,23 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); - set_bit(req, &vcpu->requests); + set_bit(req & KVM_REQUEST_MASK, &vcpu->requests); +} + +static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) +{ + return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests); +} + +static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) +{ + clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { - if (test_bit(req, &vcpu->requests)) { - clear_bit(req, &vcpu->requests); + if (kvm_test_request(req, vcpu)) { + kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's @@ -1165,7 +1178,6 @@ int kvm_register_device_ops(struct kvm_device_ops *ops, u32 type); void kvm_unregister_device_ops(u32 type); extern struct kvm_device_ops kvm_mpic_ops; -extern struct kvm_device_ops kvm_xics_ops; extern struct kvm_device_ops kvm_arm_vgic_v2_ops; extern struct kvm_device_ops kvm_arm_vgic_v3_ops; diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h index d215b45611809ddc0c0d1e6d51b622b0db91cdbb..5e240b2b4d581b29c792de8ce8d6bff9ad045c91 100644 --- a/include/linux/leds-pca9532.h +++ b/include/linux/leds-pca9532.h @@ -22,7 +22,8 @@ enum pca9532_state { PCA9532_OFF = 0x0, PCA9532_ON = 0x1, PCA9532_PWM0 = 0x2, - PCA9532_PWM1 = 0x3 + PCA9532_PWM1 = 0x3, + PCA9532_KEEP = 0xff, }; struct pca9532_led { @@ -44,4 +45,3 @@ struct pca9532_platform_data { }; #endif /* __LINUX_PCA9532_H */ - diff --git a/include/linux/leds.h b/include/linux/leds.h index 38c0bd7ca1074af234d516275791d05f945ce1f0..64c56d454f7df9f864a5242ce4212df586f66886 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -122,10 +122,16 @@ struct led_classdev { struct mutex led_access; }; -extern int led_classdev_register(struct device *parent, - struct led_classdev *led_cdev); -extern int devm_led_classdev_register(struct device *parent, - struct led_classdev *led_cdev); +extern int of_led_classdev_register(struct device *parent, + struct device_node *np, + struct led_classdev *led_cdev); +#define led_classdev_register(parent, led_cdev) \ + of_led_classdev_register(parent, NULL, led_cdev) +extern int devm_of_led_classdev_register(struct device *parent, + struct device_node *np, + struct led_classdev *led_cdev); +#define devm_led_classdev_register(parent, led_cdev) \ + devm_of_led_classdev_register(parent, NULL, led_cdev) extern void led_classdev_unregister(struct led_classdev *led_cdev); extern void devm_led_classdev_unregister(struct device *parent, struct led_classdev *led_cdev); diff --git a/include/linux/libnvdimm.h b/include/linux/libnvdimm.h index 77e7af32543f698d0413286dc8702d7124bcf81b..6c807017128d4645d423b6ded0737c452fa3a3e6 100644 --- a/include/linux/libnvdimm.h +++ b/include/linux/libnvdimm.h @@ -20,9 +20,11 @@ enum { /* when a dimm supports both PMEM and BLK access a label is required */ - NDD_ALIASING = 1 << 0, + NDD_ALIASING = 0, /* unarmed memory devices may not persist writes */ - NDD_UNARMED = 1 << 1, + NDD_UNARMED = 1, + /* locked memory devices should not be accessed */ + NDD_LOCKED = 2, /* need to set a limit somewhere, but yes, this is likely overkill */ ND_IOCTL_MAX_BUFLEN = SZ_4M, @@ -120,7 +122,7 @@ static inline struct nd_blk_region_desc *to_blk_region_desc( } int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); -void nvdimm_clear_from_poison_list(struct nvdimm_bus *nvdimm_bus, +void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start, unsigned int len); struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus_descriptor *nfit_desc); diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ca45e4a088a91eb929a208de5ffb061b33c9ff2c..7dfa56ebbc6d0f1bd4e215c93d5e642b3231334b 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -56,7 +56,6 @@ typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, u8 *); typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct ppa_addr *, int, int); typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); typedef void (nvm_destroy_dma_pool_fn)(void *); typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, @@ -70,7 +69,6 @@ struct nvm_dev_ops { nvm_op_set_bb_fn *set_bb_tbl; nvm_submit_io_fn *submit_io; - nvm_erase_blk_fn *erase_block; nvm_create_dma_pool_fn *create_dma_pool; nvm_destroy_dma_pool_fn *destroy_dma_pool; @@ -125,7 +123,7 @@ enum { /* NAND Access Modes */ NVM_IO_SUSPEND = 0x80, NVM_IO_SLC_MODE = 0x100, - NVM_IO_SCRAMBLE_DISABLE = 0x200, + NVM_IO_SCRAMBLE_ENABLE = 0x200, /* Block Types */ NVM_BLK_T_FREE = 0x0, @@ -438,7 +436,8 @@ static inline int ppa_cmp_blk(struct ppa_addr ppa1, struct ppa_addr ppa2) typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *); +typedef void *(nvm_tgt_init_fn)(struct nvm_tgt_dev *, struct gendisk *, + int flags); typedef void (nvm_tgt_exit_fn)(void *); typedef int (nvm_tgt_sysfs_init_fn)(struct gendisk *); typedef void (nvm_tgt_sysfs_exit_fn)(struct gendisk *); @@ -479,10 +478,10 @@ extern int nvm_set_tgt_bb_tbl(struct nvm_tgt_dev *, struct ppa_addr *, int, int); extern int nvm_max_phys_sects(struct nvm_tgt_dev *); extern int nvm_submit_io(struct nvm_tgt_dev *, struct nvm_rq *); -extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, +extern int nvm_erase_sync(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern int nvm_set_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *, const struct ppa_addr *, int, int); -extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); -extern int nvm_erase_blk(struct nvm_tgt_dev *, struct ppa_addr *, int); +extern void nvm_free_rqd_ppalist(struct nvm_tgt_dev *, struct nvm_rq *); extern int nvm_get_l2p_tbl(struct nvm_tgt_dev *, u64, u32, nvm_l2p_update_fn *, void *); extern int nvm_get_area(struct nvm_tgt_dev *, sector_t *, sector_t); diff --git a/include/linux/livepatch.h b/include/linux/livepatch.h index 9072f04db616ee2467333411fdd287a0fe6b3d84..194991ef93476743d0a2c57d13de101ba1b9ac12 100644 --- a/include/linux/livepatch.h +++ b/include/linux/livepatch.h @@ -23,15 +23,16 @@ #include #include +#include #if IS_ENABLED(CONFIG_LIVEPATCH) #include -enum klp_state { - KLP_DISABLED, - KLP_ENABLED -}; +/* task patch states */ +#define KLP_UNDEFINED -1 +#define KLP_UNPATCHED 0 +#define KLP_PATCHED 1 /** * struct klp_func - function structure for live patching @@ -39,10 +40,29 @@ enum klp_state { * @new_func: pointer to the patched function code * @old_sympos: a hint indicating which symbol position the old function * can be found (optional) + * @immediate: patch the func immediately, bypassing safety mechanisms * @old_addr: the address of the function being patched * @kobj: kobject for sysfs resources - * @state: tracks function-level patch application state * @stack_node: list node for klp_ops func_stack list + * @old_size: size of the old function + * @new_size: size of the new function + * @patched: the func has been added to the klp_ops list + * @transition: the func is currently being applied or reverted + * + * The patched and transition variables define the func's patching state. When + * patching, a func is always in one of the following states: + * + * patched=0 transition=0: unpatched + * patched=0 transition=1: unpatched, temporary starting state + * patched=1 transition=1: patched, may be visible to some tasks + * patched=1 transition=0: patched, visible to all tasks + * + * And when unpatching, it goes in the reverse order: + * + * patched=1 transition=0: patched, visible to all tasks + * patched=1 transition=1: patched, may be visible to some tasks + * patched=0 transition=1: unpatched, temporary ending state + * patched=0 transition=0: unpatched */ struct klp_func { /* external */ @@ -56,12 +76,15 @@ struct klp_func { * in kallsyms for the given object is used. */ unsigned long old_sympos; + bool immediate; /* internal */ unsigned long old_addr; struct kobject kobj; - enum klp_state state; struct list_head stack_node; + unsigned long old_size, new_size; + bool patched; + bool transition; }; /** @@ -70,8 +93,8 @@ struct klp_func { * @funcs: function entries for functions to be patched in the object * @kobj: kobject for sysfs resources * @mod: kernel module associated with the patched object - * (NULL for vmlinux) - * @state: tracks object-level patch application state + * (NULL for vmlinux) + * @patched: the object's funcs have been added to the klp_ops list */ struct klp_object { /* external */ @@ -81,26 +104,30 @@ struct klp_object { /* internal */ struct kobject kobj; struct module *mod; - enum klp_state state; + bool patched; }; /** * struct klp_patch - patch structure for live patching * @mod: reference to the live patch module * @objs: object entries for kernel objects to be patched + * @immediate: patch all funcs immediately, bypassing safety mechanisms * @list: list node for global list of registered patches * @kobj: kobject for sysfs resources - * @state: tracks patch-level application state + * @enabled: the patch is enabled (but operation may be incomplete) + * @finish: for waiting till it is safe to remove the patch module */ struct klp_patch { /* external */ struct module *mod; struct klp_object *objs; + bool immediate; /* internal */ struct list_head list; struct kobject kobj; - enum klp_state state; + bool enabled; + struct completion finish; }; #define klp_for_each_object(patch, obj) \ @@ -123,10 +150,27 @@ void arch_klp_init_object_loaded(struct klp_patch *patch, int klp_module_coming(struct module *mod); void klp_module_going(struct module *mod); +void klp_copy_process(struct task_struct *child); +void klp_update_patch_state(struct task_struct *task); + +static inline bool klp_patch_pending(struct task_struct *task) +{ + return test_tsk_thread_flag(task, TIF_PATCH_PENDING); +} + +static inline bool klp_have_reliable_stack(void) +{ + return IS_ENABLED(CONFIG_STACKTRACE) && + IS_ENABLED(CONFIG_HAVE_RELIABLE_STACKTRACE); +} + #else /* !CONFIG_LIVEPATCH */ static inline int klp_module_coming(struct module *mod) { return 0; } -static inline void klp_module_going(struct module *mod) { } +static inline void klp_module_going(struct module *mod) {} +static inline bool klp_patch_pending(struct task_struct *task) { return false; } +static inline void klp_update_patch_state(struct task_struct *task) {} +static inline void klp_copy_process(struct task_struct *child) {} #endif /* CONFIG_LIVEPATCH */ diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index 140edab644462051f23d813514fe9a60404aa0e7..05728396a1a18f701f5bba8075be5301e6a89e4e 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -18,6 +18,7 @@ /* Dummy declarations */ struct svc_rqst; +struct rpc_task; /* * This is the set of functions for lockd->nfsd communication @@ -43,6 +44,7 @@ struct nlmclnt_initdata { u32 nfs_version; int noresvport; struct net *net; + const struct nlmclnt_operations *nlmclnt_ops; }; /* @@ -52,8 +54,26 @@ struct nlmclnt_initdata { extern struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init); extern void nlmclnt_done(struct nlm_host *host); -extern int nlmclnt_proc(struct nlm_host *host, int cmd, - struct file_lock *fl); +/* + * NLM client operations provide a means to modify RPC processing of NLM + * requests. Callbacks receive a pointer to data passed into the call to + * nlmclnt_proc(). + */ +struct nlmclnt_operations { + /* Called on successful allocation of nlm_rqst, use for allocation or + * reference counting. */ + void (*nlmclnt_alloc_call)(void *); + + /* Called in rpc_task_prepare for unlock. A return value of true + * indicates the callback has put the task to sleep on a waitqueue + * and NLM should not call rpc_call_start(). */ + bool (*nlmclnt_unlock_prepare)(struct rpc_task*, void *); + + /* Called when the nlm_rqst is freed, callbacks should clean up here */ + void (*nlmclnt_release_call)(void *); +}; + +extern int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl, void *data); extern int lockd_up(struct net *net); extern void lockd_down(struct net *net); diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b37dee3acaba4c152fc2c0c928f4f312c74cc778..41f7b6a04d6914841a155d4734d308f6b4cc8d00 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -69,6 +69,7 @@ struct nlm_host { char *h_addrbuf; /* address eyecatcher */ struct net *net; /* host net */ char nodename[UNX_MAXNODENAME + 1]; + const struct nlmclnt_operations *h_nlmclnt_ops; /* Callback ops for NLM users */ }; /* @@ -142,6 +143,7 @@ struct nlm_rqst { struct nlm_block * a_block; unsigned int a_retries; /* Retry count */ u8 a_owner[NLMCLNT_OHSIZE]; + void * a_callback_data; /* sent to nlmclnt_operations callbacks */ }; /* diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 1e327bb80838d86202276a63fa715686fb6eff7e..fffe49f188e6d22dfb53ad86e7c53da44a287974 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -361,6 +361,8 @@ static inline void lock_set_subclass(struct lockdep_map *lock, lock_set_class(lock, lock->name, lock->key, subclass, ip); } +extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); + extern void lockdep_set_current_reclaim_state(gfp_t gfp_mask); extern void lockdep_clear_current_reclaim_state(void); extern void lockdep_trace_alloc(gfp_t mask); @@ -411,6 +413,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_set_current_reclaim_state(g) do { } while (0) diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index e29d4c62a3c8e268695d9963c2250f3eaf375399..080f34e66017a8299d10bc4276b1acfe8a1f94cc 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -533,8 +533,13 @@ * manual page for definitions of the @clone_flags. * @clone_flags contains the flags indicating what should be shared. * Return 0 if permission is granted. + * @task_alloc: + * @task task being allocated. + * @clone_flags contains the flags indicating what should be shared. + * Handle allocation of task-related resources. + * Returns a zero on success, negative values on failure. * @task_free: - * @task task being freed + * @task task about to be freed. * Handle release of task-related resources. (Note that this can be called * from interrupt context.) * @cred_alloc_blank: @@ -630,10 +635,19 @@ * Check permission before getting the ioprio value of @p. * @p contains the task_struct of process. * Return 0 if permission is granted. + * @task_prlimit: + * Check permission before getting and/or setting the resource limits of + * another task. + * @cred points to the cred structure for the current task. + * @tcred points to the cred structure for the target task. + * @flags contains the LSM_PRLIMIT_* flag bits indicating whether the + * resource limits are being read, modified, or both. + * Return 0 if permission is granted. * @task_setrlimit: - * Check permission before setting the resource limits of the current - * process for @resource to @new_rlim. The old resource limit values can - * be examined by dereferencing (current->signal->rlim + resource). + * Check permission before setting the resource limits of process @p + * for @resource to @new_rlim. The old resource limit values can + * be examined by dereferencing (p->signal->rlim + resource). + * @p points to the task_struct for the target task's group leader. * @resource contains the resource whose limit is being set. * @new_rlim contains the new limits for @resource. * Return 0 if permission is granted. @@ -1473,6 +1487,7 @@ union security_list_options { int (*file_open)(struct file *file, const struct cred *cred); int (*task_create)(unsigned long clone_flags); + int (*task_alloc)(struct task_struct *task, unsigned long clone_flags); void (*task_free)(struct task_struct *task); int (*cred_alloc_blank)(struct cred *cred, gfp_t gfp); void (*cred_free)(struct cred *cred); @@ -1494,6 +1509,8 @@ union security_list_options { int (*task_setnice)(struct task_struct *p, int nice); int (*task_setioprio)(struct task_struct *p, int ioprio); int (*task_getioprio)(struct task_struct *p); + int (*task_prlimit)(const struct cred *cred, const struct cred *tcred, + unsigned int flags); int (*task_setrlimit)(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); int (*task_setscheduler)(struct task_struct *p); @@ -1737,6 +1754,7 @@ struct security_hook_heads { struct list_head file_receive; struct list_head file_open; struct list_head task_create; + struct list_head task_alloc; struct list_head task_free; struct list_head cred_alloc_blank; struct list_head cred_free; @@ -1755,6 +1773,7 @@ struct security_hook_heads { struct list_head task_setnice; struct list_head task_setioprio; struct list_head task_getioprio; + struct list_head task_prlimit; struct list_head task_setrlimit; struct list_head task_setscheduler; struct list_head task_getscheduler; @@ -1908,6 +1927,13 @@ static inline void security_delete_hooks(struct security_hook_list *hooks, } #endif /* CONFIG_SECURITY_SELINUX_DISABLE */ +/* Currently required to handle SELinux runtime hook disable. */ +#ifdef CONFIG_SECURITY_WRITABLE_HOOKS +#define __lsm_ro_after_init +#else +#define __lsm_ro_after_init __ro_after_init +#endif /* CONFIG_SECURITY_WRITABLE_HOOKS */ + extern int __init security_module_enable(const char *module); extern void __init capability_add_hooks(void); #ifdef CONFIG_SECURITY_YAMA diff --git a/include/linux/mailbox/brcm-message.h b/include/linux/mailbox/brcm-message.h index 6b55c938b4015f23ae2f3deee224817902a958c8..c20b4843fc2d60ab9a0249ff7d08799a6bcab1d8 100644 --- a/include/linux/mailbox/brcm-message.h +++ b/include/linux/mailbox/brcm-message.h @@ -16,6 +16,7 @@ enum brcm_message_type { BRCM_MESSAGE_UNKNOWN = 0, + BRCM_MESSAGE_BATCH, BRCM_MESSAGE_SPU, BRCM_MESSAGE_SBA, BRCM_MESSAGE_MAX, @@ -23,23 +24,28 @@ enum brcm_message_type { struct brcm_sba_command { u64 cmd; + u64 *cmd_dma; + dma_addr_t cmd_dma_addr; #define BRCM_SBA_CMD_TYPE_A BIT(0) #define BRCM_SBA_CMD_TYPE_B BIT(1) #define BRCM_SBA_CMD_TYPE_C BIT(2) #define BRCM_SBA_CMD_HAS_RESP BIT(3) #define BRCM_SBA_CMD_HAS_OUTPUT BIT(4) u64 flags; - dma_addr_t input; - size_t input_len; dma_addr_t resp; size_t resp_len; - dma_addr_t output; - size_t output_len; + dma_addr_t data; + size_t data_len; }; struct brcm_message { enum brcm_message_type type; union { + struct { + struct brcm_message *msgs; + unsigned int msgs_queued; + unsigned int msgs_count; + } batch; struct { struct scatterlist *src; struct scatterlist *dst; diff --git a/include/linux/memblock.h b/include/linux/memblock.h index bdfc65af4152047c7b34f9b459f095a6d675cd11..4ce24a3762627be20e805da3eaab26ba1bc47578 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -93,6 +93,7 @@ int memblock_mark_hotplug(phys_addr_t base, phys_addr_t size); int memblock_clear_hotplug(phys_addr_t base, phys_addr_t size); int memblock_mark_mirror(phys_addr_t base, phys_addr_t size); int memblock_mark_nomap(phys_addr_t base, phys_addr_t size); +int memblock_clear_nomap(phys_addr_t base, phys_addr_t size); ulong choose_memblock_flags(void); /* Low level functions */ @@ -335,6 +336,7 @@ phys_addr_t memblock_mem_size(unsigned long limit_pfn); phys_addr_t memblock_start_of_DRAM(void); phys_addr_t memblock_end_of_DRAM(void); void memblock_enforce_memory_limit(phys_addr_t memory_limit); +void memblock_cap_memory_range(phys_addr_t base, phys_addr_t size); void memblock_mem_limit_remove_map(phys_addr_t limit); bool memblock_is_memory(phys_addr_t addr); int memblock_is_map_memory(phys_addr_t addr); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb7250c45cb8356b03df2d4b9f1325dc6e30069b..899949bbb2f9362182e7fd28dc34833cca7de78e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -35,48 +35,43 @@ struct page; struct mm_struct; struct kmem_cache; -/* - * The corresponding mem_cgroup_stat_names is defined in mm/memcontrol.c, - * These two lists should keep in accord with each other. - */ -enum mem_cgroup_stat_index { - /* - * For MEM_CONTAINER_TYPE_ALL, usage = pagecache + rss. - */ - MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ - MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_RSS_HUGE, /* # of pages charged as anon huge */ - MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_DIRTY, /* # of dirty pages in page cache */ - MEM_CGROUP_STAT_WRITEBACK, /* # of pages under writeback */ - MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ - MEM_CGROUP_STAT_NSTATS, - /* default hierarchy stats */ - MEMCG_KERNEL_STACK_KB = MEM_CGROUP_STAT_NSTATS, +/* Cgroup-specific page state, on top of universal node page state */ +enum memcg_stat_item { + MEMCG_CACHE = NR_VM_NODE_STAT_ITEMS, + MEMCG_RSS, + MEMCG_RSS_HUGE, + MEMCG_SWAP, + MEMCG_SOCK, + /* XXX: why are these zone and not node counters? */ + MEMCG_KERNEL_STACK_KB, MEMCG_SLAB_RECLAIMABLE, MEMCG_SLAB_UNRECLAIMABLE, - MEMCG_SOCK, MEMCG_NR_STAT, }; +/* Cgroup-specific events, on top of universal VM events */ +enum memcg_event_item { + MEMCG_LOW = NR_VM_EVENT_ITEMS, + MEMCG_HIGH, + MEMCG_MAX, + MEMCG_OOM, + MEMCG_NR_EVENTS, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; unsigned int generation; }; -enum mem_cgroup_events_index { - MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ - MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ - MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ - MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ - MEM_CGROUP_EVENTS_NSTATS, - /* default hierarchy events */ - MEMCG_LOW = MEM_CGROUP_EVENTS_NSTATS, - MEMCG_HIGH, - MEMCG_MAX, - MEMCG_OOM, - MEMCG_NR_EVENTS, +#ifdef CONFIG_MEMCG + +#define MEM_CGROUP_ID_SHIFT 16 +#define MEM_CGROUP_ID_MAX USHRT_MAX + +struct mem_cgroup_id { + int id; + atomic_t ref; }; /* @@ -92,16 +87,6 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -#ifdef CONFIG_MEMCG - -#define MEM_CGROUP_ID_SHIFT 16 -#define MEM_CGROUP_ID_MAX USHRT_MAX - -struct mem_cgroup_id { - int id; - atomic_t ref; -}; - struct mem_cgroup_stat_cpu { long count[MEMCG_NR_STAT]; unsigned long events[MEMCG_NR_EVENTS]; @@ -283,17 +268,10 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -/** - * mem_cgroup_events - count memory events against a cgroup - * @memcg: the memory cgroup - * @idx: the event index - * @nr: the number of events to account for - */ -static inline void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) { - this_cpu_add(memcg->stat->events[idx], nr); + this_cpu_inc(memcg->stat->events[event]); cgroup_file_notify(&memcg->events_file); } @@ -494,8 +472,42 @@ extern int do_swap_account; void lock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page); +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + long val = 0; + int cpu; + + for_each_possible_cpu(cpu) + val += per_cpu(memcg->stat->count[idx], cpu); + + if (val < 0) + val = 0; + + return val; +} + +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, int val) +{ + if (!mem_cgroup_disabled()) + this_cpu_add(memcg->stat->count[idx], val); +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, 1); +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + mod_memcg_state(memcg, idx, -1); +} + /** - * mem_cgroup_update_page_stat - update page state statistics + * mod_memcg_page_state - update page state statistics * @page: the page * @idx: page state item to account * @val: number of pages (positive or negative) @@ -506,28 +518,28 @@ void unlock_page_memcg(struct page *page); * * lock_page(page) or lock_page_memcg(page) * if (TestClearPageState(page)) - * mem_cgroup_update_page_stat(page, state, -1); + * mod_memcg_page_state(page, state, -1); * unlock_page(page) or unlock_page_memcg(page) + * + * Kernel pages are an exception to this, since they'll never move. */ -static inline void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) +static inline void mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, int val) { - VM_BUG_ON(!(rcu_read_lock_held() || PageLocked(page))); - if (page->mem_cgroup) - this_cpu_add(page->mem_cgroup->stat->count[idx], val); + mod_memcg_state(page->mem_cgroup, idx, val); } -static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { - mem_cgroup_update_page_stat(page, idx, 1); + mod_memcg_page_state(page, idx, 1); } -static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { - mem_cgroup_update_page_stat(page, idx, -1); + mod_memcg_page_state(page, idx, -1); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, @@ -544,20 +556,8 @@ static inline void mem_cgroup_count_vm_event(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - goto out; - - switch (idx) { - case PGFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); - break; - case PGMAJFAULT: - this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); - break; - default: - BUG(); - } -out: + if (likely(memcg)) + this_cpu_inc(memcg->stat->events[idx]); rcu_read_unlock(); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -576,9 +576,8 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline void mem_cgroup_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx, - unsigned int nr) +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) { } @@ -740,19 +739,41 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } -static inline void mem_cgroup_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, - int nr) +static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ + return 0; +} + +static inline void mod_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx, + int nr) +{ +} + +static inline void inc_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ +} + +static inline void dec_memcg_state(struct mem_cgroup *memcg, + enum memcg_stat_item idx) +{ +} + +static inline void mod_memcg_page_state(struct page *page, + enum memcg_stat_item idx, + int nr) { } -static inline void mem_cgroup_inc_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void inc_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { } -static inline void mem_cgroup_dec_page_stat(struct page *page, - enum mem_cgroup_stat_index idx) +static inline void dec_memcg_page_state(struct page *page, + enum memcg_stat_item idx) { } @@ -872,7 +893,7 @@ static inline int memcg_cache_id(struct mem_cgroup *memcg) * @val: number of pages (positive or negative) */ static inline void memcg_kmem_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) + enum memcg_stat_item idx, int val) { if (memcg_kmem_enabled() && page->mem_cgroup) this_cpu_add(page->mem_cgroup->stat->count[idx], val); @@ -901,7 +922,7 @@ static inline void memcg_put_cache_ids(void) } static inline void memcg_kmem_update_page_stat(struct page *page, - enum mem_cgroup_stat_index idx, int val) + enum memcg_stat_item idx, int val) { } #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ diff --git a/include/linux/mfd/arizona/pdata.h b/include/linux/mfd/arizona/pdata.h index 64faeeff698c2275bab11c9680f4665126a50494..bfeecf179895b09edfb2ed5489e6293ca5a100d7 100644 --- a/include/linux/mfd/arizona/pdata.h +++ b/include/linux/mfd/arizona/pdata.h @@ -12,6 +12,8 @@ #define _ARIZONA_PDATA_H #include +#include +#include #define ARIZONA_GPN_DIR_MASK 0x8000 /* GPN_DIR */ #define ARIZONA_GPN_DIR_SHIFT 15 /* GPN_DIR */ @@ -76,13 +78,12 @@ struct arizona_micd_range { struct arizona_pdata { int reset; /** GPIO controlling /RESET, if any */ - int ldoena; /** GPIO controlling LODENA, if any */ /** Regulator configuration for MICVDD */ - struct regulator_init_data *micvdd; + struct arizona_micsupp_pdata micvdd; /** Regulator configuration for LDO1 */ - struct regulator_init_data *ldo1; + struct arizona_ldo1_pdata ldo1; /** If a direct 32kHz clock is provided on an MCLK specify it here */ int clk32k_src; diff --git a/include/linux/mfd/axp20x.h b/include/linux/mfd/axp20x.h index 0d9a1ff383933e375dedf820f7ba5ef7d55494c7..cde56cfe8446b17b9696b2e1182c13cb29f9e727 100644 --- a/include/linux/mfd/axp20x.h +++ b/include/linux/mfd/axp20x.h @@ -20,6 +20,7 @@ enum axp20x_variants { AXP221_ID, AXP223_ID, AXP288_ID, + AXP803_ID, AXP806_ID, AXP809_ID, NR_AXP20X_VARIANTS, @@ -228,13 +229,13 @@ enum axp20x_variants { #define AXP20X_OCV_MAX 0xf /* AXP22X specific registers */ -#define AXP22X_PMIC_ADC_H 0x56 -#define AXP22X_PMIC_ADC_L 0x57 +#define AXP22X_PMIC_TEMP_H 0x56 +#define AXP22X_PMIC_TEMP_L 0x57 #define AXP22X_TS_ADC_H 0x58 #define AXP22X_TS_ADC_L 0x59 #define AXP22X_BATLOW_THRES1 0xe6 -/* AXP288 specific registers */ +/* AXP288/AXP803 specific registers */ #define AXP288_POWER_REASON 0x02 #define AXP288_BC_GLOBAL 0x2c #define AXP288_BC_VBUS_CNTL 0x2d @@ -475,6 +476,43 @@ enum axp288_irqs { AXP288_IRQ_BC_USB_CHNG, }; +enum axp803_irqs { + AXP803_IRQ_ACIN_OVER_V = 1, + AXP803_IRQ_ACIN_PLUGIN, + AXP803_IRQ_ACIN_REMOVAL, + AXP803_IRQ_VBUS_OVER_V, + AXP803_IRQ_VBUS_PLUGIN, + AXP803_IRQ_VBUS_REMOVAL, + AXP803_IRQ_BATT_PLUGIN, + AXP803_IRQ_BATT_REMOVAL, + AXP803_IRQ_BATT_ENT_ACT_MODE, + AXP803_IRQ_BATT_EXIT_ACT_MODE, + AXP803_IRQ_CHARG, + AXP803_IRQ_CHARG_DONE, + AXP803_IRQ_BATT_CHG_TEMP_HIGH, + AXP803_IRQ_BATT_CHG_TEMP_HIGH_END, + AXP803_IRQ_BATT_CHG_TEMP_LOW, + AXP803_IRQ_BATT_CHG_TEMP_LOW_END, + AXP803_IRQ_BATT_ACT_TEMP_HIGH, + AXP803_IRQ_BATT_ACT_TEMP_HIGH_END, + AXP803_IRQ_BATT_ACT_TEMP_LOW, + AXP803_IRQ_BATT_ACT_TEMP_LOW_END, + AXP803_IRQ_DIE_TEMP_HIGH, + AXP803_IRQ_GPADC, + AXP803_IRQ_LOW_PWR_LVL1, + AXP803_IRQ_LOW_PWR_LVL2, + AXP803_IRQ_TIMER, + AXP803_IRQ_PEK_RIS_EDGE, + AXP803_IRQ_PEK_FAL_EDGE, + AXP803_IRQ_PEK_SHORT, + AXP803_IRQ_PEK_LONG, + AXP803_IRQ_PEK_OVER_OFF, + AXP803_IRQ_GPIO1_INPUT, + AXP803_IRQ_GPIO0_INPUT, + AXP803_IRQ_BC_USB_CHNG, + AXP803_IRQ_MV_CHNG, +}; + enum axp806_irqs { AXP806_IRQ_DIE_TEMP_HIGH_LV1, AXP806_IRQ_DIE_TEMP_HIGH_LV2, diff --git a/include/linux/mfd/cros_ec.h b/include/linux/mfd/cros_ec.h index 3eef9fb9968ae730716a79bc9e3aef8be4e2e650..28baee63eaf62f11dd31417db8aed7ba34a5a63b 100644 --- a/include/linux/mfd/cros_ec.h +++ b/include/linux/mfd/cros_ec.h @@ -305,4 +305,22 @@ extern struct attribute_group cros_ec_attr_group; extern struct attribute_group cros_ec_lightbar_attr_group; extern struct attribute_group cros_ec_vbc_attr_group; +/* ACPI GPE handler */ +#ifdef CONFIG_ACPI + +int cros_ec_acpi_install_gpe_handler(struct device *dev); +void cros_ec_acpi_remove_gpe_handler(void); +void cros_ec_acpi_clear_gpe(void); + +#else /* CONFIG_ACPI */ + +static inline int cros_ec_acpi_install_gpe_handler(struct device *dev) +{ + return -ENODEV; +} +static inline void cros_ec_acpi_remove_gpe_handler(void) {} +static inline void cros_ec_acpi_clear_gpe(void) {} + +#endif /* CONFIG_ACPI */ + #endif /* __LINUX_MFD_CROS_EC_H */ diff --git a/include/linux/mfd/cros_ec_commands.h b/include/linux/mfd/cros_ec_commands.h index f1ef6388c2337e55310a3dd6b8244f792022580c..c93e7e0300efa6507cd96925c2130378a0573fca 100644 --- a/include/linux/mfd/cros_ec_commands.h +++ b/include/linux/mfd/cros_ec_commands.h @@ -2041,6 +2041,9 @@ enum ec_mkbp_event { /* The state of the switches have changed. */ EC_MKBP_EVENT_SWITCH = 4, + /* EC sent a sysrq command */ + EC_MKBP_EVENT_SYSRQ = 6, + /* Number of MKBP events */ EC_MKBP_EVENT_COUNT, }; @@ -2053,6 +2056,7 @@ union ec_response_get_next_data { uint32_t buttons; uint32_t switches; + uint32_t sysrq; } __packed; struct ec_response_get_next_event { diff --git a/include/linux/mfd/da9062/core.h b/include/linux/mfd/da9062/core.h index 376ba84366a09628b93dc6dc173d3e606326ecef..74d33a01ddae1241eb3595f2fbf240ac623f846b 100644 --- a/include/linux/mfd/da9062/core.h +++ b/include/linux/mfd/da9062/core.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2015 Dialog Semiconductor Ltd. + * Copyright (C) 2015-2017 Dialog Semiconductor * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,7 +18,31 @@ #include #include -/* Interrupts */ +enum da9062_compatible_types { + COMPAT_TYPE_DA9061 = 1, + COMPAT_TYPE_DA9062, +}; + +enum da9061_irqs { + /* IRQ A */ + DA9061_IRQ_ONKEY, + DA9061_IRQ_WDG_WARN, + DA9061_IRQ_SEQ_RDY, + /* IRQ B*/ + DA9061_IRQ_TEMP, + DA9061_IRQ_LDO_LIM, + DA9061_IRQ_DVC_RDY, + DA9061_IRQ_VDD_WARN, + /* IRQ C */ + DA9061_IRQ_GPI0, + DA9061_IRQ_GPI1, + DA9061_IRQ_GPI2, + DA9061_IRQ_GPI3, + DA9061_IRQ_GPI4, + + DA9061_NUM_IRQ, +}; + enum da9062_irqs { /* IRQ A */ DA9062_IRQ_ONKEY, @@ -45,6 +69,7 @@ struct da9062 { struct device *dev; struct regmap *regmap; struct regmap_irq_chip_data *regmap_irq; + enum da9062_compatible_types chip_type; }; #endif /* __MFD_DA9062_CORE_H__ */ diff --git a/include/linux/mfd/da9062/registers.h b/include/linux/mfd/da9062/registers.h index 97790d1b02c5e23984095354a57e85fd42d9a53c..18d576aed90282dabe7ac6caf29ac843d8df09a2 100644 --- a/include/linux/mfd/da9062/registers.h +++ b/include/linux/mfd/da9062/registers.h @@ -1,6 +1,5 @@ /* - * registers.h - REGISTERS H for DA9062 - * Copyright (C) 2015 Dialog Semiconductor Ltd. + * Copyright (C) 2015-2017 Dialog Semiconductor * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -18,6 +17,8 @@ #define DA9062_PMIC_DEVICE_ID 0x62 #define DA9062_PMIC_VARIANT_MRC_AA 0x01 +#define DA9062_PMIC_VARIANT_VRC_DA9061 0x01 +#define DA9062_PMIC_VARIANT_VRC_DA9062 0x02 #define DA9062_I2C_PAGE_SEL_SHIFT 1 diff --git a/include/linux/mfd/intel_bxtwc.h b/include/linux/mfd/intel_bxtwc.h deleted file mode 100644 index 1a0ee9d6efe9df4cfe88af60bdd8bb3608401bdf..0000000000000000000000000000000000000000 --- a/include/linux/mfd/intel_bxtwc.h +++ /dev/null @@ -1,69 +0,0 @@ -/* - * intel_bxtwc.h - Header file for Intel Broxton Whiskey Cove PMIC - * - * Copyright (C) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - */ - -#include - -#ifndef __INTEL_BXTWC_H__ -#define __INTEL_BXTWC_H__ - -/* BXT WC devices */ -#define BXTWC_DEVICE1_ADDR 0x4E -#define BXTWC_DEVICE2_ADDR 0x4F -#define BXTWC_DEVICE3_ADDR 0x5E - -/* device1 Registers */ -#define BXTWC_CHIPID 0x4E00 -#define BXTWC_CHIPVER 0x4E01 - -#define BXTWC_SCHGRIRQ0_ADDR 0x5E1A -#define BXTWC_CHGRCTRL0_ADDR 0x5E16 -#define BXTWC_CHGRCTRL1_ADDR 0x5E17 -#define BXTWC_CHGRCTRL2_ADDR 0x5E18 -#define BXTWC_CHGRSTATUS_ADDR 0x5E19 -#define BXTWC_THRMBATZONE_ADDR 0x4F22 - -#define BXTWC_USBPATH_ADDR 0x5E19 -#define BXTWC_USBPHYCTRL_ADDR 0x5E07 -#define BXTWC_USBIDCTRL_ADDR 0x5E05 -#define BXTWC_USBIDEN_MASK 0x01 -#define BXTWC_USBIDSTAT_ADDR 0x00FF -#define BXTWC_USBSRCDETSTATUS_ADDR 0x5E29 - -#define BXTWC_DBGUSBBC1_ADDR 0x5FE0 -#define BXTWC_DBGUSBBC2_ADDR 0x5FE1 -#define BXTWC_DBGUSBBCSTAT_ADDR 0x5FE2 - -#define BXTWC_WAKESRC_ADDR 0x4E22 -#define BXTWC_WAKESRC2_ADDR 0x4EE5 -#define BXTWC_CHRTTADDR_ADDR 0x5E22 -#define BXTWC_CHRTTDATA_ADDR 0x5E23 - -#define BXTWC_STHRMIRQ0_ADDR 0x4F19 -#define WC_MTHRMIRQ1_ADDR 0x4E12 -#define WC_STHRMIRQ1_ADDR 0x4F1A -#define WC_STHRMIRQ2_ADDR 0x4F1B - -#define BXTWC_THRMZN0H_ADDR 0x4F44 -#define BXTWC_THRMZN0L_ADDR 0x4F45 -#define BXTWC_THRMZN1H_ADDR 0x4F46 -#define BXTWC_THRMZN1L_ADDR 0x4F47 -#define BXTWC_THRMZN2H_ADDR 0x4F48 -#define BXTWC_THRMZN2L_ADDR 0x4F49 -#define BXTWC_THRMZN3H_ADDR 0x4F4A -#define BXTWC_THRMZN3L_ADDR 0x4F4B -#define BXTWC_THRMZN4H_ADDR 0x4F4C -#define BXTWC_THRMZN4L_ADDR 0x4F4D - -#endif diff --git a/include/linux/mfd/intel_soc_pmic_bxtwc.h b/include/linux/mfd/intel_soc_pmic_bxtwc.h new file mode 100644 index 0000000000000000000000000000000000000000..0c351bc85d2d86de2c980a550c3cb76ab28ad814 --- /dev/null +++ b/include/linux/mfd/intel_soc_pmic_bxtwc.h @@ -0,0 +1,67 @@ +/* + * Header file for Intel Broxton Whiskey Cove PMIC + * + * Copyright (C) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef __INTEL_BXTWC_H__ +#define __INTEL_BXTWC_H__ + +/* BXT WC devices */ +#define BXTWC_DEVICE1_ADDR 0x4E +#define BXTWC_DEVICE2_ADDR 0x4F +#define BXTWC_DEVICE3_ADDR 0x5E + +/* device1 Registers */ +#define BXTWC_CHIPID 0x4E00 +#define BXTWC_CHIPVER 0x4E01 + +#define BXTWC_SCHGRIRQ0_ADDR 0x5E1A +#define BXTWC_CHGRCTRL0_ADDR 0x5E16 +#define BXTWC_CHGRCTRL1_ADDR 0x5E17 +#define BXTWC_CHGRCTRL2_ADDR 0x5E18 +#define BXTWC_CHGRSTATUS_ADDR 0x5E19 +#define BXTWC_THRMBATZONE_ADDR 0x4F22 + +#define BXTWC_USBPATH_ADDR 0x5E19 +#define BXTWC_USBPHYCTRL_ADDR 0x5E07 +#define BXTWC_USBIDCTRL_ADDR 0x5E05 +#define BXTWC_USBIDEN_MASK 0x01 +#define BXTWC_USBIDSTAT_ADDR 0x00FF +#define BXTWC_USBSRCDETSTATUS_ADDR 0x5E29 + +#define BXTWC_DBGUSBBC1_ADDR 0x5FE0 +#define BXTWC_DBGUSBBC2_ADDR 0x5FE1 +#define BXTWC_DBGUSBBCSTAT_ADDR 0x5FE2 + +#define BXTWC_WAKESRC_ADDR 0x4E22 +#define BXTWC_WAKESRC2_ADDR 0x4EE5 +#define BXTWC_CHRTTADDR_ADDR 0x5E22 +#define BXTWC_CHRTTDATA_ADDR 0x5E23 + +#define BXTWC_STHRMIRQ0_ADDR 0x4F19 +#define WC_MTHRMIRQ1_ADDR 0x4E12 +#define WC_STHRMIRQ1_ADDR 0x4F1A +#define WC_STHRMIRQ2_ADDR 0x4F1B + +#define BXTWC_THRMZN0H_ADDR 0x4F44 +#define BXTWC_THRMZN0L_ADDR 0x4F45 +#define BXTWC_THRMZN1H_ADDR 0x4F46 +#define BXTWC_THRMZN1L_ADDR 0x4F47 +#define BXTWC_THRMZN2H_ADDR 0x4F48 +#define BXTWC_THRMZN2L_ADDR 0x4F49 +#define BXTWC_THRMZN3H_ADDR 0x4F4A +#define BXTWC_THRMZN3L_ADDR 0x4F4B +#define BXTWC_THRMZN4H_ADDR 0x4F4C +#define BXTWC_THRMZN4L_ADDR 0x4F4D + +#endif diff --git a/include/linux/mfd/motorola-cpcap.h b/include/linux/mfd/motorola-cpcap.h index b4031c2b2214fc2f5417f14756e14f6cd5ff63d2..aefc49cb7ba9955ebb8b7115103e43a2de89fff5 100644 --- a/include/linux/mfd/motorola-cpcap.h +++ b/include/linux/mfd/motorola-cpcap.h @@ -14,6 +14,9 @@ * published by the Free Software Foundation. */ +#include +#include + #define CPCAP_VENDOR_ST 0 #define CPCAP_VENDOR_TI 1 @@ -290,3 +293,5 @@ static inline int cpcap_get_vendor(struct device *dev, return 0; } + +extern int cpcap_sense_virq(struct regmap *regmap, int virq); diff --git a/include/linux/mfd/mxs-lradc.h b/include/linux/mfd/mxs-lradc.h new file mode 100644 index 0000000000000000000000000000000000000000..661a4521f72396fa58a3b2321d41c181b00b99e3 --- /dev/null +++ b/include/linux/mfd/mxs-lradc.h @@ -0,0 +1,187 @@ +/* + * Freescale MXS Low Resolution Analog-to-Digital Converter driver + * + * Copyright (c) 2012 DENX Software Engineering, GmbH. + * Copyright (c) 2016 Ksenija Stanojevic + * + * Author: Marek Vasut + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#ifndef __MFD_MXS_LRADC_H +#define __MFD_MXS_LRADC_H + +#include +#include +#include + +#define LRADC_MAX_DELAY_CHANS 4 +#define LRADC_MAX_MAPPED_CHANS 8 +#define LRADC_MAX_TOTAL_CHANS 16 + +#define LRADC_DELAY_TIMER_HZ 2000 + +#define LRADC_CTRL0 0x00 +# define LRADC_CTRL0_MX28_TOUCH_DETECT_ENABLE BIT(23) +# define LRADC_CTRL0_MX28_TOUCH_SCREEN_TYPE BIT(22) +# define LRADC_CTRL0_MX28_YNNSW /* YM */ BIT(21) +# define LRADC_CTRL0_MX28_YPNSW /* YP */ BIT(20) +# define LRADC_CTRL0_MX28_YPPSW /* YP */ BIT(19) +# define LRADC_CTRL0_MX28_XNNSW /* XM */ BIT(18) +# define LRADC_CTRL0_MX28_XNPSW /* XM */ BIT(17) +# define LRADC_CTRL0_MX28_XPPSW /* XP */ BIT(16) + +# define LRADC_CTRL0_MX23_TOUCH_DETECT_ENABLE BIT(20) +# define LRADC_CTRL0_MX23_YM BIT(19) +# define LRADC_CTRL0_MX23_XM BIT(18) +# define LRADC_CTRL0_MX23_YP BIT(17) +# define LRADC_CTRL0_MX23_XP BIT(16) + +# define LRADC_CTRL0_MX28_PLATE_MASK \ + (LRADC_CTRL0_MX28_TOUCH_DETECT_ENABLE | \ + LRADC_CTRL0_MX28_YNNSW | LRADC_CTRL0_MX28_YPNSW | \ + LRADC_CTRL0_MX28_YPPSW | LRADC_CTRL0_MX28_XNNSW | \ + LRADC_CTRL0_MX28_XNPSW | LRADC_CTRL0_MX28_XPPSW) + +# define LRADC_CTRL0_MX23_PLATE_MASK \ + (LRADC_CTRL0_MX23_TOUCH_DETECT_ENABLE | \ + LRADC_CTRL0_MX23_YM | LRADC_CTRL0_MX23_XM | \ + LRADC_CTRL0_MX23_YP | LRADC_CTRL0_MX23_XP) + +#define LRADC_CTRL1 0x10 +#define LRADC_CTRL1_TOUCH_DETECT_IRQ_EN BIT(24) +#define LRADC_CTRL1_LRADC_IRQ_EN(n) (1 << ((n) + 16)) +#define LRADC_CTRL1_MX28_LRADC_IRQ_EN_MASK (0x1fff << 16) +#define LRADC_CTRL1_MX23_LRADC_IRQ_EN_MASK (0x01ff << 16) +#define LRADC_CTRL1_LRADC_IRQ_EN_OFFSET 16 +#define LRADC_CTRL1_TOUCH_DETECT_IRQ BIT(8) +#define LRADC_CTRL1_LRADC_IRQ(n) BIT(n) +#define LRADC_CTRL1_MX28_LRADC_IRQ_MASK 0x1fff +#define LRADC_CTRL1_MX23_LRADC_IRQ_MASK 0x01ff +#define LRADC_CTRL1_LRADC_IRQ_OFFSET 0 + +#define LRADC_CTRL2 0x20 +#define LRADC_CTRL2_DIVIDE_BY_TWO_OFFSET 24 +#define LRADC_CTRL2_TEMPSENSE_PWD BIT(15) + +#define LRADC_STATUS 0x40 +#define LRADC_STATUS_TOUCH_DETECT_RAW BIT(0) + +#define LRADC_CH(n) (0x50 + (0x10 * (n))) +#define LRADC_CH_ACCUMULATE BIT(29) +#define LRADC_CH_NUM_SAMPLES_MASK (0x1f << 24) +#define LRADC_CH_NUM_SAMPLES_OFFSET 24 +#define LRADC_CH_NUM_SAMPLES(x) \ + ((x) << LRADC_CH_NUM_SAMPLES_OFFSET) +#define LRADC_CH_VALUE_MASK 0x3ffff +#define LRADC_CH_VALUE_OFFSET 0 + +#define LRADC_DELAY(n) (0xd0 + (0x10 * (n))) +#define LRADC_DELAY_TRIGGER_LRADCS_MASK (0xffUL << 24) +#define LRADC_DELAY_TRIGGER_LRADCS_OFFSET 24 +#define LRADC_DELAY_TRIGGER(x) \ + (((x) << LRADC_DELAY_TRIGGER_LRADCS_OFFSET) & \ + LRADC_DELAY_TRIGGER_LRADCS_MASK) +#define LRADC_DELAY_KICK BIT(20) +#define LRADC_DELAY_TRIGGER_DELAYS_MASK (0xf << 16) +#define LRADC_DELAY_TRIGGER_DELAYS_OFFSET 16 +#define LRADC_DELAY_TRIGGER_DELAYS(x) \ + (((x) << LRADC_DELAY_TRIGGER_DELAYS_OFFSET) & \ + LRADC_DELAY_TRIGGER_DELAYS_MASK) +#define LRADC_DELAY_LOOP_COUNT_MASK (0x1f << 11) +#define LRADC_DELAY_LOOP_COUNT_OFFSET 11 +#define LRADC_DELAY_LOOP(x) \ + (((x) << LRADC_DELAY_LOOP_COUNT_OFFSET) & \ + LRADC_DELAY_LOOP_COUNT_MASK) +#define LRADC_DELAY_DELAY_MASK 0x7ff +#define LRADC_DELAY_DELAY_OFFSET 0 +#define LRADC_DELAY_DELAY(x) \ + (((x) << LRADC_DELAY_DELAY_OFFSET) & \ + LRADC_DELAY_DELAY_MASK) + +#define LRADC_CTRL4 0x140 +#define LRADC_CTRL4_LRADCSELECT_MASK(n) (0xf << ((n) * 4)) +#define LRADC_CTRL4_LRADCSELECT_OFFSET(n) ((n) * 4) +#define LRADC_CTRL4_LRADCSELECT(n, x) \ + (((x) << LRADC_CTRL4_LRADCSELECT_OFFSET(n)) & \ + LRADC_CTRL4_LRADCSELECT_MASK(n)) + +#define LRADC_RESOLUTION 12 +#define LRADC_SINGLE_SAMPLE_MASK ((1 << LRADC_RESOLUTION) - 1) + +#define BUFFER_VCHANS_LIMITED 0x3f +#define BUFFER_VCHANS_ALL 0xff + + /* + * Certain LRADC channels are shared between touchscreen + * and/or touch-buttons and generic LRADC block. Therefore when using + * either of these, these channels are not available for the regular + * sampling. The shared channels are as follows: + * + * CH0 -- Touch button #0 + * CH1 -- Touch button #1 + * CH2 -- Touch screen XPUL + * CH3 -- Touch screen YPLL + * CH4 -- Touch screen XNUL + * CH5 -- Touch screen YNLR + * CH6 -- Touch screen WIPER (5-wire only) + * + * The bit fields below represents which parts of the LRADC block are + * switched into special mode of operation. These channels can not + * be sampled as regular LRADC channels. The driver will refuse any + * attempt to sample these channels. + */ +#define CHAN_MASK_TOUCHBUTTON (BIT(1) | BIT(0)) +#define CHAN_MASK_TOUCHSCREEN_4WIRE (0xf << 2) +#define CHAN_MASK_TOUCHSCREEN_5WIRE (0x1f << 2) + +enum mxs_lradc_id { + IMX23_LRADC, + IMX28_LRADC, +}; + +enum mxs_lradc_ts_wires { + MXS_LRADC_TOUCHSCREEN_NONE = 0, + MXS_LRADC_TOUCHSCREEN_4WIRE, + MXS_LRADC_TOUCHSCREEN_5WIRE, +}; + +/** + * struct mxs_lradc + * @soc: soc type (IMX23 or IMX28) + * @clk: 2 kHz clock for delay units + * @buffer_vchans: channels that can be used during buffered capture + * @touchscreen_wire: touchscreen type (4-wire or 5-wire) + * @use_touchbutton: button state (on or off) + */ +struct mxs_lradc { + enum mxs_lradc_id soc; + struct clk *clk; + u8 buffer_vchans; + + enum mxs_lradc_ts_wires touchscreen_wire; + bool use_touchbutton; +}; + +static inline u32 mxs_lradc_irq_mask(struct mxs_lradc *lradc) +{ + switch (lradc->soc) { + case IMX23_LRADC: + return LRADC_CTRL1_MX23_LRADC_IRQ_MASK; + case IMX28_LRADC: + return LRADC_CTRL1_MX28_LRADC_IRQ_MASK; + default: + return 0; + } +} + +#endif /* __MXS_LRADC_H */ diff --git a/include/linux/mfd/stm32-timers.h b/include/linux/mfd/stm32-timers.h index d0300045f04ab84efabd0a78e1af19d22b0e773f..4a0abbc10ef6c2f4cd5c6ce0dbf7902024f55e05 100644 --- a/include/linux/mfd/stm32-timers.h +++ b/include/linux/mfd/stm32-timers.h @@ -21,6 +21,7 @@ #define TIM_CCMR1 0x18 /* Capt/Comp 1 Mode Reg */ #define TIM_CCMR2 0x1C /* Capt/Comp 2 Mode Reg */ #define TIM_CCER 0x20 /* Capt/Comp Enable Reg */ +#define TIM_CNT 0x24 /* Counter */ #define TIM_PSC 0x28 /* Prescaler */ #define TIM_ARR 0x2c /* Auto-Reload Register */ #define TIM_CCR1 0x34 /* Capt/Comp Register 1 */ @@ -30,6 +31,7 @@ #define TIM_BDTR 0x44 /* Break and Dead-Time Reg */ #define TIM_CR1_CEN BIT(0) /* Counter Enable */ +#define TIM_CR1_DIR BIT(4) /* Counter Direction */ #define TIM_CR1_ARPE BIT(7) /* Auto-reload Preload Ena */ #define TIM_CR2_MMS (BIT(4) | BIT(5) | BIT(6)) /* Master mode selection */ #define TIM_SMCR_SMS (BIT(0) | BIT(1) | BIT(2)) /* Slave mode selection */ diff --git a/include/linux/mfd/sun4i-gpadc.h b/include/linux/mfd/sun4i-gpadc.h index d7a29f246d647fb6c7be225bc47fb42f2e295c8d..139872c2e0fe0ffe9517d8f13db65efcd5bc42ff 100644 --- a/include/linux/mfd/sun4i-gpadc.h +++ b/include/linux/mfd/sun4i-gpadc.h @@ -28,6 +28,7 @@ #define SUN4I_GPADC_CTRL1_TP_MODE_EN BIT(4) #define SUN4I_GPADC_CTRL1_TP_ADC_SELECT BIT(3) #define SUN4I_GPADC_CTRL1_ADC_CHAN_SELECT(x) (GENMASK(2, 0) & (x)) +#define SUN4I_GPADC_CTRL1_ADC_CHAN_MASK GENMASK(2, 0) /* TP_CTRL1 bits for sun6i SOCs */ #define SUN6I_GPADC_CTRL1_TOUCH_PAN_CALI_EN BIT(7) @@ -35,6 +36,11 @@ #define SUN6I_GPADC_CTRL1_TP_MODE_EN BIT(5) #define SUN6I_GPADC_CTRL1_TP_ADC_SELECT BIT(4) #define SUN6I_GPADC_CTRL1_ADC_CHAN_SELECT(x) (GENMASK(3, 0) & BIT(x)) +#define SUN6I_GPADC_CTRL1_ADC_CHAN_MASK GENMASK(3, 0) + +/* TP_CTRL1 bits for sun8i SoCs */ +#define SUN8I_GPADC_CTRL1_CHOP_TEMP_EN BIT(8) +#define SUN8I_GPADC_CTRL1_GPADC_CALI_EN BIT(7) #define SUN4I_GPADC_CTRL2 0x08 diff --git a/include/linux/mfd/syscon/atmel-smc.h b/include/linux/mfd/syscon/atmel-smc.h index be6ebe64eebe1964573a2b963004a8055dff7abf..afa2661698004fed5792374ac68d8d8dd9bb0c1f 100644 --- a/include/linux/mfd/syscon/atmel-smc.h +++ b/include/linux/mfd/syscon/atmel-smc.h @@ -17,157 +17,92 @@ #include #include -#define AT91SAM9_SMC_GENERIC 0x00 -#define AT91SAM9_SMC_GENERIC_BLK_SZ 0x10 - -#define SAMA5_SMC_GENERIC 0x600 -#define SAMA5_SMC_GENERIC_BLK_SZ 0x14 - -#define AT91SAM9_SMC_SETUP(o) ((o) + 0x00) -#define AT91SAM9_SMC_NWESETUP(x) (x) -#define AT91SAM9_SMC_NCS_WRSETUP(x) ((x) << 8) -#define AT91SAM9_SMC_NRDSETUP(x) ((x) << 16) -#define AT91SAM9_SMC_NCS_NRDSETUP(x) ((x) << 24) - -#define AT91SAM9_SMC_PULSE(o) ((o) + 0x04) -#define AT91SAM9_SMC_NWEPULSE(x) (x) -#define AT91SAM9_SMC_NCS_WRPULSE(x) ((x) << 8) -#define AT91SAM9_SMC_NRDPULSE(x) ((x) << 16) -#define AT91SAM9_SMC_NCS_NRDPULSE(x) ((x) << 24) - -#define AT91SAM9_SMC_CYCLE(o) ((o) + 0x08) -#define AT91SAM9_SMC_NWECYCLE(x) (x) -#define AT91SAM9_SMC_NRDCYCLE(x) ((x) << 16) - -#define AT91SAM9_SMC_MODE(o) ((o) + 0x0c) -#define SAMA5_SMC_MODE(o) ((o) + 0x10) -#define AT91_SMC_READMODE BIT(0) -#define AT91_SMC_READMODE_NCS (0 << 0) -#define AT91_SMC_READMODE_NRD (1 << 0) -#define AT91_SMC_WRITEMODE BIT(1) -#define AT91_SMC_WRITEMODE_NCS (0 << 1) -#define AT91_SMC_WRITEMODE_NWE (1 << 1) -#define AT91_SMC_EXNWMODE GENMASK(5, 4) -#define AT91_SMC_EXNWMODE_DISABLE (0 << 4) -#define AT91_SMC_EXNWMODE_FROZEN (2 << 4) -#define AT91_SMC_EXNWMODE_READY (3 << 4) -#define AT91_SMC_BAT BIT(8) -#define AT91_SMC_BAT_SELECT (0 << 8) -#define AT91_SMC_BAT_WRITE (1 << 8) -#define AT91_SMC_DBW GENMASK(13, 12) -#define AT91_SMC_DBW_8 (0 << 12) -#define AT91_SMC_DBW_16 (1 << 12) -#define AT91_SMC_DBW_32 (2 << 12) -#define AT91_SMC_TDF GENMASK(19, 16) -#define AT91_SMC_TDF_(x) ((((x) - 1) << 16) & AT91_SMC_TDF) -#define AT91_SMC_TDF_MAX 16 -#define AT91_SMC_TDFMODE_OPTIMIZED BIT(20) -#define AT91_SMC_PMEN BIT(24) -#define AT91_SMC_PS GENMASK(29, 28) -#define AT91_SMC_PS_4 (0 << 28) -#define AT91_SMC_PS_8 (1 << 28) -#define AT91_SMC_PS_16 (2 << 28) -#define AT91_SMC_PS_32 (3 << 28) - - -/* - * This function converts a setup timing expressed in nanoseconds into an - * encoded value that can be written in the SMC_SETUP register. - * - * The following formula is described in atmel datasheets (section - * "SMC Setup Register"): - * - * setup length = (128* SETUP[5] + SETUP[4:0]) - * - * where setup length is the timing expressed in cycles. - */ -static inline u32 at91sam9_smc_setup_ns_to_cycles(unsigned int clk_rate, - u32 timing_ns) -{ - u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate); - u32 coded_cycles = 0; - u32 cycles; - - cycles = DIV_ROUND_UP(timing_ns, clk_period); - if (cycles / 32) { - coded_cycles |= 1 << 5; - if (cycles < 128) - cycles = 0; - } - - coded_cycles |= cycles % 32; - - return coded_cycles; -} - -/* - * This function converts a pulse timing expressed in nanoseconds into an - * encoded value that can be written in the SMC_PULSE register. - * - * The following formula is described in atmel datasheets (section - * "SMC Pulse Register"): - * - * pulse length = (256* PULSE[6] + PULSE[5:0]) - * - * where pulse length is the timing expressed in cycles. +#define ATMEL_SMC_SETUP(cs) (((cs) * 0x10)) +#define ATMEL_HSMC_SETUP(cs) (0x600 + ((cs) * 0x14)) +#define ATMEL_SMC_PULSE(cs) (((cs) * 0x10) + 0x4) +#define ATMEL_HSMC_PULSE(cs) (0x600 + ((cs) * 0x14) + 0x4) +#define ATMEL_SMC_CYCLE(cs) (((cs) * 0x10) + 0x8) +#define ATMEL_HSMC_CYCLE(cs) (0x600 + ((cs) * 0x14) + 0x8) +#define ATMEL_SMC_NWE_SHIFT 0 +#define ATMEL_SMC_NCS_WR_SHIFT 8 +#define ATMEL_SMC_NRD_SHIFT 16 +#define ATMEL_SMC_NCS_RD_SHIFT 24 + +#define ATMEL_SMC_MODE(cs) (((cs) * 0x10) + 0xc) +#define ATMEL_HSMC_MODE(cs) (0x600 + ((cs) * 0x14) + 0x10) +#define ATMEL_SMC_MODE_READMODE_MASK BIT(0) +#define ATMEL_SMC_MODE_READMODE_NCS (0 << 0) +#define ATMEL_SMC_MODE_READMODE_NRD (1 << 0) +#define ATMEL_SMC_MODE_WRITEMODE_MASK BIT(1) +#define ATMEL_SMC_MODE_WRITEMODE_NCS (0 << 1) +#define ATMEL_SMC_MODE_WRITEMODE_NWE (1 << 1) +#define ATMEL_SMC_MODE_EXNWMODE_MASK GENMASK(5, 4) +#define ATMEL_SMC_MODE_EXNWMODE_DISABLE (0 << 4) +#define ATMEL_SMC_MODE_EXNWMODE_FROZEN (2 << 4) +#define ATMEL_SMC_MODE_EXNWMODE_READY (3 << 4) +#define ATMEL_SMC_MODE_BAT_MASK BIT(8) +#define ATMEL_SMC_MODE_BAT_SELECT (0 << 8) +#define ATMEL_SMC_MODE_BAT_WRITE (1 << 8) +#define ATMEL_SMC_MODE_DBW_MASK GENMASK(13, 12) +#define ATMEL_SMC_MODE_DBW_8 (0 << 12) +#define ATMEL_SMC_MODE_DBW_16 (1 << 12) +#define ATMEL_SMC_MODE_DBW_32 (2 << 12) +#define ATMEL_SMC_MODE_TDF_MASK GENMASK(19, 16) +#define ATMEL_SMC_MODE_TDF(x) (((x) - 1) << 16) +#define ATMEL_SMC_MODE_TDF_MAX 16 +#define ATMEL_SMC_MODE_TDF_MIN 1 +#define ATMEL_SMC_MODE_TDFMODE_OPTIMIZED BIT(20) +#define ATMEL_SMC_MODE_PMEN BIT(24) +#define ATMEL_SMC_MODE_PS_MASK GENMASK(29, 28) +#define ATMEL_SMC_MODE_PS_4 (0 << 28) +#define ATMEL_SMC_MODE_PS_8 (1 << 28) +#define ATMEL_SMC_MODE_PS_16 (2 << 28) +#define ATMEL_SMC_MODE_PS_32 (3 << 28) + +#define ATMEL_HSMC_TIMINGS(cs) (0x600 + ((cs) * 0x14) + 0xc) +#define ATMEL_HSMC_TIMINGS_OCMS BIT(12) +#define ATMEL_HSMC_TIMINGS_RBNSEL(x) ((x) << 28) +#define ATMEL_HSMC_TIMINGS_NFSEL BIT(31) +#define ATMEL_HSMC_TIMINGS_TCLR_SHIFT 0 +#define ATMEL_HSMC_TIMINGS_TADL_SHIFT 4 +#define ATMEL_HSMC_TIMINGS_TAR_SHIFT 8 +#define ATMEL_HSMC_TIMINGS_TRR_SHIFT 16 +#define ATMEL_HSMC_TIMINGS_TWB_SHIFT 24 + +/** + * struct atmel_smc_cs_conf - SMC CS config as described in the datasheet. + * @setup: NCS/NWE/NRD setup timings (not applicable to at91rm9200) + * @pulse: NCS/NWE/NRD pulse timings (not applicable to at91rm9200) + * @cycle: NWE/NRD cycle timings (not applicable to at91rm9200) + * @timings: advanced NAND related timings (only applicable to HSMC) + * @mode: all kind of config parameters (see the fields definition above). + * The mode fields are different on at91rm9200 */ -static inline u32 at91sam9_smc_pulse_ns_to_cycles(unsigned int clk_rate, - u32 timing_ns) -{ - u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate); - u32 coded_cycles = 0; - u32 cycles; - - cycles = DIV_ROUND_UP(timing_ns, clk_period); - if (cycles / 64) { - coded_cycles |= 1 << 6; - if (cycles < 256) - cycles = 0; - } - - coded_cycles |= cycles % 64; - - return coded_cycles; -} - -/* - * This function converts a cycle timing expressed in nanoseconds into an - * encoded value that can be written in the SMC_CYCLE register. - * - * The following formula is described in atmel datasheets (section - * "SMC Cycle Register"): - * - * cycle length = (CYCLE[8:7]*256 + CYCLE[6:0]) - * - * where cycle length is the timing expressed in cycles. - */ -static inline u32 at91sam9_smc_cycle_ns_to_cycles(unsigned int clk_rate, - u32 timing_ns) -{ - u32 clk_period = DIV_ROUND_UP(NSEC_PER_SEC, clk_rate); - u32 coded_cycles = 0; - u32 cycles; - - cycles = DIV_ROUND_UP(timing_ns, clk_period); - if (cycles / 128) { - coded_cycles = cycles / 256; - cycles %= 256; - if (cycles >= 128) { - coded_cycles++; - cycles = 0; - } - - if (coded_cycles > 0x3) { - coded_cycles = 0x3; - cycles = 0x7f; - } - - coded_cycles <<= 7; - } - - coded_cycles |= cycles % 128; - - return coded_cycles; -} +struct atmel_smc_cs_conf { + u32 setup; + u32 pulse; + u32 cycle; + u32 timings; + u32 mode; +}; + +void atmel_smc_cs_conf_init(struct atmel_smc_cs_conf *conf); +int atmel_smc_cs_conf_set_timing(struct atmel_smc_cs_conf *conf, + unsigned int shift, + unsigned int ncycles); +int atmel_smc_cs_conf_set_setup(struct atmel_smc_cs_conf *conf, + unsigned int shift, unsigned int ncycles); +int atmel_smc_cs_conf_set_pulse(struct atmel_smc_cs_conf *conf, + unsigned int shift, unsigned int ncycles); +int atmel_smc_cs_conf_set_cycle(struct atmel_smc_cs_conf *conf, + unsigned int shift, unsigned int ncycles); +void atmel_smc_cs_conf_apply(struct regmap *regmap, int cs, + const struct atmel_smc_cs_conf *conf); +void atmel_hsmc_cs_conf_apply(struct regmap *regmap, int cs, + const struct atmel_smc_cs_conf *conf); +void atmel_smc_cs_conf_get(struct regmap *regmap, int cs, + struct atmel_smc_cs_conf *conf); +void atmel_hsmc_cs_conf_get(struct regmap *regmap, int cs, + struct atmel_smc_cs_conf *conf); #endif /* _LINUX_MFD_SYSCON_ATMEL_SMC_H_ */ diff --git a/include/linux/mfd/syscon/exynos5-pmu.h b/include/linux/mfd/syscon/exynos5-pmu.h index c28ff21ca4d207c484c196eda1986ce8b4458b9b..b4942a32b81df8e9b4b21b92f68e408a6f80d4c2 100644 --- a/include/linux/mfd/syscon/exynos5-pmu.h +++ b/include/linux/mfd/syscon/exynos5-pmu.h @@ -12,41 +12,8 @@ #ifndef _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_ #define _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_ -/* Exynos5 PMU register definitions */ -#define EXYNOS5_HDMI_PHY_CONTROL (0x700) -#define EXYNOS5_USBDRD_PHY_CONTROL (0x704) - -/* Exynos5250 specific register definitions */ -#define EXYNOS5_USBHOST_PHY_CONTROL (0x708) -#define EXYNOS5_EFNAND_PHY_CONTROL (0x70c) -#define EXYNOS5_MIPI_PHY0_CONTROL (0x710) -#define EXYNOS5_MIPI_PHY1_CONTROL (0x714) -#define EXYNOS5_ADC_PHY_CONTROL (0x718) -#define EXYNOS5_MTCADC_PHY_CONTROL (0x71c) -#define EXYNOS5_DPTX_PHY_CONTROL (0x720) -#define EXYNOS5_SATA_PHY_CONTROL (0x724) - -/* Exynos5420 specific register definitions */ -#define EXYNOS5420_USBDRD1_PHY_CONTROL (0x708) -#define EXYNOS5420_USBHOST_PHY_CONTROL (0x70c) -#define EXYNOS5420_MIPI_PHY0_CONTROL (0x714) -#define EXYNOS5420_MIPI_PHY1_CONTROL (0x718) -#define EXYNOS5420_MIPI_PHY2_CONTROL (0x71c) -#define EXYNOS5420_ADC_PHY_CONTROL (0x720) -#define EXYNOS5420_MTCADC_PHY_CONTROL (0x724) -#define EXYNOS5420_DPTX_PHY_CONTROL (0x728) - -/* Exynos5433 specific register definitions */ -#define EXYNOS5433_USBHOST30_PHY_CONTROL (0x728) -#define EXYNOS5433_MIPI_PHY0_CONTROL (0x710) -#define EXYNOS5433_MIPI_PHY1_CONTROL (0x714) -#define EXYNOS5433_MIPI_PHY2_CONTROL (0x718) - #define EXYNOS5_PHY_ENABLE BIT(0) #define EXYNOS5_MIPI_PHY_S_RESETN BIT(1) #define EXYNOS5_MIPI_PHY_M_RESETN BIT(2) -#define EXYNOS5433_PAD_RETENTION_AUD_OPTION (0x3028) -#define EXYNOS5433_PAD_INITIATE_WAKEUP_FROM_LOWPWR BIT(28) - #endif /* _LINUX_MFD_SYSCON_PMU_EXYNOS5_H_ */ diff --git a/include/linux/mfd/syscon/imx7-iomuxc-gpr.h b/include/linux/mfd/syscon/imx7-iomuxc-gpr.h index 4585d6105d68af19f72f1fd90663b670d45a8f8e..abbd52466573f9994ea25eab5e6d2b12c00f564e 100644 --- a/include/linux/mfd/syscon/imx7-iomuxc-gpr.h +++ b/include/linux/mfd/syscon/imx7-iomuxc-gpr.h @@ -44,4 +44,8 @@ #define IMX7D_GPR5_CSI_MUX_CONTROL_MIPI (0x1 << 4) +#define IMX7D_GPR12_PCIE_PHY_REFCLK_SEL BIT(5) + +#define IMX7D_GPR22_PCIE_PHY_PLL_LOCKED BIT(31) + #endif /* __LINUX_IMX7_IOMUXC_GPR_H */ diff --git a/include/linux/mfd/ti-lmu-register.h b/include/linux/mfd/ti-lmu-register.h new file mode 100644 index 0000000000000000000000000000000000000000..2125c7c02818b3c5c30eff67111e6e512ccfb065 --- /dev/null +++ b/include/linux/mfd/ti-lmu-register.h @@ -0,0 +1,280 @@ +/* + * TI LMU (Lighting Management Unit) Device Register Map + * + * Copyright 2017 Texas Instruments + * + * Author: Milo Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MFD_TI_LMU_REGISTER_H__ +#define __MFD_TI_LMU_REGISTER_H__ + +#include + +/* LM3532 */ +#define LM3532_REG_OUTPUT_CFG 0x10 +#define LM3532_ILED1_CFG_MASK 0x03 +#define LM3532_ILED2_CFG_MASK 0x0C +#define LM3532_ILED3_CFG_MASK 0x30 +#define LM3532_ILED1_CFG_SHIFT 0 +#define LM3532_ILED2_CFG_SHIFT 2 +#define LM3532_ILED3_CFG_SHIFT 4 + +#define LM3532_REG_RAMPUP 0x12 +#define LM3532_REG_RAMPDN LM3532_REG_RAMPUP +#define LM3532_RAMPUP_MASK 0x07 +#define LM3532_RAMPUP_SHIFT 0 +#define LM3532_RAMPDN_MASK 0x38 +#define LM3532_RAMPDN_SHIFT 3 + +#define LM3532_REG_ENABLE 0x1D + +#define LM3532_REG_PWM_A_CFG 0x13 +#define LM3532_PWM_A_MASK 0x05 /* zone 0 */ +#define LM3532_PWM_ZONE_0 BIT(2) + +#define LM3532_REG_PWM_B_CFG 0x14 +#define LM3532_PWM_B_MASK 0x09 /* zone 1 */ +#define LM3532_PWM_ZONE_1 BIT(3) + +#define LM3532_REG_PWM_C_CFG 0x15 +#define LM3532_PWM_C_MASK 0x11 /* zone 2 */ +#define LM3532_PWM_ZONE_2 BIT(4) + +#define LM3532_REG_ZONE_CFG_A 0x16 +#define LM3532_REG_ZONE_CFG_B 0x18 +#define LM3532_REG_ZONE_CFG_C 0x1A +#define LM3532_ZONE_MASK (BIT(2) | BIT(3) | BIT(4)) +#define LM3532_ZONE_0 0 +#define LM3532_ZONE_1 BIT(2) +#define LM3532_ZONE_2 BIT(3) + +#define LM3532_REG_BRT_A 0x70 /* zone 0 */ +#define LM3532_REG_BRT_B 0x76 /* zone 1 */ +#define LM3532_REG_BRT_C 0x7C /* zone 2 */ + +#define LM3532_MAX_REG 0x7E + +/* LM3631 */ +#define LM3631_REG_DEVCTRL 0x00 +#define LM3631_LCD_EN_MASK BIT(1) +#define LM3631_BL_EN_MASK BIT(0) + +#define LM3631_REG_BRT_LSB 0x01 +#define LM3631_REG_BRT_MSB 0x02 + +#define LM3631_REG_BL_CFG 0x06 +#define LM3631_BL_CHANNEL_MASK BIT(3) +#define LM3631_BL_DUAL_CHANNEL 0 +#define LM3631_BL_SINGLE_CHANNEL BIT(3) +#define LM3631_MAP_MASK BIT(5) +#define LM3631_EXPONENTIAL_MAP 0 + +#define LM3631_REG_BRT_MODE 0x08 +#define LM3631_MODE_MASK (BIT(1) | BIT(2) | BIT(3)) +#define LM3631_DEFAULT_MODE (BIT(1) | BIT(3)) + +#define LM3631_REG_SLOPE 0x09 +#define LM3631_SLOPE_MASK 0xF0 +#define LM3631_SLOPE_SHIFT 4 + +#define LM3631_REG_LDO_CTRL1 0x0A +#define LM3631_EN_OREF_MASK BIT(0) +#define LM3631_EN_VNEG_MASK BIT(1) +#define LM3631_EN_VPOS_MASK BIT(2) + +#define LM3631_REG_LDO_CTRL2 0x0B +#define LM3631_EN_CONT_MASK BIT(0) + +#define LM3631_REG_VOUT_CONT 0x0C +#define LM3631_VOUT_CONT_MASK (BIT(6) | BIT(7)) + +#define LM3631_REG_VOUT_BOOST 0x0C +#define LM3631_REG_VOUT_POS 0x0D +#define LM3631_REG_VOUT_NEG 0x0E +#define LM3631_REG_VOUT_OREF 0x0F +#define LM3631_VOUT_MASK 0x3F + +#define LM3631_REG_ENTIME_VCONT 0x0B +#define LM3631_ENTIME_CONT_MASK 0x70 + +#define LM3631_REG_ENTIME_VOREF 0x0F +#define LM3631_REG_ENTIME_VPOS 0x10 +#define LM3631_REG_ENTIME_VNEG 0x11 +#define LM3631_ENTIME_MASK 0xF0 +#define LM3631_ENTIME_SHIFT 4 + +#define LM3631_MAX_REG 0x16 + +/* LM3632 */ +#define LM3632_REG_CONFIG1 0x02 +#define LM3632_OVP_MASK (BIT(5) | BIT(6) | BIT(7)) +#define LM3632_OVP_25V BIT(6) + +#define LM3632_REG_CONFIG2 0x03 +#define LM3632_SWFREQ_MASK BIT(7) +#define LM3632_SWFREQ_1MHZ BIT(7) + +#define LM3632_REG_BRT_LSB 0x04 +#define LM3632_REG_BRT_MSB 0x05 + +#define LM3632_REG_IO_CTRL 0x09 +#define LM3632_PWM_MASK BIT(6) +#define LM3632_I2C_MODE 0 +#define LM3632_PWM_MODE BIT(6) + +#define LM3632_REG_ENABLE 0x0A +#define LM3632_BL_EN_MASK BIT(0) +#define LM3632_BL_CHANNEL_MASK (BIT(3) | BIT(4)) +#define LM3632_BL_SINGLE_CHANNEL BIT(4) +#define LM3632_BL_DUAL_CHANNEL BIT(3) + +#define LM3632_REG_BIAS_CONFIG 0x0C +#define LM3632_EXT_EN_MASK BIT(0) +#define LM3632_EN_VNEG_MASK BIT(1) +#define LM3632_EN_VPOS_MASK BIT(2) + +#define LM3632_REG_VOUT_BOOST 0x0D +#define LM3632_REG_VOUT_POS 0x0E +#define LM3632_REG_VOUT_NEG 0x0F +#define LM3632_VOUT_MASK 0x3F + +#define LM3632_MAX_REG 0x10 + +/* LM3633 */ +#define LM3633_REG_HVLED_OUTPUT_CFG 0x10 +#define LM3633_HVLED1_CFG_MASK BIT(0) +#define LM3633_HVLED2_CFG_MASK BIT(1) +#define LM3633_HVLED3_CFG_MASK BIT(2) +#define LM3633_HVLED1_CFG_SHIFT 0 +#define LM3633_HVLED2_CFG_SHIFT 1 +#define LM3633_HVLED3_CFG_SHIFT 2 + +#define LM3633_REG_BANK_SEL 0x11 + +#define LM3633_REG_BL0_RAMP 0x12 +#define LM3633_REG_BL1_RAMP 0x13 +#define LM3633_BL_RAMPUP_MASK 0xF0 +#define LM3633_BL_RAMPUP_SHIFT 4 +#define LM3633_BL_RAMPDN_MASK 0x0F +#define LM3633_BL_RAMPDN_SHIFT 0 + +#define LM3633_REG_BL_RAMP_CONF 0x1B +#define LM3633_BL_RAMP_MASK 0x0F +#define LM3633_BL_RAMP_EACH 0x05 + +#define LM3633_REG_PTN0_RAMP 0x1C +#define LM3633_REG_PTN1_RAMP 0x1D +#define LM3633_PTN_RAMPUP_MASK 0x70 +#define LM3633_PTN_RAMPUP_SHIFT 4 +#define LM3633_PTN_RAMPDN_MASK 0x07 +#define LM3633_PTN_RAMPDN_SHIFT 0 + +#define LM3633_REG_LED_MAPPING_MODE 0x1F +#define LM3633_LED_EXPONENTIAL BIT(1) + +#define LM3633_REG_IMAX_HVLED_A 0x20 +#define LM3633_REG_IMAX_HVLED_B 0x21 +#define LM3633_REG_IMAX_LVLED_BASE 0x22 + +#define LM3633_REG_BL_FEEDBACK_ENABLE 0x28 + +#define LM3633_REG_ENABLE 0x2B +#define LM3633_LED_BANK_OFFSET 2 + +#define LM3633_REG_PATTERN 0x2C + +#define LM3633_REG_BOOST_CFG 0x2D +#define LM3633_OVP_MASK (BIT(1) | BIT(2)) +#define LM3633_OVP_40V 0x6 + +#define LM3633_REG_PWM_CFG 0x2F +#define LM3633_PWM_A_MASK BIT(0) +#define LM3633_PWM_B_MASK BIT(1) + +#define LM3633_REG_BRT_HVLED_A_LSB 0x40 +#define LM3633_REG_BRT_HVLED_A_MSB 0x41 +#define LM3633_REG_BRT_HVLED_B_LSB 0x42 +#define LM3633_REG_BRT_HVLED_B_MSB 0x43 + +#define LM3633_REG_BRT_LVLED_BASE 0x44 + +#define LM3633_REG_PTN_DELAY 0x50 + +#define LM3633_REG_PTN_LOWTIME 0x51 + +#define LM3633_REG_PTN_HIGHTIME 0x52 + +#define LM3633_REG_PTN_LOWBRT 0x53 + +#define LM3633_REG_PTN_HIGHBRT LM3633_REG_BRT_LVLED_BASE + +#define LM3633_REG_BL_OPEN_FAULT_STATUS 0xB0 + +#define LM3633_REG_BL_SHORT_FAULT_STATUS 0xB2 + +#define LM3633_REG_MONITOR_ENABLE 0xB4 + +#define LM3633_MAX_REG 0xB4 + +/* LM3695 */ +#define LM3695_REG_GP 0x10 +#define LM3695_BL_CHANNEL_MASK BIT(3) +#define LM3695_BL_DUAL_CHANNEL 0 +#define LM3695_BL_SINGLE_CHANNEL BIT(3) +#define LM3695_BRT_RW_MASK BIT(2) +#define LM3695_BL_EN_MASK BIT(0) + +#define LM3695_REG_BRT_LSB 0x13 +#define LM3695_REG_BRT_MSB 0x14 + +#define LM3695_MAX_REG 0x14 + +/* LM3697 */ +#define LM3697_REG_HVLED_OUTPUT_CFG 0x10 +#define LM3697_HVLED1_CFG_MASK BIT(0) +#define LM3697_HVLED2_CFG_MASK BIT(1) +#define LM3697_HVLED3_CFG_MASK BIT(2) +#define LM3697_HVLED1_CFG_SHIFT 0 +#define LM3697_HVLED2_CFG_SHIFT 1 +#define LM3697_HVLED3_CFG_SHIFT 2 + +#define LM3697_REG_BL0_RAMP 0x11 +#define LM3697_REG_BL1_RAMP 0x12 +#define LM3697_RAMPUP_MASK 0xF0 +#define LM3697_RAMPUP_SHIFT 4 +#define LM3697_RAMPDN_MASK 0x0F +#define LM3697_RAMPDN_SHIFT 0 + +#define LM3697_REG_RAMP_CONF 0x14 +#define LM3697_RAMP_MASK 0x0F +#define LM3697_RAMP_EACH 0x05 + +#define LM3697_REG_PWM_CFG 0x1C +#define LM3697_PWM_A_MASK BIT(0) +#define LM3697_PWM_B_MASK BIT(1) + +#define LM3697_REG_IMAX_A 0x17 +#define LM3697_REG_IMAX_B 0x18 + +#define LM3697_REG_FEEDBACK_ENABLE 0x19 + +#define LM3697_REG_BRT_A_LSB 0x20 +#define LM3697_REG_BRT_A_MSB 0x21 +#define LM3697_REG_BRT_B_LSB 0x22 +#define LM3697_REG_BRT_B_MSB 0x23 + +#define LM3697_REG_ENABLE 0x24 + +#define LM3697_REG_OPEN_FAULT_STATUS 0xB0 + +#define LM3697_REG_SHORT_FAULT_STATUS 0xB2 + +#define LM3697_REG_MONITOR_ENABLE 0xB4 + +#define LM3697_MAX_REG 0xB4 +#endif diff --git a/include/linux/mfd/ti-lmu.h b/include/linux/mfd/ti-lmu.h new file mode 100644 index 0000000000000000000000000000000000000000..09d5f30384e56666607be51b2e95b02ba54afd45 --- /dev/null +++ b/include/linux/mfd/ti-lmu.h @@ -0,0 +1,87 @@ +/* + * TI LMU (Lighting Management Unit) Devices + * + * Copyright 2017 Texas Instruments + * + * Author: Milo Kim + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __MFD_TI_LMU_H__ +#define __MFD_TI_LMU_H__ + +#include +#include +#include + +/* Notifier event */ +#define LMU_EVENT_MONITOR_DONE 0x01 + +enum ti_lmu_id { + LM3532, + LM3631, + LM3632, + LM3633, + LM3695, + LM3697, + LMU_MAX_ID, +}; + +enum ti_lmu_max_current { + LMU_IMAX_5mA, + LMU_IMAX_6mA, + LMU_IMAX_7mA = 0x03, + LMU_IMAX_8mA, + LMU_IMAX_9mA, + LMU_IMAX_10mA = 0x07, + LMU_IMAX_11mA, + LMU_IMAX_12mA, + LMU_IMAX_13mA, + LMU_IMAX_14mA, + LMU_IMAX_15mA = 0x0D, + LMU_IMAX_16mA, + LMU_IMAX_17mA, + LMU_IMAX_18mA, + LMU_IMAX_19mA, + LMU_IMAX_20mA = 0x13, + LMU_IMAX_21mA, + LMU_IMAX_22mA, + LMU_IMAX_23mA = 0x17, + LMU_IMAX_24mA, + LMU_IMAX_25mA, + LMU_IMAX_26mA, + LMU_IMAX_27mA = 0x1C, + LMU_IMAX_28mA, + LMU_IMAX_29mA, + LMU_IMAX_30mA, +}; + +enum lm363x_regulator_id { + LM3631_BOOST, /* Boost output */ + LM3631_LDO_CONT, /* Display panel controller */ + LM3631_LDO_OREF, /* Gamma reference */ + LM3631_LDO_POS, /* Positive display bias output */ + LM3631_LDO_NEG, /* Negative display bias output */ + LM3632_BOOST, /* Boost output */ + LM3632_LDO_POS, /* Positive display bias output */ + LM3632_LDO_NEG, /* Negative display bias output */ +}; + +/** + * struct ti_lmu + * + * @dev: Parent device pointer + * @regmap: Used for i2c communcation on accessing registers + * @en_gpio: GPIO for HWEN pin [Optional] + * @notifier: Notifier for reporting hwmon event + */ +struct ti_lmu { + struct device *dev; + struct regmap *regmap; + int en_gpio; + struct blocking_notifier_head notifier; +}; +#endif diff --git a/include/linux/mfd/wm831x/core.h b/include/linux/mfd/wm831x/core.h index 76c22648436f22dad8ee59cec7f69ee984322a47..b49fa67612f19ba52bf8a3e518f18f15298d3811 100644 --- a/include/linux/mfd/wm831x/core.h +++ b/include/linux/mfd/wm831x/core.h @@ -21,6 +21,8 @@ #include #include #include +#include +#include /* * Register values. @@ -367,6 +369,9 @@ struct wm831x { struct regmap *regmap; + struct wm831x_pdata pdata; + enum wm831x_parent type; + int irq; /* Our chip IRQ */ struct mutex irq_lock; struct irq_domain *irq_domain; @@ -412,7 +417,7 @@ int wm831x_set_bits(struct wm831x *wm831x, unsigned short reg, int wm831x_bulk_read(struct wm831x *wm831x, unsigned short reg, int count, u16 *buf); -int wm831x_device_init(struct wm831x *wm831x, unsigned long id, int irq); +int wm831x_device_init(struct wm831x *wm831x, int irq); void wm831x_device_exit(struct wm831x *wm831x); int wm831x_device_suspend(struct wm831x *wm831x); void wm831x_device_shutdown(struct wm831x *wm831x); @@ -427,4 +432,6 @@ static inline int wm831x_irq(struct wm831x *wm831x, int irq) extern struct regmap_config wm831x_regmap_config; +extern const struct of_device_id wm831x_of_match[]; + #endif diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h deleted file mode 100644 index e11f4d9f1c2e0b19a7b00e204dac73f53fc64bea..0000000000000000000000000000000000000000 --- a/include/linux/mg_disk.h +++ /dev/null @@ -1,45 +0,0 @@ -/* - * include/linux/mg_disk.c - * - * Private data for mflash platform driver - * - * (c) 2008 mGine Co.,LTD - * (c) 2008 unsik Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __MG_DISK_H__ -#define __MG_DISK_H__ - -/* name for platform device */ -#define MG_DEV_NAME "mg_disk" - -/* names of GPIO resource */ -#define MG_RST_PIN "mg_rst" -/* except MG_BOOT_DEV, reset-out pin should be assigned */ -#define MG_RSTOUT_PIN "mg_rstout" - -/* device attribution */ -/* use mflash as boot device */ -#define MG_BOOT_DEV (1 << 0) -/* use mflash as storage device */ -#define MG_STORAGE_DEV (1 << 1) -/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */ -#define MG_STORAGE_DEV_SKIP_RST (1 << 2) - -/* private driver data */ -struct mg_drv_data { - /* disk resource */ - u32 use_polling; - - /* device attribution */ - u32 dev_attr; - - /* internally used */ - void *host; -}; - -#endif diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa76b516fa473bdfd803d09e0206923153613d65..48e24844b3c5074c8bf8c26dcf2be11a32c657e0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -33,8 +33,9 @@ extern char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); -extern int migrate_page(struct address_space *, - struct page *, struct page *, enum migrate_mode); +extern int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern int isolate_movable_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 1beb1ec2fbdf339b34affc69508a5f5462b409b0..d5bed0875d309ce95025dbcab502f51815ca431b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -108,7 +108,7 @@ enum { MLX4_MFUNC_EQE_MASK = (MLX4_MFUNC_MAX_EQES - 1) }; -/* Driver supports 3 diffrent device methods to manage traffic steering: +/* Driver supports 3 different device methods to manage traffic steering: * -device managed - High level API for ib and eth flow steering. FW is * managing flow steering tables. * - B0 steering mode - Common low level API for ib and (if supported) eth. @@ -1011,8 +1011,7 @@ struct mlx4_mad_ifc { #define mlx4_foreach_ib_transport_port(port, dev) \ for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ - ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \ - ((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)) + ((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_ETH)) #define MLX4_INVALID_SLAVE_ID 0xFF #define MLX4_SINK_COUNTER_INDEX(dev) (dev->caps.max_counters - 1) diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 2fcff6b4503f6a4824bea50c189b072ef6c486cb..bcdf739ee41a2cf38fe537e48172e9ca9053e3b9 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -540,6 +540,7 @@ struct mlx5_fc_stats { struct workqueue_struct *wq; struct delayed_work work; unsigned long next_query; + unsigned long sampling_interval; /* jiffies */ }; struct mlx5_eswitch; @@ -728,6 +729,7 @@ struct mlx5e_resources { u32 pdn; struct mlx5_td td; struct mlx5_core_mkey mkey; + struct mlx5_sq_bfreg bfreg; }; struct mlx5_core_dev { @@ -890,12 +892,7 @@ static inline u16 cmdif_rev(struct mlx5_core_dev *dev) static inline void *mlx5_vzalloc(unsigned long size) { - void *rtn; - - rtn = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!rtn) - rtn = vzalloc(size); - return rtn; + return kvzalloc(size, GFP_KERNEL); } static inline u32 mlx5_base_mkey(const u32 key) @@ -1100,6 +1097,25 @@ struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev); struct mlx5_uars_page *mlx5_get_uars_page(struct mlx5_core_dev *mdev); void mlx5_put_uars_page(struct mlx5_core_dev *mdev, struct mlx5_uars_page *up); +#ifndef CONFIG_MLX5_CORE_IPOIB +static inline +struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, + struct ib_device *ibdev, + const char *name, + void (*setup)(struct net_device *)) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void mlx5_rdma_netdev_free(struct net_device *netdev) {} +#else +struct net_device *mlx5_rdma_netdev_alloc(struct mlx5_core_dev *mdev, + struct ib_device *ibdev, + const char *name, + void (*setup)(struct net_device *)); +void mlx5_rdma_netdev_free(struct net_device *netdev); +#endif /* CONFIG_MLX5_CORE_IPOIB */ + struct mlx5_profile { u64 mask; u8 log_max_qp; diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h index 949b24b6c4794ce14909d779b7dbfd2534aa53db..1b166d2e19c57829279faa20a88796fabb5d3f5e 100644 --- a/include/linux/mlx5/fs.h +++ b/include/linux/mlx5/fs.h @@ -104,12 +104,18 @@ mlx5_create_auto_grouped_flow_table(struct mlx5_flow_namespace *ns, u32 level, u32 flags); +struct mlx5_flow_table_attr { + int prio; + int max_fte; + u32 level; + u32 flags; + u32 underlay_qpn; +}; + struct mlx5_flow_table * mlx5_create_flow_table(struct mlx5_flow_namespace *ns, - int prio, - int num_flow_table_entries, - u32 level, - u32 flags); + struct mlx5_flow_table_attr *ft_attr); + struct mlx5_flow_table * mlx5_create_vport_flow_table(struct mlx5_flow_namespace *ns, int prio, @@ -134,8 +140,13 @@ struct mlx5_flow_act { u32 action; u32 flow_tag; u32 encap_id; + u32 modify_id; }; +#define MLX5_DECLARE_FLOW_ACT(name) \ + struct mlx5_flow_act name = {MLX5_FLOW_CONTEXT_ACTION_FWD_DEST,\ + MLX5_FS_DEFAULT_FLOW_TAG, 0, 0} + /* Single destination per rule. * Group ID is implied by the match criteria. */ @@ -156,5 +167,4 @@ struct mlx5_fc *mlx5_fc_create(struct mlx5_core_dev *dev, bool aging); void mlx5_fc_destroy(struct mlx5_core_dev *dev, struct mlx5_fc *counter); void mlx5_fc_query_cached(struct mlx5_fc *counter, u64 *bytes, u64 *packets, u64 *lastuse); - #endif diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 838242697541a28fdda4d90bf7b604e25f3bfba2..32de0724b40009adc2a802dcc5abafbb5505c0b1 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -227,6 +227,8 @@ enum { MLX5_CMD_OP_MODIFY_FLOW_TABLE = 0x93c, MLX5_CMD_OP_ALLOC_ENCAP_HEADER = 0x93d, MLX5_CMD_OP_DEALLOC_ENCAP_HEADER = 0x93e, + MLX5_CMD_OP_ALLOC_MODIFY_HEADER_CONTEXT = 0x940, + MLX5_CMD_OP_DEALLOC_MODIFY_HEADER_CONTEXT = 0x941, MLX5_CMD_OP_MAX }; @@ -234,7 +236,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 outer_dmac[0x1]; u8 outer_smac[0x1]; u8 outer_ether_type[0x1]; - u8 reserved_at_3[0x1]; + u8 outer_ip_version[0x1]; u8 outer_first_prio[0x1]; u8 outer_first_cfi[0x1]; u8 outer_first_vid[0x1]; @@ -263,7 +265,7 @@ struct mlx5_ifc_flow_table_fields_supported_bits { u8 inner_dmac[0x1]; u8 inner_smac[0x1]; u8 inner_ether_type[0x1]; - u8 reserved_at_23[0x1]; + u8 inner_ip_version[0x1]; u8 inner_first_prio[0x1]; u8 inner_first_cfi[0x1]; u8 inner_first_vid[0x1]; @@ -302,7 +304,8 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; - u8 reserved_at_28[0x10]; + u8 log_max_modify_header_context[0x8]; + u8 max_modify_header_actions[0x8]; u8 max_ft_level[0x8]; u8 reserved_at_40[0x20]; @@ -368,7 +371,7 @@ struct mlx5_ifc_fte_match_set_lyr_2_4_bits { u8 cvlan_tag[0x1]; u8 svlan_tag[0x1]; u8 frag[0x1]; - u8 reserved_at_93[0x4]; + u8 ip_version[0x4]; u8 tcp_flags[0x9]; u8 tcp_sport[0x10]; @@ -869,7 +872,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 compact_address_vector[0x1]; u8 striding_rq[0x1]; - u8 reserved_at_202[0x2]; + u8 reserved_at_202[0x1]; + u8 ipoib_enhanced_offloads[0x1]; u8 ipoib_basic_offloads[0x1]; u8 reserved_at_205[0xa]; u8 drain_sigerr[0x1]; @@ -1452,7 +1456,9 @@ struct mlx5_ifc_ib_port_cntrs_grp_data_layout_bits { u8 vl_15_dropped[0x10]; - u8 reserved_at_a0[0xa0]; + u8 reserved_at_a0[0x80]; + + u8 port_xmit_wait[0x20]; }; struct mlx5_ifc_eth_per_traffic_grp_data_layout_bits { @@ -2190,6 +2196,7 @@ enum { MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8, MLX5_FLOW_CONTEXT_ACTION_ENCAP = 0x10, MLX5_FLOW_CONTEXT_ACTION_DECAP = 0x20, + MLX5_FLOW_CONTEXT_ACTION_MOD_HDR = 0x40, }; struct mlx5_ifc_flow_context_bits { @@ -2211,7 +2218,9 @@ struct mlx5_ifc_flow_context_bits { u8 encap_id[0x20]; - u8 reserved_at_e0[0x120]; + u8 modify_header_id[0x20]; + + u8 reserved_at_100[0x100]; struct mlx5_ifc_fte_match_param_bits match_value; @@ -2287,7 +2296,9 @@ struct mlx5_ifc_tisc_bits { u8 reserved_at_120[0x8]; u8 transport_domain[0x18]; - u8 reserved_at_140[0x3c0]; + u8 reserved_at_140[0x8]; + u8 underlay_qpn[0x18]; + u8 reserved_at_160[0x3a0]; }; enum { @@ -4534,6 +4545,109 @@ struct mlx5_ifc_dealloc_encap_header_in_bits { u8 reserved_60[0x20]; }; +struct mlx5_ifc_set_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x3]; + u8 offset[0x5]; + u8 reserved_at_18[0x3]; + u8 length[0x5]; + + u8 data[0x20]; +}; + +struct mlx5_ifc_add_action_in_bits { + u8 action_type[0x4]; + u8 field[0xc]; + u8 reserved_at_10[0x10]; + + u8 data[0x20]; +}; + +union mlx5_ifc_set_action_in_add_action_in_auto_bits { + struct mlx5_ifc_set_action_in_bits set_action_in; + struct mlx5_ifc_add_action_in_bits add_action_in; + u8 reserved_at_0[0x40]; +}; + +enum { + MLX5_ACTION_TYPE_SET = 0x1, + MLX5_ACTION_TYPE_ADD = 0x2, +}; + +enum { + MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16 = 0x1, + MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0 = 0x2, + MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE = 0x3, + MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16 = 0x4, + MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0 = 0x5, + MLX5_ACTION_IN_FIELD_OUT_IP_DSCP = 0x6, + MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS = 0x7, + MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT = 0x8, + MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT = 0x9, + MLX5_ACTION_IN_FIELD_OUT_IP_TTL = 0xa, + MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT = 0xb, + MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT = 0xc, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96 = 0xd, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64 = 0xe, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32 = 0xf, + MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0 = 0x10, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96 = 0x11, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64 = 0x12, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32 = 0x13, + MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0 = 0x14, + MLX5_ACTION_IN_FIELD_OUT_SIPV4 = 0x15, + MLX5_ACTION_IN_FIELD_OUT_DIPV4 = 0x16, +}; + +struct mlx5_ifc_alloc_modify_header_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_alloc_modify_header_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x20]; + + u8 table_type[0x8]; + u8 reserved_at_68[0x10]; + u8 num_of_actions[0x8]; + + union mlx5_ifc_set_action_in_add_action_in_auto_bits actions[0]; +}; + +struct mlx5_ifc_dealloc_modify_header_context_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_dealloc_modify_header_context_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 modify_header_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_query_dct_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -4623,17 +4737,17 @@ struct mlx5_ifc_query_cong_statistics_out_bits { u8 reserved_at_40[0x40]; - u8 cur_flows[0x20]; + u8 rp_cur_flows[0x20]; u8 sum_flows[0x20]; - u8 cnp_ignored_high[0x20]; + u8 rp_cnp_ignored_high[0x20]; - u8 cnp_ignored_low[0x20]; + u8 rp_cnp_ignored_low[0x20]; - u8 cnp_handled_high[0x20]; + u8 rp_cnp_handled_high[0x20]; - u8 cnp_handled_low[0x20]; + u8 rp_cnp_handled_low[0x20]; u8 reserved_at_140[0x100]; @@ -4643,13 +4757,13 @@ struct mlx5_ifc_query_cong_statistics_out_bits { u8 accumulators_period[0x20]; - u8 ecn_marked_roce_packets_high[0x20]; + u8 np_ecn_marked_roce_packets_high[0x20]; - u8 ecn_marked_roce_packets_low[0x20]; + u8 np_ecn_marked_roce_packets_low[0x20]; - u8 cnps_sent_high[0x20]; + u8 np_cnp_sent_high[0x20]; - u8 cnps_sent_low[0x20]; + u8 np_cnp_sent_low[0x20]; u8 reserved_at_320[0x560]; }; @@ -5013,6 +5127,7 @@ struct mlx5_ifc_modify_rq_out_bits { enum { MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_VSD = 1ULL << 1, + MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_SCATTER_FCS = 1ULL << 2, MLX5_MODIFY_RQ_IN_MODIFY_BITMASK_RQ_COUNTER_SET_ID = 1ULL << 3, }; @@ -8108,7 +8223,9 @@ struct mlx5_ifc_set_flow_table_root_in_bits { u8 reserved_at_a0[0x8]; u8 table_id[0x18]; - u8 reserved_at_c0[0x140]; + u8 reserved_at_c0[0x8]; + u8 underlay_qpn[0x18]; + u8 reserved_at_e0[0x120]; }; enum { diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index 3096370fe8319ec76f823b920ded745a10ebb6fd..bef80d0a0e30bcfb446abf14417789a34eb28af8 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -295,6 +295,16 @@ struct mlx5_av { u8 rgid[16]; }; +struct mlx5_ib_ah { + struct ib_ah ibah; + struct mlx5_av av; +}; + +static inline struct mlx5_ib_ah *to_mah(struct ib_ah *ibah) +{ + return container_of(ibah, struct mlx5_ib_ah, ibah); +} + struct mlx5_wqe_datagram_seg { struct mlx5_av av; }; diff --git a/include/linux/mm.h b/include/linux/mm.h index 00a8fa7e366a0320210941ca39dd53fed97d4e2c..7cb17c6b97de38b1e8d55ca6d8c90b8415a9c3fa 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -432,6 +432,10 @@ static inline int pud_devmap(pud_t pud) { return 0; } +static inline int pgd_devmap(pgd_t pgd) +{ + return 0; +} #endif /* @@ -514,6 +518,28 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +extern void *kvmalloc_node(size_t size, gfp_t flags, int node); +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + return kvmalloc_node(size, flags, NUMA_NO_NODE); +} +static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +{ + return kvmalloc_node(size, flags | __GFP_ZERO, node); +} +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + +static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + + return kvmalloc(n * size, flags); +} + extern void kvfree(const void *addr); static inline atomic_t *compound_mapcount_ptr(struct page *page) @@ -758,19 +784,11 @@ static inline enum zone_type page_zonenum(const struct page *page) } #ifdef CONFIG_ZONE_DEVICE -void get_zone_device_page(struct page *page); -void put_zone_device_page(struct page *page); static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } #else -static inline void get_zone_device_page(struct page *page) -{ -} -static inline void put_zone_device_page(struct page *page) -{ -} static inline bool is_zone_device_page(const struct page *page) { return false; @@ -786,9 +804,6 @@ static inline void get_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) <= 0, page); page_ref_inc(page); - - if (unlikely(is_zone_device_page(page))) - get_zone_device_page(page); } static inline void put_page(struct page *page) @@ -797,9 +812,6 @@ static inline void put_page(struct page *page) if (put_page_testzero(page)) __put_page(page); - - if (unlikely(is_zone_device_page(page))) - put_zone_device_page(page); } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) @@ -2497,7 +2509,6 @@ extern long copy_huge_page_from_user(struct page *dst_page, #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ extern struct page_ext_operations debug_guardpage_ops; -extern struct page_ext_operations page_poisoning_ops; #ifdef CONFIG_DEBUG_PAGEALLOC extern unsigned int _debug_guardpage_minorder; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index f60f45fe226fcad85590b1eda3fafb1d170e943a..45cdb27791a33ff8d494549ce92f080a93434889 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -367,6 +367,11 @@ struct mm_struct { #endif unsigned long mmap_base; /* base of mmap area */ unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */ +#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES + /* Base adresses for compatible mmap() */ + unsigned long mmap_compat_base; + unsigned long mmap_compat_legacy_base; +#endif unsigned long task_size; /* size of task vm space */ unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 77e61e0a216a2728dd5cfecf55299402ac03c384..aad015e0152b7f1d32f92c500825b723498d1be9 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -89,6 +89,7 @@ struct mmc_ext_csd { unsigned int boot_ro_lock; /* ro lock support */ bool boot_ro_lockable; bool ffu_capable; /* Firmware upgrade support */ + bool cmdq_en; /* Command Queue enabled */ bool cmdq_support; /* Command Queue supported */ unsigned int cmdq_depth; /* Command Queue depth */ #define MMC_FIRMWARE_LEN 8 @@ -208,6 +209,7 @@ struct sdio_cis { struct mmc_host; struct sdio_func; struct sdio_func_tuple; +struct mmc_queue_req; #define SDIO_MAX_FUNCS 7 @@ -267,6 +269,8 @@ struct mmc_card { #define MMC_QUIRK_TRIM_BROKEN (1<<12) /* Skip trim */ #define MMC_QUIRK_BROKEN_HPI (1<<13) /* Disable broken HPI support */ + bool reenable_cmdq; /* Re-enable Command Queue */ + unsigned int erase_size; /* erase size in sectors */ unsigned int erase_shift; /* if erase unit is power 2 */ unsigned int pref_erase; /* in sectors */ @@ -300,6 +304,10 @@ struct mmc_card { struct dentry *debugfs_root; struct mmc_part part[MMC_NUM_PHY_PARTITION]; /* physical partitions */ unsigned int nr_parts; + + struct mmc_queue_req *mqrq; /* Shared queue structure */ + unsigned int bouncesz; /* Bounce buffer size */ + int qdepth; /* Shared queue depth */ }; static inline bool mmc_large_sector(struct mmc_card *card) @@ -307,6 +315,8 @@ static inline bool mmc_large_sector(struct mmc_card *card) return card->ext_csd.data_sector_size == 4096; } +bool mmc_card_is_blockaddr(struct mmc_card *card); + #define mmc_card_mmc(c) ((c)->type == MMC_TYPE_MMC) #define mmc_card_sd(c) ((c)->type == MMC_TYPE_SD) #define mmc_card_sdio(c) ((c)->type == MMC_TYPE_SDIO) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 83f1c4a9f03b7e36b4cb9ed23346d443c2cb4f4c..21385ac0c9b1c9cebea155571bbbb9047af50685 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -17,6 +17,7 @@ #include #include #include +#include struct mmc_ios { unsigned int clock; /* clock rate */ @@ -499,6 +500,11 @@ static inline bool mmc_can_retune(struct mmc_host *host) return host->can_retune == 1; } +static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data) +{ + return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE; +} + int mmc_send_tuning(struct mmc_host *host, u32 opcode, int *cmd_error); int mmc_abort_tuning(struct mmc_host *host, u32 opcode); diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index aab032a6ae6124de63b7934f49d0dc564b0d2dc3..97ca105347a6c5e608297ae1a3562925dc6f6834 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -53,7 +53,7 @@ struct sdio_func { unsigned int state; /* function state */ #define SDIO_STATE_PRESENT (1<<0) /* present in sysfs */ - u8 tmpbuf[4]; /* DMA:able scratch buffer */ + u8 *tmpbuf; /* DMA:able scratch buffer */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8e02b3750fe0f6e18afeb8cfc096705f23405728..ebaccd4e7d8cdc5f5ef13fed1475a8b202d93628 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -35,7 +35,7 @@ */ #define PAGE_ALLOC_COSTLY_ORDER 3 -enum { +enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, @@ -74,6 +74,11 @@ extern char * const migratetype_names[MIGRATE_TYPES]; # define is_migrate_cma_page(_page) false #endif +static inline bool is_migrate_movable(int mt) +{ + return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; +} + #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) @@ -149,7 +154,6 @@ enum node_stat_item { NR_UNEVICTABLE, /* " " " " " */ NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ - NR_PAGES_SCANNED, /* pages scanned since last reclaim */ WORKINGSET_REFAULT, WORKINGSET_ACTIVATE, WORKINGSET_NODERECLAIM, @@ -226,6 +230,8 @@ struct lruvec { struct zone_reclaim_stat reclaim_stat; /* Evictions & activations on the inactive file list */ atomic_long_t inactive_age; + /* Refaults at the time of last reclaim cycle */ + unsigned long refaults; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif @@ -630,6 +636,8 @@ typedef struct pglist_data { int kswapd_order; enum zone_type kswapd_classzone_idx; + int kswapd_failures; /* Number of 'reclaimed == 0' runs */ + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_classzone_idx; diff --git a/include/linux/mod_devicetable.h b/include/linux/mod_devicetable.h index 8850fcaf50dba05d9440b958c5d95cd9330e727a..566fda587fcf7a76af1c6a01e84ce33336606f3f 100644 --- a/include/linux/mod_devicetable.h +++ b/include/linux/mod_devicetable.h @@ -428,6 +428,16 @@ struct i2c_device_id { kernel_ulong_t driver_data; /* Data private to the driver */ }; +/* pci_epf */ + +#define PCI_EPF_NAME_SIZE 20 +#define PCI_EPF_MODULE_PREFIX "pci_epf:" + +struct pci_epf_device_id { + char name[PCI_EPF_NAME_SIZE]; + kernel_ulong_t driver_data; +}; + /* spi */ #define SPI_NAME_SIZE 32 diff --git a/include/linux/module.h b/include/linux/module.h index 0297c5cd7cdfc6692a400cac545d1a07637ab8b2..21f56393602f2000b901238cd9a7911e23e04448 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -493,6 +493,7 @@ static inline int module_is_live(struct module *mod) struct module *__module_text_address(unsigned long addr); struct module *__module_address(unsigned long addr); bool is_module_address(unsigned long addr); +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr); bool is_module_percpu_address(unsigned long addr); bool is_module_text_address(unsigned long addr); @@ -582,7 +583,7 @@ extern bool try_module_get(struct module *module); extern void module_put(struct module *module); #else /*!CONFIG_MODULE_UNLOAD*/ -static inline int try_module_get(struct module *module) +static inline bool try_module_get(struct module *module) { return !module || module_is_live(module); } @@ -660,6 +661,11 @@ static inline bool is_module_percpu_address(unsigned long addr) return false; } +static inline bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + static inline bool is_module_text_address(unsigned long addr) { return false; @@ -674,9 +680,9 @@ static inline void __module_get(struct module *module) { } -static inline int try_module_get(struct module *module) +static inline bool try_module_get(struct module *module) { - return 1; + return true; } static inline void module_put(struct module *module) diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 52666d90ca9452251c539383ecc7b038ba9afbea..6be1949ebcdff2dd2dcb5270618c7a56a42fdc40 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -60,9 +60,11 @@ struct kernel_param_ops { * Flags available for kernel_param * * UNSAFE - the parameter is dangerous and setting it will taint the kernel + * HWPARAM - Hardware param not permitted in lockdown mode */ enum { - KERNEL_PARAM_FL_UNSAFE = (1 << 0) + KERNEL_PARAM_FL_UNSAFE = (1 << 0), + KERNEL_PARAM_FL_HWPARAM = (1 << 1), }; struct kernel_param { @@ -451,6 +453,67 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); perm, -1, 0); \ __MODULE_PARM_TYPE(name, "array of " #type) +enum hwparam_type { + hwparam_ioport, /* Module parameter configures an I/O port */ + hwparam_iomem, /* Module parameter configures an I/O mem address */ + hwparam_ioport_or_iomem, /* Module parameter could be either, depending on other option */ + hwparam_irq, /* Module parameter configures an I/O port */ + hwparam_dma, /* Module parameter configures a DMA channel */ + hwparam_dma_addr, /* Module parameter configures a DMA buffer address */ + hwparam_other, /* Module parameter configures some other value */ +}; + +/** + * module_param_hw_named - A parameter representing a hw parameters + * @name: a valid C identifier which is the parameter name. + * @value: the actual lvalue to alter. + * @type: the type of the parameter + * @hwtype: what the value represents (enum hwparam_type) + * @perm: visibility in sysfs. + * + * Usually it's a good idea to have variable names and user-exposed names the + * same, but that's harder if the variable must be non-static or is inside a + * structure. This allows exposure under a different name. + */ +#define module_param_hw_named(name, value, type, hwtype, perm) \ + param_check_##type(name, &(value)); \ + __module_param_call(MODULE_PARAM_PREFIX, name, \ + ¶m_ops_##type, &value, \ + perm, -1, \ + KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0)); \ + __MODULE_PARM_TYPE(name, #type) + +#define module_param_hw(name, type, hwtype, perm) \ + module_param_hw_named(name, name, type, hwtype, perm) + +/** + * module_param_hw_array - A parameter representing an array of hw parameters + * @name: the name of the array variable + * @type: the type, as per module_param() + * @hwtype: what the value represents (enum hwparam_type) + * @nump: optional pointer filled in with the number written + * @perm: visibility in sysfs + * + * Input and output are as comma-separated values. Commas inside values + * don't work properly (eg. an array of charp). + * + * ARRAY_SIZE(@name) is used to determine the number of elements in the + * array, so the definition must be visible. + */ +#define module_param_hw_array(name, type, hwtype, nump, perm) \ + param_check_##type(name, &(name)[0]); \ + static const struct kparam_array __param_arr_##name \ + = { .max = ARRAY_SIZE(name), .num = nump, \ + .ops = ¶m_ops_##type, \ + .elemsize = sizeof(name[0]), .elem = name }; \ + __module_param_call(MODULE_PARAM_PREFIX, name, \ + ¶m_array_ops, \ + .arr = &__param_arr_##name, \ + perm, -1, \ + KERNEL_PARAM_FL_HWPARAM | (hwparam_##hwtype & 0)); \ + __MODULE_PARM_TYPE(name, "array of " #type) + + extern const struct kernel_param_ops param_array_ops; extern const struct kernel_param_ops param_ops_string; diff --git a/include/linux/mpls.h b/include/linux/mpls.h index 9999145bc19007105db13aef9d473296f2f0c5a9..384fb22b6c436e357919656232342c0d538b8de3 100644 --- a/include/linux/mpls.h +++ b/include/linux/mpls.h @@ -3,4 +3,9 @@ #include +#define MPLS_TTL_MASK (MPLS_LS_TTL_MASK >> MPLS_LS_TTL_SHIFT) +#define MPLS_BOS_MASK (MPLS_LS_S_MASK >> MPLS_LS_S_SHIFT) +#define MPLS_TC_MASK (MPLS_LS_TC_MASK >> MPLS_LS_TC_SHIFT) +#define MPLS_LABEL_MASK (MPLS_LS_LABEL_MASK >> MPLS_LS_LABEL_SHIFT) + #endif /* _LINUX_MPLS_H */ diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index eebdc63cf6af94a5ea9a5c4703c416f87e0079a8..f8a2ef239c60adf6ce49f5ebcdbf576db0bd434d 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -334,11 +334,6 @@ struct mtd_info { int (*_get_device) (struct mtd_info *mtd); void (*_put_device) (struct mtd_info *mtd); - /* Backing device capabilities for this device - * - provides mmap capabilities - */ - struct backing_dev_info *backing_dev_info; - struct notifier_block reboot_notifier; /* default mode before reboot */ /* ECC status information */ @@ -393,7 +388,7 @@ static inline void mtd_set_of_node(struct mtd_info *mtd, static inline struct device_node *mtd_get_of_node(struct mtd_info *mtd) { - return mtd->dev.of_node; + return dev_of_node(&mtd->dev); } static inline int mtd_oobavail(struct mtd_info *mtd, struct mtd_oob_ops *ops) diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 9591e0fbe5bd76a1c47e2deb4292469e0a2cf8e6..8f67b15816836a1127fddae062c1ad76774e4534 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -366,26 +366,6 @@ struct onfi_ext_param_page { */ } __packed; -struct nand_onfi_vendor_micron { - u8 two_plane_read; - u8 read_cache; - u8 read_unique_id; - u8 dq_imped; - u8 dq_imped_num_settings; - u8 dq_imped_feat_addr; - u8 rb_pulldown_strength; - u8 rb_pulldown_strength_feat_addr; - u8 rb_pulldown_strength_num_settings; - u8 otp_mode; - u8 otp_page_start; - u8 otp_data_prot_addr; - u8 otp_num_pages; - u8 otp_feat_addr; - u8 read_retry_options; - u8 reserved[72]; - u8 param_revision; -} __packed; - struct jedec_ecc_info { u8 ecc_bits; u8 codeword_size; @@ -464,6 +444,17 @@ struct nand_jedec_params { __le16 crc; } __packed; +/** + * struct nand_id - NAND id structure + * @data: buffer containing the id bytes. Currently 8 bytes large, but can + * be extended if required. + * @len: ID length. + */ +struct nand_id { + u8 data[8]; + int len; +}; + /** * struct nand_hw_control - Control structure for hardware controller (e.g ECC generator) shared among independent devices * @lock: protection lock @@ -525,7 +516,7 @@ static inline void nand_hw_control_init(struct nand_hw_control *nfc) * out-of-band data). * @read_page: function to read a page according to the ECC generator * requirements; returns maximum number of bitflips corrected in - * any single ECC step, 0 if bitflips uncorrectable, -EIO hw error + * any single ECC step, -EIO hw error * @read_subpage: function to read parts of the page covered by ECC; * returns same as read_page() * @write_subpage: function to write parts of the page covered by ECC. @@ -720,6 +711,20 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) return &conf->timings.sdr; } +/** + * struct nand_manufacturer_ops - NAND Manufacturer operations + * @detect: detect the NAND memory organization and capabilities + * @init: initialize all vendor specific fields (like the ->read_retry() + * implementation) if any. + * @cleanup: the ->init() function may have allocated resources, ->cleanup() + * is here to let vendor specific code release those resources. + */ +struct nand_manufacturer_ops { + void (*detect)(struct nand_chip *chip); + int (*init)(struct nand_chip *chip); + void (*cleanup)(struct nand_chip *chip); +}; + /** * struct nand_chip - NAND Private Flash Chip Data * @mtd: MTD device registered to the MTD framework @@ -750,6 +755,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) * setting the read-retry mode. Mostly needed for MLC NAND. * @ecc: [BOARDSPECIFIC] ECC control structure * @buffers: buffer structure for read/write + * @buf_align: minimum buffer alignment required by a platform * @hwcontrol: platform-specific hardware control structure * @erase: [REPLACEABLE] erase function * @scan_bbt: [REPLACEABLE] function to scan bad block table @@ -793,6 +799,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) * @pagebuf_bitflips: [INTERN] holds the bitflip count for the page which is * currently in data_buf. * @subpagesize: [INTERN] holds the subpagesize + * @id: [INTERN] holds NAND ID * @onfi_version: [INTERN] holds the chip ONFI version (BCD encoded), * non 0 if ONFI supported. * @jedec_version: [INTERN] holds the chip JEDEC version (BCD encoded), @@ -822,7 +829,7 @@ nand_get_sdr_timings(const struct nand_data_interface *conf) * @errstat: [OPTIONAL] hardware specific function to perform * additional error status checks (determine if errors are * correctable). - * @write_page: [REPLACEABLE] High-level page write function + * @manufacturer: [INTERN] Contains manufacturer information */ struct nand_chip { @@ -847,9 +854,6 @@ struct nand_chip { int (*scan_bbt)(struct mtd_info *mtd); int (*errstat)(struct mtd_info *mtd, struct nand_chip *this, int state, int status, int page); - int (*write_page)(struct mtd_info *mtd, struct nand_chip *chip, - uint32_t offset, int data_len, const uint8_t *buf, - int oob_required, int page, int cached, int raw); int (*onfi_set_features)(struct mtd_info *mtd, struct nand_chip *chip, int feature_addr, uint8_t *subfeature_para); int (*onfi_get_features)(struct mtd_info *mtd, struct nand_chip *chip, @@ -881,6 +885,7 @@ struct nand_chip { int badblockpos; int badblockbits; + struct nand_id id; int onfi_version; int jedec_version; union { @@ -901,6 +906,7 @@ struct nand_chip { struct nand_ecc_ctrl ecc; struct nand_buffers *buffers; + unsigned long buf_align; struct nand_hw_control hwcontrol; uint8_t *bbt; @@ -910,6 +916,11 @@ struct nand_chip { struct nand_bbt_descr *badblock_pattern; void *priv; + + struct { + const struct nand_manufacturer *desc; + void *priv; + } manufacturer; }; extern const struct mtd_ooblayout_ops nand_ooblayout_sp_ops; @@ -946,6 +957,17 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv) chip->priv = priv; } +static inline void nand_set_manufacturer_data(struct nand_chip *chip, + void *priv) +{ + chip->manufacturer.priv = priv; +} + +static inline void *nand_get_manufacturer_data(struct nand_chip *chip) +{ + return chip->manufacturer.priv; +} + /* * NAND Flash Manufacturer ID Codes */ @@ -1049,17 +1071,33 @@ struct nand_flash_dev { }; /** - * struct nand_manufacturers - NAND Flash Manufacturer ID Structure + * struct nand_manufacturer - NAND Flash Manufacturer structure * @name: Manufacturer name * @id: manufacturer ID code of device. + * @ops: manufacturer operations */ -struct nand_manufacturers { +struct nand_manufacturer { int id; char *name; + const struct nand_manufacturer_ops *ops; }; +const struct nand_manufacturer *nand_get_manufacturer(u8 id); + +static inline const char * +nand_manufacturer_name(const struct nand_manufacturer *manufacturer) +{ + return manufacturer ? manufacturer->name : "Unknown"; +} + extern struct nand_flash_dev nand_flash_ids[]; -extern struct nand_manufacturers nand_manuf_ids[]; + +extern const struct nand_manufacturer_ops toshiba_nand_manuf_ops; +extern const struct nand_manufacturer_ops samsung_nand_manuf_ops; +extern const struct nand_manufacturer_ops hynix_nand_manuf_ops; +extern const struct nand_manufacturer_ops micron_nand_manuf_ops; +extern const struct nand_manufacturer_ops amd_nand_manuf_ops; +extern const struct nand_manufacturer_ops macronix_nand_manuf_ops; int nand_default_bbt(struct mtd_info *mtd); int nand_markbad_bbt(struct mtd_info *mtd, loff_t offs); @@ -1226,4 +1264,6 @@ int nand_reset(struct nand_chip *chip, int chipnr); /* Free resources held by the NAND device */ void nand_cleanup(struct nand_chip *chip); +/* Default extended ID decoding function */ +void nand_decode_ext_id(struct nand_chip *chip); #endif /* __LINUX_MTD_NAND_H */ diff --git a/include/linux/namei.h b/include/linux/namei.h index f29abda31e6dc8b32376a86c6c52e9e9ca6da0a0..8b4794e83196db62997ef5aea81886c3ad800bbd 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -44,6 +44,7 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; #define LOOKUP_JUMPED 0x1000 #define LOOKUP_ROOT 0x2000 #define LOOKUP_EMPTY 0x4000 +#define LOOKUP_DOWN 0x8000 extern int path_pts(struct path *path); diff --git a/include/linux/nd.h b/include/linux/nd.h index fa66aeed441a5882a1dcad4ad7a8c89faca1abf3..194b8e002ea74d9e0cef5ec184cf23833c0b6215 100644 --- a/include/linux/nd.h +++ b/include/linux/nd.h @@ -48,7 +48,7 @@ struct nd_namespace_common { struct device dev; struct device *claim; int (*rw_bytes)(struct nd_namespace_common *, resource_size_t offset, - void *buf, size_t size, int rw); + void *buf, size_t size, int rw, unsigned long flags); }; static inline struct nd_namespace_common *to_ndns(struct device *dev) @@ -134,9 +134,10 @@ static inline struct nd_namespace_blk *to_nd_namespace_blk(const struct device * * @buf is up-to-date upon return from this routine. */ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, - resource_size_t offset, void *buf, size_t size) + resource_size_t offset, void *buf, size_t size, + unsigned long flags) { - return ndns->rw_bytes(ndns, offset, buf, size, READ); + return ndns->rw_bytes(ndns, offset, buf, size, READ, flags); } /** @@ -152,9 +153,10 @@ static inline int nvdimm_read_bytes(struct nd_namespace_common *ndns, * to media is handled internal to the @ndns driver, if at all. */ static inline int nvdimm_write_bytes(struct nd_namespace_common *ndns, - resource_size_t offset, void *buf, size_t size) + resource_size_t offset, void *buf, size_t size, + unsigned long flags) { - return ndns->rw_bytes(ndns, offset, buf, size, WRITE); + return ndns->rw_bytes(ndns, offset, buf, size, WRITE, flags); } #define MODULE_ALIAS_ND_DEVICE(type) \ diff --git a/include/linux/net.h b/include/linux/net.h index 0620f5e18c96b706b70fb71005b29008a11fffb1..abcfa46a2bd9a6f9eb242dacb199070a05d61014 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -298,6 +298,9 @@ int kernel_sendpage(struct socket *sock, struct page *page, int offset, int kernel_sock_ioctl(struct socket *sock, int cmd, unsigned long arg); int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how); +/* Routine returns the IP overhead imposed by a (caller-protected) socket. */ +u32 kernel_sock_ip_overhead(struct sock *sk); + #define MODULE_ALIAS_NETPROTO(proto) \ MODULE_ALIAS("net-pf-" __stringify(proto)) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 9a0419594e842ca00a5ecfca53823b38bad207bb..1d4737cffc7116c898b42787680b2e9b0dd46812 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -54,8 +54,9 @@ enum { */ NETIF_F_GSO_TUNNEL_REMCSUM_BIT, /* ... TUNNEL with TSO & REMCSUM */ NETIF_F_GSO_SCTP_BIT, /* ... SCTP fragmentation */ + NETIF_F_GSO_ESP_BIT, /* ... ESP with TSO */ /**/NETIF_F_GSO_LAST = /* last bit, see GSO_MASK */ - NETIF_F_GSO_SCTP_BIT, + NETIF_F_GSO_ESP_BIT, NETIF_F_FCOE_CRC_BIT, /* FCoE CRC32 */ NETIF_F_SCTP_CRC_BIT, /* SCTP checksum offload */ @@ -73,6 +74,8 @@ enum { NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ NETIF_F_HW_TC_BIT, /* Offload TC infrastructure */ + NETIF_F_HW_ESP_BIT, /* Hardware ESP transformation offload */ + NETIF_F_HW_ESP_TX_CSUM_BIT, /* ESP with TX checksum offload */ /* * Add your fresh new feature above and remember to update @@ -129,11 +132,14 @@ enum { #define NETIF_F_GSO_PARTIAL __NETIF_F(GSO_PARTIAL) #define NETIF_F_GSO_TUNNEL_REMCSUM __NETIF_F(GSO_TUNNEL_REMCSUM) #define NETIF_F_GSO_SCTP __NETIF_F(GSO_SCTP) +#define NETIF_F_GSO_ESP __NETIF_F(GSO_ESP) #define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER) #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) #define NETIF_F_HW_TC __NETIF_F(HW_TC) +#define NETIF_F_HW_ESP __NETIF_F(HW_ESP) +#define NETIF_F_HW_ESP_TX_CSUM __NETIF_F(HW_ESP_TX_CSUM) #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 97456b2539e46d6232dda804f6a434db6fd7134f..9c23bd2efb56393e7c616d42d427101c3f5f51d2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -41,7 +41,6 @@ #include #include -#include #ifdef CONFIG_DCB #include #endif @@ -57,6 +56,8 @@ struct netpoll_info; struct device; struct phy_device; +struct dsa_switch_tree; + /* 802.11 specific */ struct wireless_dev; /* 802.15.4 specific */ @@ -236,8 +237,7 @@ struct netdev_hw_addr_list { netdev_hw_addr_list_for_each(ha, &(dev)->mc) struct hh_cache { - u16 hh_len; - u16 __pad; + unsigned int hh_len; seqlock_t hh_lock; /* cached hardware header; allow for machine alignment needs. */ @@ -786,11 +786,11 @@ struct tc_cls_u32_offload; struct tc_to_netdev { unsigned int type; union { - u8 tc; struct tc_cls_u32_offload *cls_u32; struct tc_cls_flower_offload *cls_flower; struct tc_cls_matchall_offload *cls_mall; struct tc_cls_bpf_offload *cls_bpf; + struct tc_mqprio_qopt *mqprio; }; bool egress_dev; }; @@ -813,16 +813,31 @@ enum xdp_netdev_command { XDP_QUERY_PROG, }; +struct netlink_ext_ack; + struct netdev_xdp { enum xdp_netdev_command command; union { /* XDP_SETUP_PROG */ - struct bpf_prog *prog; + struct { + struct bpf_prog *prog; + struct netlink_ext_ack *extack; + }; /* XDP_QUERY_PROG */ bool prog_attached; }; }; +#ifdef CONFIG_XFRM_OFFLOAD +struct xfrmdev_ops { + int (*xdo_dev_state_add) (struct xfrm_state *x); + void (*xdo_dev_state_delete) (struct xfrm_state *x); + void (*xdo_dev_state_free) (struct xfrm_state *x); + bool (*xdo_dev_offload_ok) (struct sk_buff *skb, + struct xfrm_state *x); +}; +#endif + /* * This structure defines the management hooks for network devices. * The following hooks can be defined; unless noted otherwise, they are @@ -1696,6 +1711,10 @@ struct net_device { const struct ndisc_ops *ndisc_ops; #endif +#ifdef CONFIG_XFRM + const struct xfrmdev_ops *xfrmdev_ops; +#endif + const struct header_ops *header_ops; unsigned int flags; @@ -1715,7 +1734,7 @@ struct net_device { unsigned int max_mtu; unsigned short type; unsigned short hard_header_len; - unsigned short min_header_len; + unsigned char min_header_len; unsigned short needed_headroom; unsigned short needed_tailroom; @@ -1776,6 +1795,7 @@ struct net_device { unsigned int real_num_rx_queues; #endif + struct bpf_prog __rcu *xdp_prog; unsigned long gro_flush_timeout; rx_handler_func_t __rcu *rx_handler; void __rcu *rx_handler_data; @@ -1894,6 +1914,13 @@ struct net_device { }; #define to_net_dev(d) container_of(d, struct net_device, dev) +static inline bool netif_elide_gro(const struct net_device *dev) +{ + if (!(dev->features & NETIF_F_GRO) || dev->xdp_prog) + return true; + return false; +} + #define NETDEV_ALIGN 32 static inline @@ -2004,15 +2031,6 @@ void dev_net_set(struct net_device *dev, struct net *net) write_pnet(&dev->nd_net, net); } -static inline bool netdev_uses_dsa(struct net_device *dev) -{ -#if IS_ENABLED(CONFIG_NET_DSA) - if (dev->dsa_ptr != NULL) - return dsa_uses_tagged_protocol(dev->dsa_ptr); -#endif - return false; -} - /** * netdev_priv - access network device private data * @dev: network device @@ -3278,7 +3296,8 @@ int dev_get_phys_port_id(struct net_device *dev, int dev_get_phys_port_name(struct net_device *dev, char *name, size_t len); int dev_change_proto_down(struct net_device *dev, bool proto_down); -int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags); +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, u32 flags); struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev); struct sk_buff *dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, struct netdev_queue *txq, int *ret); @@ -3305,6 +3324,7 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev, void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; +extern unsigned int netdev_budget_usecs; /* Called by rtnetlink.c:rtnl_unlock() */ void netdev_run_todo(void); @@ -3394,10 +3414,10 @@ static inline void netif_dormant_off(struct net_device *dev) } /** - * netif_dormant - test if carrier present + * netif_dormant - test if device is dormant * @dev: network device * - * Check if carrier is present on device + * Check if device is dormant. */ static inline bool netif_dormant(const struct net_device *dev) { @@ -4078,6 +4098,7 @@ static inline bool net_gso_ok(netdev_features_t features, int gso_type) BUILD_BUG_ON(SKB_GSO_PARTIAL != (NETIF_F_GSO_PARTIAL >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_TUNNEL_REMCSUM != (NETIF_F_GSO_TUNNEL_REMCSUM >> NETIF_F_GSO_SHIFT)); BUILD_BUG_ON(SKB_GSO_SCTP != (NETIF_F_GSO_SCTP >> NETIF_F_GSO_SHIFT)); + BUILD_BUG_ON(SKB_GSO_ESP != (NETIF_F_GSO_ESP >> NETIF_F_GSO_SHIFT)); return (features & feature) == feature; } @@ -4180,6 +4201,11 @@ static inline bool netif_is_ovs_master(const struct net_device *dev) return dev->priv_flags & IFF_OPENVSWITCH; } +static inline bool netif_is_ovs_port(const struct net_device *dev) +{ + return dev->priv_flags & IFF_OVS_DATAPATH; +} + static inline bool netif_is_team_master(const struct net_device *dev) { return dev->priv_flags & IFF_TEAM; diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h index 1b49209dd5c7cd74644624007ce57df107c3744b..996711d8a7b4b536990c05d698243cdbb3413e44 100644 --- a/include/linux/netfilter/nfnetlink.h +++ b/include/linux/netfilter/nfnetlink.h @@ -41,6 +41,11 @@ int nfnetlink_set_err(struct net *net, u32 portid, u32 group, int error); int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, int flags); +static inline u16 nfnl_msg_type(u8 subsys, u8 msg_type) +{ + return subsys << 8 | msg_type; +} + void nfnl_lock(__u8 subsys_id); void nfnl_unlock(__u8 subsys_id); #ifdef CONFIG_PROVE_LOCKING diff --git a/include/linux/netfilter_bridge/ebtables.h b/include/linux/netfilter_bridge/ebtables.h index 984b2112c77b642f75e3f121387e955aa3f22b93..a30efb437e6d1cfa42a75ec72997bf12b9cdf59d 100644 --- a/include/linux/netfilter_bridge/ebtables.h +++ b/include/linux/netfilter_bridge/ebtables.h @@ -109,8 +109,10 @@ struct ebt_table { #define EBT_ALIGN(s) (((s) + (__alignof__(struct _xt_align)-1)) & \ ~(__alignof__(struct _xt_align)-1)) extern struct ebt_table *ebt_register_table(struct net *net, - const struct ebt_table *table); -extern void ebt_unregister_table(struct net *net, struct ebt_table *table); + const struct ebt_table *table, + const struct nf_hook_ops *); +extern void ebt_unregister_table(struct net *net, struct ebt_table *table, + const struct nf_hook_ops *); extern unsigned int ebt_do_table(struct sk_buff *skb, const struct nf_hook_state *state, struct ebt_table *table); diff --git a/include/linux/netlink.h b/include/linux/netlink.h index da14ab61f3634cae5c9ec1636c2774a6f388083d..5fff5ba5964e25e3aa5814509d08943409d471b5 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -62,11 +62,47 @@ netlink_kernel_create(struct net *net, int unit, struct netlink_kernel_cfg *cfg) return __netlink_kernel_create(net, unit, THIS_MODULE, cfg); } +/* this can be increased when necessary - don't expose to userland */ +#define NETLINK_MAX_COOKIE_LEN 20 + +/** + * struct netlink_ext_ack - netlink extended ACK report struct + * @_msg: message string to report - don't access directly, use + * %NL_SET_ERR_MSG + * @bad_attr: attribute with error + * @cookie: cookie data to return to userspace (for success) + * @cookie_len: actual cookie data length + */ +struct netlink_ext_ack { + const char *_msg; + const struct nlattr *bad_attr; + u8 cookie[NETLINK_MAX_COOKIE_LEN]; + u8 cookie_len; +}; + +/* Always use this macro, this allows later putting the + * message into a separate section or such for things + * like translation or listing all possible messages. + * Currently string formatting is not supported (due + * to the lack of an output buffer.) + */ +#define NL_SET_ERR_MSG(extack, msg) do { \ + static const char __msg[] = (msg); \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) \ + __extack->_msg = __msg; \ +} while (0) + +#define NL_SET_ERR_MSG_MOD(extack, msg) \ + NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) + extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); extern int netlink_change_ngroups(struct sock *sk, unsigned int groups); extern void __netlink_clear_multicast_users(struct sock *sk, unsigned int group); -extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err); +extern void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack); extern int netlink_has_listeners(struct sock *sk, unsigned int group); extern int netlink_unicast(struct sock *ssk, struct sk_buff *skb, __u32 portid, int nonblock); diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 287f341610864f745e44fb4cbfb4ad44a26f4ea5..bb0eb2c9acca7d19ddf14c65480a9ee27715c561 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -76,6 +76,7 @@ struct nfs_open_context { #define NFS_CONTEXT_ERROR_WRITE (0) #define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) +#define NFS_CONTEXT_UNLOCK (3) int error; struct list_head list; @@ -499,24 +500,12 @@ extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned */ extern int nfs_sync_inode(struct inode *inode); extern int nfs_wb_all(struct inode *inode); -extern int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder); +extern int nfs_wb_page(struct inode *inode, struct page *page); extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); -extern struct nfs_commit_data *nfs_commitdata_alloc(void); +extern struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail); extern void nfs_commit_free(struct nfs_commit_data *data); -static inline int -nfs_wb_launder_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, true); -} - -static inline int -nfs_wb_page(struct inode *inode, struct page *page) -{ - return nfs_wb_single_page(inode, page, false); -} - static inline int nfs_have_writebacks(struct inode *inode) { diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index b34097c6784864193e67053f8fa8e5bbbc092071..e418a1096662c387f1b96e834d90983c22e5106f 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -133,7 +133,6 @@ struct nfs_server { struct rpc_clnt * client_acl; /* ACL RPC client handle */ struct nlm_host *nlm_host; /* NLM client handle */ struct nfs_iostats __percpu *io_stats; /* I/O statistics */ - struct backing_dev_info backing_dev_info; atomic_long_t writeback; /* number of writeback pages */ int flags; /* various flags */ unsigned int caps; /* server capabilities */ @@ -222,6 +221,7 @@ struct nfs_server { u32 mountd_version; unsigned short mountd_port; unsigned short mountd_protocol; + struct rpc_wait_queue uoc_rpcwaitq; }; /* Server capabilities */ diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h index 957049f72290d4b6c328845535c664aea4afc7c0..247cc3d3498f241b19f57b486685986b81d95e56 100644 --- a/include/linux/nfs_page.h +++ b/include/linux/nfs_page.h @@ -64,7 +64,6 @@ struct nfs_pageio_ops { }; struct nfs_rw_ops { - const fmode_t rw_mode; struct nfs_pgio_header *(*rw_alloc_header)(void); void (*rw_free_header)(struct nfs_pgio_header *); int (*rw_done)(struct rpc_task *, struct nfs_pgio_header *, @@ -124,7 +123,8 @@ extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, const struct nfs_pgio_completion_ops *compl_ops, const struct nfs_rw_ops *rw_ops, size_t bsize, - int how); + int how, + gfp_t gfp_flags); extern int nfs_pageio_add_request(struct nfs_pageio_descriptor *, struct nfs_page *); extern int nfs_pageio_resend(struct nfs_pageio_descriptor *, @@ -141,6 +141,7 @@ extern int nfs_page_group_lock(struct nfs_page *, bool); extern void nfs_page_group_lock_wait(struct nfs_page *); extern void nfs_page_group_unlock(struct nfs_page *); extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); +extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *); /* * Lock the page of an asynchronous request diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 348f7c158084b8de71c2a0a2f6941b0db7bf2c3b..b28c83475ee8e9106c5f567551e4307156dfe465 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1383,6 +1383,7 @@ struct nfs42_copy_res { struct nfs42_write_res write_res; bool consecutive; bool synchronous; + struct nfs_commitres commit_res; }; struct nfs42_seek_args { @@ -1427,6 +1428,7 @@ struct nfs_pgio_header { struct list_head pages; struct nfs_page *req; struct nfs_writeverf verf; /* Used for writes */ + fmode_t rw_mode; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; @@ -1550,6 +1552,7 @@ struct nfs_rpc_ops { const struct inode_operations *dir_inode_ops; const struct inode_operations *file_inode_ops; const struct file_operations *file_ops; + const struct nlmclnt_operations *nlmclnt_ops; int (*getroot) (struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); diff --git a/include/linux/nvme-fc-driver.h b/include/linux/nvme-fc-driver.h index f21471f7ee407a80534047004ecd784ebdf347d5..0db37158a61d4e4e5af0d16d14c07510ef660330 100644 --- a/include/linux/nvme-fc-driver.h +++ b/include/linux/nvme-fc-driver.h @@ -137,9 +137,9 @@ enum nvmefc_fcp_datadir { * transferred. Should equal payload_length on success. * @rcv_rsplen: length, in bytes, of the FCP RSP IU received. * @status: Completion status of the FCP operation. must be 0 upon success, - * NVME_SC_FC_xxx value upon failure. Note: this is NOT a - * reflection of the NVME CQE completion status. Only the status - * of the FCP operation at the NVME-FC level. + * negative errno value upon failure (ex: -EIO). Note: this is + * NOT a reflection of the NVME CQE completion status. Only the + * status of the FCP operation at the NVME-FC level. */ struct nvmefc_fcp_req { void *cmdaddr; @@ -533,9 +533,6 @@ enum { * rsp as well */ NVMET_FCOP_RSP = 4, /* send rsp frame */ - NVMET_FCOP_ABORT = 5, /* abort exchange via ABTS */ - NVMET_FCOP_BA_ACC = 6, /* send BA_ACC */ - NVMET_FCOP_BA_RJT = 7, /* send BA_RJT */ }; /** @@ -572,8 +569,6 @@ enum { * upon compeletion of the operation. The nvmet-fc layer will also set a * private pointer for its own use in the done routine. * - * Note: the LLDD must never fail a NVMET_FCOP_ABORT request !! - * * Values set by the NVMET-FC layer prior to calling the LLDD fcp_op * entrypoint. * @op: Indicates the FCP IU operation to perform (see NVMET_FCOP_xxx) @@ -655,6 +650,22 @@ enum { * on. The transport should pick a cpu to schedule the work * on. */ + NVMET_FCTGTFEAT_CMD_IN_ISR = (1 << 2), + /* Bit 2: When 0, the LLDD is calling the cmd rcv handler + * in a non-isr context, allowing the transport to finish + * op completion in the calling context. When 1, the LLDD + * is calling the cmd rcv handler in an ISR context, + * requiring the transport to transition to a workqueue + * for op completion. + */ + NVMET_FCTGTFEAT_OPDONE_IN_ISR = (1 << 3), + /* Bit 3: When 0, the LLDD is calling the op done handler + * in a non-isr context, allowing the transport to finish + * op completion in the calling context. When 1, the LLDD + * is calling the op done handler in an ISR context, + * requiring the transport to transition to a workqueue + * for op completion. + */ }; @@ -725,12 +736,12 @@ struct nvmet_fc_target_port { * be freed/released. * Entrypoint is Mandatory. * - * @fcp_op: Called to perform a data transfer, transmit a response, or - * abort an FCP opertion. The nvmefc_tgt_fcp_req structure is the same - * LLDD-supplied exchange structure specified in the - * nvmet_fc_rcv_fcp_req() call made when the FCP CMD IU was received. - * The op field in the structure shall indicate the operation for - * the LLDD to perform relative to the io. + * @fcp_op: Called to perform a data transfer or transmit a response. + * The nvmefc_tgt_fcp_req structure is the same LLDD-supplied + * exchange structure specified in the nvmet_fc_rcv_fcp_req() call + * made when the FCP CMD IU was received. The op field in the + * structure shall indicate the operation for the LLDD to perform + * relative to the io. * NVMET_FCOP_READDATA operation: the LLDD is to send the * payload data (described by sglist) to the host in 1 or * more FC sequences (preferrably 1). Note: the fc-nvme layer @@ -752,29 +763,31 @@ struct nvmet_fc_target_port { * successfully, the LLDD is to update the nvmefc_tgt_fcp_req * transferred_length field and may subsequently transmit the * FCP_RSP iu payload (described by rspbuf, rspdma, rsplen). - * The LLDD is to await FCP_CONF reception to confirm the RSP - * reception by the host. The LLDD may retramsit the FCP_RSP iu - * if necessary per FC-NVME. Upon reception of FCP_CONF, or upon - * FCP_CONF failure, the LLDD is to set the nvmefc_tgt_fcp_req - * fcp_error field and consider the operation complete.. + * If FCP_CONF is supported, the LLDD is to await FCP_CONF + * reception to confirm the RSP reception by the host. The LLDD + * may retramsit the FCP_RSP iu if necessary per FC-NVME. Upon + * transmission of the FCP_RSP iu if FCP_CONF is not supported, + * or upon success/failure of FCP_CONF if it is supported, the + * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and + * consider the operation complete. * NVMET_FCOP_RSP: the LLDD is to transmit the FCP_RSP iu payload - * (described by rspbuf, rspdma, rsplen). The LLDD is to await - * FCP_CONF reception to confirm the RSP reception by the host. - * The LLDD may retramsit the FCP_RSP iu if necessary per FC-NVME. - * Upon reception of FCP_CONF, or upon FCP_CONF failure, the + * (described by rspbuf, rspdma, rsplen). If FCP_CONF is + * supported, the LLDD is to await FCP_CONF reception to confirm + * the RSP reception by the host. The LLDD may retramsit the + * FCP_RSP iu if FCP_CONF is not received per FC-NVME. Upon + * transmission of the FCP_RSP iu if FCP_CONF is not supported, + * or upon success/failure of FCP_CONF if it is supported, the * LLDD is to set the nvmefc_tgt_fcp_req fcp_error field and - * consider the operation complete.. - * NVMET_FCOP_ABORT: the LLDD is to terminate the exchange - * corresponding to the fcp operation. The LLDD shall send - * ABTS and follow FC exchange abort-multi rules, including - * ABTS retries and possible logout. + * consider the operation complete. * Upon completing the indicated operation, the LLDD is to set the * status fields for the operation (tranferred_length and fcp_error - * status) in the request, then all the "done" routine - * indicated in the fcp request. Upon return from the "done" - * routine for either a NVMET_FCOP_RSP or NVMET_FCOP_ABORT operation - * the fc-nvme layer will not longer reference the fcp request, - * allowing the LLDD to free/release the fcp request. + * status) in the request, then call the "done" routine + * indicated in the fcp request. After the operation completes, + * regardless of whether the FCP_RSP iu was successfully transmit, + * the LLDD-supplied exchange structure must remain valid until the + * transport calls the fcp_req_release() callback to return ownership + * of the exchange structure back to the LLDD so that it may be used + * for another fcp command. * Note: when calling the done routine for READDATA or WRITEDATA * operations, the fc-nvme layer may immediate convert, in the same * thread and before returning to the LLDD, the fcp operation to @@ -786,6 +799,22 @@ struct nvmet_fc_target_port { * Returns 0 on success, - on failure (Ex: -EIO) * Entrypoint is Mandatory. * + * @fcp_abort: Called by the transport to abort an active command. + * The command may be in-between operations (nothing active in LLDD) + * or may have an active WRITEDATA operation pending. The LLDD is to + * initiate the ABTS process for the command and return from the + * callback. The ABTS does not need to be complete on the command. + * The fcp_abort callback inherently cannot fail. After the + * fcp_abort() callback completes, the transport will wait for any + * outstanding operation (if there was one) to complete, then will + * call the fcp_req_release() callback to return the command's + * exchange context back to the LLDD. + * + * @fcp_req_release: Called by the transport to return a nvmefc_tgt_fcp_req + * to the LLDD after all operations on the fcp operation are complete. + * This may be due to the command completing or upon completion of + * abort cleanup. + * * @max_hw_queues: indicates the maximum number of hw queues the LLDD * supports for cpu affinitization. * Value is Mandatory. Must be at least 1. @@ -820,7 +849,11 @@ struct nvmet_fc_target_template { int (*xmt_ls_rsp)(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_ls_req *tls_req); int (*fcp_op)(struct nvmet_fc_target_port *tgtport, - struct nvmefc_tgt_fcp_req *); + struct nvmefc_tgt_fcp_req *fcpreq); + void (*fcp_abort)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); + void (*fcp_req_release)(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); u32 max_hw_queues; u16 max_sgl_segments; @@ -848,4 +881,7 @@ int nvmet_fc_rcv_fcp_req(struct nvmet_fc_target_port *tgtport, struct nvmefc_tgt_fcp_req *fcpreq, void *cmdiubuf, u32 cmdiubuf_len); +void nvmet_fc_rcv_fcp_abort(struct nvmet_fc_target_port *tgtport, + struct nvmefc_tgt_fcp_req *fcpreq); + #endif /* _NVME_FC_DRIVER_H */ diff --git a/include/linux/nvme-fc.h b/include/linux/nvme-fc.h index 4b45226bd604c5630fe73015e3a9876ced1cef2e..e997c4a49a8884e3b1167a830e7fdf0431365a3d 100644 --- a/include/linux/nvme-fc.h +++ b/include/linux/nvme-fc.h @@ -16,8 +16,7 @@ */ /* - * This file contains definitions relative to FC-NVME r1.11 and a few - * newer items + * This file contains definitions relative to FC-NVME r1.14 (16-020vB). */ #ifndef _NVME_FC_H @@ -47,8 +46,15 @@ struct nvme_fc_cmd_iu { #define NVME_FC_SIZEOF_ZEROS_RSP 12 +enum { + FCNVME_SC_SUCCESS = 0, + FCNVME_SC_INVALID_FIELD = 1, + FCNVME_SC_INVALID_CONNID = 2, +}; + struct nvme_fc_ersp_iu { - __u8 rsvd0[2]; + __u8 status_code; + __u8 rsvd1; __be16 iu_len; __be32 rsn; __be32 xfrd_len; @@ -58,7 +64,7 @@ struct nvme_fc_ersp_iu { }; -/* FC-NVME r1.03/16-119v0 NVME Link Services */ +/* FC-NVME Link Services */ enum { FCNVME_LS_RSVD = 0, FCNVME_LS_RJT = 1, @@ -68,7 +74,7 @@ enum { FCNVME_LS_DISCONNECT = 5, }; -/* FC-NVME r1.03/16-119v0 NVME Link Service Descriptors */ +/* FC-NVME Link Service Descriptors */ enum { FCNVME_LSDESC_RSVD = 0x0, FCNVME_LSDESC_RQST = 0x1, @@ -92,7 +98,6 @@ static inline __be32 fcnvme_lsdesc_len(size_t sz) return cpu_to_be32(sz - (2 * sizeof(u32))); } - struct fcnvme_ls_rqst_w0 { u8 ls_cmd; /* FCNVME_LS_xxx */ u8 zeros[3]; @@ -106,8 +111,53 @@ struct fcnvme_lsdesc_rqst { __be32 rsvd12; }; +/* FC-NVME LS RJT reason_code values */ +enum fcnvme_ls_rjt_reason { + FCNVME_RJT_RC_NONE = 0, + /* no reason - not to be sent */ + + FCNVME_RJT_RC_INVAL = 0x01, + /* invalid NVMe_LS command code */ + + FCNVME_RJT_RC_LOGIC = 0x03, + /* logical error */ + + FCNVME_RJT_RC_UNAB = 0x09, + /* unable to perform command request */ + + FCNVME_RJT_RC_UNSUP = 0x0b, + /* command not supported */ + + FCNVME_RJT_RC_INPROG = 0x0e, + /* command already in progress */ + FCNVME_RJT_RC_INV_ASSOC = 0x40, + /* Invalid Association ID*/ + FCNVME_RJT_RC_INV_CONN = 0x41, + /* Invalid Connection ID*/ + + FCNVME_RJT_RC_VENDOR = 0xff, + /* vendor specific error */ +}; + +/* FC-NVME LS RJT reason_explanation values */ +enum fcnvme_ls_rjt_explan { + FCNVME_RJT_EXP_NONE = 0x00, + /* No additional explanation */ + + FCNVME_RJT_EXP_OXID_RXID = 0x17, + /* invalid OX_ID-RX_ID combination */ + + FCNVME_RJT_EXP_INSUF_RES = 0x29, + /* insufficient resources */ + + FCNVME_RJT_EXP_UNAB_DATA = 0x2a, + /* unable to supply requested data */ + + FCNVME_RJT_EXP_INV_LEN = 0x2d, + /* Invalid payload length */ +}; /* FCNVME_LSDESC_RJT */ struct fcnvme_lsdesc_rjt { @@ -119,15 +169,15 @@ struct fcnvme_lsdesc_rjt { * Reject reason and explanaction codes are generic * to ELs's from LS-3. */ - u8 reason_code; - u8 reason_explanation; + u8 reason_code; /* fcnvme_ls_rjt_reason */ + u8 reason_explanation; /* fcnvme_ls_rjt_explan */ u8 vendor; __be32 rsvd12; }; -#define FCNVME_ASSOC_HOSTID_LEN 64 +#define FCNVME_ASSOC_HOSTID_LEN 16 #define FCNVME_ASSOC_HOSTNQN_LEN 256 #define FCNVME_ASSOC_SUBNQN_LEN 256 diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 9061780b141ff44c58a0ab5d5429dfbba8f1d2fb..b625bacf37efaabd84e8735b2b304401cdb195fd 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -245,6 +245,7 @@ enum { NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3, NVME_CTRL_VWC_PRESENT = 1 << 0, NVME_CTRL_OACS_SEC_SUPP = 1 << 0, + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7, }; struct nvme_lbaf { @@ -603,6 +604,7 @@ enum nvme_admin_opcode { nvme_admin_download_fw = 0x11, nvme_admin_ns_attach = 0x15, nvme_admin_keep_alive = 0x18, + nvme_admin_dbbuf = 0x7C, nvme_admin_format_nvm = 0x80, nvme_admin_security_send = 0x81, nvme_admin_security_recv = 0x82, @@ -874,6 +876,16 @@ struct nvmf_property_get_command { __u8 resv4[16]; }; +struct nvme_dbbuf { + __u8 opcode; + __u8 flags; + __u16 command_id; + __u32 rsvd1[5]; + __le64 prp1; + __le64 prp2; + __u32 rsvd12[6]; +}; + struct nvme_command { union { struct nvme_common_command common; @@ -893,6 +905,7 @@ struct nvme_command { struct nvmf_connect_command connect; struct nvmf_property_set_command prop_set; struct nvmf_property_get_command prop_get; + struct nvme_dbbuf dbbuf; }; }; diff --git a/include/linux/of.h b/include/linux/of.h index 21e6323de0f3b7868a2c3a44471f92cdc6cba176..50fcdb54087fa02dcacc10820077771981f7c445 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -159,6 +159,8 @@ static inline struct device_node *to_of_node(struct fwnode_handle *fwnode) container_of(fwnode, struct device_node, fwnode) : NULL; } +#define of_fwnode_handle(node) (&(node)->fwnode) + static inline bool of_have_populated_dt(void) { return of_root != NULL; @@ -292,6 +294,9 @@ extern int of_property_count_elems_of_size(const struct device_node *np, extern int of_property_read_u32_index(const struct device_node *np, const char *propname, u32 index, u32 *out_value); +extern int of_property_read_u64_index(const struct device_node *np, + const char *propname, + u32 index, u64 *out_value); extern int of_property_read_variable_u8_array(const struct device_node *np, const char *propname, u8 *out_values, size_t sz_min, size_t sz_max); @@ -602,6 +607,8 @@ static inline struct device_node *of_find_node_with_property( return NULL; } +#define of_fwnode_handle(node) NULL + static inline bool of_have_populated_dt(void) { return false; diff --git a/include/linux/of_device.h b/include/linux/of_device.h index c12dace043f3c840a8bb4d4540e06f897eeca8cb..b4ad8b4f85065dae43ab10be81ac27851b4802e7 100644 --- a/include/linux/of_device.h +++ b/include/linux/of_device.h @@ -34,8 +34,7 @@ extern void of_device_unregister(struct platform_device *ofdev); extern const void *of_device_get_match_data(const struct device *dev); -extern ssize_t of_device_get_modalias(struct device *dev, - char *str, ssize_t len); +extern ssize_t of_device_modalias(struct device *dev, char *str, ssize_t len); extern int of_device_request_module(struct device *dev); extern void of_device_uevent(struct device *dev, struct kobj_uevent_env *env); @@ -55,7 +54,8 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) return of_node_get(cpu_dev->of_node); } -void of_dma_configure(struct device *dev, struct device_node *np); +int of_dma_configure(struct device *dev, struct device_node *np); +void of_dma_deconfigure(struct device *dev); #else /* CONFIG_OF */ static inline int of_driver_match_device(struct device *dev, @@ -72,8 +72,8 @@ static inline const void *of_device_get_match_data(const struct device *dev) return NULL; } -static inline int of_device_get_modalias(struct device *dev, - char *str, ssize_t len) +static inline int of_device_modalias(struct device *dev, + char *str, ssize_t len) { return -ENODEV; } @@ -103,7 +103,12 @@ static inline struct device_node *of_cpu_device_node_get(int cpu) { return NULL; } -static inline void of_dma_configure(struct device *dev, struct device_node *np) + +static inline int of_dma_configure(struct device *dev, struct device_node *np) +{ + return 0; +} +static inline void of_dma_deconfigure(struct device *dev) {} #endif /* CONFIG_OF */ diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 271b3fdf00709e2752115e0d371cfa397791eec4..1dfbfd0d8040cb56820e43f51fc9bf58b7d117e8 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -54,6 +54,11 @@ extern char __dtb_end[]; extern int of_scan_flat_dt(int (*it)(unsigned long node, const char *uname, int depth, void *data), void *data); +extern int of_scan_flat_dt_subnodes(unsigned long node, + int (*it)(unsigned long node, + const char *uname, + void *data), + void *data); extern int of_get_flat_dt_subnode_by_name(unsigned long node, const char *uname); extern const void *of_get_flat_dt_prop(unsigned long node, const char *name, @@ -62,6 +67,7 @@ extern int of_flat_dt_is_compatible(unsigned long node, const char *name); extern int of_flat_dt_match(unsigned long node, const char *const *matches); extern unsigned long of_get_flat_dt_root(void); extern int of_get_flat_dt_size(void); +extern uint32_t of_get_flat_dt_phandle(unsigned long node); extern int early_init_dt_scan_chosen(unsigned long node, const char *uname, int depth, void *data); diff --git a/include/linux/of_gpio.h b/include/linux/of_gpio.h index 3f87ea5b8bee24d0e5cfa78517cfaf338a2bf051..1e089d5a182beb076d9021e41ab9672820dcfb82 100644 --- a/include/linux/of_gpio.h +++ b/include/linux/of_gpio.h @@ -30,6 +30,7 @@ struct device_node; enum of_gpio_flags { OF_GPIO_ACTIVE_LOW = 0x1, OF_GPIO_SINGLE_ENDED = 0x2, + OF_GPIO_OPEN_DRAIN = 0x4, }; #ifdef CONFIG_OF_GPIO diff --git a/include/linux/of_irq.h b/include/linux/of_irq.h index 1e0deb8e849458ca179c1df4734bb24f0e4cd748..ec6b11deb77357444757691ec87d71d94b1b8953 100644 --- a/include/linux/of_irq.h +++ b/include/linux/of_irq.h @@ -8,7 +8,7 @@ #include #include -typedef int (*of_irq_init_cb_t)(struct device_node *, struct device_node *); +typedef int const (*of_irq_init_cb_t)(struct device_node *, struct device_node *); /* * Workarounds only applied to 32bit powermac machines diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index a58cca8bcb29d8c9bdcda71538cfa8f25c913916..ba35ba5204871a69b5d5522f170432d24aa32e96 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -12,7 +12,7 @@ #include #include -#ifdef CONFIG_OF +#if IS_ENABLED(CONFIG_OF_MDIO) extern int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np); extern struct phy_device *of_phy_find_device(struct device_node *phy_np); extern struct phy_device *of_phy_connect(struct net_device *dev, @@ -32,7 +32,7 @@ extern int of_phy_register_fixed_link(struct device_node *np); extern void of_phy_deregister_fixed_link(struct device_node *np); extern bool of_phy_is_fixed_link(struct device_node *np); -#else /* CONFIG_OF */ +#else /* CONFIG_OF_MDIO */ static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np) { /* diff --git a/include/linux/of_pci.h b/include/linux/of_pci.h index 0e0974eceb803b64cee8f8f67a7bde55d58f8f4a..518c8d20647a560bf2f679b62b80364b040ba9ae 100644 --- a/include/linux/of_pci.h +++ b/include/linux/of_pci.h @@ -85,15 +85,4 @@ static inline int of_pci_get_host_bridge_resources(struct device_node *dev, } #endif -#if defined(CONFIG_OF) && defined(CONFIG_PCI_MSI) -int of_pci_msi_chip_add(struct msi_controller *chip); -void of_pci_msi_chip_remove(struct msi_controller *chip); -struct msi_controller *of_pci_find_msi_chip_by_node(struct device_node *of_node); -#else -static inline int of_pci_msi_chip_add(struct msi_controller *chip) { return -EINVAL; } -static inline void of_pci_msi_chip_remove(struct msi_controller *chip) { } -static inline struct msi_controller * -of_pci_find_msi_chip_by_node(struct device_node *of_node) { return NULL; } -#endif - #endif diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 047d64706f2a298157cf0d54305aa8e49f6f7837..d4cd2014fa6f5a18f03f1b0babfccb21fa3ac55f 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -33,10 +33,7 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); -int move_freepages(struct zone *zone, - struct page *start_page, struct page *end_page, - int migratetype); + int migratetype, int *num_movable); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 84943e8057ef85a646a8f43155805fd8fa2273d8..316a19f6b635db760570a7e7b66af2a03b697e5e 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -148,7 +148,7 @@ static inline int page_cache_get_speculative(struct page *page) #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing @@ -186,7 +186,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) # ifdef CONFIG_PREEMPT_COUNT - VM_BUG_ON(!in_atomic()); + VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); diff --git a/include/linux/pci-ecam.h b/include/linux/pci-ecam.h index f0d2b94512701ccdb6c77d5b37c7ff4f49ea1d3e..809c2f1873ac2a04c6ebb262407196302d00ea46 100644 --- a/include/linux/pci-ecam.h +++ b/include/linux/pci-ecam.h @@ -16,6 +16,7 @@ #ifndef DRIVERS_PCI_ECAM_H #define DRIVERS_PCI_ECAM_H +#include #include #include @@ -68,7 +69,7 @@ extern struct pci_ecam_ops xgene_v1_pcie_ecam_ops; /* APM X-Gene PCIe v1 */ extern struct pci_ecam_ops xgene_v2_pcie_ecam_ops; /* APM X-Gene PCIe v2.x */ #endif -#ifdef CONFIG_PCI_HOST_GENERIC +#ifdef CONFIG_PCI_HOST_COMMON /* for DT-based PCI controllers that support ECAM */ int pci_host_common_probe(struct platform_device *pdev, struct pci_ecam_ops *ops); diff --git a/include/linux/pci-ep-cfs.h b/include/linux/pci-ep-cfs.h new file mode 100644 index 0000000000000000000000000000000000000000..263b89ea5705028c663c7792fd49380b17323386 --- /dev/null +++ b/include/linux/pci-ep-cfs.h @@ -0,0 +1,41 @@ +/** + * PCI Endpoint ConfigFS header file + * + * Copyright (C) 2017 Texas Instruments + * Author: Kishon Vijay Abraham I + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + */ + +#ifndef __LINUX_PCI_EP_CFS_H +#define __LINUX_PCI_EP_CFS_H + +#include + +#ifdef CONFIG_PCI_ENDPOINT_CONFIGFS +struct config_group *pci_ep_cfs_add_epc_group(const char *name); +void pci_ep_cfs_remove_epc_group(struct config_group *group); +struct config_group *pci_ep_cfs_add_epf_group(const char *name); +void pci_ep_cfs_remove_epf_group(struct config_group *group); +#else +static inline struct config_group *pci_ep_cfs_add_epc_group(const char *name) +{ + return 0; +} + +static inline void pci_ep_cfs_remove_epc_group(struct config_group *group) +{ +} + +static inline struct config_group *pci_ep_cfs_add_epf_group(const char *name) +{ + return 0; +} + +static inline void pci_ep_cfs_remove_epf_group(struct config_group *group) +{ +} +#endif +#endif /* __LINUX_PCI_EP_CFS_H */ diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h new file mode 100644 index 0000000000000000000000000000000000000000..af5edbf3eea3c2d1767bfaf660634ec63729b54f --- /dev/null +++ b/include/linux/pci-epc.h @@ -0,0 +1,144 @@ +/** + * PCI Endpoint *Controller* (EPC) header file + * + * Copyright (C) 2017 Texas Instruments + * Author: Kishon Vijay Abraham I + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + */ + +#ifndef __LINUX_PCI_EPC_H +#define __LINUX_PCI_EPC_H + +#include + +struct pci_epc; + +enum pci_epc_irq_type { + PCI_EPC_IRQ_UNKNOWN, + PCI_EPC_IRQ_LEGACY, + PCI_EPC_IRQ_MSI, +}; + +/** + * struct pci_epc_ops - set of function pointers for performing EPC operations + * @write_header: ops to populate configuration space header + * @set_bar: ops to configure the BAR + * @clear_bar: ops to reset the BAR + * @map_addr: ops to map CPU address to PCI address + * @unmap_addr: ops to unmap CPU address and PCI address + * @set_msi: ops to set the requested number of MSI interrupts in the MSI + * capability register + * @get_msi: ops to get the number of MSI interrupts allocated by the RC from + * the MSI capability register + * @raise_irq: ops to raise a legacy or MSI interrupt + * @start: ops to start the PCI link + * @stop: ops to stop the PCI link + * @owner: the module owner containing the ops + */ +struct pci_epc_ops { + int (*write_header)(struct pci_epc *pci_epc, + struct pci_epf_header *hdr); + int (*set_bar)(struct pci_epc *epc, enum pci_barno bar, + dma_addr_t bar_phys, size_t size, int flags); + void (*clear_bar)(struct pci_epc *epc, enum pci_barno bar); + int (*map_addr)(struct pci_epc *epc, phys_addr_t addr, + u64 pci_addr, size_t size); + void (*unmap_addr)(struct pci_epc *epc, phys_addr_t addr); + int (*set_msi)(struct pci_epc *epc, u8 interrupts); + int (*get_msi)(struct pci_epc *epc); + int (*raise_irq)(struct pci_epc *pci_epc, + enum pci_epc_irq_type type, u8 interrupt_num); + int (*start)(struct pci_epc *epc); + void (*stop)(struct pci_epc *epc); + struct module *owner; +}; + +/** + * struct pci_epc_mem - address space of the endpoint controller + * @phys_base: physical base address of the PCI address space + * @size: the size of the PCI address space + * @bitmap: bitmap to manage the PCI address space + * @pages: number of bits representing the address region + */ +struct pci_epc_mem { + phys_addr_t phys_base; + size_t size; + unsigned long *bitmap; + int pages; +}; + +/** + * struct pci_epc - represents the PCI EPC device + * @dev: PCI EPC device + * @pci_epf: list of endpoint functions present in this EPC device + * @ops: function pointers for performing endpoint operations + * @mem: address space of the endpoint controller + * @max_functions: max number of functions that can be configured in this EPC + * @group: configfs group representing the PCI EPC device + * @lock: spinlock to protect pci_epc ops + */ +struct pci_epc { + struct device dev; + struct list_head pci_epf; + const struct pci_epc_ops *ops; + struct pci_epc_mem *mem; + u8 max_functions; + struct config_group *group; + /* spinlock to protect against concurrent access of EP controller */ + spinlock_t lock; +}; + +#define to_pci_epc(device) container_of((device), struct pci_epc, dev) + +#define pci_epc_create(dev, ops) \ + __pci_epc_create((dev), (ops), THIS_MODULE) +#define devm_pci_epc_create(dev, ops) \ + __devm_pci_epc_create((dev), (ops), THIS_MODULE) + +static inline void epc_set_drvdata(struct pci_epc *epc, void *data) +{ + dev_set_drvdata(&epc->dev, data); +} + +static inline void *epc_get_drvdata(struct pci_epc *epc) +{ + return dev_get_drvdata(&epc->dev); +} + +struct pci_epc * +__devm_pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, + struct module *owner); +struct pci_epc * +__pci_epc_create(struct device *dev, const struct pci_epc_ops *ops, + struct module *owner); +void devm_pci_epc_destroy(struct device *dev, struct pci_epc *epc); +void pci_epc_destroy(struct pci_epc *epc); +int pci_epc_add_epf(struct pci_epc *epc, struct pci_epf *epf); +void pci_epc_linkup(struct pci_epc *epc); +void pci_epc_remove_epf(struct pci_epc *epc, struct pci_epf *epf); +int pci_epc_write_header(struct pci_epc *epc, struct pci_epf_header *hdr); +int pci_epc_set_bar(struct pci_epc *epc, enum pci_barno bar, + dma_addr_t bar_phys, size_t size, int flags); +void pci_epc_clear_bar(struct pci_epc *epc, int bar); +int pci_epc_map_addr(struct pci_epc *epc, phys_addr_t phys_addr, + u64 pci_addr, size_t size); +void pci_epc_unmap_addr(struct pci_epc *epc, phys_addr_t phys_addr); +int pci_epc_set_msi(struct pci_epc *epc, u8 interrupts); +int pci_epc_get_msi(struct pci_epc *epc); +int pci_epc_raise_irq(struct pci_epc *epc, enum pci_epc_irq_type type, + u8 interrupt_num); +int pci_epc_start(struct pci_epc *epc); +void pci_epc_stop(struct pci_epc *epc); +struct pci_epc *pci_epc_get(const char *epc_name); +void pci_epc_put(struct pci_epc *epc); + +int pci_epc_mem_init(struct pci_epc *epc, phys_addr_t phys_addr, size_t size); +void pci_epc_mem_exit(struct pci_epc *epc); +void __iomem *pci_epc_mem_alloc_addr(struct pci_epc *epc, + phys_addr_t *phys_addr, size_t size); +void pci_epc_mem_free_addr(struct pci_epc *epc, phys_addr_t phys_addr, + void __iomem *virt_addr, size_t size); +#endif /* __LINUX_PCI_EPC_H */ diff --git a/include/linux/pci-epf.h b/include/linux/pci-epf.h new file mode 100644 index 0000000000000000000000000000000000000000..0d529cb9014398df67893a72efb7f2f171111345 --- /dev/null +++ b/include/linux/pci-epf.h @@ -0,0 +1,162 @@ +/** + * PCI Endpoint *Function* (EPF) header file + * + * Copyright (C) 2017 Texas Instruments + * Author: Kishon Vijay Abraham I + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 of + * the License as published by the Free Software Foundation. + */ + +#ifndef __LINUX_PCI_EPF_H +#define __LINUX_PCI_EPF_H + +#include +#include + +struct pci_epf; + +enum pci_interrupt_pin { + PCI_INTERRUPT_UNKNOWN, + PCI_INTERRUPT_INTA, + PCI_INTERRUPT_INTB, + PCI_INTERRUPT_INTC, + PCI_INTERRUPT_INTD, +}; + +enum pci_barno { + BAR_0, + BAR_1, + BAR_2, + BAR_3, + BAR_4, + BAR_5, +}; + +/** + * struct pci_epf_header - represents standard configuration header + * @vendorid: identifies device manufacturer + * @deviceid: identifies a particular device + * @revid: specifies a device-specific revision identifier + * @progif_code: identifies a specific register-level programming interface + * @subclass_code: identifies more specifically the function of the device + * @baseclass_code: broadly classifies the type of function the device performs + * @cache_line_size: specifies the system cacheline size in units of DWORDs + * @subsys_vendor_id: vendor of the add-in card or subsystem + * @subsys_id: id specific to vendor + * @interrupt_pin: interrupt pin the device (or device function) uses + */ +struct pci_epf_header { + u16 vendorid; + u16 deviceid; + u8 revid; + u8 progif_code; + u8 subclass_code; + u8 baseclass_code; + u8 cache_line_size; + u16 subsys_vendor_id; + u16 subsys_id; + enum pci_interrupt_pin interrupt_pin; +}; + +/** + * struct pci_epf_ops - set of function pointers for performing EPF operations + * @bind: ops to perform when a EPC device has been bound to EPF device + * @unbind: ops to perform when a binding has been lost between a EPC device + * and EPF device + * @linkup: ops to perform when the EPC device has established a connection with + * a host system + */ +struct pci_epf_ops { + int (*bind)(struct pci_epf *epf); + void (*unbind)(struct pci_epf *epf); + void (*linkup)(struct pci_epf *epf); +}; + +/** + * struct pci_epf_driver - represents the PCI EPF driver + * @probe: ops to perform when a new EPF device has been bound to the EPF driver + * @remove: ops to perform when the binding between the EPF device and EPF + * driver is broken + * @driver: PCI EPF driver + * @ops: set of function pointers for performing EPF operations + * @owner: the owner of the module that registers the PCI EPF driver + * @group: configfs group corresponding to the PCI EPF driver + * @id_table: identifies EPF devices for probing + */ +struct pci_epf_driver { + int (*probe)(struct pci_epf *epf); + int (*remove)(struct pci_epf *epf); + + struct device_driver driver; + struct pci_epf_ops *ops; + struct module *owner; + struct config_group *group; + const struct pci_epf_device_id *id_table; +}; + +#define to_pci_epf_driver(drv) (container_of((drv), struct pci_epf_driver, \ + driver)) + +/** + * struct pci_epf_bar - represents the BAR of EPF device + * @phys_addr: physical address that should be mapped to the BAR + * @size: the size of the address space present in BAR + */ +struct pci_epf_bar { + dma_addr_t phys_addr; + size_t size; +}; + +/** + * struct pci_epf - represents the PCI EPF device + * @dev: the PCI EPF device + * @name: the name of the PCI EPF device + * @header: represents standard configuration header + * @bar: represents the BAR of EPF device + * @msi_interrupts: number of MSI interrupts required by this function + * @func_no: unique function number within this endpoint device + * @epc: the EPC device to which this EPF device is bound + * @driver: the EPF driver to which this EPF device is bound + * @list: to add pci_epf as a list of PCI endpoint functions to pci_epc + */ +struct pci_epf { + struct device dev; + const char *name; + struct pci_epf_header *header; + struct pci_epf_bar bar[6]; + u8 msi_interrupts; + u8 func_no; + + struct pci_epc *epc; + struct pci_epf_driver *driver; + struct list_head list; +}; + +#define to_pci_epf(epf_dev) container_of((epf_dev), struct pci_epf, dev) + +#define pci_epf_register_driver(driver) \ + __pci_epf_register_driver((driver), THIS_MODULE) + +static inline void epf_set_drvdata(struct pci_epf *epf, void *data) +{ + dev_set_drvdata(&epf->dev, data); +} + +static inline void *epf_get_drvdata(struct pci_epf *epf) +{ + return dev_get_drvdata(&epf->dev); +} + +struct pci_epf *pci_epf_create(const char *name); +void pci_epf_destroy(struct pci_epf *epf); +int __pci_epf_register_driver(struct pci_epf_driver *driver, + struct module *owner); +void pci_epf_unregister_driver(struct pci_epf_driver *driver); +void *pci_epf_alloc_space(struct pci_epf *epf, size_t size, enum pci_barno bar); +void pci_epf_free_space(struct pci_epf *epf, void *addr, enum pci_barno bar); +int pci_epf_bind(struct pci_epf *epf); +void pci_epf_unbind(struct pci_epf *epf); +void pci_epf_linkup(struct pci_epf *epf); +#endif /* __LINUX_PCI_EPF_H */ diff --git a/include/linux/pci.h b/include/linux/pci.h index 5948cfdc984e2f849507fa1d0fb8f2bed3c01cfa..33c2b0b77429d09aaa31cb9735458db5762e08a7 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -178,6 +179,10 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_PM_RESET = (__force pci_dev_flags_t) (1 << 7), /* Get VPD from function 0 VPD */ PCI_DEV_FLAGS_VPD_REF_F0 = (__force pci_dev_flags_t) (1 << 8), + /* a non-root bridge where translation occurs, stop alias search here */ + PCI_DEV_FLAGS_BRIDGE_XLATE_ROOT = (__force pci_dev_flags_t) (1 << 9), + /* Do not use FLR even if device advertises PCI_AF_CAP */ + PCI_DEV_FLAGS_NO_FLR_RESET = (__force pci_dev_flags_t) (1 << 10), }; enum pci_irq_reroute_variant { @@ -397,6 +402,8 @@ struct pci_dev { phys_addr_t rom; /* Physical address of ROM if it's not from the BAR */ size_t romlen; /* Length of ROM if it's not from the BAR */ char *driver_override; /* Driver name to force a match */ + + unsigned long priv_flags; /* Private flags for the pci driver */ }; static inline struct pci_dev *pci_physfn(struct pci_dev *dev) @@ -941,32 +948,12 @@ int pci_generic_config_write32(struct pci_bus *bus, unsigned int devfn, struct pci_ops *pci_bus_set_ops(struct pci_bus *bus, struct pci_ops *ops); -static inline int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val) -{ - return pci_bus_read_config_byte(dev->bus, dev->devfn, where, val); -} -static inline int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val) -{ - return pci_bus_read_config_word(dev->bus, dev->devfn, where, val); -} -static inline int pci_read_config_dword(const struct pci_dev *dev, int where, - u32 *val) -{ - return pci_bus_read_config_dword(dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val) -{ - return pci_bus_write_config_byte(dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_word(const struct pci_dev *dev, int where, u16 val) -{ - return pci_bus_write_config_word(dev->bus, dev->devfn, where, val); -} -static inline int pci_write_config_dword(const struct pci_dev *dev, int where, - u32 val) -{ - return pci_bus_write_config_dword(dev->bus, dev->devfn, where, val); -} +int pci_read_config_byte(const struct pci_dev *dev, int where, u8 *val); +int pci_read_config_word(const struct pci_dev *dev, int where, u16 *val); +int pci_read_config_dword(const struct pci_dev *dev, int where, u32 *val); +int pci_write_config_byte(const struct pci_dev *dev, int where, u8 val); +int pci_write_config_word(const struct pci_dev *dev, int where, u16 val); +int pci_write_config_dword(const struct pci_dev *dev, int where, u32 val); int pcie_capability_read_word(struct pci_dev *dev, int pos, u16 *val); int pcie_capability_read_dword(struct pci_dev *dev, int pos, u32 *val); @@ -1053,6 +1040,7 @@ int pcie_get_mps(struct pci_dev *dev); int pcie_set_mps(struct pci_dev *dev, int mps); int pcie_get_minimum_link(struct pci_dev *dev, enum pci_bus_speed *speed, enum pcie_link_width *width); +void pcie_flr(struct pci_dev *dev); int __pci_reset_function(struct pci_dev *dev); int __pci_reset_function_locked(struct pci_dev *dev); int pci_reset_function(struct pci_dev *dev); @@ -1073,6 +1061,11 @@ int pci_select_bars(struct pci_dev *dev, unsigned long flags); bool pci_device_is_present(struct pci_dev *pdev); void pci_ignore_hotplug(struct pci_dev *dev); +int __printf(6, 7) pci_request_irq(struct pci_dev *dev, unsigned int nr, + irq_handler_t handler, irq_handler_t thread_fn, void *dev_id, + const char *fmt, ...); +void pci_free_irq(struct pci_dev *dev, unsigned int nr, void *dev_id); + /* ROM control related routines */ int pci_enable_rom(struct pci_dev *pdev); void pci_disable_rom(struct pci_dev *pdev); @@ -1200,6 +1193,11 @@ unsigned long pci_address_to_pio(phys_addr_t addr); phys_addr_t pci_pio_to_address(unsigned long pio); int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr); void pci_unmap_iospace(struct resource *res); +void __iomem *devm_pci_remap_cfgspace(struct device *dev, + resource_size_t offset, + resource_size_t size); +void __iomem *devm_pci_remap_cfg_resource(struct device *dev, + struct resource *res); static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar) { @@ -1298,11 +1296,8 @@ struct msix_entry { #ifdef CONFIG_PCI_MSI int pci_msi_vec_count(struct pci_dev *dev); -void pci_msi_shutdown(struct pci_dev *dev); void pci_disable_msi(struct pci_dev *dev); int pci_msix_vec_count(struct pci_dev *dev); -int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec); -void pci_msix_shutdown(struct pci_dev *dev); void pci_disable_msix(struct pci_dev *dev); void pci_restore_msi_state(struct pci_dev *dev); int pci_msi_enabled(void); @@ -1328,13 +1323,8 @@ int pci_irq_get_node(struct pci_dev *pdev, int vec); #else static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; } -static inline void pci_msi_shutdown(struct pci_dev *dev) { } static inline void pci_disable_msi(struct pci_dev *dev) { } static inline int pci_msix_vec_count(struct pci_dev *dev) { return -ENOSYS; } -static inline int pci_enable_msix(struct pci_dev *dev, - struct msix_entry *entries, int nvec) -{ return -ENOSYS; } -static inline void pci_msix_shutdown(struct pci_dev *dev) { } static inline void pci_disable_msix(struct pci_dev *dev) { } static inline void pci_restore_msi_state(struct pci_dev *dev) { } static inline int pci_msi_enabled(void) { return 0; } @@ -1627,6 +1617,36 @@ static inline int pci_get_new_domain_nr(void) { return -ENOSYS; } #include +/* These two functions provide almost identical functionality. Depennding + * on the architecture, one will be implemented as a wrapper around the + * other (in drivers/pci/mmap.c). + * + * pci_mmap_resource_range() maps a specific BAR, and vm->vm_pgoff + * is expected to be an offset within that region. + * + * pci_mmap_page_range() is the legacy architecture-specific interface, + * which accepts a "user visible" resource address converted by + * pci_resource_to_user(), as used in the legacy mmap() interface in + * /proc/bus/pci/. + */ +int pci_mmap_resource_range(struct pci_dev *dev, int bar, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine); +int pci_mmap_page_range(struct pci_dev *pdev, int bar, + struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine); + +#ifndef arch_can_pci_mmap_wc +#define arch_can_pci_mmap_wc() 0 +#endif + +#ifndef arch_can_pci_mmap_io +#define arch_can_pci_mmap_io() 0 +#define pci_iobar_pfn(pdev, bar, vma) (-EINVAL) +#else +int pci_iobar_pfn(struct pci_dev *pdev, int bar, struct vm_area_struct *vma); +#endif + #ifndef pci_root_bus_fwnode #define pci_root_bus_fwnode(bus) NULL #endif diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index a4f77feecbb00f84fac0295a2b2745928e9e0472..5f6b71d15393a4d6df153589cee33e42d42ebb26 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -862,6 +862,8 @@ #define PCI_DEVICE_ID_TI_X620 0xac8d #define PCI_DEVICE_ID_TI_X420 0xac8e #define PCI_DEVICE_ID_TI_XX20_FM 0xac8f +#define PCI_DEVICE_ID_TI_DRA74x 0xb500 +#define PCI_DEVICE_ID_TI_DRA72x 0xb501 #define PCI_VENDOR_ID_SONY 0x104d diff --git a/include/linux/pe.h b/include/linux/pe.h index e170b95e763bbdf78d04c36372a2c9b4e45c0e1a..143ce75be5f01549ac60f6a8e91e8e614d5f3fe1 100644 --- a/include/linux/pe.h +++ b/include/linux/pe.h @@ -23,34 +23,6 @@ #define MZ_MAGIC 0x5a4d /* "MZ" */ -struct mz_hdr { - uint16_t magic; /* MZ_MAGIC */ - uint16_t lbsize; /* size of last used block */ - uint16_t blocks; /* pages in file, 0x3 */ - uint16_t relocs; /* relocations */ - uint16_t hdrsize; /* header size in "paragraphs" */ - uint16_t min_extra_pps; /* .bss */ - uint16_t max_extra_pps; /* runtime limit for the arena size */ - uint16_t ss; /* relative stack segment */ - uint16_t sp; /* initial %sp register */ - uint16_t checksum; /* word checksum */ - uint16_t ip; /* initial %ip register */ - uint16_t cs; /* initial %cs relative to load segment */ - uint16_t reloc_table_offset; /* offset of the first relocation */ - uint16_t overlay_num; /* overlay number. set to 0. */ - uint16_t reserved0[4]; /* reserved */ - uint16_t oem_id; /* oem identifier */ - uint16_t oem_info; /* oem specific */ - uint16_t reserved1[10]; /* reserved */ - uint32_t peaddr; /* address of pe header */ - char message[64]; /* message to print */ -}; - -struct mz_reloc { - uint16_t offset; - uint16_t segment; -}; - #define PE_MAGIC 0x00004550 /* "PE\0\0" */ #define PE_OPT_MAGIC_PE32 0x010b #define PE_OPT_MAGIC_PE32_ROM 0x0107 @@ -62,6 +34,7 @@ struct mz_reloc { #define IMAGE_FILE_MACHINE_AMD64 0x8664 #define IMAGE_FILE_MACHINE_ARM 0x01c0 #define IMAGE_FILE_MACHINE_ARMV7 0x01c4 +#define IMAGE_FILE_MACHINE_ARM64 0xaa64 #define IMAGE_FILE_MACHINE_EBC 0x0ebc #define IMAGE_FILE_MACHINE_I386 0x014c #define IMAGE_FILE_MACHINE_IA64 0x0200 @@ -98,17 +71,6 @@ struct mz_reloc { #define IMAGE_FILE_UP_SYSTEM_ONLY 0x4000 #define IMAGE_FILE_BYTES_REVERSED_HI 0x8000 -struct pe_hdr { - uint32_t magic; /* PE magic */ - uint16_t machine; /* machine type */ - uint16_t sections; /* number of sections */ - uint32_t timestamp; /* time_t */ - uint32_t symbol_table; /* symbol table offset */ - uint32_t symbols; /* number of symbols */ - uint16_t opt_hdr_size; /* size of optional header */ - uint16_t flags; /* flags */ -}; - #define IMAGE_FILE_OPT_ROM_MAGIC 0x107 #define IMAGE_FILE_OPT_PE32_MAGIC 0x10b #define IMAGE_FILE_OPT_PE32_PLUS_MAGIC 0x20b @@ -134,6 +96,95 @@ struct pe_hdr { #define IMAGE_DLLCHARACTERISTICS_WDM_DRIVER 0x2000 #define IMAGE_DLLCHARACTERISTICS_TERMINAL_SERVER_AWARE 0x8000 +/* they actually defined 0x00000000 as well, but I think we'll skip that one. */ +#define IMAGE_SCN_RESERVED_0 0x00000001 +#define IMAGE_SCN_RESERVED_1 0x00000002 +#define IMAGE_SCN_RESERVED_2 0x00000004 +#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* don't pad - obsolete */ +#define IMAGE_SCN_RESERVED_3 0x00000010 +#define IMAGE_SCN_CNT_CODE 0x00000020 /* .text */ +#define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */ +#define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */ +#define IMAGE_SCN_LNK_OTHER 0x00000100 /* reserved */ +#define IMAGE_SCN_LNK_INFO 0x00000200 /* .drectve comments */ +#define IMAGE_SCN_RESERVED_4 0x00000400 +#define IMAGE_SCN_LNK_REMOVE 0x00000800 /* .o only - scn to be rm'd*/ +#define IMAGE_SCN_LNK_COMDAT 0x00001000 /* .o only - COMDAT data */ +#define IMAGE_SCN_RESERVED_5 0x00002000 /* spec omits this */ +#define IMAGE_SCN_RESERVED_6 0x00004000 /* spec omits this */ +#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data */ +/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */ +#define IMAGE_SCN_MEM_PURGEABLE 0x00010000 /* reserved for "future" use */ +#define IMAGE_SCN_16BIT 0x00020000 /* reserved for "future" use */ +#define IMAGE_SCN_LOCKED 0x00040000 /* reserved for "future" use */ +#define IMAGE_SCN_PRELOAD 0x00080000 /* reserved for "future" use */ +/* and here they just stuck a 1-byte integer in the middle of a bitfield */ +#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* it does what it says on the box */ +#define IMAGE_SCN_ALIGN_2BYTES 0x00200000 +#define IMAGE_SCN_ALIGN_4BYTES 0x00300000 +#define IMAGE_SCN_ALIGN_8BYTES 0x00400000 +#define IMAGE_SCN_ALIGN_16BYTES 0x00500000 +#define IMAGE_SCN_ALIGN_32BYTES 0x00600000 +#define IMAGE_SCN_ALIGN_64BYTES 0x00700000 +#define IMAGE_SCN_ALIGN_128BYTES 0x00800000 +#define IMAGE_SCN_ALIGN_256BYTES 0x00900000 +#define IMAGE_SCN_ALIGN_512BYTES 0x00a00000 +#define IMAGE_SCN_ALIGN_1024BYTES 0x00b00000 +#define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000 +#define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000 +#define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000 +#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */ +#define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */ +#define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */ +#define IMAGE_SCN_MEM_NOT_PAGED 0x08000000 /* not pageable */ +#define IMAGE_SCN_MEM_SHARED 0x10000000 /* can be shared */ +#define IMAGE_SCN_MEM_EXECUTE 0x20000000 /* can be executed as code */ +#define IMAGE_SCN_MEM_READ 0x40000000 /* readable */ +#define IMAGE_SCN_MEM_WRITE 0x80000000 /* writeable */ + +#define IMAGE_DEBUG_TYPE_CODEVIEW 2 + +#ifndef __ASSEMBLY__ + +struct mz_hdr { + uint16_t magic; /* MZ_MAGIC */ + uint16_t lbsize; /* size of last used block */ + uint16_t blocks; /* pages in file, 0x3 */ + uint16_t relocs; /* relocations */ + uint16_t hdrsize; /* header size in "paragraphs" */ + uint16_t min_extra_pps; /* .bss */ + uint16_t max_extra_pps; /* runtime limit for the arena size */ + uint16_t ss; /* relative stack segment */ + uint16_t sp; /* initial %sp register */ + uint16_t checksum; /* word checksum */ + uint16_t ip; /* initial %ip register */ + uint16_t cs; /* initial %cs relative to load segment */ + uint16_t reloc_table_offset; /* offset of the first relocation */ + uint16_t overlay_num; /* overlay number. set to 0. */ + uint16_t reserved0[4]; /* reserved */ + uint16_t oem_id; /* oem identifier */ + uint16_t oem_info; /* oem specific */ + uint16_t reserved1[10]; /* reserved */ + uint32_t peaddr; /* address of pe header */ + char message[64]; /* message to print */ +}; + +struct mz_reloc { + uint16_t offset; + uint16_t segment; +}; + +struct pe_hdr { + uint32_t magic; /* PE magic */ + uint16_t machine; /* machine type */ + uint16_t sections; /* number of sections */ + uint32_t timestamp; /* time_t */ + uint32_t symbol_table; /* symbol table offset */ + uint32_t symbols; /* number of symbols */ + uint16_t opt_hdr_size; /* size of optional header */ + uint16_t flags; /* flags */ +}; + /* the fact that pe32 isn't padded where pe32+ is 64-bit means union won't * work right. vomit. */ struct pe32_opt_hdr { @@ -243,52 +294,6 @@ struct section_header { uint32_t flags; }; -/* they actually defined 0x00000000 as well, but I think we'll skip that one. */ -#define IMAGE_SCN_RESERVED_0 0x00000001 -#define IMAGE_SCN_RESERVED_1 0x00000002 -#define IMAGE_SCN_RESERVED_2 0x00000004 -#define IMAGE_SCN_TYPE_NO_PAD 0x00000008 /* don't pad - obsolete */ -#define IMAGE_SCN_RESERVED_3 0x00000010 -#define IMAGE_SCN_CNT_CODE 0x00000020 /* .text */ -#define IMAGE_SCN_CNT_INITIALIZED_DATA 0x00000040 /* .data */ -#define IMAGE_SCN_CNT_UNINITIALIZED_DATA 0x00000080 /* .bss */ -#define IMAGE_SCN_LNK_OTHER 0x00000100 /* reserved */ -#define IMAGE_SCN_LNK_INFO 0x00000200 /* .drectve comments */ -#define IMAGE_SCN_RESERVED_4 0x00000400 -#define IMAGE_SCN_LNK_REMOVE 0x00000800 /* .o only - scn to be rm'd*/ -#define IMAGE_SCN_LNK_COMDAT 0x00001000 /* .o only - COMDAT data */ -#define IMAGE_SCN_RESERVED_5 0x00002000 /* spec omits this */ -#define IMAGE_SCN_RESERVED_6 0x00004000 /* spec omits this */ -#define IMAGE_SCN_GPREL 0x00008000 /* global pointer referenced data */ -/* spec lists 0x20000 twice, I suspect they meant 0x10000 for one of them */ -#define IMAGE_SCN_MEM_PURGEABLE 0x00010000 /* reserved for "future" use */ -#define IMAGE_SCN_16BIT 0x00020000 /* reserved for "future" use */ -#define IMAGE_SCN_LOCKED 0x00040000 /* reserved for "future" use */ -#define IMAGE_SCN_PRELOAD 0x00080000 /* reserved for "future" use */ -/* and here they just stuck a 1-byte integer in the middle of a bitfield */ -#define IMAGE_SCN_ALIGN_1BYTES 0x00100000 /* it does what it says on the box */ -#define IMAGE_SCN_ALIGN_2BYTES 0x00200000 -#define IMAGE_SCN_ALIGN_4BYTES 0x00300000 -#define IMAGE_SCN_ALIGN_8BYTES 0x00400000 -#define IMAGE_SCN_ALIGN_16BYTES 0x00500000 -#define IMAGE_SCN_ALIGN_32BYTES 0x00600000 -#define IMAGE_SCN_ALIGN_64BYTES 0x00700000 -#define IMAGE_SCN_ALIGN_128BYTES 0x00800000 -#define IMAGE_SCN_ALIGN_256BYTES 0x00900000 -#define IMAGE_SCN_ALIGN_512BYTES 0x00a00000 -#define IMAGE_SCN_ALIGN_1024BYTES 0x00b00000 -#define IMAGE_SCN_ALIGN_2048BYTES 0x00c00000 -#define IMAGE_SCN_ALIGN_4096BYTES 0x00d00000 -#define IMAGE_SCN_ALIGN_8192BYTES 0x00e00000 -#define IMAGE_SCN_LNK_NRELOC_OVFL 0x01000000 /* extended relocations */ -#define IMAGE_SCN_MEM_DISCARDABLE 0x02000000 /* scn can be discarded */ -#define IMAGE_SCN_MEM_NOT_CACHED 0x04000000 /* cannot be cached */ -#define IMAGE_SCN_MEM_NOT_PAGED 0x08000000 /* not pageable */ -#define IMAGE_SCN_MEM_SHARED 0x10000000 /* can be shared */ -#define IMAGE_SCN_MEM_EXECUTE 0x20000000 /* can be executed as code */ -#define IMAGE_SCN_MEM_READ 0x40000000 /* readable */ -#define IMAGE_SCN_MEM_WRITE 0x80000000 /* writeable */ - enum x64_coff_reloc_type { IMAGE_REL_AMD64_ABSOLUTE = 0, IMAGE_REL_AMD64_ADDR64, @@ -445,4 +450,6 @@ struct win_certificate { uint16_t cert_type; }; +#endif /* !__ASSEMBLY__ */ + #endif /* __LINUX_PE_H */ diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 3a481a49546ef1c85d8f88bf7668f8a2c8e8c0f1..c13dceb87b60adbe36c8ae2c7ba21824f33343dd 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h @@ -99,6 +99,7 @@ int __must_check percpu_ref_init(struct percpu_ref *ref, void percpu_ref_exit(struct percpu_ref *ref); void percpu_ref_switch_to_atomic(struct percpu_ref *ref, percpu_ref_func_t *confirm_switch); +void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref); void percpu_ref_switch_to_percpu(struct percpu_ref *ref); void percpu_ref_kill_and_confirm(struct percpu_ref *ref, percpu_ref_func_t *confirm_kill); diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 56939d3f6e531baa21d7770e5ee8f256f3957c86..491b3f5a5f8a86ee9347affdb1b08013757235e9 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -110,6 +110,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, #endif extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); +extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); extern bool is_kernel_percpu_address(unsigned long addr); #if !defined(CONFIG_SMP) || !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 8462da2660899153552df914268ddaadd448fc2d..1360dd6d5e617a0fee746a0d530129e66530a0fb 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -75,6 +75,8 @@ struct pmu_hw_events { * already have to allocate this struct per cpu. */ struct arm_pmu *percpu_pmu; + + int irq; }; enum armpmu_attr_groups { @@ -88,7 +90,6 @@ struct arm_pmu { struct pmu pmu; cpumask_t active_irqs; cpumask_t supported_cpus; - int *irq_affinity; char *name; irqreturn_t (*handle_irq)(int irq_num, void *dev); void (*enable)(struct perf_event *event); @@ -104,12 +105,8 @@ struct arm_pmu { void (*start)(struct arm_pmu *); void (*stop)(struct arm_pmu *); void (*reset)(void *); - int (*request_irq)(struct arm_pmu *, irq_handler_t handler); - void (*free_irq)(struct arm_pmu *); int (*map_event)(struct perf_event *event); int num_events; - atomic_t active_events; - struct mutex reserve_mutex; u64 max_period; bool secure_access; /* 32-bit ARM only */ #define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40 @@ -120,6 +117,9 @@ struct arm_pmu { struct notifier_block cpu_pm_nb; /* the attr_groups array must be NULL-terminated */ const struct attribute_group *attr_groups[ARMPMU_NR_ATTR_GROUPS + 1]; + + /* Only to be used by ACPI probing code */ + unsigned long acpi_cpuid; }; #define to_arm_pmu(p) (container_of(p, struct arm_pmu, pmu)) @@ -135,10 +135,12 @@ int armpmu_map_event(struct perf_event *event, [PERF_COUNT_HW_CACHE_RESULT_MAX], u32 raw_event_mask); +typedef int (*armpmu_init_fn)(struct arm_pmu *); + struct pmu_probe_info { unsigned int cpuid; unsigned int mask; - int (*init)(struct arm_pmu *); + armpmu_init_fn init; }; #define PMU_PROBE(_cpuid, _mask, _fn) \ @@ -160,6 +162,21 @@ int arm_pmu_device_probe(struct platform_device *pdev, const struct of_device_id *of_table, const struct pmu_probe_info *probe_table); +#ifdef CONFIG_ACPI +int arm_pmu_acpi_probe(armpmu_init_fn init_fn); +#else +static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } +#endif + +/* Internal functions only for core arm_pmu code */ +struct arm_pmu *armpmu_alloc(void); +void armpmu_free(struct arm_pmu *pmu); +int armpmu_register(struct arm_pmu *pmu); +int armpmu_request_irqs(struct arm_pmu *armpmu); +void armpmu_free_irqs(struct arm_pmu *armpmu); +int armpmu_request_irq(struct arm_pmu *armpmu, int cpu); +void armpmu_free_irq(struct arm_pmu *armpmu, int cpu); + #define ARMV8_PMU_PDEV_NAME "armv8-pmu" #endif /* CONFIG_ARM_PMU */ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 000fdb211c7d7e2c8fc7351a99dbf6b58fbcb135..24a635887f28df99379d8b683fd94154c7888a69 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -165,6 +165,13 @@ struct hw_perf_event { struct list_head bp_list; }; #endif + struct { /* amd_iommu */ + u8 iommu_bank; + u8 iommu_cntr; + u16 padding; + u64 conf; + u64 conf1; + }; }; /* * If the event is a per task event, this will point to the task in @@ -801,6 +808,7 @@ struct perf_output_handle { struct ring_buffer *rb; unsigned long wakeup; unsigned long size; + u64 aux_flags; union { void *addr; unsigned long head; @@ -849,10 +857,11 @@ perf_cgroup_from_task(struct task_struct *task, struct perf_event_context *ctx) extern void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event); extern void perf_aux_output_end(struct perf_output_handle *handle, - unsigned long size, bool truncated); + unsigned long size); extern int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size); extern void *perf_get_aux(struct perf_output_handle *handle); +extern void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags); extern int perf_pmu_register(struct pmu *pmu, const char *name, int type); extern void perf_pmu_unregister(struct pmu *pmu); @@ -1112,6 +1121,7 @@ extern int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks extern void perf_event_exec(void); extern void perf_event_comm(struct task_struct *tsk, bool exec); +extern void perf_event_namespaces(struct task_struct *tsk); extern void perf_event_fork(struct task_struct *tsk); /* Callchains */ @@ -1267,8 +1277,8 @@ static inline void * perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *event) { return NULL; } static inline void -perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, - bool truncated) { } +perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) + { } static inline int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) { return -EINVAL; } @@ -1315,6 +1325,7 @@ static inline int perf_unregister_guest_info_callbacks static inline void perf_event_mmap(struct vm_area_struct *vma) { } static inline void perf_event_exec(void) { } static inline void perf_event_comm(struct task_struct *tsk, bool exec) { } +static inline void perf_event_namespaces(struct task_struct *tsk) { } static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } diff --git a/include/linux/phy.h b/include/linux/phy.h index 43a774873aa96d4af64d0cdebb579be572a6658a..e76e4adbc7c728efdbadae302af8259d5d55dc05 100644 --- a/include/linux/phy.h +++ b/include/linux/phy.h @@ -217,6 +217,13 @@ struct mii_bus { * matching its address */ int irq[PHY_MAX_ADDR]; + + /* GPIO reset pulse width in microseconds */ + int reset_delay_us; + /* Number of reset GPIOs */ + int num_reset_gpios; + /* Array of RESET GPIO descriptors */ + struct gpio_desc **reset_gpiod; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) @@ -587,23 +594,29 @@ struct phy_driver { */ void (*link_change_notify)(struct phy_device *dev); - /* A function provided by a phy specific driver to override the - * the PHY driver framework support for reading a MMD register - * from the PHY. If not supported, return -1. This function is - * optional for PHY specific drivers, if not provided then the - * default MMD read function is used by the PHY framework. + /* + * Phy specific driver override for reading a MMD register. + * This function is optional for PHY specific drivers. When + * not provided, the default MMD read function will be used + * by phy_read_mmd(), which will use either a direct read for + * Clause 45 PHYs or an indirect read for Clause 22 PHYs. + * devnum is the MMD device number within the PHY device, + * regnum is the register within the selected MMD device. */ - int (*read_mmd_indirect)(struct phy_device *dev, int ptrad, - int devnum, int regnum); - - /* A function provided by a phy specific driver to override the - * the PHY driver framework support for writing a MMD register - * from the PHY. This function is optional for PHY specific drivers, - * if not provided then the default MMD read function is used by - * the PHY framework. + int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); + + /* + * Phy specific driver override for writing a MMD register. + * This function is optional for PHY specific drivers. When + * not provided, the default MMD write function will be used + * by phy_write_mmd(), which will use either a direct write for + * Clause 45 PHYs, or an indirect write for Clause 22 PHYs. + * devnum is the MMD device number within the PHY device, + * regnum is the register within the selected MMD device. + * val is the value to be written. */ - void (*write_mmd_indirect)(struct phy_device *dev, int ptrad, - int devnum, int regnum, u32 val); + int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, + u16 val); /* Get the size and type of the eeprom contained within a plug-in * module */ @@ -651,25 +664,7 @@ struct phy_fixup { * * Same rules as for phy_read(); */ -static inline int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum) -{ - if (!phydev->is_c45) - return -EOPNOTSUPP; - - return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, - MII_ADDR_C45 | (devad << 16) | (regnum & 0xffff)); -} - -/** - * phy_read_mmd_indirect - reads data from the MMD registers - * @phydev: The PHY device bus - * @prtad: MMD Address - * @addr: PHY address on the MII bus - * - * Description: it reads data from the MMD registers (clause 22 to access to - * clause 45) of the specified phy address. - */ -int phy_read_mmd_indirect(struct phy_device *phydev, int prtad, int devad); +int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /** * phy_read - Convenience function for reading a given PHY register @@ -752,35 +747,29 @@ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) * * Same rules as for phy_write(); */ -static inline int phy_write_mmd(struct phy_device *phydev, int devad, - u32 regnum, u16 val) -{ - if (!phydev->is_c45) - return -EOPNOTSUPP; - - regnum = MII_ADDR_C45 | ((devad & 0x1f) << 16) | (regnum & 0xffff); - - return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); -} - -/** - * phy_write_mmd_indirect - writes data to the MMD registers - * @phydev: The PHY device - * @prtad: MMD Address - * @devad: MMD DEVAD - * @data: data to write in the MMD register - * - * Description: Write data from the MMD registers of the specified - * phy address. - */ -void phy_write_mmd_indirect(struct phy_device *phydev, int prtad, - int devad, u32 data); +int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, int phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); +#if IS_ENABLED(CONFIG_PHYLIB) struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); +void phy_device_free(struct phy_device *phydev); +#else +static inline +struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) +{ + return NULL; +} + +static inline int phy_device_register(struct phy_device *phy) +{ + return 0; +} + +static inline void phy_device_free(struct phy_device *phydev) { } +#endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); @@ -852,6 +841,7 @@ void phy_change_work(struct work_struct *work); void phy_mac_interrupt(struct phy_device *phydev, int new_link); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); +void phy_trigger_machine(struct phy_device *phydev, bool sync); int phy_ethtool_sset(struct phy_device *phydev, struct ethtool_cmd *cmd); int phy_ethtool_gset(struct phy_device *phydev, struct ethtool_cmd *cmd); int phy_ethtool_ksettings_get(struct phy_device *phydev, @@ -861,7 +851,6 @@ int phy_ethtool_ksettings_set(struct phy_device *phydev, int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_start_interrupts(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); -void phy_device_free(struct phy_device *phydev); int phy_set_max_speed(struct phy_device *phydev, u32 max_speed); int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, @@ -888,8 +877,10 @@ int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); +#if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); +#endif extern struct bus_type mdio_bus_type; @@ -900,7 +891,7 @@ struct mdio_board_info { const void *platform_data; }; -#if IS_ENABLED(CONFIG_PHYLIB) +#if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); #else diff --git a/include/linux/pinctrl/pinconf-generic.h b/include/linux/pinctrl/pinconf-generic.h index 7620eb127cffc5edbc475457732a042bac357055..279e3c5326e3a4e65bf6f86556ea984a69d0ca9c 100644 --- a/include/linux/pinctrl/pinconf-generic.h +++ b/include/linux/pinctrl/pinconf-generic.h @@ -42,6 +42,8 @@ * @PIN_CONFIG_BIAS_PULL_UP: the pin will be pulled up (usually with high * impedance to VDD). If the argument is != 0 pull-up is enabled, * if it is 0, pull-up is total, i.e. the pin is connected to VDD. + * @PIN_CONFIG_BIDIRECTIONAL: the pin will be configured to allow simultaneous + * input and output operations. * @PIN_CONFIG_DRIVE_OPEN_DRAIN: the pin will be driven with open drain (open * collector) which means it is usually wired with other output ports * which are then pulled up with an external resistor. Setting this @@ -96,6 +98,7 @@ enum pin_config_param { PIN_CONFIG_BIAS_PULL_DOWN, PIN_CONFIG_BIAS_PULL_PIN_DEFAULT, PIN_CONFIG_BIAS_PULL_UP, + PIN_CONFIG_BIDIRECTIONAL, PIN_CONFIG_DRIVE_OPEN_DRAIN, PIN_CONFIG_DRIVE_OPEN_SOURCE, PIN_CONFIG_DRIVE_PUSH_PULL, diff --git a/include/linux/platform_data/iommu-omap.h b/include/linux/platform_data/iommu-omap.h index 0496d171700a767b13b89e8e73c50b043fcf6c64..e8b12dbf617024f488f078f746e87ce32eec839a 100644 --- a/include/linux/platform_data/iommu-omap.h +++ b/include/linux/platform_data/iommu-omap.h @@ -12,28 +12,8 @@ #include -#define MMU_REG_SIZE 256 - -/** - * struct iommu_arch_data - omap iommu private data - * @name: name of the iommu device - * @iommu_dev: handle of the iommu device - * - * This is an omap iommu private data object, which binds an iommu user - * to its iommu device. This object should be placed at the iommu user's - * dev_archdata so generic IOMMU API can be used without having to - * utilize omap-specific plumbing anymore. - */ -struct omap_iommu_arch_data { - const char *name; - struct omap_iommu *iommu_dev; -}; - struct iommu_platform_data { - const char *name; const char *reset_name; - int nr_tlb_entries; - int (*assert_reset)(struct platform_device *pdev, const char *name); int (*deassert_reset)(struct platform_device *pdev, const char *name); }; diff --git a/include/linux/platform_data/isl9305.h b/include/linux/platform_data/isl9305.h index 1419133fa69e9be9ad07e55690fb8abeff966e32..4ac1a070af0a1967f59c02e3b18363ba83e7552f 100644 --- a/include/linux/platform_data/isl9305.h +++ b/include/linux/platform_data/isl9305.h @@ -24,7 +24,7 @@ struct regulator_init_data; struct isl9305_pdata { - struct regulator_init_data *init_data[ISL9305_MAX_REGULATOR]; + struct regulator_init_data *init_data[ISL9305_MAX_REGULATOR + 1]; }; #endif diff --git a/include/linux/platform_data/itco_wdt.h b/include/linux/platform_data/itco_wdt.h index f16542c77ff794ad13a1e81a54a8d1d57b5d1c94..0e95527edf253db9ab55496f4e404743b6c04b35 100644 --- a/include/linux/platform_data/itco_wdt.h +++ b/include/linux/platform_data/itco_wdt.h @@ -14,6 +14,10 @@ struct itco_wdt_platform_data { char name[32]; unsigned int version; + /* private data to be passed to update_no_reboot_bit API */ + void *no_reboot_priv; + /* pointer for platform specific no reboot update function */ + int (*update_no_reboot_bit)(void *priv, bool set); }; #endif /* _ITCO_WDT_H_ */ diff --git a/include/linux/platform_data/pn544.h b/include/linux/platform_data/pn544.h deleted file mode 100644 index 5ce1ab983f44d1c0191fe0a29f403cb5f6482507..0000000000000000000000000000000000000000 --- a/include/linux/platform_data/pn544.h +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Driver include for the PN544 NFC chip. - * - * Copyright (C) Nokia Corporation - * - * Author: Jari Vanhala - * Contact: Matti Aaltoenn - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _PN544_H_ -#define _PN544_H_ - -#include - -enum { - NFC_GPIO_ENABLE, - NFC_GPIO_FW_RESET, - NFC_GPIO_IRQ -}; - -/* board config */ -struct pn544_nfc_platform_data { - int (*request_resources) (struct i2c_client *client); - void (*free_resources) (void); - void (*enable) (int fw); - int (*test) (void); - void (*disable) (void); - int (*get_gpio)(int type); -}; - -#endif /* _PN544_H_ */ diff --git a/include/linux/platform_data/st21nfca.h b/include/linux/platform_data/st21nfca.h deleted file mode 100644 index cc2bdafb0c69cfe61d1ff99f69423b87d2322eee..0000000000000000000000000000000000000000 --- a/include/linux/platform_data/st21nfca.h +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Driver include for the ST21NFCA NFC chip. - * - * Copyright (C) 2014 STMicroelectronics SAS. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#ifndef _ST21NFCA_HCI_H_ -#define _ST21NFCA_HCI_H_ - -#include - -#define ST21NFCA_HCI_DRIVER_NAME "st21nfca_hci" - -struct st21nfca_nfc_platform_data { - unsigned int gpio_ena; - unsigned int irq_polarity; - bool is_ese_present; - bool is_uicc_present; -}; - -#endif /* _ST21NFCA_HCI_H_ */ diff --git a/include/linux/platform_data/video-imxfb.h b/include/linux/platform_data/video-imxfb.h index a5c0a71ec9147692f6a4dc995b71375f977bb71c..cf9348b376ac50f99944c5bc0762afb3e0b65daf 100644 --- a/include/linux/platform_data/video-imxfb.h +++ b/include/linux/platform_data/video-imxfb.h @@ -50,6 +50,7 @@ struct imx_fb_videomode { struct fb_videomode mode; u32 pcr; + bool aus_mode; unsigned char bpp; }; diff --git a/include/linux/pm_domain.h b/include/linux/pm_domain.h index 5339ed5bd6f9d04af80cd7da068a776517d9c4ff..b7803a251044a0eb89142b7a32452bef61f2594b 100644 --- a/include/linux/pm_domain.h +++ b/include/linux/pm_domain.h @@ -20,6 +20,7 @@ /* Defines used for the flags field in the struct generic_pm_domain */ #define GENPD_FLAG_PM_CLK (1U << 0) /* PM domain uses PM clk */ #define GENPD_FLAG_IRQ_SAFE (1U << 1) /* PM domain operates in atomic */ +#define GENPD_FLAG_ALWAYS_ON (1U << 2) /* PM domain is always powered on */ enum gpd_status { GPD_STATE_ACTIVE = 0, /* PM domain is active */ @@ -117,6 +118,7 @@ struct generic_pm_domain_data { struct pm_domain_data base; struct gpd_timing_data td; struct notifier_block nb; + void *data; }; #ifdef CONFIG_PM_GENERIC_DOMAINS diff --git a/include/linux/pm_wakeup.h b/include/linux/pm_wakeup.h index a3447932df1ff0a00f417c847052a83a407f4631..4c2cba7ec1d44edec6881ed7ac1e80ee27312c44 100644 --- a/include/linux/pm_wakeup.h +++ b/include/linux/pm_wakeup.h @@ -106,8 +106,8 @@ extern void __pm_stay_awake(struct wakeup_source *ws); extern void pm_stay_awake(struct device *dev); extern void __pm_relax(struct wakeup_source *ws); extern void pm_relax(struct device *dev); -extern void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec); -extern void pm_wakeup_event(struct device *dev, unsigned int msec); +extern void pm_wakeup_ws_event(struct wakeup_source *ws, unsigned int msec, bool hard); +extern void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard); #else /* !CONFIG_PM_SLEEP */ @@ -182,9 +182,11 @@ static inline void __pm_relax(struct wakeup_source *ws) {} static inline void pm_relax(struct device *dev) {} -static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) {} +static inline void pm_wakeup_ws_event(struct wakeup_source *ws, + unsigned int msec, bool hard) {} -static inline void pm_wakeup_event(struct device *dev, unsigned int msec) {} +static inline void pm_wakeup_dev_event(struct device *dev, unsigned int msec, + bool hard) {} #endif /* !CONFIG_PM_SLEEP */ @@ -201,4 +203,19 @@ static inline void wakeup_source_trash(struct wakeup_source *ws) wakeup_source_drop(ws); } +static inline void __pm_wakeup_event(struct wakeup_source *ws, unsigned int msec) +{ + return pm_wakeup_ws_event(ws, msec, false); +} + +static inline void pm_wakeup_event(struct device *dev, unsigned int msec) +{ + return pm_wakeup_dev_event(dev, msec, false); +} + +static inline void pm_wakeup_hard_event(struct device *dev) +{ + return pm_wakeup_dev_event(dev, 0, true); +} + #endif /* _LINUX_PM_WAKEUP_H */ diff --git a/include/linux/pmem.h b/include/linux/pmem.h index e856c2cb0fe86da91d55e2766b687b66ad02cc1e..71ecf3d46aacb210edfed793d5ab0ece06671d72 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -31,12 +31,6 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) BUG(); } -static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) -{ - BUG(); - return -EFAULT; -} - static inline size_t arch_copy_from_iter_pmem(void *addr, size_t bytes, struct iov_iter *i) { @@ -65,23 +59,6 @@ static inline bool arch_has_pmem_api(void) return IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API); } -/* - * memcpy_from_pmem - read from persistent memory with error handling - * @dst: destination buffer - * @src: source buffer - * @size: transfer length - * - * Returns 0 on success negative error code on failure. - */ -static inline int memcpy_from_pmem(void *dst, void const *src, size_t size) -{ - if (arch_has_pmem_api()) - return arch_memcpy_from_pmem(dst, src, size); - else - memcpy(dst, src, size); - return 0; -} - /** * memcpy_to_pmem - copy data to persistent memory * @dst: destination buffer for the copy diff --git a/include/linux/poll.h b/include/linux/poll.h index a46d6755035e3c086bea24a1bad5a37e5bd0f8b6..75ffc5729e4c654bdccd142df3d3fbac0018d6e4 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -98,64 +98,8 @@ extern int poll_schedule_timeout(struct poll_wqueues *pwq, int state, ktime_t *expires, unsigned long slack); extern u64 select_estimate_accuracy(struct timespec64 *tv); - -static inline int poll_schedule(struct poll_wqueues *pwq, int state) -{ - return poll_schedule_timeout(pwq, state, NULL, 0); -} - -/* - * Scalable version of the fd_set. - */ - -typedef struct { - unsigned long *in, *out, *ex; - unsigned long *res_in, *res_out, *res_ex; -} fd_set_bits; - -/* - * How many longwords for "nr" bits? - */ -#define FDS_BITPERLONG (8*sizeof(long)) -#define FDS_LONGS(nr) (((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG) -#define FDS_BYTES(nr) (FDS_LONGS(nr)*sizeof(long)) - -/* - * We do a VERIFY_WRITE here even though we are only reading this time: - * we'll write to it eventually.. - * - * Use "unsigned long" accesses to let user-mode fd_set's be long-aligned. - */ -static inline -int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) -{ - nr = FDS_BYTES(nr); - if (ufdset) - return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0; - - memset(fdset, 0, nr); - return 0; -} - -static inline unsigned long __must_check -set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset) -{ - if (ufdset) - return __copy_to_user(ufdset, fdset, FDS_BYTES(nr)); - return 0; -} - -static inline -void zero_fd_set(unsigned long nr, unsigned long *fdset) -{ - memset(fdset, 0, FDS_BYTES(nr)); -} - #define MAX_INT64_SECONDS (((s64)(~((u64)0)>>1)/HZ)-1) -extern int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time); -extern int do_sys_poll(struct pollfd __user * ufds, unsigned int nfds, - struct timespec64 *end_time); extern int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec64 *end_time); diff --git a/include/linux/posix-clock.h b/include/linux/posix-clock.h index 34c4498b800fc7b288bedceb375a7788d375f4d4..83b22ae9ae12a27a2855f0559cd227036a2accf6 100644 --- a/include/linux/posix-clock.h +++ b/include/linux/posix-clock.h @@ -59,23 +59,23 @@ struct posix_clock_operations { int (*clock_adjtime)(struct posix_clock *pc, struct timex *tx); - int (*clock_gettime)(struct posix_clock *pc, struct timespec *ts); + int (*clock_gettime)(struct posix_clock *pc, struct timespec64 *ts); - int (*clock_getres) (struct posix_clock *pc, struct timespec *ts); + int (*clock_getres) (struct posix_clock *pc, struct timespec64 *ts); int (*clock_settime)(struct posix_clock *pc, - const struct timespec *ts); + const struct timespec64 *ts); int (*timer_create) (struct posix_clock *pc, struct k_itimer *kit); int (*timer_delete) (struct posix_clock *pc, struct k_itimer *kit); void (*timer_gettime)(struct posix_clock *pc, - struct k_itimer *kit, struct itimerspec *tsp); + struct k_itimer *kit, struct itimerspec64 *tsp); int (*timer_settime)(struct posix_clock *pc, struct k_itimer *kit, int flags, - struct itimerspec *tsp, struct itimerspec *old); + struct itimerspec64 *tsp, struct itimerspec64 *old); /* * Optional character device methods: */ diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index 64aa189efe21d6792b07f2b9005fcf5345b2e98c..8c1e43ab14a97e8645b7f981730938cb4e257b70 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -87,22 +87,22 @@ struct k_itimer { }; struct k_clock { - int (*clock_getres) (const clockid_t which_clock, struct timespec *tp); + int (*clock_getres) (const clockid_t which_clock, struct timespec64 *tp); int (*clock_set) (const clockid_t which_clock, - const struct timespec *tp); - int (*clock_get) (const clockid_t which_clock, struct timespec * tp); + const struct timespec64 *tp); + int (*clock_get) (const clockid_t which_clock, struct timespec64 *tp); int (*clock_adj) (const clockid_t which_clock, struct timex *tx); int (*timer_create) (struct k_itimer *timer); int (*nsleep) (const clockid_t which_clock, int flags, - struct timespec *, struct timespec __user *); + struct timespec64 *, struct timespec __user *); long (*nsleep_restart) (struct restart_block *restart_block); - int (*timer_set) (struct k_itimer * timr, int flags, - struct itimerspec * new_setting, - struct itimerspec * old_setting); - int (*timer_del) (struct k_itimer * timr); + int (*timer_set) (struct k_itimer *timr, int flags, + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting); + int (*timer_del) (struct k_itimer *timr); #define TIMER_RETRY 1 - void (*timer_get) (struct k_itimer * timr, - struct itimerspec * cur_setting); + void (*timer_get) (struct k_itimer *timr, + struct itimerspec64 *cur_setting); }; extern struct k_clock clock_posix_cpu; diff --git a/include/linux/power/bq24190_charger.h b/include/linux/power/bq24190_charger.h deleted file mode 100644 index 9f0283721cbcbdae0d4379672469944780935598..0000000000000000000000000000000000000000 --- a/include/linux/power/bq24190_charger.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Platform data for the TI bq24190 battery charger driver. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef _BQ24190_CHARGER_H_ -#define _BQ24190_CHARGER_H_ - -struct bq24190_platform_data { - unsigned int gpio_int; /* GPIO pin that's connected to INT# */ -}; - -#endif diff --git a/include/linux/power/max17042_battery.h b/include/linux/power/max17042_battery.h index 522757ac9cd4d9975a5fe8bbbf144efb54551cd4..a7ed29baf44a6ce6c4ee01ba381655884090d6f2 100644 --- a/include/linux/power/max17042_battery.h +++ b/include/linux/power/max17042_battery.h @@ -24,8 +24,15 @@ #define __MAX17042_BATTERY_H_ #define MAX17042_STATUS_BattAbsent (1 << 3) -#define MAX17042_BATTERY_FULL (100) +#define MAX17042_BATTERY_FULL (95) /* Recommend. FullSOCThr value */ #define MAX17042_DEFAULT_SNS_RESISTOR (10000) +#define MAX17042_DEFAULT_VMIN (3000) +#define MAX17042_DEFAULT_VMAX (4500) /* LiHV cell max */ +#define MAX17042_DEFAULT_TEMP_MIN (0) /* For sys without temp sensor */ +#define MAX17042_DEFAULT_TEMP_MAX (700) /* 70 degrees Celcius */ + +/* Consider RepCap which is less then 10 units below FullCAP full */ +#define MAX17042_FULL_THRESHOLD 10 #define MAX17042_CHARACTERIZATION_DATA_SIZE 48 diff --git a/include/linux/printk.h b/include/linux/printk.h index 571257e0f53dc669c4f59df19923bd50dfd31a59..e10f274683222358f892096c3bda5fbf7e896015 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -198,7 +198,7 @@ extern void wake_up_klogd(void); char *log_buf_addr_get(void); u32 log_buf_len_get(void); -void log_buf_kexec_setup(void); +void log_buf_vmcoreinfo_setup(void); void __init setup_log_buf(int early); __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); @@ -246,7 +246,7 @@ static inline u32 log_buf_len_get(void) return 0; } -static inline void log_buf_kexec_setup(void) +static inline void log_buf_vmcoreinfo_setup(void) { } diff --git a/include/linux/proc_ns.h b/include/linux/proc_ns.h index 12cb8bd81d2d12b734c83059fb0b0759ee0c92c3..58ab28d81fc2ecd4f78f37a81ac07a56d984f2e5 100644 --- a/include/linux/proc_ns.h +++ b/include/linux/proc_ns.h @@ -14,6 +14,7 @@ struct inode; struct proc_ns_operations { const char *name; + const char *real_ns_name; int type; struct ns_common *(*get)(struct task_struct *task); void (*put)(struct ns_common *ns); @@ -26,6 +27,7 @@ extern const struct proc_ns_operations netns_operations; extern const struct proc_ns_operations utsns_operations; extern const struct proc_ns_operations ipcns_operations; extern const struct proc_ns_operations pidns_operations; +extern const struct proc_ns_operations pidns_for_children_operations; extern const struct proc_ns_operations userns_operations; extern const struct proc_ns_operations mntns_operations; extern const struct proc_ns_operations cgroupns_operations; diff --git a/include/linux/property.h b/include/linux/property.h index 64e3a9c6d95fed6662f75290486ef0c01d245f28..2f482616a2f22e2e26192a0a47a1c1b22d2bf015 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -33,6 +33,8 @@ enum dev_dma_attr { DEV_DMA_COHERENT, }; +struct fwnode_handle *dev_fwnode(struct device *dev); + bool device_property_present(struct device *dev, const char *propname); int device_property_read_u8_array(struct device *dev, const char *propname, u8 *val, size_t nval); @@ -70,6 +72,15 @@ int fwnode_property_read_string(struct fwnode_handle *fwnode, int fwnode_property_match_string(struct fwnode_handle *fwnode, const char *propname, const char *string); +struct fwnode_handle *fwnode_get_parent(struct fwnode_handle *fwnode); +struct fwnode_handle *fwnode_get_next_parent(struct fwnode_handle *fwnode); +struct fwnode_handle *fwnode_get_next_child_node(struct fwnode_handle *fwnode, + struct fwnode_handle *child); + +#define fwnode_for_each_child_node(fwnode, child) \ + for (child = fwnode_get_next_child_node(fwnode, NULL); child; \ + child = fwnode_get_next_child_node(fwnode, child)) + struct fwnode_handle *device_get_next_child_node(struct device *dev, struct fwnode_handle *child); @@ -77,9 +88,12 @@ struct fwnode_handle *device_get_next_child_node(struct device *dev, for (child = device_get_next_child_node(dev, NULL); child; \ child = device_get_next_child_node(dev, child)) +struct fwnode_handle *fwnode_get_named_child_node(struct fwnode_handle *fwnode, + const char *childname); struct fwnode_handle *device_get_named_child_node(struct device *dev, const char *childname); +void fwnode_handle_get(struct fwnode_handle *fwnode); void fwnode_handle_put(struct fwnode_handle *fwnode); unsigned int device_get_child_node_count(struct device *dev); @@ -258,4 +272,16 @@ int device_get_phy_mode(struct device *dev); void *device_get_mac_address(struct device *dev, char *addr, int alen); +struct fwnode_handle *fwnode_graph_get_next_endpoint( + struct fwnode_handle *fwnode, struct fwnode_handle *prev); +struct fwnode_handle *fwnode_graph_get_remote_port_parent( + struct fwnode_handle *fwnode); +struct fwnode_handle *fwnode_graph_get_remote_port( + struct fwnode_handle *fwnode); +struct fwnode_handle *fwnode_graph_get_remote_endpoint( + struct fwnode_handle *fwnode); + +int fwnode_graph_parse_endpoint(struct fwnode_handle *fwnode, + struct fwnode_endpoint *endpoint); + #endif /* _LINUX_PROPERTY_H_ */ diff --git a/include/linux/pstore.h b/include/linux/pstore.h index 0da29cae009b186a9010736b2c9b84210ff7b3a2..e2233f50f4284826d2462ea900a2f430c8db8019 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -30,7 +30,9 @@ #include #include -/* types */ +struct module; + +/* pstore record types (see fs/pstore/inode.c for filename templates) */ enum pstore_type_id { PSTORE_TYPE_DMESG = 0, PSTORE_TYPE_MCE = 1, @@ -45,42 +47,146 @@ enum pstore_type_id { PSTORE_TYPE_UNKNOWN = 255 }; -struct module; +struct pstore_info; +/** + * struct pstore_record - details of a pstore record entry + * @psi: pstore backend driver information + * @type: pstore record type + * @id: per-type unique identifier for record + * @time: timestamp of the record + * @buf: pointer to record contents + * @size: size of @buf + * @ecc_notice_size: + * ECC information for @buf + * + * Valid for PSTORE_TYPE_DMESG @type: + * + * @count: Oops count since boot + * @reason: kdump reason for notification + * @part: position in a multipart record + * @compressed: whether the buffer is compressed + * + */ +struct pstore_record { + struct pstore_info *psi; + enum pstore_type_id type; + u64 id; + struct timespec time; + char *buf; + ssize_t size; + ssize_t ecc_notice_size; + int count; + enum kmsg_dump_reason reason; + unsigned int part; + bool compressed; +}; + +/** + * struct pstore_info - backend pstore driver structure + * + * @owner: module which is repsonsible for this backend driver + * @name: name of the backend driver + * + * @buf_lock: spinlock to serialize access to @buf + * @buf: preallocated crash dump buffer + * @bufsize: size of @buf available for crash dump writes + * + * @read_mutex: serializes @open, @read, @close, and @erase callbacks + * @flags: bitfield of frontends the backend can accept writes for + * @data: backend-private pointer passed back during callbacks + * + * Callbacks: + * + * @open: + * Notify backend that pstore is starting a full read of backend + * records. Followed by one or more @read calls, and a final @close. + * + * @psi: in: pointer to the struct pstore_info for the backend + * + * Returns 0 on success, and non-zero on error. + * + * @close: + * Notify backend that pstore has finished a full read of backend + * records. Always preceded by an @open call and one or more @read + * calls. + * + * @psi: in: pointer to the struct pstore_info for the backend + * + * Returns 0 on success, and non-zero on error. (Though pstore will + * ignore the error.) + * + * @read: + * Read next available backend record. Called after a successful + * @open. + * + * @record: + * pointer to record to populate. @buf should be allocated + * by the backend and filled. At least @type and @id should + * be populated, since these are used when creating pstorefs + * file names. + * + * Returns record size on success, zero when no more records are + * available, or negative on error. + * + * @write: + * A newly generated record needs to be written to backend storage. + * + * @record: + * pointer to record metadata. When @type is PSTORE_TYPE_DMESG, + * @buf will be pointing to the preallocated @psi.buf, since + * memory allocation may be broken during an Oops. Regardless, + * @buf must be proccesed or copied before returning. The + * backend is also expected to write @id with something that + 8 can help identify this record to a future @erase callback. + * + * Returns 0 on success, and non-zero on error. + * + * @write_user: + * Perform a frontend write to a backend record, using a specified + * buffer that is coming directly from userspace, instead of the + * @record @buf. + * + * @record: pointer to record metadata. + * @buf: pointer to userspace contents to write to backend + * + * Returns 0 on success, and non-zero on error. + * + * @erase: + * Delete a record from backend storage. Different backends + * identify records differently, so entire original record is + * passed back to assist in identification of what the backend + * should remove from storage. + * + * @record: pointer to record metadata. + * + * Returns 0 on success, and non-zero on error. + * + */ struct pstore_info { struct module *owner; char *name; - spinlock_t buf_lock; /* serialize access to 'buf' */ + + spinlock_t buf_lock; char *buf; size_t bufsize; - struct mutex read_mutex; /* serialize open/read/close */ + + struct mutex read_mutex; + int flags; + void *data; + int (*open)(struct pstore_info *psi); int (*close)(struct pstore_info *psi); - ssize_t (*read)(u64 *id, enum pstore_type_id *type, - int *count, struct timespec *time, char **buf, - bool *compressed, ssize_t *ecc_notice_size, - struct pstore_info *psi); - int (*write)(enum pstore_type_id type, - enum kmsg_dump_reason reason, u64 *id, - unsigned int part, int count, bool compressed, - size_t size, struct pstore_info *psi); - int (*write_buf)(enum pstore_type_id type, - enum kmsg_dump_reason reason, u64 *id, - unsigned int part, const char *buf, bool compressed, - size_t size, struct pstore_info *psi); - int (*write_buf_user)(enum pstore_type_id type, - enum kmsg_dump_reason reason, u64 *id, - unsigned int part, const char __user *buf, - bool compressed, size_t size, struct pstore_info *psi); - int (*erase)(enum pstore_type_id type, u64 id, - int count, struct timespec time, - struct pstore_info *psi); - void *data; + ssize_t (*read)(struct pstore_record *record); + int (*write)(struct pstore_record *record); + int (*write_user)(struct pstore_record *record, + const char __user *buf); + int (*erase)(struct pstore_record *record); }; +/* Supported frontends */ #define PSTORE_FLAGS_DMESG (1 << 0) -#define PSTORE_FLAGS_FRAGILE PSTORE_FLAGS_DMESG #define PSTORE_FLAGS_CONSOLE (1 << 1) #define PSTORE_FLAGS_FTRACE (1 << 2) #define PSTORE_FLAGS_PMSG (1 << 3) diff --git a/include/linux/ptr_ring.h b/include/linux/ptr_ring.h index 6c70444da3b9d3e5c31f04193b96986ff0e4b04b..6b2e0dd88569b13c66ef445609b2f12419fdd3ac 100644 --- a/include/linux/ptr_ring.h +++ b/include/linux/ptr_ring.h @@ -34,11 +34,13 @@ struct ptr_ring { int producer ____cacheline_aligned_in_smp; spinlock_t producer_lock; - int consumer ____cacheline_aligned_in_smp; + int consumer_head ____cacheline_aligned_in_smp; /* next valid entry */ + int consumer_tail; /* next entry to invalidate */ spinlock_t consumer_lock; /* Shared consumer/producer data */ /* Read-only by both the producer and the consumer */ int size ____cacheline_aligned_in_smp; /* max entries in queue */ + int batch; /* number of entries to consume in a batch */ void **queue; }; @@ -170,7 +172,7 @@ static inline int ptr_ring_produce_bh(struct ptr_ring *r, void *ptr) static inline void *__ptr_ring_peek(struct ptr_ring *r) { if (likely(r->size)) - return r->queue[r->consumer]; + return r->queue[r->consumer_head]; return NULL; } @@ -231,9 +233,38 @@ static inline bool ptr_ring_empty_bh(struct ptr_ring *r) /* Must only be called after __ptr_ring_peek returned !NULL */ static inline void __ptr_ring_discard_one(struct ptr_ring *r) { - r->queue[r->consumer++] = NULL; - if (unlikely(r->consumer >= r->size)) - r->consumer = 0; + /* Fundamentally, what we want to do is update consumer + * index and zero out the entry so producer can reuse it. + * Doing it naively at each consume would be as simple as: + * r->queue[r->consumer++] = NULL; + * if (unlikely(r->consumer >= r->size)) + * r->consumer = 0; + * but that is suboptimal when the ring is full as producer is writing + * out new entries in the same cache line. Defer these updates until a + * batch of entries has been consumed. + */ + int head = r->consumer_head++; + + /* Once we have processed enough entries invalidate them in + * the ring all at once so producer can reuse their space in the ring. + * We also do this when we reach end of the ring - not mandatory + * but helps keep the implementation simple. + */ + if (unlikely(r->consumer_head - r->consumer_tail >= r->batch || + r->consumer_head >= r->size)) { + /* Zero out entries in the reverse order: this way we touch the + * cache line that producer might currently be reading the last; + * producer won't make progress and touch other cache lines + * besides the first one until we write out all entries. + */ + while (likely(head >= r->consumer_tail)) + r->queue[head--] = NULL; + r->consumer_tail = r->consumer_head; + } + if (unlikely(r->consumer_head >= r->size)) { + r->consumer_head = 0; + r->consumer_tail = 0; + } } static inline void *__ptr_ring_consume(struct ptr_ring *r) @@ -345,14 +376,27 @@ static inline void **__ptr_ring_init_queue_alloc(int size, gfp_t gfp) return kzalloc(ALIGN(size * sizeof(void *), SMP_CACHE_BYTES), gfp); } +static inline void __ptr_ring_set_size(struct ptr_ring *r, int size) +{ + r->size = size; + r->batch = SMP_CACHE_BYTES * 2 / sizeof(*(r->queue)); + /* We need to set batch at least to 1 to make logic + * in __ptr_ring_discard_one work correctly. + * Batching too much (because ring is small) would cause a lot of + * burstiness. Needs tuning, for now disable batching. + */ + if (r->batch > r->size / 2 || !r->batch) + r->batch = 1; +} + static inline int ptr_ring_init(struct ptr_ring *r, int size, gfp_t gfp) { r->queue = __ptr_ring_init_queue_alloc(size, gfp); if (!r->queue) return -ENOMEM; - r->size = size; - r->producer = r->consumer = 0; + __ptr_ring_set_size(r, size); + r->producer = r->consumer_head = r->consumer_tail = 0; spin_lock_init(&r->producer_lock); spin_lock_init(&r->consumer_lock); @@ -373,9 +417,10 @@ static inline void **__ptr_ring_swap_queue(struct ptr_ring *r, void **queue, else if (destroy) destroy(ptr); - r->size = size; + __ptr_ring_set_size(r, size); r->producer = producer; - r->consumer = 0; + r->consumer_head = 0; + r->consumer_tail = 0; old = r->queue; r->queue = queue; diff --git a/include/linux/qcom_scm.h b/include/linux/qcom_scm.h index d32f6f1a5225f38de2ff8456a09b236028eb1de5..e5380471c2cd2a42edb8d0bbeb231d8b7320ddf0 100644 --- a/include/linux/qcom_scm.h +++ b/include/linux/qcom_scm.h @@ -40,6 +40,9 @@ extern int qcom_scm_pas_shutdown(u32 peripheral); extern void qcom_scm_cpu_power_down(u32 flags); extern u32 qcom_scm_get_version(void); extern int qcom_scm_set_remote_state(u32 state, u32 id); +extern int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare); +extern int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size); +extern int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare); #else static inline int qcom_scm_set_cold_boot_addr(void *entry, const cpumask_t *cpus) @@ -67,5 +70,8 @@ static inline void qcom_scm_cpu_power_down(u32 flags) {} static inline u32 qcom_scm_get_version(void) { return 0; } static inline u32 qcom_scm_set_remote_state(u32 state,u32 id) { return -ENODEV; } +static inline int qcom_scm_restore_sec_cfg(u32 device_id, u32 spare) { return -ENODEV; } +static inline int qcom_scm_iommu_secure_ptbl_size(u32 spare, size_t *size) { return -ENODEV; } +static inline int qcom_scm_iommu_secure_ptbl_init(u64 addr, u32 size, u32 spare) { return -ENODEV; } #endif #endif diff --git a/include/linux/qed/common_hsi.h b/include/linux/qed/common_hsi.h index 52966b9bfde3740b506664bf2329596cf23695ef..fbab6e0514f07bf0f4a9ac481cb712c58d113b7f 100644 --- a/include/linux/qed/common_hsi.h +++ b/include/linux/qed/common_hsi.h @@ -100,8 +100,8 @@ #define MAX_NUM_LL2_TX_STATS_COUNTERS 32 #define FW_MAJOR_VERSION 8 -#define FW_MINOR_VERSION 10 -#define FW_REVISION_VERSION 10 +#define FW_MINOR_VERSION 15 +#define FW_REVISION_VERSION 3 #define FW_ENGINEERING_VERSION 0 /***********************/ @@ -187,6 +187,9 @@ /* DEMS */ #define DQ_DEMS_LEGACY 0 +#define DQ_DEMS_TOE_MORE_TO_SEND 3 +#define DQ_DEMS_TOE_LOCAL_ADV_WND 4 +#define DQ_DEMS_ROCE_CQ_CONS 7 /* XCM agg val selection */ #define DQ_XCM_AGG_VAL_SEL_WORD2 0 @@ -214,6 +217,9 @@ #define DQ_XCM_ISCSI_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 #define DQ_XCM_ISCSI_EXP_STAT_SN_CMD DQ_XCM_AGG_VAL_SEL_REG6 #define DQ_XCM_ROCE_SQ_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_TOE_TX_BD_PROD_CMD DQ_XCM_AGG_VAL_SEL_WORD4 +#define DQ_XCM_TOE_MORE_TO_SEND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG3 +#define DQ_XCM_TOE_LOCAL_ADV_WND_SEQ_CMD DQ_XCM_AGG_VAL_SEL_REG4 /* UCM agg val selection (HW) */ #define DQ_UCM_AGG_VAL_SEL_WORD0 0 @@ -269,6 +275,8 @@ #define DQ_XCM_ISCSI_DQ_FLUSH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) #define DQ_XCM_ISCSI_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) #define DQ_XCM_ISCSI_PROC_ONLY_CLEANUP_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF23) +#define DQ_XCM_TOE_DQ_FLUSH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF19) +#define DQ_XCM_TOE_SLOW_PATH_CMD BIT(DQ_XCM_AGG_FLG_SHIFT_CF22) /* UCM agg counter flag selection (HW) */ #define DQ_UCM_AGG_FLG_SHIFT_CF0 0 @@ -285,6 +293,9 @@ #define DQ_UCM_ETH_PMD_RX_ARM_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) #define DQ_UCM_ROCE_CQ_ARM_SE_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) #define DQ_UCM_ROCE_CQ_ARM_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) +#define DQ_UCM_TOE_TIMER_STOP_ALL_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF3) +#define DQ_UCM_TOE_SLOW_PATH_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF4) +#define DQ_UCM_TOE_DQ_CF_CMD BIT(DQ_UCM_AGG_FLG_SHIFT_CF5) /* TCM agg counter flag selection (HW) */ #define DQ_TCM_AGG_FLG_SHIFT_CF0 0 @@ -301,6 +312,9 @@ #define DQ_TCM_FCOE_TIMER_STOP_ALL_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF3) #define DQ_TCM_ISCSI_FLUSH_Q0_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF1) #define DQ_TCM_ISCSI_TIMER_STOP_ALL_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF3) +#define DQ_TCM_TOE_FLUSH_Q0_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF1) +#define DQ_TCM_TOE_TIMER_STOP_ALL_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF3) +#define DQ_TCM_IWARP_POST_RQ_CF_CMD BIT(DQ_TCM_AGG_FLG_SHIFT_CF1) /* PWM address mapping */ #define DQ_PWM_OFFSET_DPM_BASE 0x0 @@ -689,6 +703,16 @@ struct iscsi_eqe_data { #define ISCSI_EQE_DATA_RESERVED0_SHIFT 7 }; +struct rdma_eqe_destroy_qp { + __le32 cid; + u8 reserved[4]; +}; + +union rdma_eqe_data { + struct regpair async_handle; + struct rdma_eqe_destroy_qp rdma_destroy_qp_data; +}; + struct malicious_vf_eqe_data { u8 vf_id; u8 err_id; @@ -705,9 +729,9 @@ union event_ring_data { u8 bytes[8]; struct vf_pf_channel_eqe_data vf_pf_channel; struct iscsi_eqe_data iscsi_info; + union rdma_eqe_data rdma_data; struct malicious_vf_eqe_data malicious_vf; struct initial_cleanup_eqe_data vf_init_cleanup; - struct regpair roce_handle; }; /* Event Ring Entry */ diff --git a/include/linux/qed/eth_common.h b/include/linux/qed/eth_common.h index 4b402fb0eaad5fdf7bd29221d95596f092c0e945..34d93eb5bfba346019ba1d2c9014ab8a2fa5fd8f 100644 --- a/include/linux/qed/eth_common.h +++ b/include/linux/qed/eth_common.h @@ -49,6 +49,9 @@ #define ETH_RX_CQE_PAGE_SIZE_BYTES 4096 #define ETH_RX_NUM_NEXT_PAGE_BDS 2 +#define ETH_MAX_TUNN_LSO_INNER_IPV4_OFFSET 253 +#define ETH_MAX_TUNN_LSO_INNER_IPV6_OFFSET 251 + #define ETH_TX_MIN_BDS_PER_NON_LSO_PKT 1 #define ETH_TX_MAX_BDS_PER_NON_LSO_PACKET 18 #define ETH_TX_MAX_BDS_PER_LSO_PACKET 255 diff --git a/include/linux/qed/fcoe_common.h b/include/linux/qed/fcoe_common.h index 2e417a45c5f7028ba1fc2b6201fc828f1fef4247..947a635d04bb57ff15a61f1ee82c41ae27c1d4e5 100644 --- a/include/linux/qed/fcoe_common.h +++ b/include/linux/qed/fcoe_common.h @@ -109,13 +109,6 @@ struct fcoe_conn_terminate_ramrod_data { struct regpair terminate_params_addr; }; -struct fcoe_fast_sgl_ctx { - struct regpair sgl_start_addr; - __le32 sgl_byte_offset; - __le16 task_reuse_cnt; - __le16 init_offset_in_first_sge; -}; - struct fcoe_slow_sgl_ctx { struct regpair base_sgl_addr; __le16 curr_sge_off; @@ -124,23 +117,16 @@ struct fcoe_slow_sgl_ctx { __le16 reserved; }; -struct fcoe_sge { - struct regpair sge_addr; - __le16 size; - __le16 reserved0; - u8 reserved1[3]; - u8 is_valid_sge; -}; - -union fcoe_data_desc_ctx { - struct fcoe_fast_sgl_ctx fast; - struct fcoe_slow_sgl_ctx slow; - struct fcoe_sge single_sge; -}; - union fcoe_dix_desc_ctx { struct fcoe_slow_sgl_ctx dix_sgl; - struct fcoe_sge cached_dix_sge; + struct scsi_sge cached_dix_sge; +}; + +struct fcoe_fast_sgl_ctx { + struct regpair sgl_start_addr; + __le32 sgl_byte_offset; + __le16 task_reuse_cnt; + __le16 init_offset_in_first_sge; }; struct fcoe_fcp_cmd_payload { @@ -172,57 +158,6 @@ enum fcoe_mode_type { MAX_FCOE_MODE_TYPE }; -struct fcoe_mstorm_fcoe_task_st_ctx_fp { - __le16 flags; -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_MASK 0x7FFF -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_RSRV0_SHIFT 0 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_FP_MP_INCLUDE_FC_HEADER_SHIFT 15 - __le16 difDataResidue; - __le16 parent_id; - __le16 single_sge_saved_offset; - __le32 data_2_trns_rem; - __le32 offset_in_io; - union fcoe_dix_desc_ctx dix_desc; - union fcoe_data_desc_ctx data_desc; -}; - -struct fcoe_mstorm_fcoe_task_st_ctx_non_fp { - __le16 flags; -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_MASK 0x3 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HOST_INTERFACE_SHIFT 0 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_TO_PEER_SHIFT 2 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_APP_TAG_SHIFT 3 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_MASK 0xF -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_INTERVAL_SIZE_LOG_SHIFT 4 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_MASK 0x3 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_BLOCK_SIZE_SHIFT 8 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RESERVED_SHIFT 10 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_HAS_FIRST_PACKET_ARRIVED_SHIFT 11 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_VALIDATE_DIX_REF_TAG_SHIFT 12 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIX_CACHED_SGE_FLG_SHIFT 13 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_OFFSET_IN_IO_VALID_SHIFT 14 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_MASK 0x1 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_DIF_SUPPORTED_SHIFT 15 - u8 tx_rx_sgl_mode; -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_MASK 0x7 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_TX_SGL_MODE_SHIFT 0 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_MASK 0x7 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RX_SGL_MODE_SHIFT 3 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_MASK 0x3 -#define FCOE_MSTORM_FCOE_TASK_ST_CTX_NON_FP_RSRV1_SHIFT 6 - u8 rsrv2; - __le32 num_prm_zero_read; - struct regpair rsp_buf_addr; -}; - struct fcoe_rx_stat { struct regpair fcoe_rx_byte_cnt; struct regpair fcoe_rx_data_pkt_cnt; @@ -236,16 +171,6 @@ struct fcoe_rx_stat { __le32 rsrv; }; -enum fcoe_sgl_mode { - FCOE_SLOW_SGL, - FCOE_SINGLE_FAST_SGE, - FCOE_2_FAST_SGE, - FCOE_3_FAST_SGE, - FCOE_4_FAST_SGE, - FCOE_MUL_FAST_SGES, - MAX_FCOE_SGL_MODE -}; - struct fcoe_stat_ramrod_data { struct regpair stat_params_addr; }; @@ -328,22 +253,24 @@ union fcoe_tx_info_union_ctx { struct ystorm_fcoe_task_st_ctx { u8 task_type; u8 sgl_mode; -#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK 0x7 +#define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK 0x1 #define YSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT 0 -#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK 0x1F -#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT 3 +#define YSTORM_FCOE_TASK_ST_CTX_RSRV_MASK 0x7F +#define YSTORM_FCOE_TASK_ST_CTX_RSRV_SHIFT 1 u8 cached_dix_sge; u8 expect_first_xfer; __le32 num_pbf_zero_write; union protection_info_union_ctx protection_info_union; __le32 data_2_trns_rem; + struct scsi_sgl_params sgl_params; + u8 reserved1[12]; union fcoe_tx_info_union_ctx tx_info_union; union fcoe_dix_desc_ctx dix_desc; - union fcoe_data_desc_ctx data_desc; + struct scsi_cached_sges data_desc; __le16 ox_id; __le16 rx_id; __le32 task_rety_identifier; - __le32 reserved1[2]; + u8 reserved2[8]; }; struct ystorm_fcoe_task_ag_ctx { @@ -484,22 +411,22 @@ struct tstorm_fcoe_task_ag_ctx { struct fcoe_tstorm_fcoe_task_st_ctx_read_write { union fcoe_cleanup_addr_exp_ro_union cleanup_addr_exp_ro_union; __le16 flags; -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK 0x7 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_MASK 0x1 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RX_SGL_MODE_SHIFT 0 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_MASK 0x1 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT 3 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_EXP_FIRST_FRAME_SHIFT 1 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_MASK 0x1 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT 4 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_ACTIVE_SHIFT 2 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_MASK 0x1 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT 5 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SEQ_TIMEOUT_SHIFT 3 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_MASK 0x1 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT 6 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_SINGLE_PKT_IN_EX_SHIFT 4 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_MASK 0x1 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT 7 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_OOO_RX_SEQ_STAT_SHIFT 5 #define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_MASK 0x3 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT 8 -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK 0x3F -#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT 10 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_CQ_ADD_ADV_SHIFT 6 +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_MASK 0xFF +#define FCOE_TSTORM_FCOE_TASK_ST_CTX_READ_WRITE_RSRV1_SHIFT 8 __le16 seq_cnt; u8 seq_id; u8 ooo_rx_seq_id; @@ -582,8 +509,34 @@ struct mstorm_fcoe_task_ag_ctx { }; struct mstorm_fcoe_task_st_ctx { - struct fcoe_mstorm_fcoe_task_st_ctx_non_fp non_fp; - struct fcoe_mstorm_fcoe_task_st_ctx_fp fp; + struct regpair rsp_buf_addr; + __le32 rsrv[2]; + struct scsi_sgl_params sgl_params; + __le32 data_2_trns_rem; + __le32 data_buffer_offset; + __le16 parent_id; + __le16 flags; +#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_MASK 0xF +#define MSTORM_FCOE_TASK_ST_CTX_INTERVAL_SIZE_LOG_SHIFT 0 +#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_MASK 0x3 +#define MSTORM_FCOE_TASK_ST_CTX_HOST_INTERFACE_SHIFT 4 +#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_DIF_TO_PEER_SHIFT 6 +#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_MP_INCLUDE_FC_HEADER_SHIFT 7 +#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_MASK 0x3 +#define MSTORM_FCOE_TASK_ST_CTX_DIX_BLOCK_SIZE_SHIFT 8 +#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_VALIDATE_DIX_REF_TAG_SHIFT 10 +#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_DIX_CACHED_SGE_FLG_SHIFT 11 +#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_DIF_SUPPORTED_SHIFT 12 +#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_MASK 0x1 +#define MSTORM_FCOE_TASK_ST_CTX_TX_SGL_MODE_SHIFT 13 +#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_MASK 0x3 +#define MSTORM_FCOE_TASK_ST_CTX_RESERVED_SHIFT 14 + struct scsi_cached_sges data_desc; }; struct ustorm_fcoe_task_ag_ctx { @@ -646,6 +599,7 @@ struct ustorm_fcoe_task_ag_ctx { struct fcoe_task_context { struct ystorm_fcoe_task_st_ctx ystorm_st_context; + struct regpair ystorm_st_padding[2]; struct tdif_task_context tdif_context; struct ystorm_fcoe_task_ag_ctx ystorm_ag_context; struct tstorm_fcoe_task_ag_ctx tstorm_ag_context; @@ -668,20 +622,20 @@ struct fcoe_tx_stat { struct fcoe_wqe { __le16 task_id; __le16 flags; -#define FCOE_WQE_REQ_TYPE_MASK 0xF -#define FCOE_WQE_REQ_TYPE_SHIFT 0 -#define FCOE_WQE_SGL_MODE_MASK 0x7 -#define FCOE_WQE_SGL_MODE_SHIFT 4 -#define FCOE_WQE_CONTINUATION_MASK 0x1 -#define FCOE_WQE_CONTINUATION_SHIFT 7 -#define FCOE_WQE_INVALIDATE_PTU_MASK 0x1 -#define FCOE_WQE_INVALIDATE_PTU_SHIFT 8 -#define FCOE_WQE_SUPER_IO_MASK 0x1 -#define FCOE_WQE_SUPER_IO_SHIFT 9 -#define FCOE_WQE_SEND_AUTO_RSP_MASK 0x1 -#define FCOE_WQE_SEND_AUTO_RSP_SHIFT 10 -#define FCOE_WQE_RESERVED0_MASK 0x1F -#define FCOE_WQE_RESERVED0_SHIFT 11 +#define FCOE_WQE_REQ_TYPE_MASK 0xF +#define FCOE_WQE_REQ_TYPE_SHIFT 0 +#define FCOE_WQE_SGL_MODE_MASK 0x1 +#define FCOE_WQE_SGL_MODE_SHIFT 4 +#define FCOE_WQE_CONTINUATION_MASK 0x1 +#define FCOE_WQE_CONTINUATION_SHIFT 5 +#define FCOE_WQE_SEND_AUTO_RSP_MASK 0x1 +#define FCOE_WQE_SEND_AUTO_RSP_SHIFT 6 +#define FCOE_WQE_RESERVED_MASK 0x1 +#define FCOE_WQE_RESERVED_SHIFT 7 +#define FCOE_WQE_NUM_SGES_MASK 0xF +#define FCOE_WQE_NUM_SGES_SHIFT 8 +#define FCOE_WQE_RESERVED1_MASK 0xF +#define FCOE_WQE_RESERVED1_SHIFT 12 union fcoe_additional_info_union additional_info_union; }; diff --git a/include/linux/qed/iscsi_common.h b/include/linux/qed/iscsi_common.h index 4c5747babcf63ff32b8db664146df40545d986d2..69949f8e354b0447c7950884bd205622a4653ab2 100644 --- a/include/linux/qed/iscsi_common.h +++ b/include/linux/qed/iscsi_common.h @@ -39,17 +39,9 @@ /* iSCSI HSI constants */ #define ISCSI_DEFAULT_MTU (1500) -/* Current iSCSI HSI version number composed of two fields (16 bit) */ -#define ISCSI_HSI_MAJOR_VERSION (0) -#define ISCSI_HSI_MINOR_VERSION (0) - /* KWQ (kernel work queue) layer codes */ #define ISCSI_SLOW_PATH_LAYER_CODE (6) -/* CQE completion status */ -#define ISCSI_EQE_COMPLETION_SUCCESS (0x0) -#define ISCSI_EQE_RST_CONN_RCVD (0x1) - /* iSCSI parameter defaults */ #define ISCSI_DEFAULT_HEADER_DIGEST (0) #define ISCSI_DEFAULT_DATA_DIGEST (0) @@ -68,6 +60,10 @@ #define ISCSI_MIN_VAL_MAX_OUTSTANDING_R2T (1) #define ISCSI_MAX_VAL_MAX_OUTSTANDING_R2T (0xff) +#define ISCSI_AHS_CNTL_SIZE 4 + +#define ISCSI_WQE_NUM_SGES_SLOWIO (0xf) + /* iSCSI reserved params */ #define ISCSI_ITT_ALL_ONES (0xffffffff) #define ISCSI_TTT_ALL_ONES (0xffffffff) @@ -173,19 +169,6 @@ struct iscsi_async_msg_hdr { __le32 reserved7; }; -struct iscsi_sge { - struct regpair sge_addr; - __le16 sge_len; - __le16 reserved0; - __le32 reserved1; -}; - -struct iscsi_cached_sge_ctx { - struct iscsi_sge sge; - struct regpair reserved; - __le32 dsgl_curr_offset[2]; -}; - struct iscsi_cmd_hdr { __le16 reserved1; u8 flags_attr; @@ -229,8 +212,13 @@ struct iscsi_common_hdr { #define ISCSI_COMMON_HDR_DATA_SEG_LEN_SHIFT 0 #define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_COMMON_HDR_TOTAL_AHS_LEN_SHIFT 24 - __le32 lun_reserved[4]; - __le32 data[6]; + struct regpair lun_reserved; + __le32 itt; + __le32 ttt; + __le32 cmdstat_sn; + __le32 exp_statcmd_sn; + __le32 max_cmd_sn; + __le32 data[3]; }; struct iscsi_conn_offload_params { @@ -246,8 +234,10 @@ struct iscsi_conn_offload_params { #define ISCSI_CONN_OFFLOAD_PARAMS_TCP_ON_CHIP_1B_SHIFT 0 #define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_MASK 0x1 #define ISCSI_CONN_OFFLOAD_PARAMS_TARGET_MODE_SHIFT 1 -#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_MASK 0x3F -#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT 2 +#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_MASK 0x1 +#define ISCSI_CONN_OFFLOAD_PARAMS_RESTRICTED_MODE_SHIFT 2 +#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_MASK 0x1F +#define ISCSI_CONN_OFFLOAD_PARAMS_RESERVED1_SHIFT 3 u8 pbl_page_size_log; u8 pbe_page_size_log; u8 default_cq; @@ -278,8 +268,12 @@ struct iscsi_conn_update_ramrod_params { #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_INITIAL_R2T_SHIFT 2 #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_MASK 0x1 #define ISCSI_CONN_UPDATE_RAMROD_PARAMS_IMMEDIATE_DATA_SHIFT 3 -#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_MASK 0xF -#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_SHIFT 4 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_MASK 0x1 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_BLOCK_SIZE_SHIFT 4 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_MASK 0x1 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_DIF_ON_HOST_EN_SHIFT 5 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_MASK 0x3 +#define ISCSI_CONN_UPDATE_RAMROD_PARAMS_RESERVED1_SHIFT 6 u8 reserved0[3]; __le32 max_seq_size; __le32 max_send_pdu_length; @@ -312,7 +306,7 @@ struct iscsi_ext_cdb_cmd_hdr { __le32 expected_transfer_length; __le32 cmd_sn; __le32 exp_stat_sn; - struct iscsi_sge cdb_sge; + struct scsi_sge cdb_sge; }; struct iscsi_login_req_hdr { @@ -519,8 +513,8 @@ struct iscsi_logout_response_hdr { __le32 exp_cmd_sn; __le32 max_cmd_sn; __le32 reserved4; - __le16 time2retain; - __le16 time2wait; + __le16 time_2_retain; + __le16 time_2_wait; __le32 reserved5[1]; }; @@ -602,7 +596,7 @@ struct iscsi_tmf_response_hdr { #define ISCSI_TMF_RESPONSE_HDR_TOTAL_AHS_LEN_SHIFT 24 struct regpair reserved0; __le32 itt; - __le32 rtt; + __le32 reserved1; __le32 stat_sn; __le32 exp_cmd_sn; __le32 max_cmd_sn; @@ -641,7 +635,7 @@ struct iscsi_reject_hdr { #define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_MASK 0xFF #define ISCSI_REJECT_HDR_TOTAL_AHS_LEN_SHIFT 24 struct regpair reserved0; - __le32 reserved1; + __le32 all_ones; __le32 reserved2; __le32 stat_sn; __le32 exp_cmd_sn; @@ -688,7 +682,9 @@ struct iscsi_cqe_solicited { __le16 itid; u8 task_type; u8 fw_dbg_field; - __le32 reserved1[2]; + u8 caused_conn_err; + u8 reserved0[3]; + __le32 reserved1[1]; union iscsi_task_hdr iscsi_hdr; }; @@ -727,35 +723,6 @@ enum iscsi_cqe_unsolicited_type { MAX_ISCSI_CQE_UNSOLICITED_TYPE }; -struct iscsi_virt_sgl_ctx { - struct regpair sgl_base; - struct regpair dsgl_base; - __le32 sgl_initial_offset; - __le32 dsgl_initial_offset; - __le32 dsgl_curr_offset[2]; -}; - -struct iscsi_sgl_var_params { - u8 sgl_ptr; - u8 dsgl_ptr; - __le16 sge_offset; - __le16 dsge_offset; -}; - -struct iscsi_phys_sgl_ctx { - struct regpair sgl_base; - struct regpair dsgl_base; - u8 sgl_size; - u8 dsgl_size; - __le16 reserved; - struct iscsi_sgl_var_params var_params[2]; -}; - -union iscsi_data_desc_ctx { - struct iscsi_virt_sgl_ctx virt_sgl; - struct iscsi_phys_sgl_ctx phys_sgl; - struct iscsi_cached_sge_ctx cached_sge; -}; struct iscsi_debug_modes { u8 flags; @@ -771,8 +738,10 @@ struct iscsi_debug_modes { #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_REJECT_OR_ASYNC_SHIFT 4 #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_MASK 0x1 #define ISCSI_DEBUG_MODES_ASSERT_IF_RECV_NOP_SHIFT 5 -#define ISCSI_DEBUG_MODES_RESERVED0_MASK 0x3 -#define ISCSI_DEBUG_MODES_RESERVED0_SHIFT 6 +#define ISCSI_DEBUG_MODES_ASSERT_IF_DATA_DIGEST_ERROR_MASK 0x1 +#define ISCSI_DEBUG_MODES_ASSERT_IF_DATA_DIGEST_ERROR_SHIFT 6 +#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_ERROR_MASK 0x1 +#define ISCSI_DEBUG_MODES_ASSERT_IF_DIF_ERROR_SHIFT 7 }; struct iscsi_dif_flags { @@ -806,7 +775,6 @@ enum iscsi_eqe_opcode { ISCSI_EVENT_TYPE_ASYN_FIN_WAIT2, ISCSI_EVENT_TYPE_ISCSI_CONN_ERROR, ISCSI_EVENT_TYPE_TCP_CONN_ERROR, - ISCSI_EVENT_TYPE_ASYN_DELETE_OOO_ISLES, MAX_ISCSI_EQE_OPCODE }; @@ -856,31 +824,11 @@ enum iscsi_error_types { ISCSI_CONN_ERROR_PROTOCOL_ERR_DIF_TX, ISCSI_CONN_ERROR_SENSE_DATA_LENGTH, ISCSI_CONN_ERROR_DATA_PLACEMENT_ERROR, + ISCSI_CONN_ERROR_INVALID_ITT, ISCSI_ERROR_UNKNOWN, MAX_ISCSI_ERROR_TYPES }; -struct iscsi_mflags { - u8 mflags; -#define ISCSI_MFLAGS_SLOW_IO_MASK 0x1 -#define ISCSI_MFLAGS_SLOW_IO_SHIFT 0 -#define ISCSI_MFLAGS_SINGLE_SGE_MASK 0x1 -#define ISCSI_MFLAGS_SINGLE_SGE_SHIFT 1 -#define ISCSI_MFLAGS_RESERVED_MASK 0x3F -#define ISCSI_MFLAGS_RESERVED_SHIFT 2 -}; - -struct iscsi_sgl { - struct regpair sgl_addr; - __le16 updated_sge_size; - __le16 updated_sge_offset; - __le32 byte_offset; -}; - -union iscsi_mstorm_sgl { - struct iscsi_sgl sgl_struct; - struct iscsi_sge single_sge; -}; enum iscsi_ramrod_cmd_id { ISCSI_RAMROD_CMD_ID_UNUSED = 0, @@ -896,10 +844,10 @@ enum iscsi_ramrod_cmd_id { struct iscsi_reg1 { __le32 reg1_map; -#define ISCSI_REG1_NUM_FAST_SGES_MASK 0x7 -#define ISCSI_REG1_NUM_FAST_SGES_SHIFT 0 -#define ISCSI_REG1_RESERVED1_MASK 0x1FFFFFFF -#define ISCSI_REG1_RESERVED1_SHIFT 3 +#define ISCSI_REG1_NUM_SGES_MASK 0xF +#define ISCSI_REG1_NUM_SGES_SHIFT 0 +#define ISCSI_REG1_RESERVED1_MASK 0xFFFFFFF +#define ISCSI_REG1_RESERVED1_SHIFT 4 }; union iscsi_seq_num { @@ -967,22 +915,33 @@ struct iscsi_spe_func_init { }; struct ystorm_iscsi_task_state { - union iscsi_data_desc_ctx sgl_ctx_union; - __le32 buffer_offset[2]; - __le16 bytes_nxt_dif; - __le16 rxmit_bytes_nxt_dif; - union iscsi_seq_num seq_num_union; - u8 dif_bytes_leftover; - u8 rxmit_dif_bytes_leftover; - __le16 reuse_count; - struct iscsi_dif_flags dif_flags; - u8 local_comp; + struct scsi_cached_sges data_desc; + struct scsi_sgl_params sgl_params; __le32 exp_r2t_sn; - __le32 sgl_offset[2]; + __le32 buffer_offset; + union iscsi_seq_num seq_num; + struct iscsi_dif_flags dif_flags; + u8 flags; +#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_MASK 0x1 +#define YSTORM_ISCSI_TASK_STATE_LOCAL_COMP_SHIFT 0 +#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_MASK 0x1 +#define YSTORM_ISCSI_TASK_STATE_SLOW_IO_SHIFT 1 +#define YSTORM_ISCSI_TASK_STATE_RESERVED0_MASK 0x3F +#define YSTORM_ISCSI_TASK_STATE_RESERVED0_SHIFT 2 +}; + +struct ystorm_iscsi_task_rxmit_opt { + __le32 fast_rxmit_sge_offset; + __le32 scan_start_buffer_offset; + __le32 fast_rxmit_buffer_offset; + u8 scan_start_sgl_index; + u8 fast_rxmit_sgl_index; + __le16 reserved; }; struct ystorm_iscsi_task_st_ctx { struct ystorm_iscsi_task_state state; + struct ystorm_iscsi_task_rxmit_opt rxmit_opt; union iscsi_task_hdr pdu_hdr; }; @@ -1152,25 +1111,16 @@ struct ustorm_iscsi_task_ag_ctx { }; struct mstorm_iscsi_task_st_ctx { - union iscsi_mstorm_sgl sgl_union; - struct iscsi_dif_flags dif_flags; - struct iscsi_mflags flags; - u8 sgl_size; - u8 host_sge_index; - __le16 dix_cur_sge_offset; - __le16 dix_cur_sge_size; - __le32 data_offset_rtid; - u8 dif_offset; - u8 dix_sgl_size; - u8 dix_sge_index; + struct scsi_cached_sges data_desc; + struct scsi_sgl_params sgl_params; + __le32 rem_task_size; + __le32 data_buffer_offset; u8 task_type; + struct iscsi_dif_flags dif_flags; + u8 reserved0[2]; struct regpair sense_db; - struct regpair dix_sgl_cur_sge; - __le32 rem_task_size; - __le16 reuse_count; - __le16 dif_data_residue; - u8 reserved0[4]; - __le32 reserved1[1]; + __le32 expected_itt; + __le32 reserved1; }; struct ustorm_iscsi_task_st_ctx { @@ -1184,7 +1134,7 @@ struct ustorm_iscsi_task_st_ctx { #define USTORM_ISCSI_TASK_ST_CTX_AHS_EXIST_SHIFT 0 #define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_MASK 0x7F #define USTORM_ISCSI_TASK_ST_CTX_RESERVED1_SHIFT 1 - u8 reserved2; + struct iscsi_dif_flags dif_flags; __le16 reserved3; __le32 reserved4; __le32 reserved5; @@ -1207,10 +1157,10 @@ struct ustorm_iscsi_task_st_ctx { #define USTORM_ISCSI_TASK_ST_CTX_LOCAL_COMP_SHIFT 2 #define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_MASK 0x1 #define USTORM_ISCSI_TASK_ST_CTX_Q0_R2TQE_WRITE_SHIFT 3 -#define USTORM_ISCSI_TASK_ST_CTX_TOTALDATAACKED_DONE_MASK 0x1 -#define USTORM_ISCSI_TASK_ST_CTX_TOTALDATAACKED_DONE_SHIFT 4 -#define USTORM_ISCSI_TASK_ST_CTX_HQSCANNED_DONE_MASK 0x1 -#define USTORM_ISCSI_TASK_ST_CTX_HQSCANNED_DONE_SHIFT 5 +#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_MASK 0x1 +#define USTORM_ISCSI_TASK_ST_CTX_TOTAL_DATA_ACKED_DONE_SHIFT 4 +#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_MASK 0x1 +#define USTORM_ISCSI_TASK_ST_CTX_HQ_SCANNED_DONE_SHIFT 5 #define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_MASK 0x1 #define USTORM_ISCSI_TASK_ST_CTX_R2T2RECV_DONE_SHIFT 6 #define USTORM_ISCSI_TASK_ST_CTX_RESERVED0_MASK 0x1 @@ -1220,7 +1170,6 @@ struct ustorm_iscsi_task_st_ctx { struct iscsi_task_context { struct ystorm_iscsi_task_st_ctx ystorm_st_context; - struct regpair ystorm_st_padding[2]; struct ystorm_iscsi_task_ag_ctx ystorm_ag_context; struct regpair ystorm_ag_padding[2]; struct tdif_task_context tdif_context; @@ -1272,32 +1221,22 @@ struct iscsi_uhqe { #define ISCSI_UHQE_TASK_ID_LO_SHIFT 24 }; -struct iscsi_wqe_field { - __le32 contlen_cdbsize_field; -#define ISCSI_WQE_FIELD_CONT_LEN_MASK 0xFFFFFF -#define ISCSI_WQE_FIELD_CONT_LEN_SHIFT 0 -#define ISCSI_WQE_FIELD_CDB_SIZE_MASK 0xFF -#define ISCSI_WQE_FIELD_CDB_SIZE_SHIFT 24 -}; - -union iscsi_wqe_field_union { - struct iscsi_wqe_field cont_field; - __le32 prev_tid; -}; struct iscsi_wqe { __le16 task_id; u8 flags; #define ISCSI_WQE_WQE_TYPE_MASK 0x7 #define ISCSI_WQE_WQE_TYPE_SHIFT 0 -#define ISCSI_WQE_NUM_FAST_SGES_MASK 0x7 -#define ISCSI_WQE_NUM_FAST_SGES_SHIFT 3 -#define ISCSI_WQE_PTU_INVALIDATE_MASK 0x1 -#define ISCSI_WQE_PTU_INVALIDATE_SHIFT 6 +#define ISCSI_WQE_NUM_SGES_MASK 0xF +#define ISCSI_WQE_NUM_SGES_SHIFT 3 #define ISCSI_WQE_RESPONSE_MASK 0x1 #define ISCSI_WQE_RESPONSE_SHIFT 7 struct iscsi_dif_flags prot_flags; - union iscsi_wqe_field_union cont_prevtid_union; + __le32 contlen_cdbsize; +#define ISCSI_WQE_CONT_LEN_MASK 0xFFFFFF +#define ISCSI_WQE_CONT_LEN_SHIFT 0 +#define ISCSI_WQE_CDB_SIZE_MASK 0xFF +#define ISCSI_WQE_CDB_SIZE_SHIFT 24 }; enum iscsi_wqe_type { @@ -1318,17 +1257,15 @@ struct iscsi_xhqe { u8 total_ahs_length; u8 opcode; u8 flags; -#define ISCSI_XHQE_NUM_FAST_SGES_MASK 0x7 -#define ISCSI_XHQE_NUM_FAST_SGES_SHIFT 0 -#define ISCSI_XHQE_FINAL_MASK 0x1 -#define ISCSI_XHQE_FINAL_SHIFT 3 -#define ISCSI_XHQE_SUPER_IO_MASK 0x1 -#define ISCSI_XHQE_SUPER_IO_SHIFT 4 -#define ISCSI_XHQE_STATUS_BIT_MASK 0x1 -#define ISCSI_XHQE_STATUS_BIT_SHIFT 5 -#define ISCSI_XHQE_RESERVED_MASK 0x3 -#define ISCSI_XHQE_RESERVED_SHIFT 6 - union iscsi_seq_num seq_num_union; +#define ISCSI_XHQE_FINAL_MASK 0x1 +#define ISCSI_XHQE_FINAL_SHIFT 0 +#define ISCSI_XHQE_STATUS_BIT_MASK 0x1 +#define ISCSI_XHQE_STATUS_BIT_SHIFT 1 +#define ISCSI_XHQE_NUM_SGES_MASK 0xF +#define ISCSI_XHQE_NUM_SGES_SHIFT 2 +#define ISCSI_XHQE_RESERVED0_MASK 0x3 +#define ISCSI_XHQE_RESERVED0_SHIFT 6 + union iscsi_seq_num seq_num; __le16 reserved1; }; diff --git a/include/linux/qed/qed_eth_if.h b/include/linux/qed/qed_eth_if.h index 4cd1f0ccfa367a5215dd121ca22efa6267e7c51e..d66d16a559e19e37516abe08939fa9e0a712aa60 100644 --- a/include/linux/qed/qed_eth_if.h +++ b/include/linux/qed/qed_eth_if.h @@ -158,15 +158,27 @@ struct qed_tunn_params { struct qed_eth_cb_ops { struct qed_common_cb_ops common; void (*force_mac) (void *dev, u8 *mac, bool forced); + void (*ports_update)(void *dev, u16 vxlan_port, u16 geneve_port); }; #define QED_MAX_PHC_DRIFT_PPB 291666666 enum qed_ptp_filter_type { - QED_PTP_FILTER_L2, - QED_PTP_FILTER_IPV4, - QED_PTP_FILTER_IPV4_IPV6, - QED_PTP_FILTER_L2_IPV4_IPV6 + QED_PTP_FILTER_NONE, + QED_PTP_FILTER_ALL, + QED_PTP_FILTER_V1_L4_EVENT, + QED_PTP_FILTER_V1_L4_GEN, + QED_PTP_FILTER_V2_L4_EVENT, + QED_PTP_FILTER_V2_L4_GEN, + QED_PTP_FILTER_V2_L2_EVENT, + QED_PTP_FILTER_V2_L2_GEN, + QED_PTP_FILTER_V2_EVENT, + QED_PTP_FILTER_V2_GEN +}; + +enum qed_ptp_hwtstamp_tx_type { + QED_PTP_HWTSTAMP_TX_OFF, + QED_PTP_HWTSTAMP_TX_ON, }; #ifdef CONFIG_DCB @@ -229,8 +241,8 @@ struct qed_eth_dcbnl_ops { #endif struct qed_eth_ptp_ops { - int (*hwtstamp_tx_on)(struct qed_dev *); - int (*cfg_rx_filters)(struct qed_dev *, enum qed_ptp_filter_type); + int (*cfg_filters)(struct qed_dev *, enum qed_ptp_filter_type, + enum qed_ptp_hwtstamp_tx_type); int (*read_rx_ts)(struct qed_dev *, u64 *); int (*read_tx_ts)(struct qed_dev *, u64 *); int (*read_cc)(struct qed_dev *, u64 *); @@ -301,6 +313,14 @@ struct qed_eth_ops { int (*tunn_config)(struct qed_dev *cdev, struct qed_tunn_params *params); + + int (*ntuple_filter_config)(struct qed_dev *cdev, void *cookie, + dma_addr_t mapping, u16 length, + u16 vport_id, u16 rx_queue_id, + bool add_filter); + + int (*configure_arfs_searcher)(struct qed_dev *cdev, + bool en_searcher); }; const struct qed_eth_ops *qed_get_eth_ops(void); diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index fde56c436f7177e27173656f4fa4480752351c6d..c70ac13a97e66e9ded8a81f2716228619f55a98c 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -144,6 +144,7 @@ struct qed_dcbx_operational_params { bool enabled; bool ieee; bool cee; + bool local; u32 err; }; @@ -178,6 +179,12 @@ struct qed_eth_pf_params { * to update_pf_params routine invoked before slowpath start */ u16 num_cons; + + /* To enable arfs, previous to HW-init a positive number needs to be + * set [as filters require allocated searcher ILT memory]. + * This will set the maximal number of configured steering-filters. + */ + u32 num_arfs_filters; }; struct qed_fcoe_pf_params { @@ -263,7 +270,6 @@ struct qed_rdma_pf_params { * the doorbell BAR). */ u32 min_dpis; /* number of requested DPIs */ - u32 num_mrs; /* number of requested memory regions */ u32 num_qps; /* number of requested Queue Pairs */ u32 num_srqs; /* number of requested SRQ */ u8 roce_edpm_mode; /* see QED_ROCE_EDPM_MODE_ENABLE */ @@ -300,6 +306,11 @@ struct qed_sb_info { struct qed_dev *cdev; }; +enum qed_dev_type { + QED_DEV_TYPE_BB, + QED_DEV_TYPE_AH, +}; + struct qed_dev_info { unsigned long pci_mem_start; unsigned long pci_mem_end; @@ -325,6 +336,13 @@ struct qed_dev_info { u16 mtu; bool wol_support; + + enum qed_dev_type dev_type; + + /* Output parameters for qede */ + bool vxlan_enable; + bool gre_enable; + bool geneve_enable; }; enum qed_sb_type { @@ -421,6 +439,7 @@ struct qed_int_info { }; struct qed_common_cb_ops { + void (*arfs_filter_op)(void *dev, void *fltr, u8 fw_rc); void (*link_update)(void *dev, struct qed_link_output *link); void (*dcbx_aen)(void *dev, struct qed_dcbx_get *get, u32 mib_type); @@ -616,7 +635,7 @@ struct qed_common_ops { * @return 0 on success, error otherwise. */ int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, - u8 qid, u16 sb_id); + u16 qid, u16 sb_id); /** * @brief set_led - Configure LED mode @@ -752,7 +771,7 @@ enum qed_mf_mode { QED_MF_NPAR, }; -struct qed_eth_stats { +struct qed_eth_stats_common { u64 no_buff_discards; u64 packet_too_big_discard; u64 ttl0_discard; @@ -784,11 +803,6 @@ struct qed_eth_stats { u64 rx_256_to_511_byte_packets; u64 rx_512_to_1023_byte_packets; u64 rx_1024_to_1518_byte_packets; - u64 rx_1519_to_1522_byte_packets; - u64 rx_1519_to_2047_byte_packets; - u64 rx_2048_to_4095_byte_packets; - u64 rx_4096_to_9216_byte_packets; - u64 rx_9217_to_16383_byte_packets; u64 rx_crc_errors; u64 rx_mac_crtl_frames; u64 rx_pause_frames; @@ -805,14 +819,8 @@ struct qed_eth_stats { u64 tx_256_to_511_byte_packets; u64 tx_512_to_1023_byte_packets; u64 tx_1024_to_1518_byte_packets; - u64 tx_1519_to_2047_byte_packets; - u64 tx_2048_to_4095_byte_packets; - u64 tx_4096_to_9216_byte_packets; - u64 tx_9217_to_16383_byte_packets; u64 tx_pause_frames; u64 tx_pfc_frames; - u64 tx_lpi_entry_count; - u64 tx_total_collisions; u64 brb_truncates; u64 brb_discards; u64 rx_mac_bytes; @@ -827,6 +835,34 @@ struct qed_eth_stats { u64 tx_mac_ctrl_frames; }; +struct qed_eth_stats_bb { + u64 rx_1519_to_1522_byte_packets; + u64 rx_1519_to_2047_byte_packets; + u64 rx_2048_to_4095_byte_packets; + u64 rx_4096_to_9216_byte_packets; + u64 rx_9217_to_16383_byte_packets; + u64 tx_1519_to_2047_byte_packets; + u64 tx_2048_to_4095_byte_packets; + u64 tx_4096_to_9216_byte_packets; + u64 tx_9217_to_16383_byte_packets; + u64 tx_lpi_entry_count; + u64 tx_total_collisions; +}; + +struct qed_eth_stats_ah { + u64 rx_1519_to_max_byte_packets; + u64 tx_1519_to_max_byte_packets; +}; + +struct qed_eth_stats { + struct qed_eth_stats_common common; + + union { + struct qed_eth_stats_bb bb; + struct qed_eth_stats_ah ah; + }; +}; + #define QED_SB_IDX 0x0002 #define RX_PI 0 diff --git a/include/linux/qed/qed_iscsi_if.h b/include/linux/qed/qed_iscsi_if.h index f70bb81b8b6acfda1701ffcd073163db05a82770..3414649133d2e44fe36b9c12e12f42ba6970ed9d 100644 --- a/include/linux/qed/qed_iscsi_if.h +++ b/include/linux/qed/qed_iscsi_if.h @@ -67,6 +67,8 @@ struct qed_dev_iscsi_info { void __iomem *primary_dbq_rq_addr; void __iomem *secondary_bdq_rq_addr; + + u8 num_cqs; }; struct qed_iscsi_id_params { diff --git a/include/linux/qed/qed_roce_if.h b/include/linux/qed/qed_roce_if.h index f742d4312c9d96f1498554edee09654e065e9e74..cbb2ff0ce4bc34c5179a5ad77167ac0b721f6a30 100644 --- a/include/linux/qed/qed_roce_if.h +++ b/include/linux/qed/qed_roce_if.h @@ -240,6 +240,7 @@ struct qed_rdma_add_user_out_params { u64 dpi_addr; u64 dpi_phys_addr; u32 dpi_size; + u16 wid_count; }; enum roce_mode { @@ -533,6 +534,7 @@ enum qed_rdma_type { struct qed_dev_rdma_info { struct qed_dev_info common; enum qed_rdma_type rdma_type; + u8 user_dpm_enabled; }; struct qed_rdma_ops { diff --git a/include/linux/qed/rdma_common.h b/include/linux/qed/rdma_common.h index f773aa5e746ff47bb886aa19f568a24108dd0d1e..72c770f9f6669a5169f1780f8cae524bc6c3e0b4 100644 --- a/include/linux/qed/rdma_common.h +++ b/include/linux/qed/rdma_common.h @@ -52,7 +52,8 @@ #define RDMA_MAX_PDS (64 * 1024) #define RDMA_NUM_STATISTIC_COUNTERS MAX_NUM_VPORTS -#define RDMA_NUM_STATISTIC_COUNTERS_BB MAX_NUM_VPORTS_BB +#define RDMA_NUM_STATISTIC_COUNTERS_K2 MAX_NUM_VPORTS_K2 +#define RDMA_NUM_STATISTIC_COUNTERS_BB MAX_NUM_VPORTS_BB #define RDMA_TASK_TYPE (PROTOCOLID_ROCE) diff --git a/include/linux/qed/roce_common.h b/include/linux/qed/roce_common.h index bad02df213dfccd11cd25fa1b1e0decaf17f92c6..866f063026dedc6540d87d595bcacf9071f14681 100644 --- a/include/linux/qed/roce_common.h +++ b/include/linux/qed/roce_common.h @@ -38,4 +38,21 @@ #define ROCE_MAX_QPS (32 * 1024) +enum roce_async_events_type { + ROCE_ASYNC_EVENT_NONE = 0, + ROCE_ASYNC_EVENT_COMM_EST = 1, + ROCE_ASYNC_EVENT_SQ_DRAINED, + ROCE_ASYNC_EVENT_SRQ_LIMIT, + ROCE_ASYNC_EVENT_LAST_WQE_REACHED, + ROCE_ASYNC_EVENT_CQ_ERR, + ROCE_ASYNC_EVENT_LOCAL_INVALID_REQUEST_ERR, + ROCE_ASYNC_EVENT_LOCAL_CATASTROPHIC_ERR, + ROCE_ASYNC_EVENT_LOCAL_ACCESS_ERR, + ROCE_ASYNC_EVENT_QP_CATASTROPHIC_ERR, + ROCE_ASYNC_EVENT_CQ_OVERFLOW_ERR, + ROCE_ASYNC_EVENT_SRQ_EMPTY, + ROCE_ASYNC_EVENT_DESTROY_QP_DONE, + MAX_ROCE_ASYNC_EVENTS_TYPE +}; + #endif /* __ROCE_COMMON__ */ diff --git a/include/linux/qed/storage_common.h b/include/linux/qed/storage_common.h index 03f3e37ab059d5e4b48aa2b7f80366016b27fdf5..08df82a096b62de80e51f34122f4809f0eb6d27e 100644 --- a/include/linux/qed/storage_common.h +++ b/include/linux/qed/storage_common.h @@ -40,6 +40,8 @@ #define BDQ_ID_IMM_DATA (1) #define BDQ_NUM_IDS (2) +#define SCSI_NUM_SGES_SLOW_SGL_THR 8 + #define BDQ_MAX_EXTERNAL_RING_SIZE (1 << 15) struct scsi_bd { @@ -52,6 +54,16 @@ struct scsi_bdq_ram_drv_data { __le16 reserved0[3]; }; +struct scsi_sge { + struct regpair sge_addr; + __le32 sge_len; + __le32 reserved; +}; + +struct scsi_cached_sges { + struct scsi_sge sge[4]; +}; + struct scsi_drv_cmdq { __le16 cmdq_cons; __le16 reserved0; @@ -99,11 +111,19 @@ struct scsi_ram_per_bdq_resource_drv_data { struct scsi_bdq_ram_drv_data drv_data_per_bdq_id[BDQ_NUM_IDS]; }; -struct scsi_sge { - struct regpair sge_addr; - __le16 sge_len; - __le16 reserved0; - __le32 reserved1; +enum scsi_sgl_mode { + SCSI_TX_SLOW_SGL, + SCSI_FAST_SGL, + MAX_SCSI_SGL_MODE +}; + +struct scsi_sgl_params { + struct regpair sgl_addr; + __le32 sgl_total_length; + __le32 sge_offset; + __le16 sgl_num_sges; + u8 sgl_index; + u8 reserved; }; struct scsi_terminate_extra_params { diff --git a/include/linux/qed/tcp_common.h b/include/linux/qed/tcp_common.h index 46fe7856f1b22c828474257ceedf03c182958ec5..a5e843268f0e9431eacd07ad5cc74e10be690b0d 100644 --- a/include/linux/qed/tcp_common.h +++ b/include/linux/qed/tcp_common.h @@ -173,6 +173,7 @@ enum tcp_seg_placement_event { TCP_EVENT_ADD_ISLE_RIGHT, TCP_EVENT_ADD_ISLE_LEFT, TCP_EVENT_JOIN, + TCP_EVENT_DELETE_ISLES, TCP_EVENT_NOP, MAX_TCP_SEG_PLACEMENT_EVENT }; diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 799a63d0e1a823e38c0c54f52e5e297cde9bb60c..9c6f768b7d32f66db0b05e0b5500f4b71ecebde7 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -162,7 +162,6 @@ static inline bool sb_has_quota_active(struct super_block *sb, int type) * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; -extern const struct quotactl_ops dquot_quotactl_ops; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else diff --git a/include/linux/ras.h b/include/linux/ras.h index 2aceeafd6fe51fb34f5b99a4c506f92e3aa0353c..ffb147185e8df8ae62e4566d7da1b666fc290e1a 100644 --- a/include/linux/ras.h +++ b/include/linux/ras.h @@ -1,14 +1,25 @@ #ifndef __RAS_H__ #define __RAS_H__ +#include + #ifdef CONFIG_DEBUG_FS int ras_userspace_consumers(void); void ras_debugfs_init(void); int ras_add_daemon_trace(void); #else static inline int ras_userspace_consumers(void) { return 0; } -static inline void ras_debugfs_init(void) { return; } +static inline void ras_debugfs_init(void) { } static inline int ras_add_daemon_trace(void) { return 0; } #endif +#ifdef CONFIG_RAS_CEC +void __init cec_init(void); +int __init parse_cec_param(char *str); +int cec_add_elem(u64 pfn); +#else +static inline void __init cec_init(void) { } +static inline int cec_add_elem(u64 pfn) { return -ENODEV; } #endif + +#endif /* __RAS_H__ */ diff --git a/include/linux/rcu_node_tree.h b/include/linux/rcu_node_tree.h new file mode 100644 index 0000000000000000000000000000000000000000..4b766b61e1a026c226bb86186a72009b8074929f --- /dev/null +++ b/include/linux/rcu_node_tree.h @@ -0,0 +1,99 @@ +/* + * RCU node combining tree definitions. These are used to compute + * global attributes while avoiding common-case global contention. A key + * property that these computations rely on is a tournament-style approach + * where only one of the tasks contending a lower level in the tree need + * advance to the next higher level. If properly configured, this allows + * unlimited scalability while maintaining a constant level of contention + * on the root node. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Author: Paul E. McKenney + */ + +#ifndef __LINUX_RCU_NODE_TREE_H +#define __LINUX_RCU_NODE_TREE_H + +/* + * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and + * CONFIG_RCU_FANOUT_LEAF. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this did work well going from three levels to four. + * Of course, your mileage may vary. + */ + +#ifdef CONFIG_RCU_FANOUT +#define RCU_FANOUT CONFIG_RCU_FANOUT +#else /* #ifdef CONFIG_RCU_FANOUT */ +# ifdef CONFIG_64BIT +# define RCU_FANOUT 64 +# else +# define RCU_FANOUT 32 +# endif +#endif /* #else #ifdef CONFIG_RCU_FANOUT */ + +#ifdef CONFIG_RCU_FANOUT_LEAF +#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF +#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ +#define RCU_FANOUT_LEAF 16 +#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ + +#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) +#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) +#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) +#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT_1 +# define RCU_NUM_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_NODES NUM_RCU_LVL_0 +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } +# define RCU_NODE_NAME_INIT { "rcu_node_0" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } +#elif NR_CPUS <= RCU_FANOUT_2 +# define RCU_NUM_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } +#elif NR_CPUS <= RCU_FANOUT_3 +# define RCU_NUM_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } +#elif NR_CPUS <= RCU_FANOUT_4 +# define RCU_NUM_LVLS 4 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) +# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) +# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) +# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } +# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } +# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ + +#endif /* __LINUX_RCU_NODE_TREE_H */ diff --git a/include/linux/rcu_segcblist.h b/include/linux/rcu_segcblist.h new file mode 100644 index 0000000000000000000000000000000000000000..ba4d2621d9cabccc8130ad0b7576dfac5930f3aa --- /dev/null +++ b/include/linux/rcu_segcblist.h @@ -0,0 +1,90 @@ +/* + * RCU segmented callback lists + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney + */ + +#ifndef __INCLUDE_LINUX_RCU_SEGCBLIST_H +#define __INCLUDE_LINUX_RCU_SEGCBLIST_H + +/* Simple unsegmented callback lists. */ +struct rcu_cblist { + struct rcu_head *head; + struct rcu_head **tail; + long len; + long len_lazy; +}; + +#define RCU_CBLIST_INITIALIZER(n) { .head = NULL, .tail = &n.head } + +/* Complicated segmented callback lists. ;-) */ + +/* + * Index values for segments in rcu_segcblist structure. + * + * The segments are as follows: + * + * [head, *tails[RCU_DONE_TAIL]): + * Callbacks whose grace period has elapsed, and thus can be invoked. + * [*tails[RCU_DONE_TAIL], *tails[RCU_WAIT_TAIL]): + * Callbacks waiting for the current GP from the current CPU's viewpoint. + * [*tails[RCU_WAIT_TAIL], *tails[RCU_NEXT_READY_TAIL]): + * Callbacks that arrived before the next GP started, again from + * the current CPU's viewpoint. These can be handled by the next GP. + * [*tails[RCU_NEXT_READY_TAIL], *tails[RCU_NEXT_TAIL]): + * Callbacks that might have arrived after the next GP started. + * There is some uncertainty as to when a given GP starts and + * ends, but a CPU knows the exact times if it is the one starting + * or ending the GP. Other CPUs know that the previous GP ends + * before the next one starts. + * + * Note that RCU_WAIT_TAIL cannot be empty unless RCU_NEXT_READY_TAIL is also + * empty. + * + * The ->gp_seq[] array contains the grace-period number at which the + * corresponding segment of callbacks will be ready to invoke. A given + * element of this array is meaningful only when the corresponding segment + * is non-empty, and it is never valid for RCU_DONE_TAIL (whose callbacks + * are already ready to invoke) or for RCU_NEXT_TAIL (whose callbacks have + * not yet been assigned a grace-period number). + */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_CBLIST_NSEGS 4 + +struct rcu_segcblist { + struct rcu_head *head; + struct rcu_head **tails[RCU_CBLIST_NSEGS]; + unsigned long gp_seq[RCU_CBLIST_NSEGS]; + long len; + long len_lazy; +}; + +#define RCU_SEGCBLIST_INITIALIZER(n) \ +{ \ + .head = NULL, \ + .tails[RCU_DONE_TAIL] = &n.head, \ + .tails[RCU_WAIT_TAIL] = &n.head, \ + .tails[RCU_NEXT_READY_TAIL] = &n.head, \ + .tails[RCU_NEXT_TAIL] = &n.head, \ +} + +#endif /* __INCLUDE_LINUX_RCU_SEGCBLIST_H */ diff --git a/include/linux/rculist.h b/include/linux/rculist.h index 4f7a9561b8c415d069505dab632b922903d87d2e..b1fd8bf85fdc430eaaa2195cd6dc18417bb64585 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -509,7 +509,8 @@ static inline void hlist_add_tail_rcu(struct hlist_node *n, { struct hlist_node *i, *last = NULL; - for (i = hlist_first_rcu(h); i; i = hlist_next_rcu(i)) + /* Note: write side code, so rcu accessors are not needed. */ + for (i = h->first; i; i = i->next) last = i; if (last) { diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index de88b33c0974877bdaac25c0759f8f4d0ea3808b..e1e5d002fdb93a537110b68886656caf6c049fc4 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -97,6 +97,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long secs, unsigned long c_old, unsigned long c); +bool rcu_irq_enter_disabled(void); #else static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, @@ -113,6 +114,10 @@ static inline void rcutorture_record_test_transition(void) static inline void rcutorture_record_progress(unsigned long vernum) { } +static inline bool rcu_irq_enter_disabled(void) +{ + return false; +} #ifdef CONFIG_RCU_TRACE void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, @@ -363,15 +368,20 @@ static inline void rcu_init_nohz(void) #ifdef CONFIG_TASKS_RCU #define TASKS_RCU(x) x extern struct srcu_struct tasks_rcu_exit_srcu; -#define rcu_note_voluntary_context_switch(t) \ +#define rcu_note_voluntary_context_switch_lite(t) \ do { \ - rcu_all_qs(); \ if (READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) +#define rcu_note_voluntary_context_switch(t) \ + do { \ + rcu_all_qs(); \ + rcu_note_voluntary_context_switch_lite(t); \ + } while (0) #else /* #ifdef CONFIG_TASKS_RCU */ #define TASKS_RCU(x) do { } while (0) -#define rcu_note_voluntary_context_switch(t) rcu_all_qs() +#define rcu_note_voluntary_context_switch_lite(t) do { } while (0) +#define rcu_note_voluntary_context_switch(t) rcu_all_qs() #endif /* #else #ifdef CONFIG_TASKS_RCU */ /** @@ -1127,11 +1137,11 @@ do { \ * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ -#ifdef CONFIG_PPC +#ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ -#else /* #ifdef CONFIG_PPC */ +#else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) -#endif /* #else #ifdef CONFIG_PPC */ +#endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index b452953e21c8ae0b311a29d2af46a2a4d3036ce7..74d9c3a1feeec494b693501c6ec976c9dd487186 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -33,6 +33,11 @@ static inline int rcu_dynticks_snap(struct rcu_dynticks *rdtp) return 0; } +static inline bool rcu_eqs_special_set(int cpu) +{ + return false; /* Never flag non-existent other CPUs! */ +} + static inline unsigned long get_state_synchronize_rcu(void) { return 0; @@ -87,10 +92,11 @@ static inline void kfree_call_rcu(struct rcu_head *head, call_rcu(head, func); } -static inline void rcu_note_context_switch(void) -{ - rcu_sched_qs(); -} +#define rcu_note_context_switch(preempt) \ + do { \ + rcu_sched_qs(); \ + rcu_note_voluntary_context_switch_lite(current); \ + } while (0) /* * Take advantage of the fact that there is only one CPU, which @@ -212,14 +218,14 @@ static inline void exit_rcu(void) { } -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) extern int rcu_scheduler_active __read_mostly; void rcu_scheduler_starting(void); -#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#else /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ static inline void rcu_scheduler_starting(void) { } -#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #else #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) @@ -237,6 +243,10 @@ static inline bool rcu_is_watching(void) #endif /* #else defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ +static inline void rcu_request_urgent_qs_task(struct task_struct *t) +{ +} + static inline void rcu_all_qs(void) { barrier(); /* Avoid RCU read-side critical sections leaking across. */ diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 63a4e4cf40a54315705cec804c8f065bdcf2cdbb..0bacb6b2af6973681b3724a9e4d4311daf730c30 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -30,7 +30,7 @@ #ifndef __LINUX_RCUTREE_H #define __LINUX_RCUTREE_H -void rcu_note_context_switch(void); +void rcu_note_context_switch(bool preempt); int rcu_needs_cpu(u64 basem, u64 *nextevt); void rcu_cpu_stall_reset(void); @@ -41,7 +41,7 @@ void rcu_cpu_stall_reset(void); */ static inline void rcu_virt_note_context_switch(int cpu) { - rcu_note_context_switch(); + rcu_note_context_switch(false); } void synchronize_rcu_bh(void); @@ -108,6 +108,7 @@ void rcu_scheduler_starting(void); extern int rcu_scheduler_active __read_mostly; bool rcu_is_watching(void); +void rcu_request_urgent_qs_task(struct task_struct *t); void rcu_all_qs(void); diff --git a/include/linux/refcount.h b/include/linux/refcount.h index 0023fee4bbbcb4f38c4d3f6f9181e57261c4188a..b34aa649d204887d723569d00da544e0eba463a7 100644 --- a/include/linux/refcount.h +++ b/include/linux/refcount.h @@ -6,17 +6,36 @@ #include #include +/** + * refcount_t - variant of atomic_t specialized for reference counts + * @refs: atomic_t counter field + * + * The counter saturates at UINT_MAX and will not move once + * there. This avoids wrapping the counter and causing 'spurious' + * use-after-free bugs. + */ typedef struct refcount_struct { atomic_t refs; } refcount_t; #define REFCOUNT_INIT(n) { .refs = ATOMIC_INIT(n), } +/** + * refcount_set - set a refcount's value + * @r: the refcount + * @n: value to which the refcount will be set + */ static inline void refcount_set(refcount_t *r, unsigned int n) { atomic_set(&r->refs, n); } +/** + * refcount_read - get a refcount's value + * @r: the refcount + * + * Return: the refcount's value + */ static inline unsigned int refcount_read(const refcount_t *r) { return atomic_read(&r->refs); diff --git a/include/linux/regulator/arizona-ldo1.h b/include/linux/regulator/arizona-ldo1.h new file mode 100644 index 0000000000000000000000000000000000000000..c685f1277c639187afdbead1bce86a2a681566a8 --- /dev/null +++ b/include/linux/regulator/arizona-ldo1.h @@ -0,0 +1,24 @@ +/* + * Platform data for Arizona LDO1 regulator + * + * Copyright 2017 Cirrus Logic + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef ARIZONA_LDO1_H +#define ARIZONA_LDO1_H + +struct regulator_init_data; + +struct arizona_ldo1_pdata { + /** GPIO controlling LDOENA, if any */ + int ldoena; + + /** Regulator configuration for LDO1 */ + const struct regulator_init_data *init_data; +}; + +#endif diff --git a/include/linux/regulator/arizona-micsupp.h b/include/linux/regulator/arizona-micsupp.h new file mode 100644 index 0000000000000000000000000000000000000000..616842619c0084928c4aaee5dbc4f441cfc60c6a --- /dev/null +++ b/include/linux/regulator/arizona-micsupp.h @@ -0,0 +1,21 @@ +/* + * Platform data for Arizona micsupp regulator + * + * Copyright 2017 Cirrus Logic + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef ARIZONA_MICSUPP_H +#define ARIZONA_MICSUPP_H + +struct regulator_init_data; + +struct arizona_micsupp_pdata { + /** Regulator configuration for micsupp */ + const struct regulator_init_data *init_data; +}; + +#endif diff --git a/include/linux/regulator/consumer.h b/include/linux/regulator/consumer.h index ea0fffa5faebb1688ecdb6166db8c7f4d2836c40..df176d7c2b87c00356a11b383a61185e7f3e47ae 100644 --- a/include/linux/regulator/consumer.h +++ b/include/linux/regulator/consumer.h @@ -119,6 +119,7 @@ struct regmap; #define REGULATOR_EVENT_ABORT_VOLTAGE_CHANGE 0x200 #define REGULATOR_EVENT_PRE_DISABLE 0x400 #define REGULATOR_EVENT_ABORT_DISABLE 0x800 +#define REGULATOR_EVENT_ENABLE 0x1000 /* * Regulator errors that can be queried using regulator_get_error_flags diff --git a/include/linux/regulator/driver.h b/include/linux/regulator/driver.h index dac8e7b16bc679e8550ee7781cbda5194791c535..94417b4226bd88a42d5d53c845266ee97ec30879 100644 --- a/include/linux/regulator/driver.h +++ b/include/linux/regulator/driver.h @@ -292,6 +292,14 @@ enum regulator_type { * set_active_discharge * @active_discharge_reg: Register for control when using regmap * set_active_discharge + * @soft_start_reg: Register for control when using regmap set_soft_start + * @soft_start_mask: Mask for control when using regmap set_soft_start + * @soft_start_val_on: Enabling value for control when using regmap + * set_soft_start + * @pull_down_reg: Register for control when using regmap set_pull_down + * @pull_down_mask: Mask for control when using regmap set_pull_down + * @pull_down_val_on: Enabling value for control when using regmap + * set_pull_down * * @enable_time: Time taken for initial enable of regulator (in uS). * @off_on_delay: guard time (in uS), before re-enabling a regulator @@ -345,6 +353,12 @@ struct regulator_desc { unsigned int active_discharge_off; unsigned int active_discharge_mask; unsigned int active_discharge_reg; + unsigned int soft_start_reg; + unsigned int soft_start_mask; + unsigned int soft_start_val_on; + unsigned int pull_down_reg; + unsigned int pull_down_mask; + unsigned int pull_down_val_on; unsigned int enable_time; @@ -429,6 +443,8 @@ struct regulator_dev { struct regulator_enable_gpio *ena_pin; unsigned int ena_gpio_state:1; + unsigned int is_switch:1; + /* time when this regulator was disabled last time */ unsigned long last_off_jiffy; }; @@ -476,6 +492,8 @@ int regulator_set_voltage_time_sel(struct regulator_dev *rdev, unsigned int new_selector); int regulator_set_bypass_regmap(struct regulator_dev *rdev, bool enable); int regulator_get_bypass_regmap(struct regulator_dev *rdev, bool *enable); +int regulator_set_soft_start_regmap(struct regulator_dev *rdev); +int regulator_set_pull_down_regmap(struct regulator_dev *rdev); int regulator_set_active_discharge_regmap(struct regulator_dev *rdev, bool enable); diff --git a/include/linux/regulator/machine.h b/include/linux/regulator/machine.h index c9f795e9a2ee26aaf562e9a97a2fe2f963a2f054..117699d1f7df4f72491bd65eccec39a160acf646 100644 --- a/include/linux/regulator/machine.h +++ b/include/linux/regulator/machine.h @@ -108,6 +108,8 @@ struct regulator_state { * @initial_state: Suspend state to set by default. * @initial_mode: Mode to set at startup. * @ramp_delay: Time to settle down after voltage change (unit: uV/us) + * @settling_time: Time to settle down after voltage change when voltage + * change is non-linear (unit: microseconds). * @active_discharge: Enable/disable active discharge. The enum * regulator_active_discharge values are used for * initialisation. @@ -149,6 +151,7 @@ struct regulation_constraints { unsigned int initial_mode; unsigned int ramp_delay; + unsigned int settling_time; unsigned int enable_time; unsigned int active_discharge; diff --git a/include/linux/regulator/pfuze100.h b/include/linux/regulator/pfuze100.h index 70c6c66c5bcf16cc4be324a1aaf40b1981e56413..e0ccf46f66cf34ae44296de9410fa107270aff4e 100644 --- a/include/linux/regulator/pfuze100.h +++ b/include/linux/regulator/pfuze100.h @@ -48,6 +48,7 @@ #define PFUZE200_VGEN4 10 #define PFUZE200_VGEN5 11 #define PFUZE200_VGEN6 12 +#define PFUZE200_COIN 13 #define PFUZE3000_SW1A 0 #define PFUZE3000_SW1B 1 diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h index 092292b6675e2cf08b1138410a8488bc87986495..7d56a7ea2b2e8cdcf471da6aaa6ac0be3c51acaa 100644 --- a/include/linux/rhashtable.h +++ b/include/linux/rhashtable.h @@ -49,6 +49,21 @@ /* Base bits plus 1 bit for nulls marker */ #define RHT_HASH_RESERVED_SPACE (RHT_BASE_BITS + 1) +/* Maximum chain length before rehash + * + * The maximum (not average) chain length grows with the size of the hash + * table, at a rate of (log N)/(log log N). + * + * The value of 16 is selected so that even if the hash table grew to + * 2^32 you would not expect the maximum chain length to exceed it + * unless we are under attack (or extremely unlucky). + * + * As this limit is only to detect attacks, we don't need to set it to a + * lower value as you'd need the chain length to vastly exceed 16 to have + * any real effect on the system. + */ +#define RHT_ELASTICITY 16u + struct rhash_head { struct rhash_head __rcu *next; }; @@ -110,29 +125,25 @@ struct rhashtable; * @key_len: Length of key * @key_offset: Offset of key in struct to be hashed * @head_offset: Offset of rhash_head in struct to be hashed - * @insecure_max_entries: Maximum number of entries (may be exceeded) * @max_size: Maximum size while expanding * @min_size: Minimum size while shrinking - * @nulls_base: Base value to generate nulls marker - * @insecure_elasticity: Set to true to disable chain length checks - * @automatic_shrinking: Enable automatic shrinking of tables * @locks_mul: Number of bucket locks to allocate per cpu (default: 128) + * @automatic_shrinking: Enable automatic shrinking of tables + * @nulls_base: Base value to generate nulls marker * @hashfn: Hash function (default: jhash2 if !(key_len % 4), or jhash) * @obj_hashfn: Function to hash object * @obj_cmpfn: Function to compare key with object */ struct rhashtable_params { - size_t nelem_hint; - size_t key_len; - size_t key_offset; - size_t head_offset; - unsigned int insecure_max_entries; + u16 nelem_hint; + u16 key_len; + u16 key_offset; + u16 head_offset; unsigned int max_size; - unsigned int min_size; - u32 nulls_base; - bool insecure_elasticity; + u16 min_size; bool automatic_shrinking; - size_t locks_mul; + u8 locks_mul; + u32 nulls_base; rht_hashfn_t hashfn; rht_obj_hashfn_t obj_hashfn; rht_obj_cmpfn_t obj_cmpfn; @@ -143,8 +154,8 @@ struct rhashtable_params { * @tbl: Bucket table * @nelems: Number of elements in table * @key_len: Key length for hashfn - * @elasticity: Maximum chain length before rehash * @p: Configuration parameters + * @max_elems: Maximum number of elements in table * @rhlist: True if this is an rhltable * @run_work: Deferred worker to expand/shrink asynchronously * @mutex: Mutex to protect current/future table swapping @@ -154,8 +165,8 @@ struct rhashtable { struct bucket_table __rcu *tbl; atomic_t nelems; unsigned int key_len; - unsigned int elasticity; struct rhashtable_params p; + unsigned int max_elems; bool rhlist; struct work_struct run_work; struct mutex mutex; @@ -318,8 +329,7 @@ static inline bool rht_grow_above_100(const struct rhashtable *ht, static inline bool rht_grow_above_max(const struct rhashtable *ht, const struct bucket_table *tbl) { - return ht->p.insecure_max_entries && - atomic_read(&ht->nelems) >= ht->p.insecure_max_entries; + return atomic_read(&ht->nelems) >= ht->max_elems; } /* The bucket lock is selected based on the hash and protects mutations @@ -726,7 +736,7 @@ static inline void *__rhashtable_insert_fast( return rhashtable_insert_slow(ht, key, obj); } - elasticity = ht->elasticity; + elasticity = RHT_ELASTICITY; pprev = rht_bucket_insert(ht, tbl, hash); data = ERR_PTR(-ENOMEM); if (!pprev) @@ -915,6 +925,28 @@ static inline int rhashtable_lookup_insert_fast( return ret == NULL ? 0 : -EEXIST; } +/** + * rhashtable_lookup_get_insert_fast - lookup and insert object into hash table + * @ht: hash table + * @obj: pointer to hash head inside object + * @params: hash table parameters + * + * Just like rhashtable_lookup_insert_fast(), but this function returns the + * object if it exists, NULL if it did not and the insertion was successful, + * and an ERR_PTR otherwise. + */ +static inline void *rhashtable_lookup_get_insert_fast( + struct rhashtable *ht, struct rhash_head *obj, + const struct rhashtable_params params) +{ + const char *key = rht_obj(ht, obj); + + BUG_ON(ht->p.obj_hashfn); + + return __rhashtable_insert_fast(ht, key + ht->p.key_offset, obj, params, + false); +} + /** * rhashtable_lookup_insert_key - search and insert object to hash table * with explicit key diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index b6d4568795a78839f075c43681de5da1af0d0674..ee9b461af095389f7dd238b10e6cff120bcb773d 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -185,7 +185,7 @@ size_t ring_buffer_page_len(void *page); void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu); -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 8c89e902df3e7ad35ecbff3009e2d88d06026b2a..43ef2c30cb0f59f83c05d23eea4a8aad87edefd1 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -83,19 +83,17 @@ struct anon_vma_chain { }; enum ttu_flags { - TTU_UNMAP = 1, /* unmap mode */ - TTU_MIGRATION = 2, /* migration mode */ - TTU_MUNLOCK = 4, /* munlock mode */ - TTU_LZFREE = 8, /* lazy free mode */ - TTU_SPLIT_HUGE_PMD = 16, /* split huge PMD if any */ - - TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ - TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ - TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ - TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible + TTU_MIGRATION = 0x1, /* migration mode */ + TTU_MUNLOCK = 0x2, /* munlock mode */ + + TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ + TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ + TTU_IGNORE_ACCESS = 0x10, /* don't age */ + TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ + TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ - TTU_RMAP_LOCKED = (1 << 12) /* do not grab rmap lock: + TTU_RMAP_LOCKED = 0x80 /* do not grab rmap lock: * caller holds it */ }; @@ -193,9 +191,7 @@ static inline void page_dup_rmap(struct page *page, bool compound) int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -#define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) - -int try_to_unmap(struct page *, enum ttu_flags flags); +bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) @@ -239,7 +235,7 @@ int page_mkclean(struct page *); * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ -int try_to_munlock(struct page *); +void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); @@ -261,15 +257,19 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); */ struct rmap_walk_control { void *arg; - int (*rmap_one)(struct page *page, struct vm_area_struct *vma, + /* + * Return false if page table scanning in rmap_walk should be stopped. + * Otherwise, return true. + */ + bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; -int rmap_walk(struct page *page, struct rmap_walk_control *rwc); -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk(struct page *page, struct rmap_walk_control *rwc); +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ @@ -285,7 +285,7 @@ static inline int page_referenced(struct page *page, int is_locked, return 0; } -#define try_to_unmap(page, refs) SWAP_FAIL +#define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { @@ -295,13 +295,4 @@ static inline int page_mkclean(struct page *page) #endif /* CONFIG_MMU */ -/* - * Return values of try_to_unmap - */ -#define SWAP_SUCCESS 0 -#define SWAP_AGAIN 1 -#define SWAP_FAIL 2 -#define SWAP_MLOCK 3 -#define SWAP_LZFREE 4 - #endif /* _LINUX_RMAP_H */ diff --git a/include/linux/rodata_test.h b/include/linux/rodata_test.h index ea05f6c514139c39e1c52abd7d45c7337d764152..84766bcdd01f934a632af261f998882cb46eeb27 100644 --- a/include/linux/rodata_test.h +++ b/include/linux/rodata_test.h @@ -14,7 +14,6 @@ #define _RODATA_TEST_H #ifdef CONFIG_DEBUG_RODATA_TEST -extern const int rodata_test_data; void rodata_test(void); #else static inline void rodata_test(void) {} diff --git a/include/linux/rpmsg/qcom_smd.h b/include/linux/rpmsg/qcom_smd.h index 8ec8b6439b25edb956bb06bcf116c190a46ce62c..f27917e0a10114f9c72e228c42b6f60efd51b541 100644 --- a/include/linux/rpmsg/qcom_smd.h +++ b/include/linux/rpmsg/qcom_smd.h @@ -6,7 +6,7 @@ struct qcom_smd_edge; -#if IS_ENABLED(CONFIG_RPMSG_QCOM_SMD) || IS_ENABLED(CONFIG_QCOM_SMD) +#if IS_ENABLED(CONFIG_RPMSG_QCOM_SMD) struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent, struct device_node *node); diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h index d4e0a204c118c7244e7e5d167cc3d0429832de01..a1904aadbc45004ba18c8db06f184a164908cd46 100644 --- a/include/linux/sbitmap.h +++ b/include/linux/sbitmap.h @@ -175,6 +175,25 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); +/** + * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, + * limiting the depth used from each word. + * @sb: Bitmap to allocate from. + * @alloc_hint: Hint for where to start searching for a free bit. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * + * This rather specific operation allows for having multiple users with + * different allocation limits. E.g., there can be a high-priority class that + * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() + * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority + * class can only allocate half of the total bits in the bitmap, preventing it + * from starving out the high-priority class. + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, + unsigned long shallow_depth); + /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. @@ -325,6 +344,19 @@ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); +/** + * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word, with preemption + * already disabled. + * @sbq: Bitmap queue to allocate from. + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int shallow_depth); + /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. @@ -345,6 +377,29 @@ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, return nr; } +/** + * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct + * sbitmap_queue, limiting the depth used from each word. + * @sbq: Bitmap queue to allocate from. + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to + * sbitmap_queue_clear()). + * @shallow_depth: The maximum number of bits to allocate from a single word. + * See sbitmap_get_shallow(). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int *cpu, + unsigned int shallow_depth) +{ + int nr; + + *cpu = get_cpu(); + nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); + put_cpu(); + return nr; +} + /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. diff --git a/include/linux/sched.h b/include/linux/sched.h index 4cf9a59a4d08ed181f30d7cefa56db92f48408d2..2b69fc65020131bed57e41d4b228ed0c59692a5c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -186,7 +186,7 @@ extern long io_schedule_timeout(long timeout); extern void io_schedule(void); /** - * struct prev_cputime - snaphsot of system and user cputime + * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode * @stime: time spent in system mode * @lock: protects the above two fields @@ -779,6 +779,8 @@ struct task_struct { /* PI waiters blocked on a rt_mutex held by this task: */ struct rb_root pi_waiters; struct rb_node *pi_waiters_leftmost; + /* Updated under owner's pi_lock and rq lock */ + struct task_struct *pi_top_task; /* Deadlock detection and priority inheritance handling: */ struct rt_mutex_waiter *pi_blocked_on; #endif @@ -1041,6 +1043,13 @@ struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* A live task holds one reference: */ atomic_t stack_refcount; +#endif +#ifdef CONFIG_LIVEPATCH + int patch_state; +#endif +#ifdef CONFIG_SECURITY + /* Used by LSM modules for access restriction: */ + void *security; #endif /* CPU-specific state of this task: */ struct thread_struct thread; @@ -1215,9 +1224,9 @@ extern struct pid *cad_pid; #define PF_USED_ASYNC 0x00004000 /* Used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* This thread should not be frozen */ #define PF_FROZEN 0x00010000 /* Frozen for system suspend */ -#define PF_FSTRANS 0x00020000 /* Inside a filesystem transaction */ -#define PF_KSWAPD 0x00040000 /* I am kswapd */ -#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */ +#define PF_KSWAPD 0x00020000 /* I am kswapd */ +#define PF_MEMALLOC_NOFS 0x00040000 /* All allocation requests will inherit GFP_NOFS */ +#define PF_MEMALLOC_NOIO 0x00080000 /* All allocation requests will inherit GFP_NOIO */ #define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */ #define PF_KTHREAD 0x00200000 /* I am a kernel thread */ #define PF_RANDOMIZE 0x00400000 /* Randomize virtual address space */ @@ -1260,7 +1269,6 @@ extern struct pid *cad_pid; #define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */ #define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */ #define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */ -#define PFA_LMK_WAITING 3 /* Lowmemorykiller is waiting */ #define TASK_PFA_TEST(name, func) \ @@ -1286,14 +1294,11 @@ TASK_PFA_TEST(SPREAD_SLAB, spread_slab) TASK_PFA_SET(SPREAD_SLAB, spread_slab) TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab) -TASK_PFA_TEST(LMK_WAITING, lmk_waiting) -TASK_PFA_SET(LMK_WAITING, lmk_waiting) - static inline void -tsk_restore_flags(struct task_struct *task, unsigned long orig_flags, unsigned long flags) +current_restore_flags(unsigned long orig_flags, unsigned long flags) { - task->flags &= ~flags; - task->flags |= orig_flags & flags; + current->flags &= ~flags; + current->flags |= orig_flags & flags; } extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 830953ebb391fa80e39af4c4f61f1cdd766a3040..2b24a6974847664165bb323aa00c8fae6f3506b0 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -149,13 +149,21 @@ static inline bool in_vfork(struct task_struct *tsk) return ret; } -/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags - * __GFP_FS is also cleared as it implies __GFP_IO. +/* + * Applies per-task gfp context to the given allocation flags. + * PF_MEMALLOC_NOIO implies GFP_NOIO + * PF_MEMALLOC_NOFS implies GFP_NOFS */ -static inline gfp_t memalloc_noio_flags(gfp_t flags) +static inline gfp_t current_gfp_context(gfp_t flags) { + /* + * NOIO implies both NOIO and NOFS and it is a weaker context + * so always make sure it makes precendence + */ if (unlikely(current->flags & PF_MEMALLOC_NOIO)) flags &= ~(__GFP_IO | __GFP_FS); + else if (unlikely(current->flags & PF_MEMALLOC_NOFS)) + flags &= ~__GFP_FS; return flags; } @@ -171,4 +179,28 @@ static inline void memalloc_noio_restore(unsigned int flags) current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } +static inline unsigned int memalloc_nofs_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC_NOFS; + current->flags |= PF_MEMALLOC_NOFS; + return flags; +} + +static inline void memalloc_nofs_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; +} + +static inline unsigned int memalloc_noreclaim_save(void) +{ + unsigned int flags = current->flags & PF_MEMALLOC; + current->flags |= PF_MEMALLOC; + return flags; +} + +static inline void memalloc_noreclaim_restore(unsigned int flags) +{ + current->flags = (current->flags & ~PF_MEMALLOC) | flags; +} + #endif /* _LINUX_SCHED_MM_H */ diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index 3bd668414f61a593e9f40ff8364a3dce63d78d01..f93329aba31a9ebf814600447b4abc107827dd17 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -18,27 +18,20 @@ static inline int rt_task(struct task_struct *p) } #ifdef CONFIG_RT_MUTEXES -extern int rt_mutex_getprio(struct task_struct *p); -extern void rt_mutex_setprio(struct task_struct *p, int prio); -extern int rt_mutex_get_effective_prio(struct task_struct *task, int newprio); -extern struct task_struct *rt_mutex_get_top_task(struct task_struct *task); +/* + * Must hold either p->pi_lock or task_rq(p)->lock. + */ +static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) +{ + return p->pi_top_task; +} +extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return tsk->pi_blocked_on != NULL; } #else -static inline int rt_mutex_getprio(struct task_struct *p) -{ - return p->normal_prio; -} - -static inline int rt_mutex_get_effective_prio(struct task_struct *task, - int newprio) -{ - return newprio; -} - static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 2cf446704cd4fce8c380d36df074f8822787b2e2..c06d63b3a58389b92483bb6a0f24f427324ac000 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -293,7 +293,6 @@ extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *, const struct cred *, u32); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); -extern int kill_proc_info(int, struct siginfo *, pid_t); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int, struct task_struct *); diff --git a/include/linux/security.h b/include/linux/security.h index 96899fad701696ce9212055a5a4e049cfcf0f933..af675b57664592148e23691a5a3c15ade42ddd0f 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -133,6 +133,10 @@ extern unsigned long dac_mmap_min_addr; /* setfsuid or setfsgid, id0 == fsuid or fsgid */ #define LSM_SETID_FS 8 +/* Flags for security_task_prlimit(). */ +#define LSM_PRLIMIT_READ 1 +#define LSM_PRLIMIT_WRITE 2 + /* forward declares to avoid warnings */ struct sched_param; struct request_sock; @@ -304,6 +308,7 @@ int security_file_send_sigiotask(struct task_struct *tsk, int security_file_receive(struct file *file); int security_file_open(struct file *file, const struct cred *cred); int security_task_create(unsigned long clone_flags); +int security_task_alloc(struct task_struct *task, unsigned long clone_flags); void security_task_free(struct task_struct *task); int security_cred_alloc_blank(struct cred *cred, gfp_t gfp); void security_cred_free(struct cred *cred); @@ -324,6 +329,8 @@ void security_task_getsecid(struct task_struct *p, u32 *secid); int security_task_setnice(struct task_struct *p, int nice); int security_task_setioprio(struct task_struct *p, int ioprio); int security_task_getioprio(struct task_struct *p); +int security_task_prlimit(const struct cred *cred, const struct cred *tcred, + unsigned int flags); int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim); int security_task_setscheduler(struct task_struct *p); @@ -855,6 +862,12 @@ static inline int security_task_create(unsigned long clone_flags) return 0; } +static inline int security_task_alloc(struct task_struct *task, + unsigned long clone_flags) +{ + return 0; +} + static inline void security_task_free(struct task_struct *task) { } @@ -949,6 +962,13 @@ static inline int security_task_getioprio(struct task_struct *p) return 0; } +static inline int security_task_prlimit(const struct cred *cred, + const struct cred *tcred, + unsigned int flags) +{ + return 0; +} + static inline int security_task_setrlimit(struct task_struct *p, unsigned int resource, struct rlimit *new_rlim) diff --git a/include/linux/sem.h b/include/linux/sem.h index 4fc222f8755d82113deba19eea85e8c47a844818..9edec926e9d968a3900ec0f683b039a6cf6f6ef7 100644 --- a/include/linux/sem.h +++ b/include/linux/sem.h @@ -10,8 +10,7 @@ struct task_struct; /* One sem_array data structure for each set of semaphores in the system. */ struct sem_array { - struct kern_ipc_perm ____cacheline_aligned_in_smp - sem_perm; /* permissions .. see ipc.h */ + struct kern_ipc_perm sem_perm; /* permissions .. see ipc.h */ time_t sem_ctime; /* last change time */ struct sem *sem_base; /* ptr to first semaphore in array */ struct list_head pending_alter; /* pending operations */ diff --git a/include/linux/serdev.h b/include/linux/serdev.h index 9519da6253a83bac113219c41ecafda95be3a029..cda76c6506ca411c42c5e1cdf6d287263d97dde6 100644 --- a/include/linux/serdev.h +++ b/include/linux/serdev.h @@ -15,6 +15,8 @@ #include #include +#include +#include struct serdev_controller; struct serdev_device; @@ -39,12 +41,16 @@ struct serdev_device_ops { * @nr: Device number on serdev bus. * @ctrl: serdev controller managing this device. * @ops: Device operations. + * @write_comp Completion used by serdev_device_write() internally + * @write_lock Lock to serialize access when writing data */ struct serdev_device { struct device dev; int nr; struct serdev_controller *ctrl; const struct serdev_device_ops *ops; + struct completion write_comp; + struct mutex write_lock; }; static inline struct serdev_device *to_serdev_device(struct device *d) @@ -81,6 +87,9 @@ struct serdev_controller_ops { void (*close)(struct serdev_controller *); void (*set_flow_control)(struct serdev_controller *, bool); unsigned int (*set_baudrate)(struct serdev_controller *, unsigned int); + void (*wait_until_sent)(struct serdev_controller *, long); + int (*get_tiocm)(struct serdev_controller *); + int (*set_tiocm)(struct serdev_controller *, unsigned int, unsigned int); }; /** @@ -165,7 +174,7 @@ static inline void serdev_controller_write_wakeup(struct serdev_controller *ctrl if (!serdev || !serdev->ops->write_wakeup) return; - serdev->ops->write_wakeup(ctrl->serdev); + serdev->ops->write_wakeup(serdev); } static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl, @@ -177,7 +186,7 @@ static inline int serdev_controller_receive_buf(struct serdev_controller *ctrl, if (!serdev || !serdev->ops->receive_buf) return -EINVAL; - return serdev->ops->receive_buf(ctrl->serdev, data, count); + return serdev->ops->receive_buf(serdev, data, count); } #if IS_ENABLED(CONFIG_SERIAL_DEV_BUS) @@ -186,7 +195,11 @@ int serdev_device_open(struct serdev_device *); void serdev_device_close(struct serdev_device *); unsigned int serdev_device_set_baudrate(struct serdev_device *, unsigned int); void serdev_device_set_flow_control(struct serdev_device *, bool); -int serdev_device_write_buf(struct serdev_device *, const unsigned char *, size_t); +void serdev_device_wait_until_sent(struct serdev_device *, long); +int serdev_device_get_tiocm(struct serdev_device *); +int serdev_device_set_tiocm(struct serdev_device *, int, int); +void serdev_device_write_wakeup(struct serdev_device *); +int serdev_device_write(struct serdev_device *, const unsigned char *, size_t, unsigned long); void serdev_device_write_flush(struct serdev_device *); int serdev_device_write_room(struct serdev_device *); @@ -223,7 +236,17 @@ static inline unsigned int serdev_device_set_baudrate(struct serdev_device *sdev return 0; } static inline void serdev_device_set_flow_control(struct serdev_device *sdev, bool enable) {} -static inline int serdev_device_write_buf(struct serdev_device *sdev, const unsigned char *buf, size_t count) +static inline void serdev_device_wait_until_sent(struct serdev_device *sdev, long timeout) {} +static inline int serdev_device_get_tiocm(struct serdev_device *serdev) +{ + return -ENOTSUPP; +} +static inline int serdev_device_set_tiocm(struct serdev_device *serdev, int set, int clear) +{ + return -ENOTSUPP; +} +static inline int serdev_device_write(struct serdev_device *sdev, const unsigned char *buf, + size_t count, unsigned long timeout) { return -ENODEV; } @@ -238,6 +261,36 @@ static inline int serdev_device_write_room(struct serdev_device *sdev) #endif /* CONFIG_SERIAL_DEV_BUS */ +static inline bool serdev_device_get_cts(struct serdev_device *serdev) +{ + int status = serdev_device_get_tiocm(serdev); + return !!(status & TIOCM_CTS); +} + +static inline int serdev_device_wait_for_cts(struct serdev_device *serdev, bool state, int timeout_ms) +{ + unsigned long timeout; + bool signal; + + timeout = jiffies + msecs_to_jiffies(timeout_ms); + while (time_is_after_jiffies(timeout)) { + signal = serdev_device_get_cts(serdev); + if (signal == state) + return 0; + usleep_range(1000, 2000); + } + + return -ETIMEDOUT; +} + +static inline int serdev_device_set_rts(struct serdev_device *serdev, bool enable) +{ + if (enable) + return serdev_device_set_tiocm(serdev, TIOCM_RTS, 0); + else + return serdev_device_set_tiocm(serdev, 0, TIOCM_RTS); +} + /* * serdev hooks into TTY core */ @@ -259,4 +312,11 @@ static inline struct device *serdev_tty_port_register(struct tty_port *port, static inline void serdev_tty_port_unregister(struct tty_port *port) {} #endif /* CONFIG_SERIAL_DEV_CTRL_TTYPORT */ +static inline int serdev_device_write_buf(struct serdev_device *serdev, + const unsigned char *data, + size_t count) +{ + return serdev_device_write(serdev, data, count, 0); +} + #endif /*_LINUX_SERDEV_H */ diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 58484fb35cc869b63fe0a1dec5fa39f1b9eace84..64d892f1e5cd833bd30003e6f64c99da3a73f5f7 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -247,6 +247,7 @@ struct uart_port { unsigned char suspended; unsigned char irq_wake; unsigned char unused[2]; + const char *name; /* port name */ struct attribute_group *attr_group; /* port specific attributes */ const struct attribute_group **tty_groups; /* all attributes (serial core use only) */ struct serial_rs485 rs485; diff --git a/include/linux/serio.h b/include/linux/serio.h index c733cff44e18a74f3949032d1413356df0a9e226..138a5efe863a3cb569de62fe00f93bbe38051279 100644 --- a/include/linux/serio.h +++ b/include/linux/serio.h @@ -77,6 +77,7 @@ struct serio_driver { irqreturn_t (*interrupt)(struct serio *, unsigned char, unsigned int); int (*connect)(struct serio *, struct serio_driver *drv); int (*reconnect)(struct serio *); + int (*fast_reconnect)(struct serio *); void (*disconnect)(struct serio *); void (*cleanup)(struct serio *); diff --git a/include/linux/signal.h b/include/linux/signal.h index 94ad6eea9550352d8e717a90cf50b462b238bfa9..1f5a16620693a644281d6e108c256de0800f977a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -390,10 +390,6 @@ int unhandled_signal(struct task_struct *tsk, int sig); #define sig_kernel_ignore(sig) siginmask(sig, SIG_KERNEL_IGNORE_MASK) #define sig_kernel_stop(sig) siginmask(sig, SIG_KERNEL_STOP_MASK) -#define sig_user_defined(t, signr) \ - (((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_DFL) && \ - ((t)->sighand->action[(signr)-1].sa.sa_handler != SIG_IGN)) - #define sig_fatal(t, signr) \ (!siginmask(signr, SIG_KERNEL_IGNORE_MASK|SIG_KERNEL_STOP_MASK) && \ (t)->sighand->action[(signr)-1].sa.sa_handler == SIG_DFL) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c776abd86937f52b002773728b5611746f199281..a098d95b3d84d0246adc88651af9978d6810fb01 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -413,14 +413,15 @@ struct ubuf_info { * the end of the header data, ie. at skb->end. */ struct skb_shared_info { + unsigned short _unused; unsigned char nr_frags; __u8 tx_flags; unsigned short gso_size; /* Warning: this field is not always filled in (UFO)! */ unsigned short gso_segs; - unsigned short gso_type; struct sk_buff *frag_list; struct skb_shared_hwtstamps hwtstamps; + unsigned int gso_type; u32 tskey; __be32 ip6_frag_id; @@ -491,6 +492,8 @@ enum { SKB_GSO_TUNNEL_REMCSUM = 1 << 14, SKB_GSO_SCTP = 1 << 15, + + SKB_GSO_ESP = 1 << 16, }; #if BITS_PER_LONG > 32 @@ -3113,7 +3116,7 @@ struct sk_buff *pskb_extract(struct sk_buff *skb, int off, int to_copy, static inline int memcpy_from_msg(void *data, struct msghdr *msg, int len) { - return copy_from_iter(data, len, &msg->msg_iter) == len ? 0 : -EFAULT; + return copy_from_iter_full(data, len, &msg->msg_iter) ? 0 : -EFAULT; } static inline int memcpy_to_msg(struct msghdr *msg, void *data, int len) diff --git a/include/linux/slab.h b/include/linux/slab.h index 3c37a8c5192159c88c892779ffa71c17d23b7736..04a7f7993e678d6454b6efaa608db3b88ea78eec 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -28,7 +28,7 @@ #define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ #define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ /* - * SLAB_DESTROY_BY_RCU - **WARNING** READ THIS! + * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() @@ -61,8 +61,10 @@ * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. + * + * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_DESTROY_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ #define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ #define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ diff --git a/include/linux/smp.h b/include/linux/smp.h index 8e0cb7a0f836faadd10ee917f8fe2c76d8965fa6..68123c1fe54918c051292eb5ba3427df09f31c2f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); +extern int __boot_cpu_id; + +static inline int get_boot_cpu_id(void) +{ + return __boot_cpu_id; +} + #else /* !SMP */ static inline void smp_send_stop(void) { } @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); } static inline void smp_init(void) { } #endif +static inline int get_boot_cpu_id(void) +{ + return 0; +} + #endif /* !SMP */ /* diff --git a/include/linux/soc/qcom/smd.h b/include/linux/soc/qcom/smd.h deleted file mode 100644 index f148e0ffbec75f443af5bdb23e8ca318b5743786..0000000000000000000000000000000000000000 --- a/include/linux/soc/qcom/smd.h +++ /dev/null @@ -1,139 +0,0 @@ -#ifndef __QCOM_SMD_H__ -#define __QCOM_SMD_H__ - -#include -#include - -struct qcom_smd; -struct qcom_smd_channel; -struct qcom_smd_lookup; - -/** - * struct qcom_smd_id - struct used for matching a smd device - * @name: name of the channel - */ -struct qcom_smd_id { - char name[20]; -}; - -/** - * struct qcom_smd_device - smd device struct - * @dev: the device struct - * @channel: handle to the smd channel for this device - */ -struct qcom_smd_device { - struct device dev; - struct qcom_smd_channel *channel; -}; - -typedef int (*qcom_smd_cb_t)(struct qcom_smd_channel *, const void *, size_t); - -/** - * struct qcom_smd_driver - smd driver struct - * @driver: underlying device driver - * @smd_match_table: static channel match table - * @probe: invoked when the smd channel is found - * @remove: invoked when the smd channel is closed - * @callback: invoked when an inbound message is received on the channel, - * should return 0 on success or -EBUSY if the data cannot be - * consumed at this time - */ -struct qcom_smd_driver { - struct device_driver driver; - const struct qcom_smd_id *smd_match_table; - - int (*probe)(struct qcom_smd_device *dev); - void (*remove)(struct qcom_smd_device *dev); - qcom_smd_cb_t callback; -}; - -#if IS_ENABLED(CONFIG_QCOM_SMD) - -int qcom_smd_driver_register(struct qcom_smd_driver *drv); -void qcom_smd_driver_unregister(struct qcom_smd_driver *drv); - -struct qcom_smd_channel *qcom_smd_open_channel(struct qcom_smd_channel *channel, - const char *name, - qcom_smd_cb_t cb); -void qcom_smd_close_channel(struct qcom_smd_channel *channel); -void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel); -void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data); -int qcom_smd_send(struct qcom_smd_channel *channel, const void *data, int len); - - -struct qcom_smd_edge *qcom_smd_register_edge(struct device *parent, - struct device_node *node); -int qcom_smd_unregister_edge(struct qcom_smd_edge *edge); - -#else - -static inline int qcom_smd_driver_register(struct qcom_smd_driver *drv) -{ - return -ENXIO; -} - -static inline void qcom_smd_driver_unregister(struct qcom_smd_driver *drv) -{ - /* This shouldn't be possible */ - WARN_ON(1); -} - -static inline struct qcom_smd_channel * -qcom_smd_open_channel(struct qcom_smd_channel *channel, - const char *name, - qcom_smd_cb_t cb) -{ - /* This shouldn't be possible */ - WARN_ON(1); - return NULL; -} - -static inline void qcom_smd_close_channel(struct qcom_smd_channel *channel) -{ - /* This shouldn't be possible */ - WARN_ON(1); -} - -static inline void *qcom_smd_get_drvdata(struct qcom_smd_channel *channel) -{ - /* This shouldn't be possible */ - WARN_ON(1); - return NULL; -} - -static inline void qcom_smd_set_drvdata(struct qcom_smd_channel *channel, void *data) -{ - /* This shouldn't be possible */ - WARN_ON(1); -} - -static inline int qcom_smd_send(struct qcom_smd_channel *channel, - const void *data, int len) -{ - /* This shouldn't be possible */ - WARN_ON(1); - return -ENXIO; -} - -static inline struct qcom_smd_edge * -qcom_smd_register_edge(struct device *parent, - struct device_node *node) -{ - return ERR_PTR(-ENXIO); -} - -static inline int qcom_smd_unregister_edge(struct qcom_smd_edge *edge) -{ - /* This shouldn't be possible */ - WARN_ON(1); - return -ENXIO; -} - -#endif - -#define module_qcom_smd_driver(__smd_driver) \ - module_driver(__smd_driver, qcom_smd_driver_register, \ - qcom_smd_driver_unregister) - - -#endif diff --git a/include/linux/soc/qcom/wcnss_ctrl.h b/include/linux/soc/qcom/wcnss_ctrl.h index eab64976a73b0e1aa2c15de6a06ae65e1333de99..a4dd4d7c711dc000fc3f1fe93a794dbff245b418 100644 --- a/include/linux/soc/qcom/wcnss_ctrl.h +++ b/include/linux/soc/qcom/wcnss_ctrl.h @@ -1,16 +1,19 @@ #ifndef __WCNSS_CTRL_H__ #define __WCNSS_CTRL_H__ -#include +#include #if IS_ENABLED(CONFIG_QCOM_WCNSS_CTRL) -struct qcom_smd_channel *qcom_wcnss_open_channel(void *wcnss, const char *name, qcom_smd_cb_t cb); +struct rpmsg_endpoint *qcom_wcnss_open_channel(void *wcnss, const char *name, + rpmsg_rx_cb_t cb, void *priv); #else -static inline struct qcom_smd_channel* -qcom_wcnss_open_channel(void *wcnss, const char *name, qcom_smd_cb_t cb) +static struct rpmsg_endpoint *qcom_wcnss_open_channel(void *wcnss, + const char *name, + rpmsg_rx_cb_t cb, + void *priv) { WARN_ON(1); return ERR_PTR(-ENXIO); diff --git a/include/linux/soc/samsung/exynos-regs-pmu.h b/include/linux/soc/samsung/exynos-regs-pmu.h index 49df0a01a2cc203ef2e8ea50560e637e012dac98..bebdde5dccd69ff53112f12464ffa14ba7c971b7 100644 --- a/include/linux/soc/samsung/exynos-regs-pmu.h +++ b/include/linux/soc/samsung/exynos-regs-pmu.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2010-2012 Samsung Electronics Co., Ltd. + * Copyright (c) 2010-2015 Samsung Electronics Co., Ltd. * http://www.samsung.com * * EXYNOS - Power management unit definition @@ -50,6 +50,14 @@ #define S5P_WAKEUP_MASK 0x0608 #define S5P_WAKEUP_MASK2 0x0614 +/* MIPI_PHYn_CONTROL, valid for Exynos3250, Exynos4, Exynos5250 and Exynos5433 */ +#define EXYNOS4_MIPI_PHY_CONTROL(n) (0x0710 + (n) * 4) +/* Phy enable bit, common for all phy registers, not only MIPI */ +#define EXYNOS4_PHY_ENABLE (1 << 0) +#define EXYNOS4_MIPI_PHY_SRESETN (1 << 1) +#define EXYNOS4_MIPI_PHY_MRESETN (1 << 2) +#define EXYNOS4_MIPI_PHY_RESET_MASK (3 << 1) + #define S5P_INFORM0 0x0800 #define S5P_INFORM1 0x0804 #define S5P_INFORM5 0x0814 @@ -342,6 +350,8 @@ #define EXYNOS5_AUTO_WDTRESET_DISABLE 0x0408 #define EXYNOS5_MASK_WDTRESET_REQUEST 0x040C +#define EXYNOS5_USBDRD_PHY_CONTROL 0x0704 +#define EXYNOS5_DPTX_PHY_CONTROL 0x0720 #define EXYNOS5_USE_RETENTION BIT(4) #define EXYNOS5_SYS_WDTRESET (1 << 20) @@ -495,6 +505,9 @@ #define EXYNOS5420_KFC_CORE_RESET(_nr) \ ((EXYNOS5420_KFC_CORE_RESET0 | EXYNOS5420_KFC_ETM_RESET0) << (_nr)) +#define EXYNOS5420_USBDRD1_PHY_CONTROL 0x0708 +#define EXYNOS5420_MIPI_PHY_CONTROL(n) (0x0714 + (n) * 4) +#define EXYNOS5420_DPTX_PHY_CONTROL 0x0728 #define EXYNOS5420_ARM_CORE2_SYS_PWR_REG 0x1020 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_LOCAL_SYS_PWR_REG 0x1024 #define EXYNOS5420_DIS_IRQ_ARM_CORE2_CENTRAL_SYS_PWR_REG 0x1028 @@ -632,6 +645,7 @@ | EXYNOS5420_KFC_USE_STANDBY_WFI3) /* For EXYNOS5433 */ +#define EXYNOS5433_USBHOST30_PHY_CONTROL (0x0728) #define EXYNOS5433_PAD_RETENTION_AUD_OPTION (0x3028) #define EXYNOS5433_PAD_RETENTION_MMC2_OPTION (0x30C8) #define EXYNOS5433_PAD_RETENTION_TOP_OPTION (0x3108) diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h index a0596ca0e80ac77aeb0afa29648532ef51a5deae..a2f8109bb215751427e99b3c026badfe7113b875 100644 --- a/include/linux/sock_diag.h +++ b/include/linux/sock_diag.h @@ -24,6 +24,7 @@ void sock_diag_unregister(const struct sock_diag_handler *h); void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh)); +u64 sock_gen_cookie(struct sock *sk); int sock_diag_check_cookie(struct sock *sk, const __u32 *cookie); void sock_diag_save_cookie(struct sock *sk, __u32 *cookie); diff --git a/include/linux/spi/spi.h b/include/linux/spi/spi.h index 75c6bd0ac605b2024509985214cdc3d451c6ab53..935bd2854ff19b2ab64e6d3786bc4ab53f7bc59c 100644 --- a/include/linux/spi/spi.h +++ b/include/linux/spi/spi.h @@ -23,6 +23,7 @@ #include struct dma_chan; +struct property_entry; struct spi_master; struct spi_transfer; struct spi_flash_read_message; @@ -375,6 +376,8 @@ static inline void spi_unregister_driver(struct spi_driver *sdrv) * @unprepare_message: undo any work done by prepare_message(). * @spi_flash_read: to support spi-controller hardwares that provide * accelerated interface to read from flash devices. + * @spi_flash_can_dma: analogous to can_dma() interface, but for + * controllers implementing spi_flash_read. * @flash_read_supported: spi device supports flash read * @cs_gpios: Array of GPIOs to use as chip select lines; one per CS * number. Any individual value may be -ENOENT for CS lines that @@ -538,6 +541,8 @@ struct spi_master { struct spi_message *message); int (*spi_flash_read)(struct spi_device *spi, struct spi_flash_read_message *msg); + bool (*spi_flash_can_dma)(struct spi_device *spi, + struct spi_flash_read_message *msg); bool (*flash_read_supported)(struct spi_device *spi); /* @@ -891,7 +896,7 @@ static inline struct spi_message *spi_message_alloc(unsigned ntrans, gfp_t flags unsigned i; struct spi_transfer *t = (struct spi_transfer *)(m + 1); - INIT_LIST_HEAD(&m->transfers); + spi_message_init_no_memset(m); for (i = 0; i < ntrans; i++, t++) spi_message_add_tail(t, m); } @@ -1209,6 +1214,7 @@ int spi_flash_read(struct spi_device *spi, * @modalias: Initializes spi_device.modalias; identifies the driver. * @platform_data: Initializes spi_device.platform_data; the particular * data stored there is driver-specific. + * @properties: Additional device properties for the device. * @controller_data: Initializes spi_device.controller_data; some * controllers need hints about hardware setup, e.g. for DMA. * @irq: Initializes spi_device.irq; depends on how the board is wired. @@ -1241,10 +1247,12 @@ struct spi_board_info { * * platform_data goes to spi_device.dev.platform_data, * controller_data goes to spi_device.controller_data, + * device properties are copied and attached to spi_device, * irq is copied too */ char modalias[SPI_NAME_SIZE]; const void *platform_data; + const struct property_entry *properties; void *controller_data; int irq; diff --git a/include/linux/splice.h b/include/linux/splice.h index 00a21166e268b50f812807bc7e899360cb65febc..db42746bdfea54858ba2fae9fdaf5e4141b478a9 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -20,6 +20,8 @@ #define SPLICE_F_MORE (0x04) /* expect more data */ #define SPLICE_F_GIFT (0x08) /* pages passed in are a gift */ +#define SPLICE_F_ALL (SPLICE_F_MOVE|SPLICE_F_NONBLOCK|SPLICE_F_MORE|SPLICE_F_GIFT) + /* * Passed to the actors */ @@ -55,7 +57,6 @@ struct splice_pipe_desc { struct partial_page *partial; /* pages[] may not be contig */ int nr_pages; /* number of populated pages in map */ unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ - unsigned int flags; /* splice flags */ const struct pipe_buf_operations *ops;/* ops associated with output pipe */ void (*spd_release)(struct splice_pipe_desc *, unsigned int); }; @@ -82,7 +83,6 @@ extern ssize_t splice_direct_to_actor(struct file *, struct splice_desc *, */ extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); extern void splice_shrink_spd(struct splice_pipe_desc *); -extern void spd_release_page(struct splice_pipe_desc *, unsigned int); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; extern const struct pipe_buf_operations default_pipe_buf_ops; diff --git a/include/linux/srcu.h b/include/linux/srcu.h index a598cf3ac70ca686ca60c77dbdfb986a554993af..167ad8831aafe092a53f87b982acb45748973337 100644 --- a/include/linux/srcu.h +++ b/include/linux/srcu.h @@ -22,7 +22,7 @@ * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -32,35 +32,9 @@ #include #include #include +#include -struct srcu_array { - unsigned long lock_count[2]; - unsigned long unlock_count[2]; -}; - -struct rcu_batch { - struct rcu_head *head, **tail; -}; - -#define RCU_BATCH_INIT(name) { NULL, &(name.head) } - -struct srcu_struct { - unsigned long completed; - struct srcu_array __percpu *per_cpu_ref; - spinlock_t queue_lock; /* protect ->batch_queue, ->running */ - bool running; - /* callbacks just queued */ - struct rcu_batch batch_queue; - /* callbacks try to do the first check_zero */ - struct rcu_batch batch_check0; - /* callbacks done with the first check_zero and the flip */ - struct rcu_batch batch_check1; - struct rcu_batch batch_done; - struct delayed_work work; -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -}; +struct srcu_struct; #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -82,46 +56,15 @@ int init_srcu_struct(struct srcu_struct *sp); #define __SRCU_DEP_MAP_INIT(srcu_name) #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ -void process_srcu(struct work_struct *work); - -#define __SRCU_STRUCT_INIT(name) \ - { \ - .completed = -300, \ - .per_cpu_ref = &name##_srcu_array, \ - .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ - .running = false, \ - .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ - .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ - .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ - .batch_done = RCU_BATCH_INIT(name.batch_done), \ - .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ - __SRCU_DEP_MAP_INIT(name) \ - } - -/* - * Define and initialize a srcu struct at build time. - * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. - * - * Note that although DEFINE_STATIC_SRCU() hides the name from other - * files, the per-CPU variable rules nevertheless require that the - * chosen name be globally unique. These rules also prohibit use of - * DEFINE_STATIC_SRCU() within a function. If these rules are too - * restrictive, declare the srcu_struct manually. For example, in - * each file: - * - * static struct srcu_struct my_srcu; - * - * Then, before the first use of each my_srcu, manually initialize it: - * - * init_srcu_struct(&my_srcu); - * - * See include/linux/percpu-defs.h for the rules on per-CPU variables. - */ -#define __DEFINE_SRCU(name, is_static) \ - static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ - is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) -#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) -#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) +#ifdef CONFIG_TINY_SRCU +#include +#elif defined(CONFIG_TREE_SRCU) +#include +#elif defined(CONFIG_CLASSIC_SRCU) +#include +#else +#error "Unknown SRCU implementation specified to kernel configuration" +#endif /** * call_srcu() - Queue a callback for invocation after an SRCU grace period @@ -147,9 +90,6 @@ void cleanup_srcu_struct(struct srcu_struct *sp); int __srcu_read_lock(struct srcu_struct *sp) __acquires(sp); void __srcu_read_unlock(struct srcu_struct *sp, int idx) __releases(sp); void synchronize_srcu(struct srcu_struct *sp); -void synchronize_srcu_expedited(struct srcu_struct *sp); -unsigned long srcu_batches_completed(struct srcu_struct *sp); -void srcu_barrier(struct srcu_struct *sp); #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/include/linux/srcuclassic.h b/include/linux/srcuclassic.h new file mode 100644 index 0000000000000000000000000000000000000000..5753f73222629c1340d363eba4e50510e437c575 --- /dev/null +++ b/include/linux/srcuclassic.h @@ -0,0 +1,115 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * classic v4.11 variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_CLASSIC_H +#define _LINUX_SRCU_CLASSIC_H + +struct srcu_array { + unsigned long lock_count[2]; + unsigned long unlock_count[2]; +}; + +struct rcu_batch { + struct rcu_head *head, **tail; +}; + +#define RCU_BATCH_INIT(name) { NULL, &(name.head) } + +struct srcu_struct { + unsigned long completed; + struct srcu_array __percpu *per_cpu_ref; + spinlock_t queue_lock; /* protect ->batch_queue, ->running */ + bool running; + /* callbacks just queued */ + struct rcu_batch batch_queue; + /* callbacks try to do the first check_zero */ + struct rcu_batch batch_check0; + /* callbacks done with the first check_zero and the flip */ + struct rcu_batch batch_check1; + struct rcu_batch batch_done; + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .completed = -300, \ + .per_cpu_ref = &name##_srcu_array, \ + .queue_lock = __SPIN_LOCK_UNLOCKED(name.queue_lock), \ + .running = false, \ + .batch_queue = RCU_BATCH_INIT(name.batch_queue), \ + .batch_check0 = RCU_BATCH_INIT(name.batch_check0), \ + .batch_check1 = RCU_BATCH_INIT(name.batch_check1), \ + .batch_done = RCU_BATCH_INIT(name.batch_done), \ + .work = __DELAYED_WORK_INITIALIZER(name.work, process_srcu, 0),\ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_array, name##_srcu_array);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->completed; + *gpnum = *completed; + if (sp->batch_queue.head || sp->batch_check0.head || sp->batch_check0.head) + (*gpnum)++; +} + +#endif diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h new file mode 100644 index 0000000000000000000000000000000000000000..42311ee0334fde629280f6b92cdc065853e3ba14 --- /dev/null +++ b/include/linux/srcutiny.h @@ -0,0 +1,93 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TINY_H +#define _LINUX_SRCU_TINY_H + +#include + +struct srcu_struct { + int srcu_lock_nesting[2]; /* srcu_read_lock() nesting depth. */ + struct swait_queue_head srcu_wq; + /* Last srcu_read_unlock() wakes GP. */ + unsigned long srcu_gp_seq; /* GP seq # for callback tagging. */ + struct rcu_segcblist srcu_cblist; + /* Pending SRCU callbacks. */ + int srcu_idx; /* Current reader array element. */ + bool srcu_gp_running; /* GP workqueue running? */ + bool srcu_gp_waiting; /* GP waiting for readers? */ + struct work_struct srcu_work; /* For driving grace periods. */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +void srcu_drive_gp(struct work_struct *wp); + +#define __SRCU_STRUCT_INIT(name) \ +{ \ + .srcu_wq = __SWAIT_QUEUE_HEAD_INITIALIZER(name.srcu_wq), \ + .srcu_cblist = RCU_SEGCBLIST_INITIALIZER(name.srcu_cblist), \ + .srcu_work = __WORK_INITIALIZER(name.srcu_work, srcu_drive_gp), \ + __SRCU_DEP_MAP_INIT(name) \ +} + +/* + * This odd _STATIC_ arrangement is needed for API compatibility with + * Tree SRCU, which needs some per-CPU data. + */ +#define DEFINE_SRCU(name) \ + struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_STATIC_SRCU(name) \ + static struct srcu_struct name = __SRCU_STRUCT_INIT(name) + +void synchronize_srcu(struct srcu_struct *sp); + +static inline void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline void srcu_barrier(struct srcu_struct *sp) +{ + synchronize_srcu(sp); +} + +static inline unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return 0; +} + +static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, + unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = sp->srcu_gp_seq; + *gpnum = *completed; +} + +#endif diff --git a/include/linux/srcutree.h b/include/linux/srcutree.h new file mode 100644 index 0000000000000000000000000000000000000000..32e86d85fd11f54bd973129a6428c4c09c104ace --- /dev/null +++ b/include/linux/srcutree.h @@ -0,0 +1,150 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tree variant. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#ifndef _LINUX_SRCU_TREE_H +#define _LINUX_SRCU_TREE_H + +#include +#include + +struct srcu_node; +struct srcu_struct; + +/* + * Per-CPU structure feeding into leaf srcu_node, similar in function + * to rcu_node. + */ +struct srcu_data { + /* Read-side state. */ + unsigned long srcu_lock_count[2]; /* Locks per CPU. */ + unsigned long srcu_unlock_count[2]; /* Unlocks per CPU. */ + + /* Update-side state. */ + spinlock_t lock ____cacheline_internodealigned_in_smp; + struct rcu_segcblist srcu_cblist; /* List of callbacks.*/ + unsigned long srcu_gp_seq_needed; /* Furthest future GP needed. */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + bool srcu_cblist_invoking; /* Invoking these CBs? */ + struct delayed_work work; /* Context for CB invoking. */ + struct rcu_head srcu_barrier_head; /* For srcu_barrier() use. */ + struct srcu_node *mynode; /* Leaf srcu_node. */ + unsigned long grpmask; /* Mask for leaf srcu_node */ + /* ->srcu_data_have_cbs[]. */ + int cpu; + struct srcu_struct *sp; +}; + +/* + * Node in SRCU combining tree, similar in function to rcu_data. + */ +struct srcu_node { + spinlock_t lock; + unsigned long srcu_have_cbs[4]; /* GP seq for children */ + /* having CBs, but only */ + /* is > ->srcu_gq_seq. */ + unsigned long srcu_data_have_cbs[4]; /* Which srcu_data structs */ + /* have CBs for given GP? */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + struct srcu_node *srcu_parent; /* Next up in tree. */ + int grplo; /* Least CPU for node. */ + int grphi; /* Biggest CPU for node. */ +}; + +/* + * Per-SRCU-domain structure, similar in function to rcu_state. + */ +struct srcu_struct { + struct srcu_node node[NUM_RCU_NODES]; /* Combining tree. */ + struct srcu_node *level[RCU_NUM_LVLS + 1]; + /* First node at each level. */ + struct mutex srcu_cb_mutex; /* Serialize CB preparation. */ + spinlock_t gp_lock; /* protect ->srcu_cblist */ + struct mutex srcu_gp_mutex; /* Serialize GP work. */ + unsigned int srcu_idx; /* Current rdr array element. */ + unsigned long srcu_gp_seq; /* Grace-period seq #. */ + unsigned long srcu_gp_seq_needed; /* Latest gp_seq needed. */ + unsigned long srcu_gp_seq_needed_exp; /* Furthest future exp GP. */ + unsigned long srcu_last_gp_end; /* Last GP end timestamp (ns) */ + struct srcu_data __percpu *sda; /* Per-CPU srcu_data array. */ + unsigned long srcu_barrier_seq; /* srcu_barrier seq #. */ + struct mutex srcu_barrier_mutex; /* Serialize barrier ops. */ + struct completion srcu_barrier_completion; + /* Awaken barrier rq at end. */ + atomic_t srcu_barrier_cpu_cnt; /* # CPUs not yet posting a */ + /* callback for the barrier */ + /* operation. */ + struct delayed_work work; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +}; + +/* Values for state variable (bottom bits of ->srcu_gp_seq). */ +#define SRCU_STATE_IDLE 0 +#define SRCU_STATE_SCAN1 1 +#define SRCU_STATE_SCAN2 2 + +void process_srcu(struct work_struct *work); + +#define __SRCU_STRUCT_INIT(name) \ + { \ + .sda = &name##_srcu_data, \ + .gp_lock = __SPIN_LOCK_UNLOCKED(name.gp_lock), \ + .srcu_gp_seq_needed = 0 - 1, \ + __SRCU_DEP_MAP_INIT(name) \ + } + +/* + * Define and initialize a srcu struct at build time. + * Do -not- call init_srcu_struct() nor cleanup_srcu_struct() on it. + * + * Note that although DEFINE_STATIC_SRCU() hides the name from other + * files, the per-CPU variable rules nevertheless require that the + * chosen name be globally unique. These rules also prohibit use of + * DEFINE_STATIC_SRCU() within a function. If these rules are too + * restrictive, declare the srcu_struct manually. For example, in + * each file: + * + * static struct srcu_struct my_srcu; + * + * Then, before the first use of each my_srcu, manually initialize it: + * + * init_srcu_struct(&my_srcu); + * + * See include/linux/percpu-defs.h for the rules on per-CPU variables. + */ +#define __DEFINE_SRCU(name, is_static) \ + static DEFINE_PER_CPU(struct srcu_data, name##_srcu_data);\ + is_static struct srcu_struct name = __SRCU_STRUCT_INIT(name) +#define DEFINE_SRCU(name) __DEFINE_SRCU(name, /* not static */) +#define DEFINE_STATIC_SRCU(name) __DEFINE_SRCU(name, static) + +void synchronize_srcu_expedited(struct srcu_struct *sp); +void srcu_barrier(struct srcu_struct *sp); +unsigned long srcu_batches_completed(struct srcu_struct *sp); + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed); + +#endif diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h index 0a34489a46b62a4c526e27b39b53d3d571db6727..4205f71a5f0e99e9900ec47de977aed8f6453aaf 100644 --- a/include/linux/stacktrace.h +++ b/include/linux/stacktrace.h @@ -18,6 +18,8 @@ extern void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace); extern void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace); +extern int save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace); extern void print_stack_trace(struct stack_trace *trace, int spaces); extern int snprint_stack_trace(char *buf, size_t size, @@ -29,12 +31,13 @@ extern void save_stack_trace_user(struct stack_trace *trace); # define save_stack_trace_user(trace) do { } while (0) #endif -#else +#else /* !CONFIG_STACKTRACE */ # define save_stack_trace(trace) do { } while (0) # define save_stack_trace_tsk(tsk, trace) do { } while (0) # define save_stack_trace_user(trace) do { } while (0) # define print_stack_trace(trace, spaces) do { } while (0) # define snprint_stack_trace(buf, size, trace, spaces) do { } while (0) -#endif +# define save_stack_trace_tsk_reliable(tsk, trace) ({ -ENOSYS; }) +#endif /* CONFIG_STACKTRACE */ -#endif +#endif /* __LINUX_STACKTRACE_H */ diff --git a/include/linux/stmmac.h b/include/linux/stmmac.h index fc273e9d5f67625b9ddf611746d1a09ff555e4ab..3921cb9dfadb9b4a9bbe30854ccada75725b8005 100644 --- a/include/linux/stmmac.h +++ b/include/linux/stmmac.h @@ -28,6 +28,9 @@ #include +#define MTL_MAX_RX_QUEUES 8 +#define MTL_MAX_TX_QUEUES 8 + #define STMMAC_RX_COE_NONE 0 #define STMMAC_RX_COE_TYPE1 1 #define STMMAC_RX_COE_TYPE2 2 @@ -44,6 +47,18 @@ #define STMMAC_CSR_150_250M 0x4 /* MDC = clk_scr_i/102 */ #define STMMAC_CSR_250_300M 0x5 /* MDC = clk_scr_i/122 */ +/* MTL algorithms identifiers */ +#define MTL_TX_ALGORITHM_WRR 0x0 +#define MTL_TX_ALGORITHM_WFQ 0x1 +#define MTL_TX_ALGORITHM_DWRR 0x2 +#define MTL_TX_ALGORITHM_SP 0x3 +#define MTL_RX_ALGORITHM_SP 0x4 +#define MTL_RX_ALGORITHM_WSP 0x5 + +/* RX/TX Queue Mode */ +#define MTL_QUEUE_AVB 0x0 +#define MTL_QUEUE_DCB 0x1 + /* The MDC clock could be set higher than the IEEE 802.3 * specified frequency limit 0f 2.5 MHz, by programming a clock divider * of value different than the above defined values. The resultant MDIO @@ -109,6 +124,26 @@ struct stmmac_axi { bool axi_rb; }; +struct stmmac_rxq_cfg { + u8 mode_to_use; + u8 chan; + u8 pkt_route; + bool use_prio; + u32 prio; +}; + +struct stmmac_txq_cfg { + u8 weight; + u8 mode_to_use; + /* Credit Base Shaper parameters */ + u32 send_slope; + u32 idle_slope; + u32 high_credit; + u32 low_credit; + bool use_prio; + u32 prio; +}; + struct plat_stmmacenet_data { int bus_id; int phy_addr; @@ -133,6 +168,12 @@ struct plat_stmmacenet_data { int unicast_filter_entries; int tx_fifo_size; int rx_fifo_size; + u8 rx_queues_to_use; + u8 tx_queues_to_use; + u8 rx_sched_algorithm; + u8 tx_sched_algorithm; + struct stmmac_rxq_cfg rx_queues_cfg[MTL_MAX_RX_QUEUES]; + struct stmmac_txq_cfg tx_queues_cfg[MTL_MAX_TX_QUEUES]; void (*fix_mac_speed)(void *priv, unsigned int speed); int (*init)(struct platform_device *pdev, void *priv); void (*exit)(struct platform_device *pdev, void *priv); diff --git a/include/linux/string.h b/include/linux/string.h index 26b6f6a66f835b5f3fcb44da978e3d75b57054ec..537918f8a98eec239a8f16c7ae64cadcbc847789 100644 --- a/include/linux/string.h +++ b/include/linux/string.h @@ -114,6 +114,14 @@ extern int memcmp(const void *,const void *,__kernel_size_t); #ifndef __HAVE_ARCH_MEMCHR extern void * memchr(const void *,int,__kernel_size_t); #endif +#ifndef __HAVE_ARCH_MEMCPY_MCSAFE +static inline __must_check int memcpy_mcsafe(void *dst, const void *src, + size_t cnt) +{ + memcpy(dst, src, cnt); + return 0; +} +#endif void *memchr_inv(const void *s, int c, size_t n); char *strreplace(char *s, char old, char new); @@ -135,6 +143,16 @@ static inline int strtobool(const char *s, bool *res) } int match_string(const char * const *array, size_t n, const char *string); +int __sysfs_match_string(const char * const *array, size_t n, const char *s); + +/** + * sysfs_match_string - matches given string in an array + * @_a: array of strings + * @_s: string to match with + * + * Helper for __sysfs_match_string(). Calculates the size of @a automatically. + */ +#define sysfs_match_string(_a, _s) __sysfs_match_string(_a, ARRAY_SIZE(_a), _s) #ifdef CONFIG_BINARY_PRINTF int vbin_printf(u32 *bin_buf, size_t size, const char *fmt, va_list args); diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h index 245fc59b73247d744682c128bfcae1270e146c26..b7e85b341a54f5498e8a2095f3aef84cf5a0fe63 100644 --- a/include/linux/sunrpc/rpc_rdma.h +++ b/include/linux/sunrpc/rpc_rdma.h @@ -143,6 +143,9 @@ enum rpcrdma_proc { #define rdma_done cpu_to_be32(RDMA_DONE) #define rdma_error cpu_to_be32(RDMA_ERROR) +#define err_vers cpu_to_be32(ERR_VERS) +#define err_chunk cpu_to_be32(ERR_CHUNK) + /* * Private extension to RPC-over-RDMA Version One. * Message passed during RDMA-CM connection set-up. diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h index e770abeed32d7117c4f2d363f9d7370a60d2c55f..94631026f79c56f022976a85dcde92379507e87c 100644 --- a/include/linux/sunrpc/svc.h +++ b/include/linux/sunrpc/svc.h @@ -336,8 +336,7 @@ xdr_argsize_check(struct svc_rqst *rqstp, __be32 *p) { char *cp = (char *)p; struct kvec *vec = &rqstp->rq_arg.head[0]; - return cp >= (char*)vec->iov_base - && cp <= (char*)vec->iov_base + vec->iov_len; + return cp == (char *)vec->iov_base + vec->iov_len; } static inline int @@ -474,6 +473,7 @@ void svc_pool_map_put(void); struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int, struct svc_serv_ops *); int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int); +int svc_set_num_threads_sync(struct svc_serv *, struct svc_pool *, int); int svc_pool_stats_open(struct svc_serv *serv, struct file *file); void svc_destroy(struct svc_serv *); void svc_shutdown_net(struct svc_serv *, struct net *); diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index b105f73e3ca26355b2ee8b32651b48526a945899..f3787d800ba46b7cdae49e19584fe64cbdff2ef3 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -48,6 +48,12 @@ #include #define SVCRDMA_DEBUG +/* Default and maximum inline threshold sizes */ +enum { + RPCRDMA_DEF_INLINE_THRESH = 4096, + RPCRDMA_MAX_INLINE_THRESH = 65536 +}; + /* RPC/RDMA parameters and stats */ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; @@ -85,27 +91,11 @@ struct svc_rdma_op_ctxt { enum dma_data_direction direction; int count; unsigned int mapped_sges; - struct ib_sge sge[RPCSVC_MAXPAGES]; + struct ib_send_wr send_wr; + struct ib_sge sge[1 + RPCRDMA_MAX_INLINE_THRESH / PAGE_SIZE]; struct page *pages[RPCSVC_MAXPAGES]; }; -/* - * NFS_ requests are mapped on the client side by the chunk lists in - * the RPCRDMA header. During the fetching of the RPC from the client - * and the writing of the reply to the client, the memory in the - * client and the memory in the server must be mapped as contiguous - * vaddr/len for access by the hardware. These data strucures keep - * these mappings. - * - * For an RDMA_WRITE, the 'sge' maps the RPC REPLY. For RDMA_READ, the - * 'sge' in the svc_rdma_req_map maps the server side RPC reply and the - * 'ch' field maps the read-list of the RPCRDMA header to the 'sge' - * mapping of the reply. - */ -struct svc_rdma_chunk_sge { - int start; /* sge no for this chunk */ - int count; /* sge count for this chunk */ -}; struct svc_rdma_fastreg_mr { struct ib_mr *mr; struct scatterlist *sg; @@ -114,15 +104,7 @@ struct svc_rdma_fastreg_mr { enum dma_data_direction direction; struct list_head frmr_list; }; -struct svc_rdma_req_map { - struct list_head free; - unsigned long count; - union { - struct kvec sge[RPCSVC_MAXPAGES]; - struct svc_rdma_chunk_sge ch[RPCSVC_MAXPAGES]; - unsigned long lkey[RPCSVC_MAXPAGES]; - }; -}; + #define RDMACTXT_F_LAST_CTXT 2 #define SVCRDMA_DEVCAP_FAST_REG 1 /* fast mr registration */ @@ -144,14 +126,15 @@ struct svcxprt_rdma { u32 sc_max_requests; /* Max requests */ u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ + u8 sc_port_num; struct ib_pd *sc_pd; spinlock_t sc_ctxt_lock; struct list_head sc_ctxts; int sc_ctxt_used; - spinlock_t sc_map_lock; - struct list_head sc_maps; + spinlock_t sc_rw_ctxt_lock; + struct list_head sc_rw_ctxts; struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; @@ -181,9 +164,7 @@ struct svcxprt_rdma { /* The default ORD value is based on two outstanding full-size writes with a * page size of 4k, or 32k * 2 ops / 4k = 16 outstanding RDMA_READ. */ #define RPCRDMA_ORD (64/4) -#define RPCRDMA_SQ_DEPTH_MULT 8 #define RPCRDMA_MAX_REQUESTS 32 -#define RPCRDMA_MAX_REQ_SIZE 4096 /* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our * current NFSv4.1 implementation supports one backchannel slot. @@ -201,19 +182,11 @@ static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma, /* svc_rdma_backchannel.c */ extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, - struct rpcrdma_msg *rmsgp, + __be32 *rdma_resp, struct xdr_buf *rcvbuf); /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct xdr_buf *); -extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, - struct rpcrdma_msg *, - enum rpcrdma_errcode, __be32 *); -extern void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *, int); -extern void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *, int); -extern void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *, int, - __be32, __be64, u32); -extern unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp); /* svc_rdma_recvfrom.c */ extern int svc_rdma_recvfrom(struct svc_rqst *); @@ -224,16 +197,25 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, int *, u32 *, u32, u32, u64, bool); +/* svc_rdma_rw.c */ +extern void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma); +extern int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, + __be32 *wr_ch, struct xdr_buf *xdr); +extern int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, + __be32 *rp_ch, bool writelist, + struct xdr_buf *xdr); + /* svc_rdma_sendto.c */ -extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, - struct svc_rdma_req_map *, bool); +extern int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + __be32 *rdma_resp, unsigned int len); +extern int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + int num_sge, u32 inv_rkey); extern int svc_rdma_sendto(struct svc_rqst *); -extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, - int); /* svc_rdma_transport.c */ extern void svc_rdma_wc_send(struct ib_cq *, struct ib_wc *); -extern void svc_rdma_wc_write(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_reg(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_read(struct ib_cq *, struct ib_wc *); extern void svc_rdma_wc_inv(struct ib_cq *, struct ib_wc *); @@ -244,9 +226,6 @@ extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); -extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *); -extern void svc_rdma_put_req_map(struct svcxprt_rdma *, - struct svc_rdma_req_map *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); diff --git a/include/linux/suspend.h b/include/linux/suspend.h index d9718378a8bee0b327d08c2e80a6fd3b5490b967..0b1cf32edfd7ba1c456252124e23c68450d5bcc3 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -189,6 +189,8 @@ struct platform_suspend_ops { struct platform_freeze_ops { int (*begin)(void); int (*prepare)(void); + void (*wake)(void); + void (*sync)(void); void (*restore)(void); void (*end)(void); }; @@ -428,7 +430,8 @@ extern unsigned int pm_wakeup_irq; extern bool pm_wakeup_pending(void); extern void pm_system_wakeup(void); -extern void pm_wakeup_clear(void); +extern void pm_system_cancel_wakeup(void); +extern void pm_wakeup_clear(bool reset); extern void pm_system_irq_wakeup(unsigned int irq_number); extern bool pm_get_wakeup_count(unsigned int *count, bool block); extern bool pm_save_wakeup_count(unsigned int count); @@ -478,7 +481,7 @@ static inline int unregister_pm_notifier(struct notifier_block *nb) static inline bool pm_wakeup_pending(void) { return false; } static inline void pm_system_wakeup(void) {} -static inline void pm_wakeup_clear(void) {} +static inline void pm_wakeup_clear(bool reset) {} static inline void pm_system_irq_wakeup(unsigned int irq_number) {} static inline void lock_system_sleep(void) {} diff --git a/include/linux/swap.h b/include/linux/swap.h index 45e91dd6716d89b0ffa8f2d97481d12732d20173..ba5882419a7dbcebe0cbf7edee276346429d8b79 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -279,7 +279,7 @@ extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); -extern void deactivate_page(struct page *page); +extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); @@ -411,9 +411,6 @@ struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); -extern int get_swap_slots(int n, swp_entry_t *slots); -extern void swapcache_free_batch(swp_entry_t *entries, int n); - #else /* CONFIG_SWAP */ #define swap_address_space(entry) (NULL) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index b7e82049fec754c7f2c88d3b50fd513395958660..80d07816def05bf0f7662c46d56387830dec3d57 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -180,7 +180,6 @@ extern void setup_sysctl_set(struct ctl_table_set *p, int (*is_seen)(struct ctl_table_set *)); extern void retire_sysctl_set(struct ctl_table_set *set); -void register_sysctl_root(struct ctl_table_root *root); struct ctl_table_header *__register_sysctl_table( struct ctl_table_set *set, const char *path, struct ctl_table *table); diff --git a/include/linux/t10-pi.h b/include/linux/t10-pi.h index 9fba9dd33544087dc5dfc55733ec3e4cdb9a317f..9375d23a24e7aba97504d743e77d3b78f4d72493 100644 --- a/include/linux/t10-pi.h +++ b/include/linux/t10-pi.h @@ -34,9 +34,9 @@ struct t10_pi_tuple { }; -extern struct blk_integrity_profile t10_pi_type1_crc; -extern struct blk_integrity_profile t10_pi_type1_ip; -extern struct blk_integrity_profile t10_pi_type3_crc; -extern struct blk_integrity_profile t10_pi_type3_ip; +extern const struct blk_integrity_profile t10_pi_type1_crc; +extern const struct blk_integrity_profile t10_pi_type1_ip; +extern const struct blk_integrity_profile t10_pi_type3_crc; +extern const struct blk_integrity_profile t10_pi_type3_ip; #endif diff --git a/include/linux/tcp.h b/include/linux/tcp.h index cfc2d9506ce8077af1ec92eb7086fd52ce4fe1ac..b6d5adcee8fcb611de202993623cc80274d262e4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -233,12 +233,14 @@ struct tcp_sock { u8 syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ + syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ save_syn:1, /* Save headers of SYN packet */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP retransmit. */ /* RTT measurement */ + struct skb_mstamp tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ @@ -331,16 +333,16 @@ struct tcp_sock { /* Receiver side RTT estimation */ struct { - u32 rtt; - u32 seq; - u32 time; + u32 rtt_us; + u32 seq; + struct skb_mstamp time; } rcv_rtt_est; /* Receiver queue space */ struct { - int space; - u32 seq; - u32 time; + int space; + u32 seq; + struct skb_mstamp time; } rcvq_space; /* TCP-specific MTU probe information. */ diff --git a/include/linux/tee_drv.h b/include/linux/tee_drv.h new file mode 100644 index 0000000000000000000000000000000000000000..0f175b8f6456c1d5ad14538deb66b9c3a9f3c5c7 --- /dev/null +++ b/include/linux/tee_drv.h @@ -0,0 +1,277 @@ +/* + * Copyright (c) 2015-2016, Linaro Limited + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#ifndef __TEE_DRV_H +#define __TEE_DRV_H + +#include +#include +#include +#include + +/* + * The file describes the API provided by the generic TEE driver to the + * specific TEE driver. + */ + +#define TEE_SHM_MAPPED 0x1 /* Memory mapped by the kernel */ +#define TEE_SHM_DMA_BUF 0x2 /* Memory with dma-buf handle */ + +struct tee_device; +struct tee_shm; +struct tee_shm_pool; + +/** + * struct tee_context - driver specific context on file pointer data + * @teedev: pointer to this drivers struct tee_device + * @list_shm: List of shared memory object owned by this context + * @data: driver specific context data, managed by the driver + */ +struct tee_context { + struct tee_device *teedev; + struct list_head list_shm; + void *data; +}; + +struct tee_param_memref { + size_t shm_offs; + size_t size; + struct tee_shm *shm; +}; + +struct tee_param_value { + u64 a; + u64 b; + u64 c; +}; + +struct tee_param { + u64 attr; + union { + struct tee_param_memref memref; + struct tee_param_value value; + } u; +}; + +/** + * struct tee_driver_ops - driver operations vtable + * @get_version: returns version of driver + * @open: called when the device file is opened + * @release: release this open file + * @open_session: open a new session + * @close_session: close a session + * @invoke_func: invoke a trusted function + * @cancel_req: request cancel of an ongoing invoke or open + * @supp_revc: called for supplicant to get a command + * @supp_send: called for supplicant to send a response + */ +struct tee_driver_ops { + void (*get_version)(struct tee_device *teedev, + struct tee_ioctl_version_data *vers); + int (*open)(struct tee_context *ctx); + void (*release)(struct tee_context *ctx); + int (*open_session)(struct tee_context *ctx, + struct tee_ioctl_open_session_arg *arg, + struct tee_param *param); + int (*close_session)(struct tee_context *ctx, u32 session); + int (*invoke_func)(struct tee_context *ctx, + struct tee_ioctl_invoke_arg *arg, + struct tee_param *param); + int (*cancel_req)(struct tee_context *ctx, u32 cancel_id, u32 session); + int (*supp_recv)(struct tee_context *ctx, u32 *func, u32 *num_params, + struct tee_param *param); + int (*supp_send)(struct tee_context *ctx, u32 ret, u32 num_params, + struct tee_param *param); +}; + +/** + * struct tee_desc - Describes the TEE driver to the subsystem + * @name: name of driver + * @ops: driver operations vtable + * @owner: module providing the driver + * @flags: Extra properties of driver, defined by TEE_DESC_* below + */ +#define TEE_DESC_PRIVILEGED 0x1 +struct tee_desc { + const char *name; + const struct tee_driver_ops *ops; + struct module *owner; + u32 flags; +}; + +/** + * tee_device_alloc() - Allocate a new struct tee_device instance + * @teedesc: Descriptor for this driver + * @dev: Parent device for this device + * @pool: Shared memory pool, NULL if not used + * @driver_data: Private driver data for this device + * + * Allocates a new struct tee_device instance. The device is + * removed by tee_device_unregister(). + * + * @returns a pointer to a 'struct tee_device' or an ERR_PTR on failure + */ +struct tee_device *tee_device_alloc(const struct tee_desc *teedesc, + struct device *dev, + struct tee_shm_pool *pool, + void *driver_data); + +/** + * tee_device_register() - Registers a TEE device + * @teedev: Device to register + * + * tee_device_unregister() need to be called to remove the @teedev if + * this function fails. + * + * @returns < 0 on failure + */ +int tee_device_register(struct tee_device *teedev); + +/** + * tee_device_unregister() - Removes a TEE device + * @teedev: Device to unregister + * + * This function should be called to remove the @teedev even if + * tee_device_register() hasn't been called yet. Does nothing if + * @teedev is NULL. + */ +void tee_device_unregister(struct tee_device *teedev); + +/** + * struct tee_shm_pool_mem_info - holds information needed to create a shared + * memory pool + * @vaddr: Virtual address of start of pool + * @paddr: Physical address of start of pool + * @size: Size in bytes of the pool + */ +struct tee_shm_pool_mem_info { + unsigned long vaddr; + phys_addr_t paddr; + size_t size; +}; + +/** + * tee_shm_pool_alloc_res_mem() - Create a shared memory pool from reserved + * memory range + * @priv_info: Information for driver private shared memory pool + * @dmabuf_info: Information for dma-buf shared memory pool + * + * Start and end of pools will must be page aligned. + * + * Allocation with the flag TEE_SHM_DMA_BUF set will use the range supplied + * in @dmabuf, others will use the range provided by @priv. + * + * @returns pointer to a 'struct tee_shm_pool' or an ERR_PTR on failure. + */ +struct tee_shm_pool * +tee_shm_pool_alloc_res_mem(struct tee_shm_pool_mem_info *priv_info, + struct tee_shm_pool_mem_info *dmabuf_info); + +/** + * tee_shm_pool_free() - Free a shared memory pool + * @pool: The shared memory pool to free + * + * The must be no remaining shared memory allocated from this pool when + * this function is called. + */ +void tee_shm_pool_free(struct tee_shm_pool *pool); + +/** + * tee_get_drvdata() - Return driver_data pointer + * @returns the driver_data pointer supplied to tee_register(). + */ +void *tee_get_drvdata(struct tee_device *teedev); + +/** + * tee_shm_alloc() - Allocate shared memory + * @ctx: Context that allocates the shared memory + * @size: Requested size of shared memory + * @flags: Flags setting properties for the requested shared memory. + * + * Memory allocated as global shared memory is automatically freed when the + * TEE file pointer is closed. The @flags field uses the bits defined by + * TEE_SHM_* above. TEE_SHM_MAPPED must currently always be set. If + * TEE_SHM_DMA_BUF global shared memory will be allocated and associated + * with a dma-buf handle, else driver private memory. + * + * @returns a pointer to 'struct tee_shm' + */ +struct tee_shm *tee_shm_alloc(struct tee_context *ctx, size_t size, u32 flags); + +/** + * tee_shm_free() - Free shared memory + * @shm: Handle to shared memory to free + */ +void tee_shm_free(struct tee_shm *shm); + +/** + * tee_shm_put() - Decrease reference count on a shared memory handle + * @shm: Shared memory handle + */ +void tee_shm_put(struct tee_shm *shm); + +/** + * tee_shm_va2pa() - Get physical address of a virtual address + * @shm: Shared memory handle + * @va: Virtual address to tranlsate + * @pa: Returned physical address + * @returns 0 on success and < 0 on failure + */ +int tee_shm_va2pa(struct tee_shm *shm, void *va, phys_addr_t *pa); + +/** + * tee_shm_pa2va() - Get virtual address of a physical address + * @shm: Shared memory handle + * @pa: Physical address to tranlsate + * @va: Returned virtual address + * @returns 0 on success and < 0 on failure + */ +int tee_shm_pa2va(struct tee_shm *shm, phys_addr_t pa, void **va); + +/** + * tee_shm_get_va() - Get virtual address of a shared memory plus an offset + * @shm: Shared memory handle + * @offs: Offset from start of this shared memory + * @returns virtual address of the shared memory + offs if offs is within + * the bounds of this shared memory, else an ERR_PTR + */ +void *tee_shm_get_va(struct tee_shm *shm, size_t offs); + +/** + * tee_shm_get_pa() - Get physical address of a shared memory plus an offset + * @shm: Shared memory handle + * @offs: Offset from start of this shared memory + * @pa: Physical address to return + * @returns 0 if offs is within the bounds of this shared memory, else an + * error code. + */ +int tee_shm_get_pa(struct tee_shm *shm, size_t offs, phys_addr_t *pa); + +/** + * tee_shm_get_id() - Get id of a shared memory object + * @shm: Shared memory handle + * @returns id + */ +int tee_shm_get_id(struct tee_shm *shm); + +/** + * tee_shm_get_from_id() - Find shared memory object and increase reference + * count + * @ctx: Context owning the shared memory + * @id: Id of shared memory object + * @returns a pointer to 'struct tee_shm' on success or an ERR_PTR on failure + */ +struct tee_shm *tee_shm_get_from_id(struct tee_context *ctx, int id); + +#endif /*__TEE_DRV_H*/ diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 58373875e8eec2aee668e5fdac4526796ebe78c8..d7d3ea637dd0a500875cbe64c00eb5e0188acd41 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -22,6 +22,18 @@ #endif #include + +/* + * For per-arch arch_within_stack_frames() implementations, defined in + * asm/thread_info.h. + */ +enum { + BAD_STACK = -1, + NOT_STACK = 0, + GOOD_FRAME, + GOOD_STACK, +}; + #include #ifdef __KERNEL__ @@ -101,6 +113,10 @@ static inline void check_object_size(const void *ptr, unsigned long n, { } #endif /* CONFIG_HARDENED_USERCOPY */ +#ifndef arch_setup_new_exec +static inline void arch_setup_new_exec(void) { } +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */ diff --git a/include/linux/tick.h b/include/linux/tick.h index a04fea19676f30e07282231b6c14b4ce9ab40f1d..fe01e68bf520c792f8c06989dd3e811baf8e98b5 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -117,6 +117,7 @@ extern void tick_nohz_idle_enter(void); extern void tick_nohz_idle_exit(void); extern void tick_nohz_irq_exit(void); extern ktime_t tick_nohz_get_sleep_length(void); +extern unsigned long tick_nohz_get_idle_calls(void); extern u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time); extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); #else /* !CONFIG_NO_HZ_COMMON */ diff --git a/include/linux/time.h b/include/linux/time.h index 23f0f5ce30909d7349a354086980e2ca9988744c..c0543f5f25de471594330d84d605c6f20ef4c565 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -151,9 +151,6 @@ static inline bool timespec_inject_offset_valid(const struct timespec *ts) return true; } -#define CURRENT_TIME (current_kernel_time()) -#define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) - /* Some architectures do not supply their own clocksource. * This is mainly the case in architectures that get their * inter-tick times by reading the counter on their interval diff --git a/include/linux/timekeeping.h b/include/linux/timekeeping.h index b598cbc7b576847a4e3f58ebbc2c76a24f149540..ddc229ff6d1ee1dbd3e535d7c0a2f16225c29f47 100644 --- a/include/linux/timekeeping.h +++ b/include/linux/timekeeping.h @@ -19,21 +19,6 @@ extern void do_gettimeofday(struct timeval *tv); extern int do_settimeofday64(const struct timespec64 *ts); extern int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz); -static inline int do_sys_settimeofday(const struct timespec *tv, - const struct timezone *tz) -{ - struct timespec64 ts64; - - if (!tv) - return do_sys_settimeofday64(NULL, tz); - - if (!timespec_valid(tv)) - return -EINVAL; - - ts64 = timespec_to_timespec64(*tv); - return do_sys_settimeofday64(&ts64, tz); -} - /* * Kernel time accessors */ @@ -273,6 +258,11 @@ static inline void timekeeping_clocktai(struct timespec *ts) *ts = ktime_to_timespec(ktime_get_clocktai()); } +static inline void timekeeping_clocktai64(struct timespec64 *ts) +{ + *ts = ktime_to_timespec64(ktime_get_clocktai()); +} + /* * RTC specific */ diff --git a/include/linux/tpm.h b/include/linux/tpm.h index da158f06e0b2fec35f10e2faea57d806696e4d20..5a090f5ab3356f5f26bb21581d9a3df8ea488aa0 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -48,7 +48,8 @@ struct tpm_class_ops { u8 (*status) (struct tpm_chip *chip); bool (*update_timeouts)(struct tpm_chip *chip, unsigned long *timeout_cap); - + int (*request_locality)(struct tpm_chip *chip, int loc); + void (*relinquish_locality)(struct tpm_chip *chip, int loc); }; #if defined(CONFIG_TCG_TPM) || defined(CONFIG_TCG_TPM_MODULE) diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 0af63c4381b9799151317bafc1a800815d85876b..a556805eff8a8ebb5390dd6c9f025d85608cbbb9 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -138,16 +138,7 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; -/* - * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq - * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function - * simplifies those functions and keeps them in sync. - */ -static inline enum print_line_t trace_handle_return(struct trace_seq *s) -{ - return trace_seq_has_overflowed(s) ? - TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; -} +enum print_line_t trace_handle_return(struct trace_seq *s); void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index f72fcfe0e66a80a80ae59a175e21abfba5b5ca42..cc48cb2ce20965a412037bfe3a8565991fcb9611 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -128,7 +128,7 @@ extern void syscall_unregfunc(void); * as "(void *, void)". The DECLARE_TRACE_NOARGS() will pass in just * "void *data", where as the DECLARE_TRACE() will pass in "void *data, proto". */ -#define __DO_TRACE(tp, proto, args, cond, prercu, postrcu) \ +#define __DO_TRACE(tp, proto, args, cond, rcucheck) \ do { \ struct tracepoint_func *it_func_ptr; \ void *it_func; \ @@ -136,7 +136,11 @@ extern void syscall_unregfunc(void); \ if (!(cond)) \ return; \ - prercu; \ + if (rcucheck) { \ + if (WARN_ON_ONCE(rcu_irq_enter_disabled())) \ + return; \ + rcu_irq_enter_irqson(); \ + } \ rcu_read_lock_sched_notrace(); \ it_func_ptr = rcu_dereference_sched((tp)->funcs); \ if (it_func_ptr) { \ @@ -147,20 +151,19 @@ extern void syscall_unregfunc(void); } while ((++it_func_ptr)->func); \ } \ rcu_read_unlock_sched_notrace(); \ - postrcu; \ + if (rcucheck) \ + rcu_irq_exit_irqson(); \ } while (0) #ifndef MODULE -#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ +#define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) \ static inline void trace_##name##_rcuidle(proto) \ { \ if (static_key_false(&__tracepoint_##name.key)) \ __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond), \ - rcu_irq_enter_irqson(), \ - rcu_irq_exit_irqson()); \ + TP_CONDITION(cond), 1); \ } #else #define __DECLARE_TRACE_RCU(name, proto, args, cond, data_proto, data_args) @@ -186,7 +189,7 @@ extern void syscall_unregfunc(void); __DO_TRACE(&__tracepoint_##name, \ TP_PROTO(data_proto), \ TP_ARGS(data_args), \ - TP_CONDITION(cond),,); \ + TP_CONDITION(cond), 0); \ if (IS_ENABLED(CONFIG_LOCKDEP) && (cond)) { \ rcu_read_lock_sched_notrace(); \ rcu_dereference_sched(__tracepoint_##name.funcs);\ diff --git a/include/linux/tty.h b/include/linux/tty.h index 1017e904c0a3f90507183570b0897199b4ace563..d07cd2105a6c6a3bdcac20c918622c7ad4ea0840 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -390,7 +390,6 @@ static inline bool tty_throttled(struct tty_struct *tty) } #ifdef CONFIG_TTY -extern void console_init(void); extern void tty_kref_put(struct tty_struct *tty); extern struct pid *tty_get_pgrp(struct tty_struct *tty); extern void tty_vhangup_self(void); @@ -402,8 +401,6 @@ extern struct tty_struct *get_current_tty(void); extern int __init tty_init(void); extern const char *tty_name(const struct tty_struct *tty); #else -static inline void console_init(void) -{ } static inline void tty_kref_put(struct tty_struct *tty) { } static inline struct pid *tty_get_pgrp(struct tty_struct *tty) @@ -478,9 +475,13 @@ extern int tty_do_resize(struct tty_struct *tty, struct winsize *ws); extern int is_current_pgrp_orphaned(void); extern void tty_hangup(struct tty_struct *tty); extern void tty_vhangup(struct tty_struct *tty); +extern void tty_vhangup_session(struct tty_struct *tty); extern int tty_hung_up_p(struct file *filp); extern void do_SAK(struct tty_struct *tty); extern void __do_SAK(struct tty_struct *tty); +extern void tty_open_proc_set_tty(struct file *filp, struct tty_struct *tty); +extern int tty_signal_session_leader(struct tty_struct *tty, int exit_session); +extern void session_clear_tty(struct pid *session); extern void no_tty(void); extern void tty_buffer_free_all(struct tty_port *port); extern void tty_buffer_flush(struct tty_struct *tty, struct tty_ldisc *ld); @@ -528,6 +529,8 @@ extern void tty_ldisc_flush(struct tty_struct *tty); extern long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg); extern int tty_mode_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg); +extern long tty_jobctrl_ioctl(struct tty_struct *tty, struct tty_struct *real_tty, + struct file *file, unsigned int cmd, unsigned long arg); extern int tty_perform_flush(struct tty_struct *tty, unsigned long arg); extern void tty_default_fops(struct file_operations *fops); extern struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx); @@ -669,7 +672,11 @@ extern int tty_ldisc_receive_buf(struct tty_ldisc *ld, const unsigned char *p, /* n_tty.c */ extern void n_tty_inherit_ops(struct tty_ldisc_ops *ops); +#ifdef CONFIG_TTY extern void __init n_tty_init(void); +#else +static inline void n_tty_init(void) { } +#endif /* tty_audit.c */ #ifdef CONFIG_AUDIT diff --git a/include/linux/types.h b/include/linux/types.h index 1e7bd24848fcb54cae26bdd10597f6ea80afd46f..258099a4ed82b1b3dd8199b7c905491c935a73a7 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -209,7 +209,7 @@ struct ustat { * naturally due ABI requirements, but some architectures (like CRIS) have * weird ABI and we need to ask it explicitly. * - * The alignment is required to guarantee that bits 0 and 1 of @next will be + * The alignment is required to guarantee that bit 0 of @next will be * clear under normal conditions -- as long as we use call_rcu(), * call_rcu_bh(), call_rcu_sched(), or call_srcu() to queue callback. * diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index f30c187ed785366231e318a7beab151e0bba64b6..201418d5e15c2b92dc7d7bc73b54f469eb488a35 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -2,8 +2,199 @@ #define __LINUX_UACCESS_H__ #include +#include +#include + +#define VERIFY_READ 0 +#define VERIFY_WRITE 1 + +#define uaccess_kernel() segment_eq(get_fs(), KERNEL_DS) + #include +/* + * Architectures should provide two primitives (raw_copy_{to,from}_user()) + * and get rid of their private instances of copy_{to,from}_user() and + * __copy_{to,from}_user{,_inatomic}(). + * + * raw_copy_{to,from}_user(to, from, size) should copy up to size bytes and + * return the amount left to copy. They should assume that access_ok() has + * already been checked (and succeeded); they should *not* zero-pad anything. + * No KASAN or object size checks either - those belong here. + * + * Both of these functions should attempt to copy size bytes starting at from + * into the area starting at to. They must not fetch or store anything + * outside of those areas. Return value must be between 0 (everything + * copied successfully) and size (nothing copied). + * + * If raw_copy_{to,from}_user(to, from, size) returns N, size - N bytes starting + * at to must become equal to the bytes fetched from the corresponding area + * starting at from. All data past to + size - N must be left unmodified. + * + * If copying succeeds, the return value must be 0. If some data cannot be + * fetched, it is permitted to copy less than had been fetched; the only + * hard requirement is that not storing anything at all (i.e. returning size) + * should happen only when nothing could be copied. In other words, you don't + * have to squeeze as much as possible - it is allowed, but not necessary. + * + * For raw_copy_from_user() to always points to kernel memory and no faults + * on store should happen. Interpretation of from is affected by set_fs(). + * For raw_copy_to_user() it's the other way round. + * + * Both can be inlined - it's up to architectures whether it wants to bother + * with that. They should not be used directly; they are used to implement + * the 6 functions (copy_{to,from}_user(), __copy_{to,from}_user_inatomic()) + * that are used instead. Out of those, __... ones are inlined. Plain + * copy_{to,from}_user() might or might not be inlined. If you want them + * inlined, have asm/uaccess.h define INLINE_COPY_{TO,FROM}_USER. + * + * NOTE: only copy_from_user() zero-pads the destination in case of short copy. + * Neither __copy_from_user() nor __copy_from_user_inatomic() zero anything + * at all; their callers absolutely must check the return value. + * + * Biarch ones should also provide raw_copy_in_user() - similar to the above, + * but both source and destination are __user pointers (affected by set_fs() + * as usual) and both source and destination can trigger faults. + */ + +static __always_inline unsigned long +__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) +{ + kasan_check_write(to, n); + check_object_size(to, n, false); + return raw_copy_from_user(to, from, n); +} + +static __always_inline unsigned long +__copy_from_user(void *to, const void __user *from, unsigned long n) +{ + might_fault(); + kasan_check_write(to, n); + check_object_size(to, n, false); + return raw_copy_from_user(to, from, n); +} + +/** + * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. + * + * Copy data from kernel space to user space. Caller must check + * the specified block with access_ok() before calling this function. + * The caller should also make sure he pins the user space address + * so that we don't result in page fault and sleep. + */ +static __always_inline unsigned long +__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n) +{ + kasan_check_read(from, n); + check_object_size(from, n, true); + return raw_copy_to_user(to, from, n); +} + +static __always_inline unsigned long +__copy_to_user(void __user *to, const void *from, unsigned long n) +{ + might_fault(); + kasan_check_read(from, n); + check_object_size(from, n, true); + return raw_copy_to_user(to, from, n); +} + +#ifdef INLINE_COPY_FROM_USER +static inline unsigned long +_copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned long res = n; + if (likely(access_ok(VERIFY_READ, from, n))) + res = raw_copy_from_user(to, from, n); + if (unlikely(res)) + memset(to + (n - res), 0, res); + return res; +} +#else +extern unsigned long +_copy_from_user(void *, const void __user *, unsigned long); +#endif + +#ifdef INLINE_COPY_TO_USER +static inline unsigned long +_copy_to_user(void __user *to, const void *from, unsigned long n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + n = raw_copy_to_user(to, from, n); + return n; +} +#else +extern unsigned long +_copy_to_user(void __user *, const void *, unsigned long); +#endif + +extern void __compiletime_error("usercopy buffer size is too small") +__bad_copy_user(void); + +static inline void copy_user_overflow(int size, unsigned long count) +{ + WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); +} + +static __always_inline unsigned long __must_check +copy_from_user(void *to, const void __user *from, unsigned long n) +{ + int sz = __compiletime_object_size(to); + + might_fault(); + kasan_check_write(to, n); + + if (likely(sz < 0 || sz >= n)) { + check_object_size(to, n, false); + n = _copy_from_user(to, from, n); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); + else + __bad_copy_user(); + + return n; +} + +static __always_inline unsigned long __must_check +copy_to_user(void __user *to, const void *from, unsigned long n) +{ + int sz = __compiletime_object_size(from); + + kasan_check_read(from, n); + might_fault(); + + if (likely(sz < 0 || sz >= n)) { + check_object_size(from, n, true); + n = _copy_to_user(to, from, n); + } else if (!__builtin_constant_p(n)) + copy_user_overflow(sz, n); + else + __bad_copy_user(); + + return n; +} +#ifdef CONFIG_COMPAT +static __always_inline unsigned long __must_check +__copy_in_user(void __user *to, const void *from, unsigned long n) +{ + might_fault(); + return raw_copy_in_user(to, from, n); +} +static __always_inline unsigned long __must_check +copy_in_user(void __user *to, const void *from, unsigned long n) +{ + might_fault(); + if (access_ok(VERIFY_WRITE, to, n) && access_ok(VERIFY_READ, from, n)) + n = raw_copy_in_user(to, from, n); + return n; +} +#endif + static __always_inline void pagefault_disabled_inc(void) { current->pagefault_disabled++; @@ -12,7 +203,6 @@ static __always_inline void pagefault_disabled_inc(void) static __always_inline void pagefault_disabled_dec(void) { current->pagefault_disabled--; - WARN_ON(current->pagefault_disabled < 0); } /* @@ -67,12 +257,6 @@ static inline unsigned long __copy_from_user_inatomic_nocache(void *to, return __copy_from_user_inatomic(to, from, n); } -static inline unsigned long __copy_from_user_nocache(void *to, - const void __user *from, unsigned long n) -{ - return __copy_from_user(to, from, n); -} - #endif /* ARCH_HAS_NOCACHE_UACCESS */ /* diff --git a/include/linux/udp.h b/include/linux/udp.h index c0f530809d1f3db7323e51a52224eb49d8f97da0..6cb4061a720d2df5e5f9467de8269529195ce827 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -115,6 +115,6 @@ static inline bool udp_get_no_check6_rx(struct sock *sk) #define udp_portaddr_for_each_entry_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node) -#define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag) +#define IS_UDPLITE(__sk) (__sk->sk_protocol == IPPROTO_UDPLITE) #endif /* _LINUX_UDP_H */ diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 32c0e83d62397355e1b6bb804c1d0381f56daca1..3c85c81b00279708eb6f816c153e9e4e2f735c1e 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -23,11 +23,13 @@ struct uio_map; /** * struct uio_mem - description of a UIO memory region * @name: name of the memory region for identification - * @addr: address of the device's memory (phys_addr is used since - * addr can be logical, virtual, or physical & phys_addr_t - * should always be large enough to handle any of the - * address types) - * @size: size of IO + * @addr: address of the device's memory rounded to page + * size (phys_addr is used since addr can be + * logical, virtual, or physical & phys_addr_t + * should always be large enough to handle any of + * the address types) + * @offs: offset of device memory within the page + * @size: size of IO (multiple of page size) * @memtype: type of memory addr points to * @internal_addr: ioremap-ped version of addr, for driver internal use * @map: for use by the UIO core only. @@ -35,6 +37,7 @@ struct uio_map; struct uio_mem { const char *name; phys_addr_t addr; + unsigned long offs; resource_size_t size; int memtype; void __iomem *internal_addr; diff --git a/include/linux/usb.h b/include/linux/usb.h index 7e68259360dec55f4578f6203615b60faed5227c..cb9fbd54386e286f58b6332490a8bd8dcb78471e 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -99,6 +99,76 @@ enum usb_interface_condition { USB_INTERFACE_UNBINDING, }; +int __must_check +usb_find_common_endpoints(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in, + struct usb_endpoint_descriptor **bulk_out, + struct usb_endpoint_descriptor **int_in, + struct usb_endpoint_descriptor **int_out); + +int __must_check +usb_find_common_endpoints_reverse(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in, + struct usb_endpoint_descriptor **bulk_out, + struct usb_endpoint_descriptor **int_in, + struct usb_endpoint_descriptor **int_out); + +static inline int __must_check +usb_find_bulk_in_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in) +{ + return usb_find_common_endpoints(alt, bulk_in, NULL, NULL, NULL); +} + +static inline int __must_check +usb_find_bulk_out_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_out) +{ + return usb_find_common_endpoints(alt, NULL, bulk_out, NULL, NULL); +} + +static inline int __must_check +usb_find_int_in_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **int_in) +{ + return usb_find_common_endpoints(alt, NULL, NULL, int_in, NULL); +} + +static inline int __must_check +usb_find_int_out_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **int_out) +{ + return usb_find_common_endpoints(alt, NULL, NULL, NULL, int_out); +} + +static inline int __must_check +usb_find_last_bulk_in_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_in) +{ + return usb_find_common_endpoints_reverse(alt, bulk_in, NULL, NULL, NULL); +} + +static inline int __must_check +usb_find_last_bulk_out_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **bulk_out) +{ + return usb_find_common_endpoints_reverse(alt, NULL, bulk_out, NULL, NULL); +} + +static inline int __must_check +usb_find_last_int_in_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **int_in) +{ + return usb_find_common_endpoints_reverse(alt, NULL, NULL, int_in, NULL); +} + +static inline int __must_check +usb_find_last_int_out_endpoint(struct usb_host_interface *alt, + struct usb_endpoint_descriptor **int_out) +{ + return usb_find_common_endpoints_reverse(alt, NULL, NULL, NULL, int_out); +} + /** * struct usb_interface - what usb device drivers talk to * @altsetting: array of interface structures, one for each alternate @@ -248,7 +318,7 @@ void usb_put_intf(struct usb_interface *intf); * struct usb_interface (which persists only as long as its configuration * is installed). The altsetting arrays can be accessed through these * structures at any time, permitting comparison of configurations and - * providing support for the /proc/bus/usb/devices pseudo-file. + * providing support for the /sys/kernel/debug/usb/devices pseudo-file. */ struct usb_interface_cache { unsigned num_altsetting; /* number of alternate settings */ @@ -354,6 +424,7 @@ struct usb_devmap { */ struct usb_bus { struct device *controller; /* host/master side hardware */ + struct device *sysdev; /* as seen from firmware or bus */ int busnum; /* Bus number (in order of reg) */ const char *bus_name; /* stable id (PCI slot_name etc) */ u8 uses_dma; /* Does the host controller use DMA? */ diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h index 4616a49a1c2e58c61e3bc602fa214e0be40f8820..f665d2ceac20587a285428202a7ba17de23b5734 100644 --- a/include/linux/usb/composite.h +++ b/include/linux/usb/composite.h @@ -451,6 +451,7 @@ static inline struct usb_composite_driver *to_cdriver( * sure doing that won't hurt too much. * * One notion for how to handle Wireless USB devices involves: + * * (a) a second gadget here, discovery mechanism TBD, but likely * needing separate "register/unregister WUSB gadget" calls; * (b) updates to usb_gadget to include flags "is it wireless", @@ -503,8 +504,9 @@ struct usb_composite_dev { /* protects deactivations and delayed_status counts*/ spinlock_t lock; - unsigned setup_pending:1; - unsigned os_desc_pending:1; + /* public: */ + unsigned int setup_pending:1; + unsigned int os_desc_pending:1; }; extern int usb_string_id(struct usb_composite_dev *c); diff --git a/include/linux/usb/gadget.h b/include/linux/usb/gadget.h index e4516e9ded0f437b73f171678833425d9f29a442..fbc22a39e7bc6898f3cc17cbc49b8b3f458697b0 100644 --- a/include/linux/usb/gadget.h +++ b/include/linux/usb/gadget.h @@ -188,7 +188,7 @@ struct usb_ep_caps { * @caps:The structure describing types and directions supported by endoint. * @maxpacket:The maximum packet size used on this endpoint. The initial * value can sometimes be reduced (hardware allowing), according to - * the endpoint descriptor used to configure the endpoint. + * the endpoint descriptor used to configure the endpoint. * @maxpacket_limit:The maximum packet size value which can be handled by this * endpoint. It's set once by UDC driver when endpoint is initialized, and * should not be changed. Should not be confused with maxpacket. diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 40edf6a8533e51fef324eb52b91d6038f74ab877..a469999a106dbc1869f7c90eccc313a90a6d17b0 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -437,6 +437,9 @@ extern int usb_hcd_alloc_bandwidth(struct usb_device *udev, struct usb_host_interface *new_alt); extern int usb_hcd_get_frame_number(struct usb_device *udev); +struct usb_hcd *__usb_create_hcd(const struct hc_driver *driver, + struct device *sysdev, struct device *dev, const char *bus_name, + struct usb_hcd *primary_hcd); extern struct usb_hcd *usb_create_hcd(const struct hc_driver *driver, struct device *dev, const char *bus_name); extern struct usb_hcd *usb_create_shared_hcd(const struct hc_driver *driver, @@ -453,7 +456,7 @@ extern int usb_hcd_find_raw_port_number(struct usb_hcd *hcd, int port1); struct platform_device; extern void usb_hcd_platform_shutdown(struct platform_device *dev); -#ifdef CONFIG_PCI +#ifdef CONFIG_USB_PCI struct pci_dev; struct pci_device_id; extern int usb_hcd_pci_probe(struct pci_dev *dev, @@ -466,7 +469,7 @@ extern int usb_hcd_amd_remote_wakeup_quirk(struct pci_dev *dev); #ifdef CONFIG_PM extern const struct dev_pm_ops usb_hcd_pci_pm_ops; #endif -#endif /* CONFIG_PCI */ +#endif /* CONFIG_USB_PCI */ /* pci-ish (pdev null is ok) buffer alloc/mapping support */ void usb_init_pool_max(void); diff --git a/include/linux/usb/of.h b/include/linux/usb/of.h index 5ff9032ee1b47b8cb8a7b969e93829ce63d1deb1..4031f47629ecf264113b3b3d402c6a78ce2a955a 100644 --- a/include/linux/usb/of.h +++ b/include/linux/usb/of.h @@ -18,6 +18,7 @@ int of_usb_update_otg_caps(struct device_node *np, struct usb_otg_caps *otg_caps); struct device_node *usb_of_get_child_node(struct device_node *parent, int portnum); +struct device *usb_of_get_companion_dev(struct device *dev); #else static inline enum usb_dr_mode of_usb_get_dr_mode_by_phy(struct device_node *np, int arg0) @@ -38,6 +39,10 @@ static inline struct device_node *usb_of_get_child_node { return NULL; } +static inline struct device *usb_of_get_companion_dev(struct device *dev) +{ + return NULL; +} #endif #if IS_ENABLED(CONFIG_OF) && IS_ENABLED(CONFIG_USB_SUPPORT) diff --git a/include/linux/usb/otg-fsm.h b/include/linux/usb/otg-fsm.h index 7a0350535cb10146ffd4e1936da7dced6bc477ca..a0a8f878503c959442608d09e1f0495a6319e03a 100644 --- a/include/linux/usb/otg-fsm.h +++ b/include/linux/usb/otg-fsm.h @@ -21,21 +21,6 @@ #include #include -#undef VERBOSE - -#ifdef VERBOSE -#define VDBG(fmt, args...) pr_debug("[%s] " fmt , \ - __func__, ## args) -#else -#define VDBG(stuff...) do {} while (0) -#endif - -#ifdef VERBOSE -#define MPC_LOC printk("Current Location [%s]:[%d]\n", __FILE__, __LINE__) -#else -#define MPC_LOC do {} while (0) -#endif - #define PROTO_UNDEF (0) #define PROTO_HOST (1) #define PROTO_GADGET (2) diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 704a1ab8240ca124f29c5ce361c871090d28ea5b..e2f0ab07eea5bd4d6b49abb7c563d7816934394d 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -20,7 +20,7 @@ #include /* The maximum number of ports one device can grab at once */ -#define MAX_NUM_PORTS 8 +#define MAX_NUM_PORTS 16 /* parity check flag */ #define RELEVANT_IFLAG(iflag) (iflag & (IGNBRK|BRKINT|IGNPAR|PARMRK|INPCK)) @@ -159,10 +159,10 @@ struct usb_serial { unsigned char minors_reserved:1; unsigned char num_ports; unsigned char num_port_pointers; - char num_interrupt_in; - char num_interrupt_out; - char num_bulk_in; - char num_bulk_out; + unsigned char num_interrupt_in; + unsigned char num_interrupt_out; + unsigned char num_bulk_in; + unsigned char num_bulk_out; struct usb_serial_port *port[MAX_NUM_PORTS]; struct kref kref; struct mutex disc_mutex; @@ -181,6 +181,17 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data) serial->private = data; } +struct usb_serial_endpoints { + unsigned char num_bulk_in; + unsigned char num_bulk_out; + unsigned char num_interrupt_in; + unsigned char num_interrupt_out; + struct usb_endpoint_descriptor *bulk_in[MAX_NUM_PORTS]; + struct usb_endpoint_descriptor *bulk_out[MAX_NUM_PORTS]; + struct usb_endpoint_descriptor *interrupt_in[MAX_NUM_PORTS]; + struct usb_endpoint_descriptor *interrupt_out[MAX_NUM_PORTS]; +}; + /** * usb_serial_driver - describes a usb serial driver * @description: pointer to a string that describes this driver. This string @@ -188,12 +199,17 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data) * @id_table: pointer to a list of usb_device_id structures that define all * of the devices this structure can support. * @num_ports: the number of different ports this device will have. + * @num_bulk_in: minimum number of bulk-in endpoints + * @num_bulk_out: minimum number of bulk-out endpoints + * @num_interrupt_in: minimum number of interrupt-in endpoints + * @num_interrupt_out: minimum number of interrupt-out endpoints * @bulk_in_size: minimum number of bytes to allocate for bulk-in buffer * (0 = end-point size) * @bulk_out_size: bytes to allocate for bulk-out buffer (0 = end-point size) * @calc_num_ports: pointer to a function to determine how many ports this - * device has dynamically. It will be called after the probe() - * callback is called, but before attach() + * device has dynamically. It can also be used to verify the number of + * endpoints or to modify the port-endpoint mapping. It will be called + * after the probe() callback is called, but before attach(). * @probe: pointer to the driver's probe function. * This will be called when the device is inserted into the system, * but before the device has been fully initialized by the usb_serial @@ -227,19 +243,26 @@ static inline void usb_set_serial_data(struct usb_serial *serial, void *data) struct usb_serial_driver { const char *description; const struct usb_device_id *id_table; - char num_ports; struct list_head driver_list; struct device_driver driver; struct usb_driver *usb_driver; struct usb_dynids dynids; + unsigned char num_ports; + + unsigned char num_bulk_in; + unsigned char num_bulk_out; + unsigned char num_interrupt_in; + unsigned char num_interrupt_out; + size_t bulk_in_size; size_t bulk_out_size; int (*probe)(struct usb_serial *serial, const struct usb_device_id *id); int (*attach)(struct usb_serial *serial); - int (*calc_num_ports) (struct usb_serial *serial); + int (*calc_num_ports)(struct usb_serial *serial, + struct usb_serial_endpoints *epds); void (*disconnect)(struct usb_serial *serial); void (*release)(struct usb_serial *serial); @@ -356,7 +379,6 @@ extern void usb_serial_handle_dcd_change(struct usb_serial_port *usb_port, extern int usb_serial_bus_register(struct usb_serial_driver *device); extern void usb_serial_bus_deregister(struct usb_serial_driver *device); -extern struct usb_serial_driver usb_serial_generic_device; extern struct bus_type usb_serial_bus_type; extern struct tty_driver *usb_serial_tty_driver; diff --git a/include/linux/usb/typec.h b/include/linux/usb/typec.h new file mode 100644 index 0000000000000000000000000000000000000000..ec78204964ab806d1f807515e8976c385c661181 --- /dev/null +++ b/include/linux/usb/typec.h @@ -0,0 +1,243 @@ + +#ifndef __LINUX_USB_TYPEC_H +#define __LINUX_USB_TYPEC_H + +#include + +/* XXX: Once we have a header for USB Power Delivery, this belongs there */ +#define ALTMODE_MAX_MODES 6 + +/* USB Type-C Specification releases */ +#define USB_TYPEC_REV_1_0 0x100 /* 1.0 */ +#define USB_TYPEC_REV_1_1 0x110 /* 1.1 */ +#define USB_TYPEC_REV_1_2 0x120 /* 1.2 */ + +struct typec_altmode; +struct typec_partner; +struct typec_cable; +struct typec_plug; +struct typec_port; + +struct fwnode_handle; + +enum typec_port_type { + TYPEC_PORT_DFP, + TYPEC_PORT_UFP, + TYPEC_PORT_DRP, +}; + +enum typec_plug_type { + USB_PLUG_NONE, + USB_PLUG_TYPE_A, + USB_PLUG_TYPE_B, + USB_PLUG_TYPE_C, + USB_PLUG_CAPTIVE, +}; + +enum typec_data_role { + TYPEC_DEVICE, + TYPEC_HOST, +}; + +enum typec_role { + TYPEC_SINK, + TYPEC_SOURCE, +}; + +enum typec_pwr_opmode { + TYPEC_PWR_MODE_USB, + TYPEC_PWR_MODE_1_5A, + TYPEC_PWR_MODE_3_0A, + TYPEC_PWR_MODE_PD, +}; + +enum typec_accessory { + TYPEC_ACCESSORY_NONE, + TYPEC_ACCESSORY_AUDIO, + TYPEC_ACCESSORY_DEBUG, +}; + +#define TYPEC_MAX_ACCESSORY 3 + +/* + * struct usb_pd_identity - USB Power Delivery identity data + * @id_header: ID Header VDO + * @cert_stat: Cert Stat VDO + * @product: Product VDO + * + * USB power delivery Discover Identity command response data. + * + * REVISIT: This is USB Power Delivery specific information, so this structure + * probable belongs to USB Power Delivery header file once we have them. + */ +struct usb_pd_identity { + u32 id_header; + u32 cert_stat; + u32 product; +}; + +int typec_partner_set_identity(struct typec_partner *partner); +int typec_cable_set_identity(struct typec_cable *cable); + +/* + * struct typec_mode_desc - Individual Mode of an Alternate Mode + * @index: Index of the Mode within the SVID + * @vdo: VDO returned by Discover Modes USB PD command + * @desc: Optional human readable description of the mode + * @roles: Only for ports. DRP if the mode is available in both roles + * + * Description of a mode of an Alternate Mode which a connector, cable plug or + * partner supports. Every mode will have it's own sysfs group. The details are + * the VDO returned by discover modes command, description for the mode and + * active flag telling has the mode being entered or not. + */ +struct typec_mode_desc { + int index; + u32 vdo; + char *desc; + /* Only used with ports */ + enum typec_port_type roles; +}; + +/* + * struct typec_altmode_desc - USB Type-C Alternate Mode Descriptor + * @svid: Standard or Vendor ID + * @n_modes: Number of modes + * @modes: Array of modes supported by the Alternate Mode + * + * Representation of an Alternate Mode that has SVID assigned by USB-IF. The + * array of modes will list the modes of a particular SVID that are supported by + * a connector, partner of a cable plug. + */ +struct typec_altmode_desc { + u16 svid; + int n_modes; + struct typec_mode_desc modes[ALTMODE_MAX_MODES]; +}; + +struct typec_altmode +*typec_partner_register_altmode(struct typec_partner *partner, + struct typec_altmode_desc *desc); +struct typec_altmode +*typec_plug_register_altmode(struct typec_plug *plug, + struct typec_altmode_desc *desc); +struct typec_altmode +*typec_port_register_altmode(struct typec_port *port, + struct typec_altmode_desc *desc); +void typec_unregister_altmode(struct typec_altmode *altmode); + +struct typec_port *typec_altmode2port(struct typec_altmode *alt); + +void typec_altmode_update_active(struct typec_altmode *alt, int mode, + bool active); + +enum typec_plug_index { + TYPEC_PLUG_SOP_P, + TYPEC_PLUG_SOP_PP, +}; + +/* + * struct typec_plug_desc - USB Type-C Cable Plug Descriptor + * @index: SOP Prime for the plug connected to DFP and SOP Double Prime for the + * plug connected to UFP + * + * Represents USB Type-C Cable Plug. + */ +struct typec_plug_desc { + enum typec_plug_index index; +}; + +/* + * struct typec_cable_desc - USB Type-C Cable Descriptor + * @type: The plug type from USB PD Cable VDO + * @active: Is the cable active or passive + * @identity: Result of Discover Identity command + * + * Represents USB Type-C Cable attached to USB Type-C port. + */ +struct typec_cable_desc { + enum typec_plug_type type; + unsigned int active:1; + struct usb_pd_identity *identity; +}; + +/* + * struct typec_partner_desc - USB Type-C Partner Descriptor + * @usb_pd: USB Power Delivery support + * @accessory: Audio, Debug or none. + * @identity: Discover Identity command data + * + * Details about a partner that is attached to USB Type-C port. If @identity + * member exists when partner is registered, a directory named "identity" is + * created to sysfs for the partner device. + */ +struct typec_partner_desc { + unsigned int usb_pd:1; + enum typec_accessory accessory; + struct usb_pd_identity *identity; +}; + +/* + * struct typec_capability - USB Type-C Port Capabilities + * @role: DFP (Host-only), UFP (Device-only) or DRP (Dual Role) + * @revision: USB Type-C Specification release. Binary coded decimal + * @pd_revision: USB Power Delivery Specification revision if supported + * @prefer_role: Initial role preference + * @accessory: Supported Accessory Modes + * @fwnode: Optional fwnode of the port + * @try_role: Set data role preference for DRP port + * @dr_set: Set Data Role + * @pr_set: Set Power Role + * @vconn_set: Set VCONN Role + * @activate_mode: Enter/exit given Alternate Mode + * + * Static capabilities of a single USB Type-C port. + */ +struct typec_capability { + enum typec_port_type type; + u16 revision; /* 0120H = "1.2" */ + u16 pd_revision; /* 0300H = "3.0" */ + int prefer_role; + enum typec_accessory accessory[TYPEC_MAX_ACCESSORY]; + + struct fwnode_handle *fwnode; + + int (*try_role)(const struct typec_capability *, + int role); + + int (*dr_set)(const struct typec_capability *, + enum typec_data_role); + int (*pr_set)(const struct typec_capability *, + enum typec_role); + int (*vconn_set)(const struct typec_capability *, + enum typec_role); + + int (*activate_mode)(const struct typec_capability *, + int mode, int activate); +}; + +/* Specific to try_role(). Indicates the user want's to clear the preference. */ +#define TYPEC_NO_PREFERRED_ROLE (-1) + +struct typec_port *typec_register_port(struct device *parent, + const struct typec_capability *cap); +void typec_unregister_port(struct typec_port *port); + +struct typec_partner *typec_register_partner(struct typec_port *port, + struct typec_partner_desc *desc); +void typec_unregister_partner(struct typec_partner *partner); + +struct typec_cable *typec_register_cable(struct typec_port *port, + struct typec_cable_desc *desc); +void typec_unregister_cable(struct typec_cable *cable); + +struct typec_plug *typec_register_plug(struct typec_cable *cable, + struct typec_plug_desc *desc); +void typec_unregister_plug(struct typec_plug *plug); + +void typec_set_data_role(struct typec_port *port, enum typec_data_role role); +void typec_set_pwr_role(struct typec_port *port, enum typec_role role); +void typec_set_vconn_role(struct typec_port *port, enum typec_role role); +void typec_set_pwr_opmode(struct typec_port *port, enum typec_pwr_opmode mode); + +#endif /* __LINUX_USB_TYPEC_H */ diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 6e0ce8c7b8cb5a9fcb985a5a5078f82267d03092..7dffa5624ea62bee992e547c44ed4a86a577b0a7 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -64,6 +64,8 @@ struct usbnet { struct usb_anchor deferred; struct tasklet_struct bh; + struct pcpu_sw_netstats __percpu *stats64; + struct work_struct kevent; unsigned long flags; # define EVENT_TX_HALT 0 @@ -261,10 +263,10 @@ extern void usbnet_pause_rx(struct usbnet *); extern void usbnet_resume_rx(struct usbnet *); extern void usbnet_purge_paused_rxq(struct usbnet *); -extern int usbnet_get_settings(struct net_device *net, - struct ethtool_cmd *cmd); -extern int usbnet_set_settings(struct net_device *net, - struct ethtool_cmd *cmd); +extern int usbnet_get_link_ksettings(struct net_device *net, + struct ethtool_link_ksettings *cmd); +extern int usbnet_set_link_ksettings(struct net_device *net, + const struct ethtool_link_ksettings *cmd); extern u32 usbnet_get_link(struct net_device *net); extern u32 usbnet_get_msglevel(struct net_device *); extern void usbnet_set_msglevel(struct net_device *, u32); @@ -278,5 +280,7 @@ extern int usbnet_status_start(struct usbnet *dev, gfp_t mem_flags); extern void usbnet_status_stop(struct usbnet *dev); extern void usbnet_update_max_qlen(struct usbnet *dev); +extern void usbnet_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *stats); #endif /* __LINUX_USB_USBNET_H */ diff --git a/include/linux/usb/xhci-dbgp.h b/include/linux/usb/xhci-dbgp.h new file mode 100644 index 0000000000000000000000000000000000000000..80c1cca1f52976117a4f62928bd56d23fb191f5d --- /dev/null +++ b/include/linux/usb/xhci-dbgp.h @@ -0,0 +1,29 @@ +/* + * Standalone xHCI debug capability driver + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Lu Baolu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __LINUX_XHCI_DBGP_H +#define __LINUX_XHCI_DBGP_H + +#ifdef CONFIG_EARLY_PRINTK_USB_XDBC +int __init early_xdbc_parse_parameter(char *s); +int __init early_xdbc_setup_hardware(void); +void __init early_xdbc_register_console(void); +#else +static inline int __init early_xdbc_setup_hardware(void) +{ + return -ENODEV; +} +static inline void __init early_xdbc_register_console(void) +{ +} +#endif /* CONFIG_EARLY_PRINTK_USB_XDBC */ +#endif /* __LINUX_XHCI_DBGP_H */ diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 7edfbdb55a995d436bf9e999ce202d0ca0bf2550..28b0e965360ff1822a22252b1ecaaab52ca8ab2d 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -44,6 +44,12 @@ int virtqueue_add_inbuf(struct virtqueue *vq, void *data, gfp_t gfp); +int virtqueue_add_inbuf_ctx(struct virtqueue *vq, + struct scatterlist sg[], unsigned int num, + void *data, + void *ctx, + gfp_t gfp); + int virtqueue_add_sgs(struct virtqueue *vq, struct scatterlist *sgs[], unsigned int out_sgs, @@ -59,6 +65,9 @@ bool virtqueue_notify(struct virtqueue *vq); void *virtqueue_get_buf(struct virtqueue *vq, unsigned int *len); +void *virtqueue_get_buf_ctx(struct virtqueue *vq, unsigned int *len, + void **ctx); + void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); @@ -156,9 +165,13 @@ int virtio_device_restore(struct virtio_device *dev); * @feature_table_legacy: same as feature_table but when working in legacy mode. * @feature_table_size_legacy: number of entries in feature table legacy array. * @probe: the function to call when a device is found. Returns 0 or -errno. + * @scan: optional function to call after successful probe; intended + * for virtio-scsi to invoke a scan. * @remove: the function to call when a device is removed. * @config_changed: optional function to call when the device configuration * changes; may be called in interrupt context. + * @freeze: optional function to call during suspend/hibernation. + * @restore: optional function to call on resume. */ struct virtio_driver { struct device_driver driver; diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 8355bab175e1d8fb27ac9e0860465ba8afc36076..0133d8a12ccd468514a72fdf8eff64aa7d69551d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -72,7 +72,8 @@ struct virtio_config_ops { void (*reset)(struct virtio_device *vdev); int (*find_vqs)(struct virtio_device *, unsigned nvqs, struct virtqueue *vqs[], vq_callback_t *callbacks[], - const char * const names[], struct irq_affinity *desc); + const char * const names[], const bool *ctx, + struct irq_affinity *desc); void (*del_vqs)(struct virtio_device *); u64 (*get_features)(struct virtio_device *vdev); int (*finalize_features)(struct virtio_device *vdev); @@ -173,12 +174,32 @@ struct virtqueue *virtio_find_single_vq(struct virtio_device *vdev, vq_callback_t *callbacks[] = { c }; const char *names[] = { n }; struct virtqueue *vq; - int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL); + int err = vdev->config->find_vqs(vdev, 1, &vq, callbacks, names, NULL, + NULL); if (err < 0) return ERR_PTR(err); return vq; } +static inline +int virtio_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], + struct irq_affinity *desc) +{ + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, NULL, desc); +} + +static inline +int virtio_find_vqs_ctx(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], vq_callback_t *callbacks[], + const char * const names[], const bool *ctx, + struct irq_affinity *desc) +{ + return vdev->config->find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, + desc); +} + /** * virtio_device_ready - enable vq use in probe function * @vdev: the device diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index e8d36938f09a56cfe4729c17b0ee8175b4991bbe..270cfa81830ee4a69bca0c4ccf914a7a94e5b3e1 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -71,6 +71,7 @@ struct virtqueue *vring_create_virtqueue(unsigned int index, struct virtio_device *vdev, bool weak_barriers, bool may_reduce_num, + bool ctx, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), const char *name); @@ -80,6 +81,7 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, struct vring vring, struct virtio_device *vdev, bool weak_barriers, + bool ctx, bool (*notify)(struct virtqueue *), void (*callback)(struct virtqueue *), const char *name); @@ -93,6 +95,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int index, unsigned int vring_align, struct virtio_device *vdev, bool weak_barriers, + bool ctx, void *pages, bool (*notify)(struct virtqueue *vq), void (*callback)(struct virtqueue *vq), diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h index 584f9a647ad4acca191ff6116a47c14da1385fa3..ab13f0743da8ed095edb3be854f5c4430e828728 100644 --- a/include/linux/virtio_vsock.h +++ b/include/linux/virtio_vsock.h @@ -153,5 +153,6 @@ void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); +void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt); #endif /* _LINUX_VIRTIO_VSOCK_H */ diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a80b7b59cf33418811217faca1b9c6b041dad814..d84ae90ccd5c40de69de9217aa7c4e23e4c26be8 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -25,7 +25,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGALLOC), FOR_ALL_ZONES(ALLOCSTALL), FOR_ALL_ZONES(PGSCAN_SKIP), - PGFREE, PGACTIVATE, PGDEACTIVATE, + PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, PGREFILL, diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index d68edffbf142cec5f8987a7907b6f2e76cefdfd7..2d92dd002abdaf823d1676be3c1ed5315d18342b 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -80,6 +80,17 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +#ifndef CONFIG_MMU +extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); +static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, + gfp_t flags, void *caller) +{ + return __vmalloc_node_flags(size, node, flags); +} +#else +extern void *__vmalloc_node_flags_caller(unsigned long size, + int node, gfp_t flags, void *caller); +#endif extern void vfree(const void *addr); extern void vfree_atomic(const void *addr); diff --git a/include/linux/vme.h b/include/linux/vme.h index ec5e8bf6118e7b1d99be2a5d0293c9a9babe6b1e..25874da3f2e17965879b44aab7b08fb0ad62af89 100644 --- a/include/linux/vme.h +++ b/include/linux/vme.h @@ -92,7 +92,7 @@ extern struct bus_type vme_bus_type; #define VME_SLOT_ALL -2 /** - * Structure representing a VME device + * struct vme_dev - Structure representing a VME device * @num: The device number * @bridge: Pointer to the bridge device this device is on * @dev: Internal device structure @@ -107,6 +107,16 @@ struct vme_dev { struct list_head bridge_list; }; +/** + * struct vme_driver - Structure representing a VME driver + * @name: Driver name, should be unique among VME drivers and usually the same + * as the module name. + * @match: Callback used to determine whether probe should be run. + * @probe: Callback for device binding, called when new device is detected. + * @remove: Callback, called on device removal. + * @driver: Underlying generic device driver structure. + * @devices: List of VME devices (struct vme_dev) associated with this driver. + */ struct vme_driver { const char *name; int (*match)(struct vme_dev *); diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index bde063cefd047c1bb42ac02a02522ae88863c723..c102ef65cb64a94269b950ff09c47d90bbc2d315 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -608,8 +608,13 @@ static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } +static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + return fn(arg); +} #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/include/linux/writeback.h b/include/linux/writeback.h index a3c0cbd7c88824a297cd725e36c931c223c3e106..d5815794416c9b8430c7ca9153e3be3271a76899 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -237,6 +237,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } diff --git a/include/media/cec-edid.h b/include/media/cec-edid.h deleted file mode 100644 index bdf731ecba1a760438a3f4bcf4ccf6d91b9fb90e..0000000000000000000000000000000000000000 --- a/include/media/cec-edid.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * cec-edid - HDMI Consumer Electronics Control & EDID helpers - * - * Copyright 2016 Cisco Systems, Inc. and/or its affiliates. All rights reserved. - * - * This program is free software; you may redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#ifndef _MEDIA_CEC_EDID_H -#define _MEDIA_CEC_EDID_H - -#include - -#define CEC_PHYS_ADDR_INVALID 0xffff -#define cec_phys_addr_exp(pa) \ - ((pa) >> 12), ((pa) >> 8) & 0xf, ((pa) >> 4) & 0xf, (pa) & 0xf - -/** - * cec_get_edid_phys_addr() - find and return the physical address - * - * @edid: pointer to the EDID data - * @size: size in bytes of the EDID data - * @offset: If not %NULL then the location of the physical address - * bytes in the EDID will be returned here. This is set to 0 - * if there is no physical address found. - * - * Return: the physical address or CEC_PHYS_ADDR_INVALID if there is none. - */ -u16 cec_get_edid_phys_addr(const u8 *edid, unsigned int size, - unsigned int *offset); - -/** - * cec_set_edid_phys_addr() - find and set the physical address - * - * @edid: pointer to the EDID data - * @size: size in bytes of the EDID data - * @phys_addr: the new physical address - * - * This function finds the location of the physical address in the EDID - * and fills in the given physical address and updates the checksum - * at the end of the EDID block. It does nothing if the EDID doesn't - * contain a physical address. - */ -void cec_set_edid_phys_addr(u8 *edid, unsigned int size, u16 phys_addr); - -/** - * cec_phys_addr_for_input() - calculate the PA for an input - * - * @phys_addr: the physical address of the parent - * @input: the number of the input port, must be between 1 and 15 - * - * This function calculates a new physical address based on the input - * port number. For example: - * - * PA = 0.0.0.0 and input = 2 becomes 2.0.0.0 - * - * PA = 3.0.0.0 and input = 1 becomes 3.1.0.0 - * - * PA = 3.2.1.0 and input = 5 becomes 3.2.1.5 - * - * PA = 3.2.1.3 and input = 5 becomes f.f.f.f since it maxed out the depth. - * - * Return: the new physical address or CEC_PHYS_ADDR_INVALID. - */ -u16 cec_phys_addr_for_input(u16 phys_addr, u8 input); - -/** - * cec_phys_addr_validate() - validate a physical address from an EDID - * - * @phys_addr: the physical address to validate - * @parent: if not %NULL, then this is filled with the parents PA. - * @port: if not %NULL, then this is filled with the input port. - * - * This validates a physical address as read from an EDID. If the - * PA is invalid (such as 1.0.1.0 since '0' is only allowed at the end), - * then it will return -EINVAL. - * - * The parent PA is passed into %parent and the input port is passed into - * %port. For example: - * - * PA = 0.0.0.0: has parent 0.0.0.0 and input port 0. - * - * PA = 1.0.0.0: has parent 0.0.0.0 and input port 1. - * - * PA = 3.2.0.0: has parent 3.0.0.0 and input port 2. - * - * PA = f.f.f.f: has parent f.f.f.f and input port 0. - * - * Return: 0 if the PA is valid, -EINVAL if not. - */ -int cec_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port); - -#endif /* _MEDIA_CEC_EDID_H */ diff --git a/include/media/cec-notifier.h b/include/media/cec-notifier.h new file mode 100644 index 0000000000000000000000000000000000000000..eb50ce54b759154b99cf333e177adf8ca229c69b --- /dev/null +++ b/include/media/cec-notifier.h @@ -0,0 +1,111 @@ +/* + * cec-notifier.h - notify CEC drivers of physical address changes + * + * Copyright 2016 Russell King + * Copyright 2016-2017 Cisco Systems, Inc. and/or its affiliates. All rights reserved. + * + * This program is free software; you may redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef LINUX_CEC_NOTIFIER_H +#define LINUX_CEC_NOTIFIER_H + +#include +#include + +struct device; +struct edid; +struct cec_adapter; +struct cec_notifier; + +#ifdef CONFIG_MEDIA_CEC_NOTIFIER + +/** + * cec_notifier_get - find or create a new cec_notifier for the given device. + * @dev: device that sends the events. + * + * If a notifier for device @dev already exists, then increase the refcount + * and return that notifier. + * + * If it doesn't exist, then allocate a new notifier struct and return a + * pointer to that new struct. + * + * Return NULL if the memory could not be allocated. + */ +struct cec_notifier *cec_notifier_get(struct device *dev); + +/** + * cec_notifier_put - decrease refcount and delete when the refcount reaches 0. + * @n: notifier + */ +void cec_notifier_put(struct cec_notifier *n); + +/** + * cec_notifier_set_phys_addr - set a new physical address. + * @n: the CEC notifier + * @pa: the CEC physical address + * + * Set a new CEC physical address. + */ +void cec_notifier_set_phys_addr(struct cec_notifier *n, u16 pa); + +/** + * cec_notifier_set_phys_addr_from_edid - set parse the PA from the EDID. + * @n: the CEC notifier + * @edid: the struct edid pointer + * + * Parses the EDID to obtain the new CEC physical address and set it. + */ +void cec_notifier_set_phys_addr_from_edid(struct cec_notifier *n, + const struct edid *edid); + +/** + * cec_notifier_register - register a callback with the notifier + * @n: the CEC notifier + * @adap: the CEC adapter, passed as argument to the callback function + * @callback: the callback function + */ +void cec_notifier_register(struct cec_notifier *n, + struct cec_adapter *adap, + void (*callback)(struct cec_adapter *adap, u16 pa)); + +/** + * cec_notifier_unregister - unregister the callback from the notifier. + * @n: the CEC notifier + */ +void cec_notifier_unregister(struct cec_notifier *n); + +#else +static inline struct cec_notifier *cec_notifier_get(struct device *dev) +{ + /* A non-NULL pointer is expected on success */ + return (struct cec_notifier *)0xdeadfeed; +} + +static inline void cec_notifier_put(struct cec_notifier *n) +{ +} + +static inline void cec_notifier_set_phys_addr(struct cec_notifier *n, u16 pa) +{ +} + +static inline void cec_notifier_set_phys_addr_from_edid(struct cec_notifier *n, + const struct edid *edid) +{ +} + +#endif + +#endif diff --git a/include/media/cec.h b/include/media/cec.h index 96a0aa770d61ddd843fc418f7ae3df4b4df41f74..b8eb895731d561a5f0ff4ec218055581ad6e21ee 100644 --- a/include/media/cec.h +++ b/include/media/cec.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include /** * struct cec_devnode - cec device node @@ -173,6 +173,10 @@ struct cec_adapter { bool passthrough; struct cec_log_addrs log_addrs; +#ifdef CONFIG_MEDIA_CEC_NOTIFIER + struct cec_notifier *notifier; +#endif + struct dentry *cec_dir; struct dentry *status_file; @@ -184,6 +188,11 @@ struct cec_adapter { char input_drv[32]; }; +static inline void *cec_get_drvdata(const struct cec_adapter *adap) +{ + return adap->priv; +} + static inline bool cec_has_log_addr(const struct cec_adapter *adap, u8 log_addr) { return adap->log_addrs.log_addr_mask & (1 << log_addr); @@ -194,7 +203,10 @@ static inline bool cec_is_sink(const struct cec_adapter *adap) return adap->phys_addr == 0; } -#if IS_ENABLED(CONFIG_MEDIA_CEC_SUPPORT) +#define cec_phys_addr_exp(pa) \ + ((pa) >> 12), ((pa) >> 8) & 0xf, ((pa) >> 4) & 0xf, (pa) & 0xf + +#if IS_ENABLED(CONFIG_CEC_CORE) struct cec_adapter *cec_allocate_adapter(const struct cec_adap_ops *ops, void *priv, const char *name, u32 caps, u8 available_las); int cec_register_adapter(struct cec_adapter *adap, struct device *parent); @@ -213,6 +225,86 @@ void cec_transmit_done(struct cec_adapter *adap, u8 status, u8 arb_lost_cnt, u8 nack_cnt, u8 low_drive_cnt, u8 error_cnt); void cec_received_msg(struct cec_adapter *adap, struct cec_msg *msg); +/** + * cec_get_edid_phys_addr() - find and return the physical address + * + * @edid: pointer to the EDID data + * @size: size in bytes of the EDID data + * @offset: If not %NULL then the location of the physical address + * bytes in the EDID will be returned here. This is set to 0 + * if there is no physical address found. + * + * Return: the physical address or CEC_PHYS_ADDR_INVALID if there is none. + */ +u16 cec_get_edid_phys_addr(const u8 *edid, unsigned int size, + unsigned int *offset); + +/** + * cec_set_edid_phys_addr() - find and set the physical address + * + * @edid: pointer to the EDID data + * @size: size in bytes of the EDID data + * @phys_addr: the new physical address + * + * This function finds the location of the physical address in the EDID + * and fills in the given physical address and updates the checksum + * at the end of the EDID block. It does nothing if the EDID doesn't + * contain a physical address. + */ +void cec_set_edid_phys_addr(u8 *edid, unsigned int size, u16 phys_addr); + +/** + * cec_phys_addr_for_input() - calculate the PA for an input + * + * @phys_addr: the physical address of the parent + * @input: the number of the input port, must be between 1 and 15 + * + * This function calculates a new physical address based on the input + * port number. For example: + * + * PA = 0.0.0.0 and input = 2 becomes 2.0.0.0 + * + * PA = 3.0.0.0 and input = 1 becomes 3.1.0.0 + * + * PA = 3.2.1.0 and input = 5 becomes 3.2.1.5 + * + * PA = 3.2.1.3 and input = 5 becomes f.f.f.f since it maxed out the depth. + * + * Return: the new physical address or CEC_PHYS_ADDR_INVALID. + */ +u16 cec_phys_addr_for_input(u16 phys_addr, u8 input); + +/** + * cec_phys_addr_validate() - validate a physical address from an EDID + * + * @phys_addr: the physical address to validate + * @parent: if not %NULL, then this is filled with the parents PA. + * @port: if not %NULL, then this is filled with the input port. + * + * This validates a physical address as read from an EDID. If the + * PA is invalid (such as 1.0.1.0 since '0' is only allowed at the end), + * then it will return -EINVAL. + * + * The parent PA is passed into %parent and the input port is passed into + * %port. For example: + * + * PA = 0.0.0.0: has parent 0.0.0.0 and input port 0. + * + * PA = 1.0.0.0: has parent 0.0.0.0 and input port 1. + * + * PA = 3.2.0.0: has parent 3.0.0.0 and input port 2. + * + * PA = f.f.f.f: has parent f.f.f.f and input port 0. + * + * Return: 0 if the PA is valid, -EINVAL if not. + */ +int cec_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port); + +#ifdef CONFIG_MEDIA_CEC_NOTIFIER +void cec_register_cec_notifier(struct cec_adapter *adap, + struct cec_notifier *notifier); +#endif + #else static inline int cec_register_adapter(struct cec_adapter *adap, @@ -234,6 +326,29 @@ static inline void cec_s_phys_addr(struct cec_adapter *adap, u16 phys_addr, { } +static inline u16 cec_get_edid_phys_addr(const u8 *edid, unsigned int size, + unsigned int *offset) +{ + if (offset) + *offset = 0; + return CEC_PHYS_ADDR_INVALID; +} + +static inline void cec_set_edid_phys_addr(u8 *edid, unsigned int size, + u16 phys_addr) +{ +} + +static inline u16 cec_phys_addr_for_input(u16 phys_addr, u8 input) +{ + return CEC_PHYS_ADDR_INVALID; +} + +static inline int cec_phys_addr_validate(u16 phys_addr, u16 *parent, u16 *port) +{ + return 0; +} + #endif #endif /* _MEDIA_CEC_H */ diff --git a/include/media/davinci/vpif_types.h b/include/media/davinci/vpif_types.h index c49c306cba6177fc21e5f9765059a98935bddc9a..385597da20dc3791cfa0c0f29f9b9c53bcc6516d 100644 --- a/include/media/davinci/vpif_types.h +++ b/include/media/davinci/vpif_types.h @@ -53,6 +53,7 @@ struct vpif_display_config { int (*set_clock)(int, int); struct vpif_subdev_info *subdevinfo; int subdev_count; + int i2c_adapter_id; struct vpif_display_chan_config chan_config[VPIF_DISPLAY_MAX_CHANNELS]; const char *card_name; struct v4l2_async_subdev **asd; /* Flat array, arranged in groups */ diff --git a/include/media/rc-map.h b/include/media/rc-map.h index a704749280d2342d10f64fbd01b2b9a90604a045..1a815a572fa184a22cdf5c3240bea59f0514de4d 100644 --- a/include/media/rc-map.h +++ b/include/media/rc-map.h @@ -27,7 +27,8 @@ * @RC_TYPE_NECX: Extended NEC protocol * @RC_TYPE_NEC32: NEC 32 bit protocol * @RC_TYPE_SANYO: Sanyo protocol - * @RC_TYPE_MCE_KBD: RC6-ish MCE keyboard/mouse + * @RC_TYPE_MCIR2_KBD: RC6-ish MCE keyboard + * @RC_TYPE_MCIR2_MSE: RC6-ish MCE mouse * @RC_TYPE_RC6_0: Philips RC6-0-16 protocol * @RC_TYPE_RC6_6A_20: Philips RC6-6A-20 protocol * @RC_TYPE_RC6_6A_24: Philips RC6-6A-24 protocol @@ -51,48 +52,51 @@ enum rc_type { RC_TYPE_NECX = 10, RC_TYPE_NEC32 = 11, RC_TYPE_SANYO = 12, - RC_TYPE_MCE_KBD = 13, - RC_TYPE_RC6_0 = 14, - RC_TYPE_RC6_6A_20 = 15, - RC_TYPE_RC6_6A_24 = 16, - RC_TYPE_RC6_6A_32 = 17, - RC_TYPE_RC6_MCE = 18, - RC_TYPE_SHARP = 19, - RC_TYPE_XMP = 20, - RC_TYPE_CEC = 21, + RC_TYPE_MCIR2_KBD = 13, + RC_TYPE_MCIR2_MSE = 14, + RC_TYPE_RC6_0 = 15, + RC_TYPE_RC6_6A_20 = 16, + RC_TYPE_RC6_6A_24 = 17, + RC_TYPE_RC6_6A_32 = 18, + RC_TYPE_RC6_MCE = 19, + RC_TYPE_SHARP = 20, + RC_TYPE_XMP = 21, + RC_TYPE_CEC = 22, }; #define RC_BIT_NONE 0ULL -#define RC_BIT_UNKNOWN (1ULL << RC_TYPE_UNKNOWN) -#define RC_BIT_OTHER (1ULL << RC_TYPE_OTHER) -#define RC_BIT_RC5 (1ULL << RC_TYPE_RC5) -#define RC_BIT_RC5X_20 (1ULL << RC_TYPE_RC5X_20) -#define RC_BIT_RC5_SZ (1ULL << RC_TYPE_RC5_SZ) -#define RC_BIT_JVC (1ULL << RC_TYPE_JVC) -#define RC_BIT_SONY12 (1ULL << RC_TYPE_SONY12) -#define RC_BIT_SONY15 (1ULL << RC_TYPE_SONY15) -#define RC_BIT_SONY20 (1ULL << RC_TYPE_SONY20) -#define RC_BIT_NEC (1ULL << RC_TYPE_NEC) -#define RC_BIT_NECX (1ULL << RC_TYPE_NECX) -#define RC_BIT_NEC32 (1ULL << RC_TYPE_NEC32) -#define RC_BIT_SANYO (1ULL << RC_TYPE_SANYO) -#define RC_BIT_MCE_KBD (1ULL << RC_TYPE_MCE_KBD) -#define RC_BIT_RC6_0 (1ULL << RC_TYPE_RC6_0) -#define RC_BIT_RC6_6A_20 (1ULL << RC_TYPE_RC6_6A_20) -#define RC_BIT_RC6_6A_24 (1ULL << RC_TYPE_RC6_6A_24) -#define RC_BIT_RC6_6A_32 (1ULL << RC_TYPE_RC6_6A_32) -#define RC_BIT_RC6_MCE (1ULL << RC_TYPE_RC6_MCE) -#define RC_BIT_SHARP (1ULL << RC_TYPE_SHARP) -#define RC_BIT_XMP (1ULL << RC_TYPE_XMP) -#define RC_BIT_CEC (1ULL << RC_TYPE_CEC) +#define RC_BIT_UNKNOWN BIT_ULL(RC_TYPE_UNKNOWN) +#define RC_BIT_OTHER BIT_ULL(RC_TYPE_OTHER) +#define RC_BIT_RC5 BIT_ULL(RC_TYPE_RC5) +#define RC_BIT_RC5X_20 BIT_ULL(RC_TYPE_RC5X_20) +#define RC_BIT_RC5_SZ BIT_ULL(RC_TYPE_RC5_SZ) +#define RC_BIT_JVC BIT_ULL(RC_TYPE_JVC) +#define RC_BIT_SONY12 BIT_ULL(RC_TYPE_SONY12) +#define RC_BIT_SONY15 BIT_ULL(RC_TYPE_SONY15) +#define RC_BIT_SONY20 BIT_ULL(RC_TYPE_SONY20) +#define RC_BIT_NEC BIT_ULL(RC_TYPE_NEC) +#define RC_BIT_NECX BIT_ULL(RC_TYPE_NECX) +#define RC_BIT_NEC32 BIT_ULL(RC_TYPE_NEC32) +#define RC_BIT_SANYO BIT_ULL(RC_TYPE_SANYO) +#define RC_BIT_MCIR2_KBD BIT_ULL(RC_TYPE_MCIR2_KBD) +#define RC_BIT_MCIR2_MSE BIT_ULL(RC_TYPE_MCIR2_MSE) +#define RC_BIT_RC6_0 BIT_ULL(RC_TYPE_RC6_0) +#define RC_BIT_RC6_6A_20 BIT_ULL(RC_TYPE_RC6_6A_20) +#define RC_BIT_RC6_6A_24 BIT_ULL(RC_TYPE_RC6_6A_24) +#define RC_BIT_RC6_6A_32 BIT_ULL(RC_TYPE_RC6_6A_32) +#define RC_BIT_RC6_MCE BIT_ULL(RC_TYPE_RC6_MCE) +#define RC_BIT_SHARP BIT_ULL(RC_TYPE_SHARP) +#define RC_BIT_XMP BIT_ULL(RC_TYPE_XMP) +#define RC_BIT_CEC BIT_ULL(RC_TYPE_CEC) #define RC_BIT_ALL (RC_BIT_UNKNOWN | RC_BIT_OTHER | \ RC_BIT_RC5 | RC_BIT_RC5X_20 | RC_BIT_RC5_SZ | \ RC_BIT_JVC | \ RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \ RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \ - RC_BIT_SANYO | RC_BIT_MCE_KBD | RC_BIT_RC6_0 | \ - RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \ + RC_BIT_SANYO | \ + RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \ + RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \ RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | RC_BIT_SHARP | \ RC_BIT_XMP | RC_BIT_CEC) /* All rc protocols for which we have decoders */ @@ -101,8 +105,8 @@ enum rc_type { RC_BIT_JVC | \ RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \ RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \ - RC_BIT_SANYO | RC_BIT_MCE_KBD | RC_BIT_RC6_0 | \ - RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \ + RC_BIT_SANYO | RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \ + RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \ RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | RC_BIT_SHARP | \ RC_BIT_XMP) @@ -111,7 +115,7 @@ enum rc_type { RC_BIT_JVC | \ RC_BIT_SONY12 | RC_BIT_SONY15 | RC_BIT_SONY20 | \ RC_BIT_NEC | RC_BIT_NECX | RC_BIT_NEC32 | \ - RC_BIT_SANYO | \ + RC_BIT_SANYO | RC_BIT_MCIR2_KBD | RC_BIT_MCIR2_MSE | \ RC_BIT_RC6_0 | RC_BIT_RC6_6A_20 | RC_BIT_RC6_6A_24 | \ RC_BIT_RC6_6A_32 | RC_BIT_RC6_MCE | \ RC_BIT_SHARP) @@ -255,7 +259,6 @@ struct rc_map *rc_map_get(const char *name); #define RC_MAP_KWORLD_PC150U "rc-kworld-pc150u" #define RC_MAP_KWORLD_PLUS_TV_ANALOG "rc-kworld-plus-tv-analog" #define RC_MAP_LEADTEK_Y04G0051 "rc-leadtek-y04g0051" -#define RC_MAP_LIRC "rc-lirc" #define RC_MAP_LME2510 "rc-lme2510" #define RC_MAP_MANLI "rc-manli" #define RC_MAP_MEDION_X10 "rc-medion-x10" diff --git a/include/media/soc_camera.h b/include/media/soc_camera.h index 1a15c3e4efd387ce5641946aaa67183ba5daa3f7..4d8cb0796bc60d637cd69a7381cb93e1318a723d 100644 --- a/include/media/soc_camera.h +++ b/include/media/soc_camera.h @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -55,10 +54,7 @@ struct soc_camera_device { /* Asynchronous subdevice management */ struct soc_camera_async_client *sasc; /* video buffer queue */ - union { - struct videobuf_queue vb_vidq; - struct vb2_queue vb2_vidq; - }; + struct vb2_queue vb2_vidq; }; /* Host supports programmable stride */ @@ -114,11 +110,8 @@ struct soc_camera_host_ops { int (*set_liveselection)(struct soc_camera_device *, struct v4l2_selection *); int (*set_fmt)(struct soc_camera_device *, struct v4l2_format *); int (*try_fmt)(struct soc_camera_device *, struct v4l2_format *); - void (*init_videobuf)(struct videobuf_queue *, - struct soc_camera_device *); int (*init_videobuf2)(struct vb2_queue *, struct soc_camera_device *); - int (*reqbufs)(struct soc_camera_device *, struct v4l2_requestbuffers *); int (*querycap)(struct soc_camera_host *, struct v4l2_capability *); int (*set_bus_param)(struct soc_camera_device *); int (*get_parm)(struct soc_camera_device *, struct v4l2_streamparm *); @@ -396,11 +389,6 @@ static inline struct soc_camera_device *soc_camera_from_vb2q(const struct vb2_qu return container_of(vq, struct soc_camera_device, vb2_vidq); } -static inline struct soc_camera_device *soc_camera_from_vbq(const struct videobuf_queue *vq) -{ - return container_of(vq, struct soc_camera_device, vb_vidq); -} - static inline u32 soc_camera_grp_id(const struct soc_camera_device *icd) { return (icd->iface << 8) | (icd->devnum + 1); diff --git a/include/media/tveeprom.h b/include/media/tveeprom.h index c56501ee0484a1a1e0b7c6720499195637756b3d..630bcf3d8885b9e1eaf9da2a77b076c4a13c4d4e 100644 --- a/include/media/tveeprom.h +++ b/include/media/tveeprom.h @@ -94,13 +94,12 @@ struct tveeprom { * of the eeprom previously filled at * @eeprom_data field. * - * @c: I2C client struct * @tvee: Struct to where the eeprom parsed data will be filled; * @eeprom_data: Array with the contents of the eeprom_data. It should * contain 256 bytes filled with the contents of the * eeprom read from the Hauppauge device. */ -void tveeprom_hauppauge_analog(struct i2c_client *c, struct tveeprom *tvee, +void tveeprom_hauppauge_analog(struct tveeprom *tvee, unsigned char *eeprom_data); /** diff --git a/include/media/v4l2-ioctl.h b/include/media/v4l2-ioctl.h index 6cd94e5ee113f0f5314f28e69e136bc9e35a4e9b..bd5312118013d3fbd2d88c698d15139d0b9155da 100644 --- a/include/media/v4l2-ioctl.h +++ b/include/media/v4l2-ioctl.h @@ -44,6 +44,9 @@ struct v4l2_fh; * @vidioc_enum_fmt_sdr_out: pointer to the function that implements * :ref:`VIDIOC_ENUM_FMT ` ioctl logic * for Software Defined Radio output + * @vidioc_enum_fmt_meta_cap: pointer to the function that implements + * :ref:`VIDIOC_ENUM_FMT ` ioctl logic + * for metadata capture * @vidioc_g_fmt_vid_cap: pointer to the function that implements * :ref:`VIDIOC_G_FMT ` ioctl logic for video capture * in single plane mode @@ -74,6 +77,8 @@ struct v4l2_fh; * @vidioc_g_fmt_sdr_out: pointer to the function that implements * :ref:`VIDIOC_G_FMT ` ioctl logic for Software Defined * Radio output + * @vidioc_g_fmt_meta_cap: pointer to the function that implements + * :ref:`VIDIOC_G_FMT ` ioctl logic for metadata capture * @vidioc_s_fmt_vid_cap: pointer to the function that implements * :ref:`VIDIOC_S_FMT ` ioctl logic for video capture * in single plane mode @@ -104,6 +109,8 @@ struct v4l2_fh; * @vidioc_s_fmt_sdr_out: pointer to the function that implements * :ref:`VIDIOC_S_FMT ` ioctl logic for Software Defined * Radio output + * @vidioc_s_fmt_meta_cap: pointer to the function that implements + * :ref:`VIDIOC_S_FMT ` ioctl logic for metadata capture * @vidioc_try_fmt_vid_cap: pointer to the function that implements * :ref:`VIDIOC_TRY_FMT ` ioctl logic for video capture * in single plane mode @@ -136,6 +143,8 @@ struct v4l2_fh; * @vidioc_try_fmt_sdr_out: pointer to the function that implements * :ref:`VIDIOC_TRY_FMT ` ioctl logic for Software Defined * Radio output + * @vidioc_try_fmt_meta_cap: pointer to the function that implements + * :ref:`VIDIOC_TRY_FMT ` ioctl logic for metadata capture * @vidioc_reqbufs: pointer to the function that implements * :ref:`VIDIOC_REQBUFS ` ioctl * @vidioc_querybuf: pointer to the function that implements @@ -306,6 +315,8 @@ struct v4l2_ioctl_ops { struct v4l2_fmtdesc *f); int (*vidioc_enum_fmt_sdr_out)(struct file *file, void *fh, struct v4l2_fmtdesc *f); + int (*vidioc_enum_fmt_meta_cap)(struct file *file, void *fh, + struct v4l2_fmtdesc *f); /* VIDIOC_G_FMT handlers */ int (*vidioc_g_fmt_vid_cap)(struct file *file, void *fh, @@ -332,6 +343,8 @@ struct v4l2_ioctl_ops { struct v4l2_format *f); int (*vidioc_g_fmt_sdr_out)(struct file *file, void *fh, struct v4l2_format *f); + int (*vidioc_g_fmt_meta_cap)(struct file *file, void *fh, + struct v4l2_format *f); /* VIDIOC_S_FMT handlers */ int (*vidioc_s_fmt_vid_cap)(struct file *file, void *fh, @@ -358,6 +371,8 @@ struct v4l2_ioctl_ops { struct v4l2_format *f); int (*vidioc_s_fmt_sdr_out)(struct file *file, void *fh, struct v4l2_format *f); + int (*vidioc_s_fmt_meta_cap)(struct file *file, void *fh, + struct v4l2_format *f); /* VIDIOC_TRY_FMT handlers */ int (*vidioc_try_fmt_vid_cap)(struct file *file, void *fh, @@ -384,6 +399,8 @@ struct v4l2_ioctl_ops { struct v4l2_format *f); int (*vidioc_try_fmt_sdr_out)(struct file *file, void *fh, struct v4l2_format *f); + int (*vidioc_try_fmt_meta_cap)(struct file *file, void *fh, + struct v4l2_format *f); /* Buffer handlers */ int (*vidioc_reqbufs)(struct file *file, void *fh, diff --git a/include/media/videobuf2-core.h b/include/media/videobuf2-core.h index ac5898a55fd967e6c051f8e837f9bdb6425752fb..cb97c224be730f5e3fd7076fd681e66b56df64e5 100644 --- a/include/media/videobuf2-core.h +++ b/include/media/videobuf2-core.h @@ -305,16 +305,16 @@ struct vb2_buffer { * buffer in \*num_planes, the size of each plane should be * set in the sizes\[\] array and optional per-plane * allocator specific device in the alloc_devs\[\] array. - * When called from VIDIOC_REQBUFS,() \*num_planes == 0, + * When called from VIDIOC_REQBUFS(), \*num_planes == 0, * the driver has to use the currently configured format to * determine the plane sizes and \*num_buffers is the total * number of buffers that are being allocated. When called - * from VIDIOC_CREATE_BUFS,() \*num_planes != 0 and it + * from VIDIOC_CREATE_BUFS(), \*num_planes != 0 and it * describes the requested number of planes and sizes\[\] - * contains the requested plane sizes. If either - * \*num_planes or the requested sizes are invalid callback - * must return %-EINVAL. In this case \*num_buffers are - * being allocated additionally to q->num_buffers. + * contains the requested plane sizes. In this case + * \*num_buffers are being allocated additionally to + * q->num_buffers. If either \*num_planes or the requested + * sizes are invalid callback must return %-EINVAL. * @wait_prepare: release any locks taken while calling vb2 functions; * it is called before an ioctl needs to wait for a new * buffer to arrive; required to avoid a deadlock in diff --git a/include/media/videobuf2-memops.h b/include/media/videobuf2-memops.h index 36565c7acb542b961fcbe38d8b978a6f802f234c..a6ed091b79ce501a183b2fb6cf6051d169f066f7 100644 --- a/include/media/videobuf2-memops.h +++ b/include/media/videobuf2-memops.h @@ -16,6 +16,7 @@ #include #include +#include /** * struct vb2_vmarea_handler - common vma refcount tracking handler @@ -25,7 +26,7 @@ * @arg: argument for @put callback */ struct vb2_vmarea_handler { - atomic_t *refcount; + refcount_t *refcount; void (*put)(void *arg); void *arg; }; diff --git a/include/misc/charlcd.h b/include/misc/charlcd.h new file mode 100644 index 0000000000000000000000000000000000000000..23f61850f3639ae1686f9f83e0bac6e6a5015ba5 --- /dev/null +++ b/include/misc/charlcd.h @@ -0,0 +1,42 @@ +/* + * Character LCD driver for Linux + * + * Copyright (C) 2000-2008, Willy Tarreau + * Copyright (C) 2016-2017 Glider bvba + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +struct charlcd { + const struct charlcd_ops *ops; + const unsigned char *char_conv; /* Optional */ + + int ifwidth; /* 4-bit or 8-bit (default) */ + int height; + int width; + int bwidth; /* Default set by charlcd_alloc() */ + int hwidth; /* Default set by charlcd_alloc() */ + + void *drvdata; /* Set by charlcd_alloc() */ +}; + +struct charlcd_ops { + /* Required */ + void (*write_cmd)(struct charlcd *lcd, int cmd); + void (*write_data)(struct charlcd *lcd, int data); + + /* Optional */ + void (*write_cmd_raw4)(struct charlcd *lcd, int cmd); /* 4-bit only */ + void (*clear_fast)(struct charlcd *lcd); + void (*backlight)(struct charlcd *lcd, int on); +}; + +struct charlcd *charlcd_alloc(unsigned int drvdata_size); + +int charlcd_register(struct charlcd *lcd); +int charlcd_unregister(struct charlcd *lcd); + +void charlcd_poke(struct charlcd *lcd); diff --git a/include/net/6lowpan.h b/include/net/6lowpan.h index 5ab4c9901ccc12979e32d4788c35050717a00d7f..a71378007e61dc94e7f43bf671b275c291376db8 100644 --- a/include/net/6lowpan.h +++ b/include/net/6lowpan.h @@ -198,6 +198,21 @@ static inline void lowpan_iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr, ipaddr->s6_addr[8] ^= 0x02; } +static inline void lowpan_iphc_uncompress_eui48_lladdr(struct in6_addr *ipaddr, + const void *lladdr) +{ + /* fe:80::XXXX:XXff:feXX:XXXX + * \_________________/ + * hwaddr + */ + ipaddr->s6_addr[0] = 0xFE; + ipaddr->s6_addr[1] = 0x80; + memcpy(&ipaddr->s6_addr[8], lladdr, 3); + ipaddr->s6_addr[11] = 0xFF; + ipaddr->s6_addr[12] = 0xFE; + memcpy(&ipaddr->s6_addr[13], lladdr + 3, 3); +} + #ifdef DEBUG /* print data in line */ static inline void raw_dump_inline(const char *caller, char *msg, diff --git a/include/net/addrconf.h b/include/net/addrconf.h index 17c6fd84e287808eca08503b87a56a5d6fc0cc5c..b43a4eec3ceca4f798bf7513ecd4aa999535c6e7 100644 --- a/include/net/addrconf.h +++ b/include/net/addrconf.h @@ -20,6 +20,8 @@ #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) +#define ADDRCONF_NOTIFY_PRIORITY 0 + #include #include @@ -103,12 +105,24 @@ int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); +static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) +{ + memcpy(eui, addr, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + memcpy(eui + 5, addr + 3, 3); +} + +static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) +{ + addrconf_addr_eui48_base(eui, addr); + eui[0] ^= 2; +} + static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; - memcpy(eui, dev->dev_addr, 3); - memcpy(eui + 5, dev->dev_addr + 3, 3); /* * The zSeries OSA network cards can be shared among various @@ -123,14 +137,16 @@ static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ + + addrconf_addr_eui48_base(eui, dev->dev_addr); + if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { - eui[3] = 0xFF; - eui[4] = 0xFE; eui[0] ^= 2; } + return 0; } @@ -262,8 +278,8 @@ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); -void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv6_devconf *devconf); +void inet6_netconf_notify_devconf(struct net *net, int event, int type, + int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h index 1061a472a3e35b88a7df587fdeefcdb310c793bc..b5f5187f488cc1e663912ec1e12811d49ecb3fd5 100644 --- a/include/net/af_rxrpc.h +++ b/include/net/af_rxrpc.h @@ -39,7 +39,7 @@ int rxrpc_kernel_send_data(struct socket *, struct rxrpc_call *, struct msghdr *, size_t); int rxrpc_kernel_recv_data(struct socket *, struct rxrpc_call *, void *, size_t, size_t *, bool, u32 *); -void rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, +bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, u32, int, const char *); void rxrpc_kernel_end_call(struct socket *, struct rxrpc_call *); void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h index f32ed9ac181a47c00757596fc3b8c5733426c468..f9fb566e75cfd8ca1531dd733d443f9a6c929c62 100644 --- a/include/net/af_vsock.h +++ b/include/net/af_vsock.h @@ -188,4 +188,17 @@ struct sock *vsock_find_connected_socket(struct sockaddr_vm *src, void vsock_remove_sock(struct vsock_sock *vsk); void vsock_for_each_connected_socket(void (*fn)(struct sock *sk)); +/**** TAP ****/ + +struct vsock_tap { + struct net_device *dev; + struct module *module; + struct list_head list; +}; + +int vsock_init_tap(void); +int vsock_add_tap(struct vsock_tap *vt); +int vsock_remove_tap(struct vsock_tap *vt); +void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque); + #endif /* __AF_VSOCK_H__ */ diff --git a/include/net/bluetooth/l2cap.h b/include/net/bluetooth/l2cap.h index 5ee3c689c86370b4b325309d337611c9b51af5d1..0697fd41308777a4801dcf8728f2ff5aa0210804 100644 --- a/include/net/bluetooth/l2cap.h +++ b/include/net/bluetooth/l2cap.h @@ -282,7 +282,7 @@ struct l2cap_conn_rsp { #define L2CAP_CR_BAD_KEY_SIZE 0x0007 #define L2CAP_CR_ENCRYPTION 0x0008 #define L2CAP_CR_INVALID_SCID 0x0009 -#define L2CAP_CR_SCID_IN_USE 0x0010 +#define L2CAP_CR_SCID_IN_USE 0x000A /* connect/create channel status */ #define L2CAP_CS_NO_INFO 0x0000 diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 4190af53a46ad1a5f7868f4add65363e5ada9b20..da4acefe39c81abbc11af58c88eb283159d1de26 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -21,6 +21,8 @@ SOFTWARE IS DISCLAIMED. */ +#include + #ifndef __RFCOMM_H #define __RFCOMM_H @@ -174,7 +176,7 @@ struct rfcomm_dlc { struct mutex lock; unsigned long state; unsigned long flags; - atomic_t refcnt; + refcount_t refcnt; u8 dlci; u8 addr; u8 priority; @@ -247,12 +249,12 @@ struct rfcomm_dlc *rfcomm_dlc_exists(bdaddr_t *src, bdaddr_t *dst, u8 channel); static inline void rfcomm_dlc_hold(struct rfcomm_dlc *d) { - atomic_inc(&d->refcnt); + refcount_inc(&d->refcnt); } static inline void rfcomm_dlc_put(struct rfcomm_dlc *d) { - if (atomic_dec_and_test(&d->refcnt)) + if (refcount_dec_and_test(&d->refcnt)) rfcomm_dlc_free(d); } diff --git a/include/net/bonding.h b/include/net/bonding.h index 3c857778a6ca6870f7e7d5604adcd263380e4708..b00508d22e0a6adc37ee4e69187377fea3bc102d 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -153,7 +153,8 @@ struct slave { unsigned long last_link_up; unsigned long last_rx; unsigned long target_last_arp_rx[BOND_MAX_ARP_TARGETS]; - s8 link; /* one of BOND_LINK_XXXX */ + s8 link; /* one of BOND_LINK_XXXX */ + s8 link_new_state; /* one of BOND_LINK_XXXX */ s8 new_link; u8 backup:1, /* indicates backup slave. Value corresponds with BOND_STATE_ACTIVE and BOND_STATE_BACKUP */ @@ -165,7 +166,7 @@ struct slave { u32 link_failure_count; u32 speed; u16 queue_id; - u8 perm_hwaddr[ETH_ALEN]; + u8 perm_hwaddr[MAX_ADDR_LEN]; struct ad_slave_info *ad_info; struct tlb_slave_info tlb_info; #ifdef CONFIG_NET_POLL_CONTROLLER @@ -401,6 +402,16 @@ static inline bool bond_slave_can_tx(struct slave *slave) bond_is_active_slave(slave); } +static inline void bond_hw_addr_copy(u8 *dst, const u8 *src, unsigned int len) +{ + if (len == ETH_ALEN) { + ether_addr_copy(dst, src); + return; + } + + memcpy(dst, src, len); +} + #define BOND_PRI_RESELECT_ALWAYS 0 #define BOND_PRI_RESELECT_BETTER 1 #define BOND_PRI_RESELECT_FAILURE 2 @@ -504,13 +515,17 @@ static inline bool bond_is_slave_inactive(struct slave *slave) return slave->inactive; } -static inline void bond_set_slave_link_state(struct slave *slave, int state, - bool notify) +static inline void bond_propose_link_state(struct slave *slave, int state) { - if (slave->link == state) + slave->link_new_state = state; +} + +static inline void bond_commit_link_state(struct slave *slave, bool notify) +{ + if (slave->link == slave->link_new_state) return; - slave->link = state; + slave->link = slave->link_new_state; if (notify) { bond_queue_slave_event(slave); bond_lower_state_changed(slave); @@ -523,6 +538,13 @@ static inline void bond_set_slave_link_state(struct slave *slave, int state, } } +static inline void bond_set_slave_link_state(struct slave *slave, int state, + bool notify) +{ + bond_propose_link_state(slave, state); + bond_commit_link_state(slave, notify); +} + static inline void bond_slave_link_notify(struct bonding *bond) { struct list_head *iter; @@ -592,6 +614,7 @@ struct bond_vlan_tag *bond_verify_device_path(struct net_device *start_dev, int level); int bond_update_slave_arr(struct bonding *bond, struct slave *skipslave); void bond_slave_arr_work_rearm(struct bonding *bond, unsigned long delay); +void bond_work_init_all(struct bonding *bond); #ifdef CONFIG_PROC_FS void bond_create_proc_entry(struct bonding *bond); diff --git a/include/net/busy_poll.h b/include/net/busy_poll.h index c0452de83086e51738a249bea5fa86d6fb121760..8ffd434676b7a270af73534f3dbcc26f370fe215 100644 --- a/include/net/busy_poll.h +++ b/include/net/busy_poll.h @@ -35,83 +35,101 @@ struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; +/* 0 - Reserved to indicate value not set + * 1..NR_CPUS - Reserved for sender_cpu + * NR_CPUS+1..~0 - Region available for NAPI IDs + */ +#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) + static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; } -static inline u64 busy_loop_us_clock(void) +static inline bool sk_can_busy_loop(const struct sock *sk) { - return local_clock() >> 10; + return sk->sk_ll_usec && !signal_pending(current); } -static inline unsigned long sk_busy_loop_end_time(struct sock *sk) -{ - return busy_loop_us_clock() + ACCESS_ONCE(sk->sk_ll_usec); -} +bool sk_busy_loop_end(void *p, unsigned long start_time); -/* in poll/select we use the global sysctl_net_ll_poll value */ -static inline unsigned long busy_loop_end_time(void) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg); + +#else /* CONFIG_NET_RX_BUSY_POLL */ +static inline unsigned long net_busy_loop_on(void) { - return busy_loop_us_clock() + ACCESS_ONCE(sysctl_net_busy_poll); + return 0; } -static inline bool sk_can_busy_loop(const struct sock *sk) +static inline bool sk_can_busy_loop(struct sock *sk) { - return sk->sk_ll_usec && sk->sk_napi_id && !signal_pending(current); + return false; } +#endif /* CONFIG_NET_RX_BUSY_POLL */ -static inline bool busy_loop_timeout(unsigned long end_time) +static inline unsigned long busy_loop_current_time(void) { - unsigned long now = busy_loop_us_clock(); - - return time_after(now, end_time); +#ifdef CONFIG_NET_RX_BUSY_POLL + return (unsigned long)(local_clock() >> 10); +#else + return 0; +#endif } -bool sk_busy_loop(struct sock *sk, int nonblock); - -/* used in the NIC receive handler to mark the skb */ -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) +/* in poll/select we use the global sysctl_net_ll_poll value */ +static inline bool busy_loop_timeout(unsigned long start_time) { - skb->napi_id = napi->napi_id; -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); -#else /* CONFIG_NET_RX_BUSY_POLL */ -static inline unsigned long net_busy_loop_on(void) -{ - return 0; + return time_after(now, end_time); + } +#endif + return true; } -static inline unsigned long busy_loop_end_time(void) +static inline bool sk_busy_loop_timeout(struct sock *sk, + unsigned long start_time) { - return 0; -} +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); -static inline bool sk_can_busy_loop(struct sock *sk) -{ - return false; -} + if (bp_usec) { + unsigned long end_time = start_time + bp_usec; + unsigned long now = busy_loop_current_time(); -static inline void skb_mark_napi_id(struct sk_buff *skb, - struct napi_struct *napi) -{ + return time_after(now, end_time); + } +#endif + return true; } -static inline bool busy_loop_timeout(unsigned long end_time) +static inline void sk_busy_loop(struct sock *sk, int nonblock) { - return true; +#ifdef CONFIG_NET_RX_BUSY_POLL + unsigned int napi_id = READ_ONCE(sk->sk_napi_id); + + if (napi_id >= MIN_NAPI_ID) + napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); +#endif } -static inline bool sk_busy_loop(struct sock *sk, int nonblock) +/* used in the NIC receive handler to mark the skb */ +static inline void skb_mark_napi_id(struct sk_buff *skb, + struct napi_struct *napi) { - return false; +#ifdef CONFIG_NET_RX_BUSY_POLL + skb->napi_id = napi->napi_id; +#endif } -#endif /* CONFIG_NET_RX_BUSY_POLL */ - /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index ead1aa6d003ef97b09847e8175b704da31c341d1..b083e6cbae8cb33f3c9d9078bab5468cd80d31f1 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -363,6 +363,8 @@ static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy) /** * struct vif_params - describes virtual interface parameters + * @flags: monitor interface flags, unchanged if 0, otherwise + * %MONITOR_FLAG_CHANGED will be set * @use_4addr: use 4-address frames * @macaddr: address to use for this virtual interface. * If this parameter is set to zero address the driver may @@ -370,13 +372,17 @@ static inline void wiphy_read_of_freq_limits(struct wiphy *wiphy) * This feature is only fully supported by drivers that enable the * %NL80211_FEATURE_MAC_ON_CREATE flag. Others may support creating ** only p2p devices with specified MAC. - * @vht_mumimo_groups: MU-MIMO groupID. used for monitoring only - * packets belonging to that MU-MIMO groupID. + * @vht_mumimo_groups: MU-MIMO groupID, used for monitoring MU-MIMO packets + * belonging to that MU-MIMO groupID; %NULL if not changed + * @vht_mumimo_follow_addr: MU-MIMO follow address, used for monitoring + * MU-MIMO packets going to the specified station; %NULL if not changed */ struct vif_params { + u32 flags; int use_4addr; u8 macaddr[ETH_ALEN]; - u8 vht_mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN]; + const u8 *vht_mumimo_groups; + const u8 *vht_mumimo_follow_addr; }; /** @@ -1007,9 +1013,9 @@ enum rate_info_flags { * @RATE_INFO_BW_160: 160 MHz bandwidth */ enum rate_info_bw { + RATE_INFO_BW_20 = 0, RATE_INFO_BW_5, RATE_INFO_BW_10, - RATE_INFO_BW_20, RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, @@ -1211,6 +1217,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * Monitor interface configuration flags. Note that these must be the bits * according to the nl80211 flags. * + * @MONITOR_FLAG_CHANGED: set if the flags were changed * @MONITOR_FLAG_FCSFAIL: pass frames with bad FCS * @MONITOR_FLAG_PLCPFAIL: pass frames with bad PLCP * @MONITOR_FLAG_CONTROL: pass control frames @@ -1219,6 +1226,7 @@ static inline int cfg80211_get_station(struct net_device *dev, * @MONITOR_FLAG_ACTIVE: active monitor, ACKs frames on its MAC address */ enum monitor_flags { + MONITOR_FLAG_CHANGED = 1<<__NL80211_MNTR_FLAG_INVALID, MONITOR_FLAG_FCSFAIL = 1<= 0). + */ +struct cfg80211_connect_resp_params { + int status; + const u8 *bssid; + struct cfg80211_bss *bss; + const u8 *req_ie; + size_t req_ie_len; + const u8 *resp_ie; + size_t resp_ie_len; + const u8 *fils_kek; + size_t fils_kek_len; + bool update_erp_next_seq_num; + u16 fils_erp_next_seq_num; + const u8 *pmk; + size_t pmk_len; + const u8 *pmkid; + enum nl80211_timeout_reason timeout_reason; +}; + +/** + * cfg80211_connect_done - notify cfg80211 of connection result + * + * @dev: network device + * @params: connection response parameters + * @gfp: allocation flags + * + * It should be called by the underlying driver once execution of the connection + * request from connect() has been completed. This is similar to + * cfg80211_connect_bss(), but takes a structure pointer for connection response + * parameters. Only one of the functions among cfg80211_connect_bss(), + * cfg80211_connect_result(), cfg80211_connect_timeout(), + * and cfg80211_connect_done() should be called. + */ +void cfg80211_connect_done(struct net_device *dev, + struct cfg80211_connect_resp_params *params, + gfp_t gfp); + /** * cfg80211_connect_bss - notify cfg80211 of connection result * @@ -5152,13 +5309,31 @@ static inline void cfg80211_testmode_event(struct sk_buff *skb, gfp_t gfp) * It should be called by the underlying driver once execution of the connection * request from connect() has been completed. This is similar to * cfg80211_connect_result(), but with the option of identifying the exact bss - * entry for the connection. Only one of these functions should be called. + * entry for the connection. Only one of the functions among + * cfg80211_connect_bss(), cfg80211_connect_result(), + * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called. */ -void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, - struct cfg80211_bss *bss, const u8 *req_ie, - size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, int status, gfp_t gfp, - enum nl80211_timeout_reason timeout_reason); +static inline void +cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, + struct cfg80211_bss *bss, const u8 *req_ie, + size_t req_ie_len, const u8 *resp_ie, + size_t resp_ie_len, int status, gfp_t gfp, + enum nl80211_timeout_reason timeout_reason) +{ + struct cfg80211_connect_resp_params params; + + memset(¶ms, 0, sizeof(params)); + params.status = status; + params.bssid = bssid; + params.bss = bss; + params.req_ie = req_ie; + params.req_ie_len = req_ie_len; + params.resp_ie = resp_ie; + params.resp_ie_len = resp_ie_len; + params.timeout_reason = timeout_reason; + + cfg80211_connect_done(dev, ¶ms, gfp); +} /** * cfg80211_connect_result - notify cfg80211 of connection result @@ -5177,7 +5352,8 @@ void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, * It should be called by the underlying driver once execution of the connection * request from connect() has been completed. This is similar to * cfg80211_connect_bss() which allows the exact bss entry to be specified. Only - * one of these functions should be called. + * one of the functions among cfg80211_connect_bss(), cfg80211_connect_result(), + * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called. */ static inline void cfg80211_connect_result(struct net_device *dev, const u8 *bssid, @@ -5204,7 +5380,9 @@ cfg80211_connect_result(struct net_device *dev, const u8 *bssid, * in a sequence where no explicit authentication/association rejection was * received from the AP. This could happen, e.g., due to not being able to send * out the Authentication or Association Request frame or timing out while - * waiting for the response. + * waiting for the response. Only one of the functions among + * cfg80211_connect_bss(), cfg80211_connect_result(), + * cfg80211_connect_timeout(), and cfg80211_connect_done() should be called. */ static inline void cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, @@ -5216,51 +5394,46 @@ cfg80211_connect_timeout(struct net_device *dev, const u8 *bssid, } /** - * cfg80211_roamed - notify cfg80211 of roaming + * struct cfg80211_roam_info - driver initiated roaming information * - * @dev: network device * @channel: the channel of the new AP - * @bssid: the BSSID of the new AP + * @bss: entry of bss to which STA got roamed (may be %NULL if %bssid is set) + * @bssid: the BSSID of the new AP (may be %NULL if %bss is set) * @req_ie: association request IEs (maybe be %NULL) * @req_ie_len: association request IEs length * @resp_ie: association response IEs (may be %NULL) * @resp_ie_len: assoc response IEs length - * @gfp: allocation flags - * - * It should be called by the underlying driver whenever it roamed - * from one AP to another while connected. */ -void cfg80211_roamed(struct net_device *dev, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp); +struct cfg80211_roam_info { + struct ieee80211_channel *channel; + struct cfg80211_bss *bss; + const u8 *bssid; + const u8 *req_ie; + size_t req_ie_len; + const u8 *resp_ie; + size_t resp_ie_len; +}; /** - * cfg80211_roamed_bss - notify cfg80211 of roaming + * cfg80211_roamed - notify cfg80211 of roaming * * @dev: network device - * @bss: entry of bss to which STA got roamed - * @req_ie: association request IEs (maybe be %NULL) - * @req_ie_len: association request IEs length - * @resp_ie: association response IEs (may be %NULL) - * @resp_ie_len: assoc response IEs length + * @info: information about the new BSS. struct &cfg80211_roam_info. * @gfp: allocation flags * - * This is just a wrapper to notify cfg80211 of roaming event with driver - * passing bss to avoid a race in timeout of the bss entry. It should be - * called by the underlying driver whenever it roamed from one AP to another - * while connected. Drivers which have roaming implemented in firmware - * may use this function to avoid a race in bss entry timeout where the bss - * entry of the new AP is seen in the driver, but gets timed out by the time - * it is accessed in __cfg80211_roamed() due to delay in scheduling + * This function may be called with the driver passing either the BSSID of the + * new AP or passing the bss entry to avoid a race in timeout of the bss entry. + * It should be called by the underlying driver whenever it roamed from one AP + * to another while connected. Drivers which have roaming implemented in + * firmware should pass the bss entry to avoid a race in bss entry timeout where + * the bss entry of the new AP is seen in the driver, but gets timed out by the + * time it is accessed in __cfg80211_roamed() due to delay in scheduling * rdev->event_work. In case of any failures, the reference is released - * either in cfg80211_roamed_bss() or in __cfg80211_romed(), Otherwise, - * it will be released while diconneting from the current bss. + * either in cfg80211_roamed() or in __cfg80211_romed(), Otherwise, it will be + * released while diconneting from the current bss. */ -void cfg80211_roamed_bss(struct net_device *dev, struct cfg80211_bss *bss, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp); +void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, + gfp_t gfp); /** * cfg80211_disconnected - notify cfg80211 that connection was dropped diff --git a/include/net/devlink.h b/include/net/devlink.h index d29e5fc8258216b9d79604bc99b69b66cce3e443..ed7687bbf5d08f1bb5327f3f8c9e3e543e98e7c2 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -25,6 +25,8 @@ struct devlink { struct list_head list; struct list_head port_list; struct list_head sb_list; + struct list_head dpipe_table_list; + struct devlink_dpipe_headers *dpipe_headers; const struct devlink_ops *ops; struct device *dev; possible_net_t _net; @@ -49,6 +51,178 @@ struct devlink_sb_pool_info { enum devlink_sb_threshold_type threshold_type; }; +/** + * struct devlink_dpipe_field - dpipe field object + * @name: field name + * @id: index inside the headers field array + * @bitwidth: bitwidth + * @mapping_type: mapping type + */ +struct devlink_dpipe_field { + const char *name; + unsigned int id; + unsigned int bitwidth; + enum devlink_dpipe_field_mapping_type mapping_type; +}; + +/** + * struct devlink_dpipe_header - dpipe header object + * @name: header name + * @id: index, global/local detrmined by global bit + * @fields: fields + * @fields_count: number of fields + * @global: indicates if header is shared like most protocol header + * or driver specific + */ +struct devlink_dpipe_header { + const char *name; + unsigned int id; + struct devlink_dpipe_field *fields; + unsigned int fields_count; + bool global; +}; + +/** + * struct devlink_dpipe_match - represents match operation + * @type: type of match + * @header_index: header index (packets can have several headers of same + * type like in case of tunnels) + * @header: header + * @fieled_id: field index + */ +struct devlink_dpipe_match { + enum devlink_dpipe_match_type type; + unsigned int header_index; + struct devlink_dpipe_header *header; + unsigned int field_id; +}; + +/** + * struct devlink_dpipe_action - represents action operation + * @type: type of action + * @header_index: header index (packets can have several headers of same + * type like in case of tunnels) + * @header: header + * @fieled_id: field index + */ +struct devlink_dpipe_action { + enum devlink_dpipe_action_type type; + unsigned int header_index; + struct devlink_dpipe_header *header; + unsigned int field_id; +}; + +/** + * struct devlink_dpipe_value - represents value of match/action + * @action: action + * @match: match + * @mapping_value: in case the field has some mapping this value + * specified the mapping value + * @mapping_valid: specify if mapping value is valid + * @value_size: value size + * @value: value + * @mask: bit mask + */ +struct devlink_dpipe_value { + union { + struct devlink_dpipe_action *action; + struct devlink_dpipe_match *match; + }; + unsigned int mapping_value; + bool mapping_valid; + unsigned int value_size; + void *value; + void *mask; +}; + +/** + * struct devlink_dpipe_entry - table entry object + * @index: index of the entry in the table + * @match_values: match values + * @matche_values_count: count of matches tuples + * @action_values: actions values + * @action_values_count: count of actions values + * @counter: value of counter + * @counter_valid: Specify if value is valid from hardware + */ +struct devlink_dpipe_entry { + u64 index; + struct devlink_dpipe_value *match_values; + unsigned int match_values_count; + struct devlink_dpipe_value *action_values; + unsigned int action_values_count; + u64 counter; + bool counter_valid; +}; + +/** + * struct devlink_dpipe_dump_ctx - context provided to driver in order + * to dump + * @info: info + * @cmd: devlink command + * @skb: skb + * @nest: top attribute + * @hdr: hdr + */ +struct devlink_dpipe_dump_ctx { + struct genl_info *info; + enum devlink_command cmd; + struct sk_buff *skb; + struct nlattr *nest; + void *hdr; +}; + +struct devlink_dpipe_table_ops; + +/** + * struct devlink_dpipe_table - table object + * @priv: private + * @name: table name + * @size: maximum number of entries + * @counters_enabled: indicates if counters are active + * @counter_control_extern: indicates if counter control is in dpipe or + * external tool + * @table_ops: table operations + * @rcu: rcu + */ +struct devlink_dpipe_table { + void *priv; + struct list_head list; + const char *name; + u64 size; + bool counters_enabled; + bool counter_control_extern; + struct devlink_dpipe_table_ops *table_ops; + struct rcu_head rcu; +}; + +/** + * struct devlink_dpipe_table_ops - dpipe_table ops + * @actions_dump - dumps all tables actions + * @matches_dump - dumps all tables matches + * @entries_dump - dumps all active entries in the table + * @counters_set_update - when changing the counter status hardware sync + * maybe needed to allocate/free counter related + * resources + */ +struct devlink_dpipe_table_ops { + int (*actions_dump)(void *priv, struct sk_buff *skb); + int (*matches_dump)(void *priv, struct sk_buff *skb); + int (*entries_dump)(void *priv, bool counters_enabled, + struct devlink_dpipe_dump_ctx *dump_ctx); + int (*counters_set_update)(void *priv, bool enable); +}; + +/** + * struct devlink_dpipe_headers - dpipe headers + * @headers - header array can be shared (global bit) or driver specific + * @headers_count - count of headers + */ +struct devlink_dpipe_headers { + struct devlink_dpipe_header **headers; + unsigned int headers_count; +}; + struct devlink_ops { int (*port_type_set)(struct devlink_port *devlink_port, enum devlink_port_type port_type); @@ -94,6 +268,8 @@ struct devlink_ops { int (*eswitch_mode_set)(struct devlink *devlink, u16 mode); int (*eswitch_inline_mode_get)(struct devlink *devlink, u8 *p_inline_mode); int (*eswitch_inline_mode_set)(struct devlink *devlink, u8 inline_mode); + int (*eswitch_encap_mode_get)(struct devlink *devlink, u8 *p_encap_mode); + int (*eswitch_encap_mode_set)(struct devlink *devlink, u8 encap_mode); }; static inline void *devlink_priv(struct devlink *devlink) @@ -132,6 +308,26 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, u16 egress_pools_count, u16 ingress_tc_count, u16 egress_tc_count); void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index); +int devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern); +void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name); +int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers); +void devlink_dpipe_headers_unregister(struct devlink *devlink); +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name); +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx); +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry); +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx); +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action); +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match); #else @@ -200,6 +396,71 @@ static inline void devlink_sb_unregister(struct devlink *devlink, { } +static inline int +devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern) +{ + return 0; +} + +static inline void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ +} + +static inline int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers * + dpipe_headers) +{ + return 0; +} + +static inline void devlink_dpipe_headers_unregister(struct devlink *devlink) +{ +} + +static inline bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + return false; +} + +static inline int +devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + return 0; +} + +static inline int +devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return 0; +} + +static inline int +devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + return 0; +} + +static inline int +devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + return 0; +} + +static inline int +devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + return 0; +} + #endif #endif /* _NET_DEVLINK_H_ */ diff --git a/include/net/dsa.h b/include/net/dsa.h index 4e13e695f0251d5c762c3089065f4eeb429033eb..8e24677b1c62a5842f81496d7d18c071d2cf0af6 100644 --- a/include/net/dsa.h +++ b/include/net/dsa.h @@ -19,6 +19,7 @@ #include #include #include +#include struct tc_action; struct phy_device; @@ -31,6 +32,8 @@ enum dsa_tag_protocol { DSA_TAG_PROTO_EDSA, DSA_TAG_PROTO_BRCM, DSA_TAG_PROTO_QCA, + DSA_TAG_PROTO_MTK, + DSA_TAG_PROTO_LAN9303, DSA_TAG_LAST, /* MUST BE LAST */ }; @@ -122,7 +125,7 @@ struct dsa_switch_tree { * protocol to use. */ struct net_device *master_netdev; - int (*rcv)(struct sk_buff *skb, + struct sk_buff * (*rcv)(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); @@ -182,6 +185,7 @@ struct dsa_port { unsigned int ageing_time; u8 stp_state; struct net_device *bridge_dev; + struct devlink_port devlink_port; }; struct dsa_switch { @@ -233,6 +237,13 @@ struct dsa_switch { u32 phys_mii_mask; struct mii_bus *slave_mii_bus; + /* Ageing Time limits in msecs */ + unsigned int ageing_time_min; + unsigned int ageing_time_max; + + /* devlink used to represent this switch device */ + struct devlink *devlink; + /* Dynamically allocated ports, keep last */ size_t num_ports; struct dsa_port ports[]; @@ -248,6 +259,11 @@ static inline bool dsa_is_dsa_port(struct dsa_switch *ds, int p) return !!((ds->dsa_port_mask) & (1 << p)); } +static inline bool dsa_is_normal_port(struct dsa_switch *ds, int p) +{ + return !dsa_is_cpu_port(ds, p) && !dsa_is_dsa_port(ds, p); +} + static inline bool dsa_is_port_initialized(struct dsa_switch *ds, int p) { return ds->enabled_port_mask & (1 << p) && ds->ports[p].netdev; @@ -287,7 +303,7 @@ struct dsa_notifier_bridge_info { struct dsa_switch_ops { /* - * Probing and setup. + * Legacy probing. */ const char *(*probe)(struct device *dsa_dev, struct device *host_dev, int sw_addr, @@ -442,6 +458,14 @@ struct dsa_switch_ops { bool ingress); void (*port_mirror_del)(struct dsa_switch *ds, int port, struct dsa_mall_mirror_tc_entry *mirror); + + /* + * Cross-chip operations + */ + int (*crosschip_bridge_join)(struct dsa_switch *ds, int sw_index, + int port, struct net_device *br); + void (*crosschip_bridge_leave)(struct dsa_switch *ds, int sw_index, + int port, struct net_device *br); }; struct dsa_switch_driver { @@ -449,9 +473,11 @@ struct dsa_switch_driver { const struct dsa_switch_ops *ops; }; +/* Legacy driver registration */ void register_switch_driver(struct dsa_switch_driver *type); void unregister_switch_driver(struct dsa_switch_driver *type); struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev); + struct net_device *dsa_dev_to_net_device(struct device *dev); static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) @@ -459,6 +485,15 @@ static inline bool dsa_uses_tagged_protocol(struct dsa_switch_tree *dst) return dst->rcv != NULL; } +static inline bool netdev_uses_dsa(struct net_device *dev) +{ +#if IS_ENABLED(CONFIG_NET_DSA) + if (dev->dsa_ptr != NULL) + return dsa_uses_tagged_protocol(dev->dsa_ptr); +#endif + return false; +} + struct dsa_switch *dsa_switch_alloc(struct device *dev, size_t n); void dsa_unregister_switch(struct dsa_switch *ds); int dsa_register_switch(struct dsa_switch *ds, struct device *dev); diff --git a/include/net/esp.h b/include/net/esp.h index a43be85aedc44594265695e98452ae491b852d83..c41994d1bfef0bce761227828638d466a01d6ebc 100644 --- a/include/net/esp.h +++ b/include/net/esp.h @@ -10,4 +10,23 @@ static inline struct ip_esp_hdr *ip_esp_hdr(const struct sk_buff *skb) return (struct ip_esp_hdr *)skb_transport_header(skb); } +struct esp_info { + struct ip_esp_hdr *esph; + __be64 seqno; + int tfclen; + int tailen; + int plen; + int clen; + int len; + int nfrags; + __u8 proto; + bool inplace; +}; + +int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); +int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); +int esp_input_done2(struct sk_buff *skb, int err); +int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); +int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp); +int esp6_input_done2(struct sk_buff *skb, int err); #endif diff --git a/include/net/fib_rules.h b/include/net/fib_rules.h index 8dbfdf728cd8ce901b3b05f0e58b4eeee25051fe..76c7300626d675bd70374efe45e5608c65f44f84 100644 --- a/include/net/fib_rules.h +++ b/include/net/fib_rules.h @@ -141,7 +141,10 @@ int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); +bool fib_rule_matchall(const struct fib_rule *rule); -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh); +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack); #endif diff --git a/include/net/flow.h b/include/net/flow.h index 6984f1913dc124ab97627a4a5ecb2aba9f93e901..bae198b3039e6e66829c2717c1baf2baebbec6a2 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -202,7 +202,7 @@ static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) typedef unsigned long flow_compare_t; -static inline size_t flow_key_size(u16 family) +static inline unsigned int flow_key_size(u16 family) { switch (family) { case AF_INET: diff --git a/include/net/flow_dissector.h b/include/net/flow_dissector.h index ac9703018a3a63bd392aa37f31b1afdf54d69aa0..8d21d448daa9d19ad58a753e373035f2a1be59bb 100644 --- a/include/net/flow_dissector.h +++ b/include/net/flow_dissector.h @@ -41,6 +41,13 @@ struct flow_dissector_key_vlan { u16 padding; }; +struct flow_dissector_key_mpls { + u32 mpls_ttl:8, + mpls_bos:1, + mpls_tc:3, + mpls_label:20; +}; + struct flow_dissector_key_keyid { __be32 keyid; }; @@ -169,6 +176,7 @@ enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ + FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_MAX, }; diff --git a/include/net/flowcache.h b/include/net/flowcache.h index 9caf3bfc8d2dafcc89064ec54e78682244d71fb7..51eb971e897378e25329b06f80678f522eec2ea6 100644 --- a/include/net/flowcache.h +++ b/include/net/flowcache.h @@ -8,7 +8,7 @@ struct flow_cache_percpu { struct hlist_head *hash_table; - int hash_count; + unsigned int hash_count; u32 hash_rnd; int hash_rnd_recalc; struct tasklet_struct flush_tasklet; @@ -18,8 +18,8 @@ struct flow_cache { u32 hash_shift; struct flow_cache_percpu __percpu *percpu; struct hlist_node node; - int low_watermark; - int high_watermark; + unsigned int low_watermark; + unsigned int high_watermark; struct timer_list rnd_timer; }; #endif /* _NET_FLOWCACHE_H */ diff --git a/include/net/genetlink.h b/include/net/genetlink.h index a34275be360001e89740fe4325a53905683e9778..68b88192b00c6200aa0b8b2628554952d0d79a1c 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -84,6 +84,7 @@ struct nlattr **genl_family_attrbuf(const struct genl_family *family); * @attrs: netlink attributes * @_net: network namespace * @user_ptr: user pointers + * @extack: extended ACK report struct */ struct genl_info { u32 snd_seq; @@ -94,6 +95,7 @@ struct genl_info { struct nlattr ** attrs; possible_net_t _net; void * user_ptr[2]; + struct netlink_ext_ack *extack; }; static inline struct net *genl_info_net(struct genl_info *info) @@ -106,6 +108,16 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) write_pnet(&info->_net, net); } +#define GENL_SET_ERR_MSG(info, msg) NL_SET_ERR_MSG((info)->extack, msg) + +static inline int genl_err_attr(struct genl_info *info, int err, + struct nlattr *attr) +{ + info->extack->bad_attr = attr; + + return err; +} + /** * struct genl_ops - generic netlink operations * @cmd: command identifier @@ -162,14 +174,16 @@ genlmsg_nlhdr(void *user_hdr, const struct genl_family *family) * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy - * */ + * @extack: extended ACK report struct + */ static inline int genlmsg_parse(const struct nlmsghdr *nlh, const struct genl_family *family, struct nlattr *tb[], int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { return nlmsg_parse(nlh, family->hdrsize + GENL_HDRLEN, tb, maxtype, - policy); + policy, extack); } /** diff --git a/include/net/ip.h b/include/net/ip.h index bf264a8db1ce3b1b29b9a5cc9732df323154e7e5..821cedcc8e73b68af72d1918242c25d757c63d24 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -33,6 +33,8 @@ #include #include +#define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ + struct sock; struct inet_skb_parm { diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9dc2c182a263218ad63ceb326893b4c86c21ff95..f5e625f5336799d88c15df94d8776680dddcfe19 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,6 +84,7 @@ struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, int flags); +void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 1b1cf33cbfb02eaf4eb1eeb92be474076fdeebe4..08fbc7f7d8d70ffc43f93881b0fbeab5b3fe8abb 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -33,6 +33,8 @@ struct __ip6_tnl_parm { __be16 o_flags; __be32 i_key; __be32 o_key; + + __u32 fwmark; }; /* IPv6 tunnel */ diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 368bb4024b78c411d02a842f340f9fa11a9b5f7e..6692c5758b332d468f1e0611ecc4f3e03ae03b2b 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -213,6 +213,11 @@ struct fib_entry_notifier_info { u32 tb_id; }; +struct fib_rule_notifier_info { + struct fib_notifier_info info; /* must be first */ + struct fib_rule *rule; +}; + struct fib_nh_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_nh *fib_nh; @@ -232,9 +237,21 @@ enum fib_event_type { int register_fib_notifier(struct notifier_block *nb, void (*cb)(struct notifier_block *nb)); int unregister_fib_notifier(struct notifier_block *nb); +int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info); int call_fib_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); +void fib_notify(struct net *net, struct notifier_block *nb); +#ifdef CONFIG_IP_MULTIPLE_TABLES +void fib_rules_notify(struct net *net, struct notifier_block *nb); +#else +static inline void fib_rules_notify(struct net *net, struct notifier_block *nb) +{ +} +#endif + struct fib_table { struct hlist_node tb_hlist; u32 tb_id; @@ -299,6 +316,11 @@ static inline int fib_lookup(struct net *net, const struct flowi4 *flp, return err; } +static inline bool fib4_rule_default(const struct fib_rule *rule) +{ + return true; +} + #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); @@ -343,6 +365,8 @@ static inline int fib_lookup(struct net *net, struct flowi4 *flp, return err; } +bool fib4_rule_default(const struct fib_rule *rule); + #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ @@ -371,17 +395,13 @@ int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned int nh_flags); -extern u32 fib_multipath_secret __read_mostly; - -static inline int fib_multipath_hash(__be32 saddr, __be32 daddr) -{ - return jhash_2words((__force u32)saddr, (__force u32)daddr, - fib_multipath_secret) >> 1; -} - +#ifdef CONFIG_IP_ROUTE_MULTIPATH +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, + const struct sk_buff *skb); +#endif void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, - struct flowi4 *fl4, int mp_hash); + struct flowi4 *fl4, const struct sk_buff *skb); /* Exported by fib_trie.c */ void fib_trie_init(void); diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h index 95056796657cf9b11c24ae1497f1d5c01a9fd178..520809912f0392e84b12cc8ce99d82e3e27f5d78 100644 --- a/include/net/ip_tunnels.h +++ b/include/net/ip_tunnels.h @@ -132,6 +132,7 @@ struct ip_tunnel { unsigned int prl_count; /* # of entries in PRL */ unsigned int ip_tnl_net_id; struct gro_cells gro_cells; + __u32 fwmark; bool collect_md; bool ignore_df; }; @@ -273,9 +274,9 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb, const struct tnl_ptk_info *tpi, struct metadata_dst *tun_dst, bool log_ecn_error); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p); + struct ip_tunnel_parm *p, __u32 fwmark); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p); + struct ip_tunnel_parm *p, __u32 fwmark); void ip_tunnel_setup(struct net_device *dev, unsigned int net_id); struct ip_tunnel_encap_ops { diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 7bdfa7d783639d8b65c18bd7f5a6ea5fa4fbb7da..4f4f786255efe16725cbdea7bb29362ea6023185 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -12,6 +12,8 @@ #include /* for struct list_head */ #include /* for struct rwlock_t */ #include /* for struct atomic_t */ +#include /* for struct refcount_t */ + #include #include #include @@ -525,7 +527,7 @@ struct ip_vs_conn { struct netns_ipvs *ipvs; /* counter and timer */ - atomic_t refcnt; /* reference count */ + refcount_t refcnt; /* reference count */ struct timer_list timer; /* Expiration timer */ volatile unsigned long timeout; /* timeout */ @@ -667,7 +669,7 @@ struct ip_vs_dest { atomic_t conn_flags; /* flags to copy to conn */ atomic_t weight; /* server weight */ - atomic_t refcnt; /* reference counter */ + refcount_t refcnt; /* reference counter */ struct ip_vs_stats stats; /* statistics */ unsigned long idle_start; /* start time, jiffies */ @@ -1211,14 +1213,14 @@ struct ip_vs_conn * ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, */ static inline bool __ip_vs_conn_get(struct ip_vs_conn *cp) { - return atomic_inc_not_zero(&cp->refcnt); + return refcount_inc_not_zero(&cp->refcnt); } /* put back the conn without restarting its timer */ static inline void __ip_vs_conn_put(struct ip_vs_conn *cp) { smp_mb__before_atomic(); - atomic_dec(&cp->refcnt); + refcount_dec(&cp->refcnt); } void ip_vs_conn_put(struct ip_vs_conn *cp); void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport); @@ -1347,8 +1349,6 @@ int ip_vs_protocol_init(void); void ip_vs_protocol_cleanup(void); void ip_vs_protocol_timeout_change(struct netns_ipvs *ipvs, int flags); int *ip_vs_create_timeout_table(int *table, int size); -int ip_vs_set_state_timeout(int *table, int num, const char *const *names, - const char *name, int to); void ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg); @@ -1410,18 +1410,18 @@ void ip_vs_try_bind_dest(struct ip_vs_conn *cp); static inline void ip_vs_dest_hold(struct ip_vs_dest *dest) { - atomic_inc(&dest->refcnt); + refcount_inc(&dest->refcnt); } static inline void ip_vs_dest_put(struct ip_vs_dest *dest) { smp_mb__before_atomic(); - atomic_dec(&dest->refcnt); + refcount_dec(&dest->refcnt); } static inline void ip_vs_dest_put_and_free(struct ip_vs_dest *dest) { - if (atomic_dec_and_test(&dest->refcnt)) + if (refcount_dec_and_test(&dest->refcnt)) kfree(dest); } @@ -1553,13 +1553,9 @@ static inline void ip_vs_notrack(struct sk_buff *skb) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (!ct || !nf_ct_is_untracked(ct)) { - struct nf_conn *untracked; - + if (ct) { nf_conntrack_put(&ct->ct_general); - untracked = nf_ct_untracked_get(); - nf_conntrack_get(&untracked->ct_general); - nf_ct_set(skb, untracked, IP_CT_NEW); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); } #endif } @@ -1618,7 +1614,7 @@ static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, if (!(cp->flags & IP_VS_CONN_F_NFCT)) return false; ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) + if (ct) return true; #endif return false; diff --git a/include/net/mac80211.h b/include/net/mac80211.h index a3bab3c5ecfb302e5df2d9e5d5ec36e2c368ef9b..76ed24a201eb9075672e8d2f55fffe5ae147923d 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -5,7 +5,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007-2010 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 - 2016 Intel Deutschland GmbH + * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -299,6 +299,8 @@ struct ieee80211_vif_chanctx_switch { * context had been assigned. * @BSS_CHANGED_OCB: OCB join status changed * @BSS_CHANGED_MU_GROUPS: VHT MU-MIMO group id or user position changed + * @BSS_CHANGED_KEEP_ALIVE: keep alive options (idle period or protected + * keep alive) changed. */ enum ieee80211_bss_change { BSS_CHANGED_ASSOC = 1<<0, @@ -325,6 +327,7 @@ enum ieee80211_bss_change { BSS_CHANGED_BANDWIDTH = 1<<21, BSS_CHANGED_OCB = 1<<22, BSS_CHANGED_MU_GROUPS = 1<<23, + BSS_CHANGED_KEEP_ALIVE = 1<<24, /* when adding here, make sure to change ieee80211_reconfig */ }; @@ -501,6 +504,10 @@ struct ieee80211_mu_group_data { * implies disabled. As with the cfg80211 callback, a change here should * cause an event to be sent indicating where the current value is in * relation to the newly configured threshold. + * @cqm_rssi_low: Connection quality monitor RSSI lower threshold, a zero value + * implies disabled. This is an alternative mechanism to the single + * threshold event and can't be enabled simultaneously with it. + * @cqm_rssi_high: Connection quality monitor RSSI upper threshold. * @cqm_rssi_hyst: Connection quality monitor RSSI hysteresis * @arp_addr_list: List of IPv4 addresses for hardware ARP filtering. The * may filter ARP queries targeted for other addresses than listed here. @@ -529,6 +536,13 @@ struct ieee80211_mu_group_data { * @allow_p2p_go_ps: indication for AP or P2P GO interface, whether it's allowed * to use P2P PS mechanism or not. AP/P2P GO is not allowed to use P2P PS * if it has associated clients without P2P PS support. + * @max_idle_period: the time period during which the station can refrain from + * transmitting frames to its associated AP without being disassociated. + * In units of 1000 TUs. Zero value indicates that the AP did not include + * a (valid) BSS Max Idle Period Element. + * @protected_keep_alive: if set, indicates that the station should send an RSN + * protected frame to the AP to reset the idle timer at the AP for the + * station. */ struct ieee80211_bss_conf { const u8 *bssid; @@ -553,6 +567,8 @@ struct ieee80211_bss_conf { u16 ht_operation_mode; s32 cqm_rssi_thold; u32 cqm_rssi_hyst; + s32 cqm_rssi_low; + s32 cqm_rssi_high; struct cfg80211_chan_def chandef; struct ieee80211_mu_group_data mu_group; __be32 arp_addr_list[IEEE80211_BSS_ARP_ADDR_LIST_LEN]; @@ -567,6 +583,8 @@ struct ieee80211_bss_conf { enum nl80211_tx_power_setting txpower_type; struct ieee80211_p2p_noa_attr p2p_noa_attr; bool allow_p2p_go_ps; + u16 max_idle_period; + bool protected_keep_alive; }; /** @@ -942,6 +960,19 @@ struct ieee80211_tx_info { }; }; +/** + * struct ieee80211_tx_status - extended tx staus info for rate control + * + * @sta: Station that the packet was transmitted for + * @info: Basic tx status information + * @skb: Packet skb (can be NULL if not provided by the driver) + */ +struct ieee80211_tx_status { + struct ieee80211_sta *sta; + struct ieee80211_tx_info *info; + struct sk_buff *skb; +}; + /** * struct ieee80211_scan_ies - descriptors for different blocks of IEs * @@ -1039,16 +1070,8 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * (including FCS) was received. * @RX_FLAG_MACTIME_PLCP_START: The timestamp passed in the RX status (@mactime * field) is valid and contains the time the SYNC preamble was received. - * @RX_FLAG_SHORTPRE: Short preamble was used for this frame - * @RX_FLAG_HT: HT MCS was used and rate_idx is MCS index - * @RX_FLAG_VHT: VHT MCS was used and rate_index is MCS index - * @RX_FLAG_40MHZ: HT40 (40 MHz) was used - * @RX_FLAG_SHORT_GI: Short guard interval was used * @RX_FLAG_NO_SIGNAL_VAL: The signal strength value is not present. * Valid only for data frames (mainly A-MPDU) - * @RX_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, if - * the driver fills this value it should add %IEEE80211_RADIOTAP_MCS_HAVE_FMT - * to hw.radiotap_mcs_details to advertise that fact * @RX_FLAG_AMPDU_DETAILS: A-MPDU details are known, in particular the reference * number (@ampdu_reference) must be populated and be a distinct number for * each A-MPDU @@ -1061,7 +1084,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * is stored in the @ampdu_delimiter_crc field) * @RX_FLAG_MIC_STRIPPED: The mic was stripped of this packet. Decryption was * done by the hardware - * @RX_FLAG_LDPC: LDPC was used * @RX_FLAG_ONLY_MONITOR: Report frame only to monitor interfaces without * processing it in any regular way. * This is useful if drivers offload some frames but still want to report @@ -1070,9 +1092,6 @@ ieee80211_tx_info_clear_status(struct ieee80211_tx_info *info) * monitor interfaces. * This is useful if drivers offload some frames but still want to report * them for sniffing purposes. - * @RX_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 - * @RX_FLAG_10MHZ: 10 MHz (half channel) was used - * @RX_FLAG_5MHZ: 5 MHz (quarter channel) was used * @RX_FLAG_AMSDU_MORE: Some drivers may prefer to report separate A-MSDU * subframes instead of a one huge frame for performance reasons. * All, but the last MSDU from an A-MSDU should have this flag set. E.g. @@ -1100,50 +1119,52 @@ enum mac80211_rx_flags { RX_FLAG_FAILED_FCS_CRC = BIT(5), RX_FLAG_FAILED_PLCP_CRC = BIT(6), RX_FLAG_MACTIME_START = BIT(7), - RX_FLAG_SHORTPRE = BIT(8), - RX_FLAG_HT = BIT(9), - RX_FLAG_40MHZ = BIT(10), - RX_FLAG_SHORT_GI = BIT(11), - RX_FLAG_NO_SIGNAL_VAL = BIT(12), - RX_FLAG_HT_GF = BIT(13), - RX_FLAG_AMPDU_DETAILS = BIT(14), - RX_FLAG_PN_VALIDATED = BIT(15), - RX_FLAG_DUP_VALIDATED = BIT(16), - RX_FLAG_AMPDU_LAST_KNOWN = BIT(17), - RX_FLAG_AMPDU_IS_LAST = BIT(18), - RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(19), - RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(20), - RX_FLAG_MACTIME_END = BIT(21), - RX_FLAG_VHT = BIT(22), - RX_FLAG_LDPC = BIT(23), - RX_FLAG_ONLY_MONITOR = BIT(24), - RX_FLAG_SKIP_MONITOR = BIT(25), - RX_FLAG_STBC_MASK = BIT(26) | BIT(27), - RX_FLAG_10MHZ = BIT(28), - RX_FLAG_5MHZ = BIT(29), - RX_FLAG_AMSDU_MORE = BIT(30), - RX_FLAG_RADIOTAP_VENDOR_DATA = BIT(31), - RX_FLAG_MIC_STRIPPED = BIT_ULL(32), - RX_FLAG_ALLOW_SAME_PN = BIT_ULL(33), - RX_FLAG_ICV_STRIPPED = BIT_ULL(34), + RX_FLAG_NO_SIGNAL_VAL = BIT(8), + RX_FLAG_AMPDU_DETAILS = BIT(9), + RX_FLAG_PN_VALIDATED = BIT(10), + RX_FLAG_DUP_VALIDATED = BIT(11), + RX_FLAG_AMPDU_LAST_KNOWN = BIT(12), + RX_FLAG_AMPDU_IS_LAST = BIT(13), + RX_FLAG_AMPDU_DELIM_CRC_ERROR = BIT(14), + RX_FLAG_AMPDU_DELIM_CRC_KNOWN = BIT(15), + RX_FLAG_MACTIME_END = BIT(16), + RX_FLAG_ONLY_MONITOR = BIT(17), + RX_FLAG_SKIP_MONITOR = BIT(18), + RX_FLAG_AMSDU_MORE = BIT(19), + RX_FLAG_RADIOTAP_VENDOR_DATA = BIT(20), + RX_FLAG_MIC_STRIPPED = BIT(21), + RX_FLAG_ALLOW_SAME_PN = BIT(22), + RX_FLAG_ICV_STRIPPED = BIT(23), }; -#define RX_FLAG_STBC_SHIFT 26 - /** - * enum mac80211_rx_vht_flags - receive VHT flags + * enum mac80211_rx_encoding_flags - MCS & bandwidth flags * - * These flags are used with the @vht_flag member of - * &struct ieee80211_rx_status. - * @RX_VHT_FLAG_80MHZ: 80 MHz was used - * @RX_VHT_FLAG_160MHZ: 160 MHz was used - * @RX_VHT_FLAG_BF: packet was beamformed - */ + * @RX_ENC_FLAG_SHORTPRE: Short preamble was used for this frame + * @RX_ENC_FLAG_SHORT_GI: Short guard interval was used + * @RX_ENC_FLAG_HT_GF: This frame was received in a HT-greenfield transmission, + * if the driver fills this value it should add + * %IEEE80211_RADIOTAP_MCS_HAVE_FMT + * to hw.radiotap_mcs_details to advertise that fact + * @RX_ENC_FLAG_LDPC: LDPC was used + * @RX_ENC_FLAG_STBC_MASK: STBC 2 bit bitmask. 1 - Nss=1, 2 - Nss=2, 3 - Nss=3 + * @RX_ENC_FLAG_BF: packet was beamformed + */ +enum mac80211_rx_encoding_flags { + RX_ENC_FLAG_SHORTPRE = BIT(0), + RX_ENC_FLAG_SHORT_GI = BIT(2), + RX_ENC_FLAG_HT_GF = BIT(3), + RX_ENC_FLAG_STBC_MASK = BIT(4) | BIT(5), + RX_ENC_FLAG_LDPC = BIT(6), + RX_ENC_FLAG_BF = BIT(7), +}; -enum mac80211_rx_vht_flags { - RX_VHT_FLAG_80MHZ = BIT(0), - RX_VHT_FLAG_160MHZ = BIT(1), - RX_VHT_FLAG_BF = BIT(2), +#define RX_ENC_FLAG_STBC_SHIFT 4 + +enum mac80211_rx_encoding { + RX_ENC_LEGACY = 0, + RX_ENC_HT, + RX_ENC_VHT, }; /** @@ -1173,9 +1194,11 @@ enum mac80211_rx_vht_flags { * @antenna: antenna used * @rate_idx: index of data rate into band's supported rates or MCS index if * HT or VHT is used (%RX_FLAG_HT/%RX_FLAG_VHT) - * @vht_nss: number of streams (VHT only) + * @nss: number of streams (VHT and HE only) * @flag: %RX_FLAG_\* - * @vht_flag: %RX_VHT_FLAG_\* + * @encoding: &enum mac80211_rx_encoding + * @bw: &enum rate_info_bw + * @enc_flags: uses bits from &enum mac80211_rx_encoding_flags * @rx_flags: internal RX flags for mac80211 * @ampdu_reference: A-MPDU reference number, must be a different value for * each A-MPDU but the same for each subframe within one A-MPDU @@ -1186,11 +1209,12 @@ struct ieee80211_rx_status { u64 boottime_ns; u32 device_timestamp; u32 ampdu_reference; - u64 flag; + u32 flag; u16 freq; - u8 vht_flag; + u8 enc_flags; + u8 encoding:2, bw:3; u8 rate_idx; - u8 vht_nss; + u8 nss; u8 rx_flags; u8 band; u8 antenna; @@ -4199,6 +4223,23 @@ void ieee80211_get_tx_rates(struct ieee80211_vif *vif, void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb); +/** + * ieee80211_tx_status_ext - extended transmit status callback + * + * This function can be used as a replacement for ieee80211_tx_status + * in drivers that may want to provide extra information that does not + * fit into &struct ieee80211_tx_info. + * + * Calls to this function for a single hardware must be synchronized + * against each other. Calls to this function, ieee80211_tx_status_ni() + * and ieee80211_tx_status_irqsafe() may not be mixed for a single hardware. + * + * @hw: the hardware the frame was transmitted by + * @status: tx status information + */ +void ieee80211_tx_status_ext(struct ieee80211_hw *hw, + struct ieee80211_tx_status *status); + /** * ieee80211_tx_status_noskb - transmit status callback without skb * @@ -4215,9 +4256,17 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, * (NULL for multicast packets) * @info: tx status information */ -void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, - struct ieee80211_sta *sta, - struct ieee80211_tx_info *info); +static inline void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, + struct ieee80211_sta *sta, + struct ieee80211_tx_info *info) +{ + struct ieee80211_tx_status status = { + .sta = sta, + .info = info, + }; + + ieee80211_tx_status_ext(hw, &status); +} /** * ieee80211_tx_status_ni - transmit status callback (in process context) @@ -5438,9 +5487,6 @@ void ieee80211_stop_rx_ba_session_offl(struct ieee80211_vif *vif, * RTS threshold * @short_preamble: whether mac80211 will request short-preamble transmission * if the selected rate supports it - * @max_rate_idx: user-requested maximum (legacy) rate - * (deprecated; this will be removed once drivers get updated to use - * rate_idx_mask) * @rate_idx_mask: user-requested (legacy) rate mask * @rate_idx_mcs_mask: user-requested MCS rate mask (NULL if not in use) * @bss: whether this frame is sent out in AP or IBSS mode @@ -5452,7 +5498,6 @@ struct ieee80211_tx_rate_control { struct sk_buff *skb; struct ieee80211_tx_rate reported_rate; bool rts, short_preamble; - u8 max_rate_idx; u32 rate_idx_mask; u8 *rate_idx_mcs_mask; bool bss; @@ -5474,10 +5519,9 @@ struct rate_control_ops { void (*free_sta)(void *priv, struct ieee80211_sta *sta, void *priv_sta); - void (*tx_status_noskb)(void *priv, - struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta, - struct ieee80211_tx_info *info); + void (*tx_status_ext)(void *priv, + struct ieee80211_supported_band *sband, + void *priv_sta, struct ieee80211_tx_status *st); void (*tx_status)(void *priv, struct ieee80211_supported_band *sband, struct ieee80211_sta *sta, void *priv_sta, struct sk_buff *skb); diff --git a/include/net/mpls_iptunnel.h b/include/net/mpls_iptunnel.h index 179253f9dcfd986ef806331044bc4973f1cc7d6e..9d22bf67ac86623eaaea2c49fbe1140747ce67e8 100644 --- a/include/net/mpls_iptunnel.h +++ b/include/net/mpls_iptunnel.h @@ -14,11 +14,12 @@ #ifndef _NET_MPLS_IPTUNNEL_H #define _NET_MPLS_IPTUNNEL_H 1 -#define MAX_NEW_LABELS 2 - struct mpls_iptunnel_encap { - u32 label[MAX_NEW_LABELS]; u8 labels; + u8 ttl_propagate; + u8 default_ttl; + u8 reserved1; + u32 label[0]; }; static inline struct mpls_iptunnel_encap *mpls_lwtunnel_encap(struct lwtunnel_state *lwtstate) diff --git a/include/net/ndisc.h b/include/net/ndisc.h index 8a0214654b6b10bc480d7e6dc8195555ca58dc9a..1036c902d2c9904ed084fcd5a4d8dc70b7902cbe 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -439,8 +439,10 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh, * IGMP */ int igmp6_init(void); +int igmp6_late_init(void); void igmp6_cleanup(void); +void igmp6_late_cleanup(void); int igmp6_event_query(struct sk_buff *skb); diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 5ebf6949116097f60e668b0c2c4c48dd1639e5e8..e4dd3a2140341049fabfbff1ec55406fbe11d5b2 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -314,7 +314,8 @@ static inline struct neighbour *neigh_create(struct neigh_table *tbl, } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); -int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags); +int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, + u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); @@ -449,7 +450,7 @@ static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq; - int hh_len; + unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); @@ -458,7 +459,7 @@ static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } else { - int hh_alen = HH_DATA_ALIGN(hh_len); + unsigned int hh_alen = HH_DATA_ALIGN(hh_len); memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index af8fe8a909dc0ca62e54056cf4be9d0d4ea82477..fe80bb48ab1f0c7b1665a10a9bf2388b32eb5013 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,9 @@ struct net { #endif #if IS_ENABLED(CONFIG_MPLS) struct netns_mpls mpls; +#endif +#if IS_ENABLED(CONFIG_CAN) + struct netns_can can; #endif struct sock *diag_nlsk; atomic_t fnhe_genid; diff --git a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h index 6ff32815641b21ba0d3c20214806c4f16aa217ca..919e4e8af3272b3d66580e4c60c79bdc39b47616 100644 --- a/include/net/netfilter/ipv4/nf_conntrack_ipv4.h +++ b/include/net/netfilter/ipv4/nf_conntrack_ipv4.h @@ -14,7 +14,6 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmp; #ifdef CONFIG_NF_CT_PROTO_DCCP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4; diff --git a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h index c59b82456f89cd421fde702f13c7cfe59f73266a..eaea968f86570db7010011b1af4a3608f2dee8f9 100644 --- a/include/net/netfilter/ipv6/nf_conntrack_ipv6.h +++ b/include/net/netfilter/ipv6/nf_conntrack_ipv6.h @@ -5,7 +5,6 @@ extern struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6; -extern struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6; extern struct nf_conntrack_l4proto nf_conntrack_l4proto_icmpv6; #ifdef CONFIG_NF_CT_PROTO_DCCP extern struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 19605878da4739d04f0642b20f8641ed8601d2eb..8ece3612d0cd6bf8fe4ae6c347c2cb30da2f553f 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -50,25 +50,6 @@ union nf_conntrack_expect_proto { #define NF_CT_ASSERT(x) #endif -struct nf_conntrack_helper; - -/* Must be kept in sync with the classes defined by helpers */ -#define NF_CT_MAX_EXPECT_CLASSES 4 - -/* nf_conn feature for connections that have a helper */ -struct nf_conn_help { - /* Helper. if any */ - struct nf_conntrack_helper __rcu *helper; - - struct hlist_head expectations; - - /* Current number of expected connections */ - u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; - - /* private helper information. */ - char data[]; -}; - #include #include @@ -243,14 +224,6 @@ extern s32 (*nf_ct_nat_offset)(const struct nf_conn *ct, enum ip_conntrack_dir dir, u32 seq); -/* Fake conntrack entry for untracked connections */ -DECLARE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); -static inline struct nf_conn *nf_ct_untracked_get(void) -{ - return raw_cpu_ptr(&nf_conntrack_untracked); -} -void nf_ct_untracked_status_or(unsigned long bits); - /* Iterate over all conntracks: if iter returns true, it's deleted. */ void nf_ct_iterate_cleanup(struct net *net, int (*iter)(struct nf_conn *i, void *data), @@ -281,11 +254,6 @@ static inline int nf_ct_is_dying(const struct nf_conn *ct) return test_bit(IPS_DYING_BIT, &ct->status); } -static inline int nf_ct_is_untracked(const struct nf_conn *ct) -{ - return test_bit(IPS_UNTRACKED_BIT, &ct->status); -} - /* Packet is received from loopback */ static inline bool nf_is_loopback_packet(const struct sk_buff *skb) { diff --git a/include/net/netfilter/nf_conntrack_core.h b/include/net/netfilter/nf_conntrack_core.h index 84ec7ca5f195db411b7e07c6dce641a286b5ea0a..81d7f8a3094557c3cd518be809c524b4448bbe1b 100644 --- a/include/net/netfilter/nf_conntrack_core.h +++ b/include/net/netfilter/nf_conntrack_core.h @@ -65,7 +65,7 @@ static inline int nf_conntrack_confirm(struct sk_buff *skb) struct nf_conn *ct = (struct nf_conn *)skb_nfct(skb); int ret = NF_ACCEPT; - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { if (!nf_ct_is_confirmed(ct)) ret = __nf_conntrack_confirm(skb); if (likely(ret == NF_ACCEPT)) diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h index 12d967b587264af71c84d62faca95a4e7ace096f..2a10c6570fccfcd633351a3b881bff4d51e2f45c 100644 --- a/include/net/netfilter/nf_conntrack_ecache.h +++ b/include/net/netfilter/nf_conntrack_ecache.h @@ -20,11 +20,11 @@ enum nf_ct_ecache_state { struct nf_conntrack_ecache { unsigned long cache; /* bitops want long */ - unsigned long missed; /* missed events */ + u16 missed; /* missed events */ u16 ctmask; /* bitmask of ct events to be delivered */ u16 expmask; /* bitmask of expect events to be delivered */ + enum nf_ct_ecache_state state:8;/* ecache state */ u32 portid; /* netlink portid of destroyer */ - enum nf_ct_ecache_state state; /* ecache state */ }; static inline struct nf_conntrack_ecache * diff --git a/include/net/netfilter/nf_conntrack_expect.h b/include/net/netfilter/nf_conntrack_expect.h index 5ed33ea4718ef5a101689432825ebb1a48fcc0d9..2ba54feaccd8d4a6ebccc84fd5e21f45bff665ca 100644 --- a/include/net/netfilter/nf_conntrack_expect.h +++ b/include/net/netfilter/nf_conntrack_expect.h @@ -5,6 +5,8 @@ #ifndef _NF_CONNTRACK_EXPECT_H #define _NF_CONNTRACK_EXPECT_H +#include + #include #include @@ -37,7 +39,7 @@ struct nf_conntrack_expect { struct timer_list timeout; /* Usage count. */ - atomic_t use; + refcount_t use; /* Flags */ unsigned int flags; @@ -71,6 +73,7 @@ struct nf_conntrack_expect_policy { }; #define NF_CT_EXPECT_CLASS_DEFAULT 0 +#define NF_CT_EXPECT_MAX_CNT 255 int nf_conntrack_expect_pernet_init(struct net *net); void nf_conntrack_expect_pernet_fini(struct net *net); @@ -102,6 +105,7 @@ static inline void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) void nf_ct_remove_expectations(struct nf_conn *ct); void nf_ct_unexpect_related(struct nf_conntrack_expect *exp); +bool nf_ct_remove_expect(struct nf_conntrack_expect *exp); /* Allocate space for an expectation: this is mandatory before calling nf_ct_expect_related. You will have to call put afterwards. */ diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 1c3035dda31f66a33d50cf2617d3a70288812383..4944bc9153cf5d307c4168079d9eb4b6a2c09f21 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -43,8 +43,8 @@ enum nf_ct_ext_id { /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { struct rcu_head rcu; - u16 offset[NF_CT_EXT_NUM]; - u16 len; + u8 offset[NF_CT_EXT_NUM]; + u8 len; char data[0]; }; @@ -69,12 +69,7 @@ static inline void *__nf_ct_ext_find(const struct nf_conn *ct, u8 id) ((id##_TYPE *)__nf_ct_ext_find((ext), (id))) /* Destroy all relationships */ -void __nf_ct_ext_destroy(struct nf_conn *ct); -static inline void nf_ct_ext_destroy(struct nf_conn *ct) -{ - if (ct->ext) - __nf_ct_ext_destroy(ct); -} +void nf_ct_ext_destroy(struct nf_conn *ct); /* Free operation. If you want to free a object referred from private area, * please implement __nf_ct_ext_free() and call it. @@ -86,15 +81,7 @@ static inline void nf_ct_ext_free(struct nf_conn *ct) } /* Add this type, returns pointer to data or NULL. */ -void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, - size_t var_alloc_len, gfp_t gfp); - -#define nf_ct_ext_add(ct, id, gfp) \ - ((id##_TYPE *)__nf_ct_ext_add_length((ct), (id), 0, (gfp))) -#define nf_ct_ext_add_length(ct, id, len, gfp) \ - ((id##_TYPE *)__nf_ct_ext_add_length((ct), (id), (len), (gfp))) - -#define NF_CT_EXT_F_PREALLOC 0x0001 +void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp); struct nf_ct_ext_type { /* Destroys relationships (can be NULL). */ @@ -102,15 +89,11 @@ struct nf_ct_ext_type { enum nf_ct_ext_id id; - unsigned int flags; - /* Length and min alignment. */ u8 len; u8 align; - /* initial size of nf_ct_ext. */ - u8 alloc_size; }; -int nf_ct_extend_register(struct nf_ct_ext_type *type); -void nf_ct_extend_unregister(struct nf_ct_ext_type *type); +int nf_ct_extend_register(const struct nf_ct_ext_type *type); +void nf_ct_extend_unregister(const struct nf_ct_ext_type *type); #endif /* _NF_CONNTRACK_EXTEND_H */ diff --git a/include/net/netfilter/nf_conntrack_helper.h b/include/net/netfilter/nf_conntrack_helper.h index 1eaac1f4cd6aef2b12cfe51621a5ce6da5c4ba13..e04fa7691e5d6873cd04e94816227d4a41275fa2 100644 --- a/include/net/netfilter/nf_conntrack_helper.h +++ b/include/net/netfilter/nf_conntrack_helper.h @@ -29,9 +29,6 @@ struct nf_conntrack_helper { struct module *me; /* pointer to self */ const struct nf_conntrack_expect_policy *expect_policy; - /* length of internal data, ie. sizeof(struct nf_ct_*_master) */ - size_t data_len; - /* Tuple of things we will help (compared against server response) */ struct nf_conntrack_tuple tuple; @@ -49,9 +46,33 @@ struct nf_conntrack_helper { unsigned int expect_class_max; unsigned int flags; - unsigned int queue_num; /* For user-space helpers. */ + + /* For user-space helpers: */ + unsigned int queue_num; + /* length of userspace private data stored in nf_conn_help->data */ + u16 data_len; }; +/* Must be kept in sync with the classes defined by helpers */ +#define NF_CT_MAX_EXPECT_CLASSES 4 + +/* nf_conn feature for connections that have a helper */ +struct nf_conn_help { + /* Helper. if any */ + struct nf_conntrack_helper __rcu *helper; + + struct hlist_head expectations; + + /* Current number of expected connections */ + u8 expecting[NF_CT_MAX_EXPECT_CLASSES]; + + /* private helper information. */ + char data[32] __aligned(8); +}; + +#define NF_CT_HELPER_BUILD_BUG_ON(structsize) \ + BUILD_BUG_ON((structsize) > FIELD_SIZEOF(struct nf_conn_help, data)) + struct nf_conntrack_helper *__nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum); @@ -62,7 +83,7 @@ void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, - u32 expect_class_max, u32 data_len, + u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h index 85e993e278d5e1e7a886e772dd69f5031214410d..7032e044bbe2a364ae0cfdb629f2e0c20ac81e8a 100644 --- a/include/net/netfilter/nf_conntrack_l4proto.h +++ b/include/net/netfilter/nf_conntrack_l4proto.h @@ -58,6 +58,9 @@ struct nf_conntrack_l4proto { unsigned int dataoff, u_int8_t pf, unsigned int hooknum); + /* called by gc worker if table is full */ + bool (*can_early_drop)(const struct nf_conn *ct); + /* Print out the per-protocol part of the tuple. Return like seq_* */ void (*print_tuple)(struct seq_file *s, const struct nf_conntrack_tuple *); diff --git a/include/net/netfilter/nf_conntrack_synproxy.h b/include/net/netfilter/nf_conntrack_synproxy.h index b0ca402c1f72e20bae492c634f663b8846e788a5..a2fcb5271726d3f5c9e20ccfb521385b59951610 100644 --- a/include/net/netfilter/nf_conntrack_synproxy.h +++ b/include/net/netfilter/nf_conntrack_synproxy.h @@ -52,6 +52,8 @@ struct synproxy_stats { struct synproxy_net { struct nf_conn *tmpl; struct synproxy_stats __percpu *stats; + unsigned int hook_ref4; + unsigned int hook_ref6; }; extern unsigned int synproxy_net_id; diff --git a/include/net/netfilter/nf_conntrack_timeout.h b/include/net/netfilter/nf_conntrack_timeout.h index 5cc5e9e6171a03db471407c708578b4794c22b92..d40b89355fdd345617cf21593922753613190a11 100644 --- a/include/net/netfilter/nf_conntrack_timeout.h +++ b/include/net/netfilter/nf_conntrack_timeout.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include @@ -12,7 +13,7 @@ struct ctnl_timeout { struct list_head head; struct rcu_head rcu_head; - atomic_t refcnt; + refcount_t refcnt; char name[CTNL_TIMEOUT_NAME_MAX]; __u16 l3num; struct nf_conntrack_l4proto *l4proto; diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h index c327a431a6f38103598b82182d463dd728f70443..05c82a1a42679cf336c4ecc2addc30bdaceecbbe 100644 --- a/include/net/netfilter/nf_nat.h +++ b/include/net/netfilter/nf_nat.h @@ -67,7 +67,7 @@ static inline bool nf_nat_oif_changed(unsigned int hooknum, { #if IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV4) || \ IS_ENABLED(CONFIG_NF_NAT_MASQUERADE_IPV6) - return nat->masq_index && hooknum == NF_INET_POST_ROUTING && + return nat && nat->masq_index && hooknum == NF_INET_POST_ROUTING && CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL && nat->masq_index != out->ifindex; #else diff --git a/include/net/netfilter/nf_nat_helper.h b/include/net/netfilter/nf_nat_helper.h index 01bcc6bfbcc9034e399a938a6a6dd83b2bed6ef7..fbfa5acf4f14628f6334a36943bd1ab01844bb6f 100644 --- a/include/net/netfilter/nf_nat_helper.h +++ b/include/net/netfilter/nf_nat_helper.h @@ -7,31 +7,31 @@ struct sk_buff; /* These return true or false. */ -int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, unsigned int match_offset, - unsigned int match_len, const char *rep_buffer, - unsigned int rep_len, bool adjust); +bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int match_offset, + unsigned int match_len, const char *rep_buffer, + unsigned int rep_len, bool adjust); -static inline int nf_nat_mangle_tcp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len) +static inline bool nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len) { return __nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, match_offset, match_len, rep_buffer, rep_len, true); } -int nf_nat_mangle_udp_packet(struct sk_buff *skb, struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, unsigned int match_offset, - unsigned int match_len, const char *rep_buffer, - unsigned int rep_len); +bool nf_nat_mangle_udp_packet(struct sk_buff *skb, struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, unsigned int match_offset, + unsigned int match_len, const char *rep_buffer, + unsigned int rep_len); /* Setup NAT on this expected conntrack so it follows master, but goes * to port ct->master->saved_proto. */ diff --git a/include/net/netfilter/nf_queue.h b/include/net/netfilter/nf_queue.h index 09948d10e38e0b939b9764caafd7f3b9a9be41d5..4454719ff849fde065f64eba042d7f58cd971078 100644 --- a/include/net/netfilter/nf_queue.h +++ b/include/net/netfilter/nf_queue.h @@ -24,8 +24,7 @@ struct nf_queue_entry { struct nf_queue_handler { int (*outfn)(struct nf_queue_entry *entry, unsigned int queuenum); - void (*nf_hook_drop)(struct net *net, - const struct nf_hook_entry *hooks); + unsigned int (*nf_hook_drop)(struct net *net); }; void nf_register_queue_handler(struct net *net, const struct nf_queue_handler *qh); diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 0136028652bdb8b3c20813b01b2fa8cfb16ba012..028faec8fc2799b0c176ac88b54fc6700e89f896 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -413,10 +413,11 @@ static inline struct nft_set *nft_set_container_of(const void *priv) return (void *)priv - offsetof(struct nft_set, data); } -struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask); -struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla, u8 genmask); +struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask); static inline unsigned long nft_set_gc_interval(const struct nft_set *set) { @@ -910,6 +911,11 @@ static inline struct nft_base_chain *nft_base_chain(const struct nft_chain *chai return container_of(chain, struct nft_base_chain, chain); } +static inline bool nft_is_base_chain(const struct nft_chain *chain) +{ + return chain->flags & NFT_BASE_CHAIN; +} + int __nft_release_basechain(struct nft_ctx *ctx); unsigned int nft_do_chain(struct nft_pktinfo *pkt, void *priv); @@ -1044,7 +1050,8 @@ struct nft_object_type { unsigned int maxattr; struct module *owner; const struct nla_policy *policy; - int (*init)(const struct nlattr * const tb[], + int (*init)(const struct nft_ctx *ctx, + const struct nlattr *const tb[], struct nft_object *obj); void (*destroy)(struct nft_object *obj); int (*dump)(struct sk_buff *skb, diff --git a/include/net/netfilter/nft_fib.h b/include/net/netfilter/nft_fib.h index 5ceb2205e4e3ed93461ed4a3956b227f99ac9494..381af9469e6ada01e4acffd4efc3ebc88e66a019 100644 --- a/include/net/netfilter/nft_fib.h +++ b/include/net/netfilter/nft_fib.h @@ -32,6 +32,6 @@ void nft_fib6_eval_type(const struct nft_expr *expr, struct nft_regs *regs, void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt); -void nft_fib_store_result(void *reg, enum nft_fib_result r, +void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct nft_pktinfo *pkt, int index); #endif diff --git a/include/net/netlink.h b/include/net/netlink.h index b239fcd33d8091268fd793fd45cc755264fa592f..01709172b3d38455e7e5510228d6b4a0ad6e94a6 100644 --- a/include/net/netlink.h +++ b/include/net/netlink.h @@ -233,14 +233,17 @@ struct nl_info { }; int netlink_rcv_skb(struct sk_buff *skb, - int (*cb)(struct sk_buff *, struct nlmsghdr *)); + int (*cb)(struct sk_buff *, struct nlmsghdr *, + struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int nla_validate(const struct nlattr *head, int len, int maxtype, - const struct nla_policy *policy); + const struct nla_policy *policy, + struct netlink_ext_ack *extack); int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy); + int len, const struct nla_policy *policy, + struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); @@ -374,18 +377,20 @@ nlmsg_next(const struct nlmsghdr *nlh, int *remaining) * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), policy); + nlmsg_attrlen(nlh, hdrlen), policy, extack); } /** @@ -409,16 +414,19 @@ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, * @hdrlen: length of familiy specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @extack: extended ACK report struct */ static inline int nlmsg_validate(const struct nlmsghdr *nlh, int hdrlen, int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return nla_validate(nlmsg_attrdata(nlh, hdrlen), - nlmsg_attrlen(nlh, hdrlen), maxtype, policy); + nlmsg_attrlen(nlh, hdrlen), maxtype, policy, + extack); } /** @@ -739,14 +747,17 @@ nla_find_nested(const struct nlattr *nla, int attrtype) * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy + * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy); + return nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, + extack); } /** @@ -1252,6 +1263,7 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be @@ -1260,9 +1272,11 @@ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) * Returns 0 on success or a negative error code. */ static inline int nla_validate_nested(const struct nlattr *start, int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { - return nla_validate(nla_data(start), nla_len(start), maxtype, policy); + return nla_validate(nla_data(start), nla_len(start), maxtype, policy, + extack); } /** diff --git a/include/net/netns/can.h b/include/net/netns/can.h new file mode 100644 index 0000000000000000000000000000000000000000..b106e6ae2e5b46dc119a322ebf22437e88158962 --- /dev/null +++ b/include/net/netns/can.h @@ -0,0 +1,40 @@ +/* + * can in net namespaces + */ + +#ifndef __NETNS_CAN_H__ +#define __NETNS_CAN_H__ + +#include + +struct dev_rcv_lists; +struct s_stats; +struct s_pstats; + +struct netns_can { +#if IS_ENABLED(CONFIG_PROC_FS) + struct proc_dir_entry *proc_dir; + struct proc_dir_entry *pde_version; + struct proc_dir_entry *pde_stats; + struct proc_dir_entry *pde_reset_stats; + struct proc_dir_entry *pde_rcvlist_all; + struct proc_dir_entry *pde_rcvlist_fil; + struct proc_dir_entry *pde_rcvlist_inv; + struct proc_dir_entry *pde_rcvlist_sff; + struct proc_dir_entry *pde_rcvlist_eff; + struct proc_dir_entry *pde_rcvlist_err; + struct proc_dir_entry *bcmproc_dir; +#endif + + /* receive filters subscribed for 'all' CAN devices */ + struct dev_rcv_lists *can_rx_alldev_list; + spinlock_t can_rcvlists_lock; + struct timer_list can_stattimer;/* timer for statistics update */ + struct s_stats *can_stats; /* packet statistics */ + struct s_pstats *can_pstats; /* receive list statistics */ + + /* CAN GW per-net gateway jobs */ + struct hlist_head cgw_list; +}; + +#endif /* __NETNS_CAN_H__ */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 622d2da27135586d164c228b81e71afb922d5d8c..cd686c4fb32dc5409a08f818d48228bffa6f6778 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -33,7 +33,6 @@ struct inet_timewait_death_row { atomic_t tw_count; struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; - int sysctl_tw_recycle; int sysctl_max_tw_buckets; }; @@ -96,6 +95,8 @@ struct netns_ipv4 { /* Shall we try to damage output packets if routing dev changes? */ int sysctl_ip_dynaddr; int sysctl_ip_early_demux; + int sysctl_tcp_early_demux; + int sysctl_udp_early_demux; int sysctl_fwmark_reflect; int sysctl_tcp_fwmark_accept; @@ -152,6 +153,7 @@ struct netns_ipv4 { #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH int sysctl_fib_multipath_use_neigh; + int sysctl_fib_multipath_hash_policy; #endif unsigned int fib_seq; /* protected by rtnl_mutex */ diff --git a/include/net/netns/mpls.h b/include/net/netns/mpls.h index d29203651c01700d9406157ef8dd016ea55a4cbb..6608b3693385e771147f78da13afa87cc6355d83 100644 --- a/include/net/netns/mpls.h +++ b/include/net/netns/mpls.h @@ -9,8 +9,11 @@ struct mpls_route; struct ctl_table_header; struct netns_mpls { + int ip_ttl_propagate; + int default_ttl; size_t platform_labels; struct mpls_route __rcu * __rcu *platform_label; + struct ctl_table_header *ctl; }; diff --git a/include/net/nfc/nfc.h b/include/net/nfc/nfc.h index 1a3de8b34ad27a8f3f8cf74ddd3eb6af61ced72e..bbdc73a3239dffbcafe8cf19f5997ddbeb7ada04 100644 --- a/include/net/nfc/nfc.h +++ b/include/net/nfc/nfc.h @@ -27,6 +27,7 @@ #include #include +#define nfc_dbg(dev, fmt, ...) dev_dbg((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_info(dev, fmt, ...) dev_info((dev), "NFC: " fmt, ##__VA_ARGS__) #define nfc_err(dev, fmt, ...) dev_err((dev), "NFC: " fmt, ##__VA_ARGS__) diff --git a/include/net/pkt_sched.h b/include/net/pkt_sched.h index f1b76b8e6d2d296177116d0ef0f254d175551cbe..bec46f63f10ced844f8aec2b19bebf8b3dc01167 100644 --- a/include/net/pkt_sched.h +++ b/include/net/pkt_sched.h @@ -92,7 +92,7 @@ int unregister_qdisc(struct Qdisc_ops *qops); void qdisc_get_default(char *id, size_t len); int qdisc_set_default(const char *id); -void qdisc_hash_add(struct Qdisc *q); +void qdisc_hash_add(struct Qdisc *q, bool invisible); void qdisc_hash_del(struct Qdisc *q); struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle); struct Qdisc *qdisc_lookup_class(struct net_device *dev, u32 handle); diff --git a/include/net/protocol.h b/include/net/protocol.h index bf36ca34af7ad255b9eb821cbed0a70abad993f5..65ba335b0e7e66bb7f1b4bd279d31e616e0dd31e 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -40,6 +40,7 @@ /* This is used to register protocols. */ struct net_protocol { void (*early_demux)(struct sk_buff *skb); + void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, u32 info); unsigned int no_policy:1, @@ -54,7 +55,7 @@ struct net_protocol { #if IS_ENABLED(CONFIG_IPV6) struct inet6_protocol { void (*early_demux)(struct sk_buff *skb); - + void (*early_demux_handler)(struct sk_buff *skb); int (*handler)(struct sk_buff *skb); void (*err_handler)(struct sk_buff *skb, @@ -92,12 +93,12 @@ struct inet_protosw { #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ #define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ -extern const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; +extern struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS]; extern const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS]; extern const struct net_offload __rcu *inet6_offloads[MAX_INET_PROTOS]; #if IS_ENABLED(CONFIG_IPV6) -extern const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; +extern struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS]; #endif int inet_add_protocol(const struct net_protocol *prot, unsigned char num); diff --git a/include/net/route.h b/include/net/route.h index c0874c87c173717f2c13c8af06d2482a76190243..2cc0e14c63598ce3d3be88bb04d2fd433d676129 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -113,13 +113,13 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); -struct rtable *__ip_route_output_key_hash(struct net *, struct flowi4 *flp, - int mp_hash); +struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *flp, + const struct sk_buff *skb); static inline struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp) { - return __ip_route_output_key_hash(net, flp, -1); + return __ip_route_output_key_hash(net, flp, NULL); } struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h index 106de5f7bf0675e29114cde65b7ea8a4e557c1ce..78fa5fe32947d590351917c6353caa243eb0cf1b 100644 --- a/include/net/rtnetlink.h +++ b/include/net/rtnetlink.h @@ -4,7 +4,8 @@ #include #include -typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *); +typedef int (*rtnl_doit_func)(struct sk_buff *, struct nlmsghdr *, + struct netlink_ext_ack *); typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *); typedef u16 (*rtnl_calcit_func)(struct sk_buff *, struct nlmsghdr *); @@ -158,7 +159,8 @@ struct net_device *rtnl_create_link(struct net *net, const char *ifname, int rtnl_delete_link(struct net_device *dev); int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm); -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len); +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, + struct netlink_ext_ack *exterr); #define MODULE_ALIAS_RTNL_LINK(kind) MODULE_ALIAS("rtnl-link-" kind) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index aeec4086afb2446dadb1fb8c54ad54a909634380..22e52093bfda7fe651f4c0aafc95d18e22b5f6fd 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -66,6 +66,7 @@ struct Qdisc { #define TCQ_F_NOPARENT 0x40 /* root of its hierarchy : * qdisc_tree_decrease_qlen() should stop. */ +#define TCQ_F_INVISIBLE 0x80 /* invisible by default in dump */ u32 limit; const struct Qdisc_ops *ops; struct qdisc_size_table __rcu *stab; @@ -203,14 +204,14 @@ struct tcf_proto_ops { const struct tcf_proto *, struct tcf_result *); int (*init)(struct tcf_proto*); - bool (*destroy)(struct tcf_proto*, bool); + void (*destroy)(struct tcf_proto*); unsigned long (*get)(struct tcf_proto*, u32 handle); int (*change)(struct net *net, struct sk_buff *, struct tcf_proto*, unsigned long, u32 handle, struct nlattr **, unsigned long *, bool); - int (*delete)(struct tcf_proto*, unsigned long); + int (*delete)(struct tcf_proto*, unsigned long, bool*); void (*walk)(struct tcf_proto*, struct tcf_walker *arg); /* rtnetlink specific */ diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h index b6f682ec184a62a1e7b9d2a0ea159293cca12a76..47113f2c4b0a2b6c596d2f28018a1e1941cb6ede 100644 --- a/include/net/sctp/sm.h +++ b/include/net/sctp/sm.h @@ -293,6 +293,22 @@ struct sctp_chunk *sctp_process_strreset_inreq( struct sctp_association *asoc, union sctp_params param, struct sctp_ulpevent **evp); +struct sctp_chunk *sctp_process_strreset_tsnreq( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp); +struct sctp_chunk *sctp_process_strreset_addstrm_out( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp); +struct sctp_chunk *sctp_process_strreset_addstrm_in( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp); +struct sctp_chunk *sctp_process_strreset_resp( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp); /* Prototypes for statetable processing. */ diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 138f8615acf0993d8015f6c1d2eee32966dccad0..a8b38e123f9744368cb6c0fa96d28556d44aee14 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -1315,6 +1315,8 @@ struct sctp_inithdr_host { struct sctp_stream_out { __u16 ssn; __u8 state; + __u64 abandoned_unsent[SCTP_PR_INDEX(MAX) + 1]; + __u64 abandoned_sent[SCTP_PR_INDEX(MAX) + 1]; }; struct sctp_stream_in { @@ -1887,6 +1889,7 @@ struct sctp_association { __u32 strreset_outseq; /* Update after receiving response */ __u32 strreset_inseq; /* Update after receiving request */ + __u32 strreset_result[2]; /* save the results of last 2 responses */ struct sctp_chunk *strreset_chunk; /* save request chunk */ diff --git a/include/net/sctp/ulpevent.h b/include/net/sctp/ulpevent.h index 324b5965fc4de505ca98fbbb9aedc0d3a0039742..1060494ac230b80caca57f6963dca2692ae10b9d 100644 --- a/include/net/sctp/ulpevent.h +++ b/include/net/sctp/ulpevent.h @@ -132,6 +132,14 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( const struct sctp_association *asoc, __u16 flags, __u16 stream_num, __u16 *stream_list, gfp_t gfp); +struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( + const struct sctp_association *asoc, __u16 flags, + __u32 local_tsn, __u32 remote_tsn, gfp_t gfp); + +struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( + const struct sctp_association *asoc, __u16 flags, + __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp); + void sctp_ulpevent_read_sndrcvinfo(const struct sctp_ulpevent *event, struct msghdr *); void sctp_ulpevent_read_rcvinfo(const struct sctp_ulpevent *event, diff --git a/include/net/secure_seq.h b/include/net/secure_seq.h index 0caee631a8364fe6e49ab8cacba864d019be8b47..b94006f6fbdde0d78fe33b9c2d86159e291c30cf 100644 --- a/include/net/secure_seq.h +++ b/include/net/secure_seq.h @@ -6,10 +6,12 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport); -u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, u32 *tsoff); -u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport, u32 *tsoff); +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport); +u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr); +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport); +u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr); u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport); u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr, diff --git a/include/net/sock.h b/include/net/sock.h index 03252d53975de7ad0da66d35802738830b0e3367..f33e3d134e0b7f66329f2122d7acc8b396c1787b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -995,7 +995,7 @@ struct smc_hashinfo; struct module; /* - * caches using SLAB_DESTROY_BY_RCU should let .next pointer from nulls nodes + * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) @@ -1783,11 +1783,8 @@ __sk_dst_set(struct sock *sk, struct dst_entry *dst) sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; - /* - * This can be called while sk is owned by the caller only, - * with no state that can be checked in a rcu_dereference_check() cond - */ - old_dst = rcu_dereference_raw(sk->sk_dst_cache); + old_dst = rcu_dereference_protected(sk->sk_dst_cache, + lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } @@ -2242,6 +2239,7 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); +#define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { @@ -2252,8 +2250,10 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); - else + else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sk->sk_stamp = skb->tstamp; + else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) + sk->sk_stamp = 0; } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); @@ -2365,6 +2365,8 @@ bool sk_ns_capable(const struct sock *sk, bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); +void sk_get_meminfo(const struct sock *sk, u32 *meminfo); + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; diff --git a/include/net/tc_act/tc_pedit.h b/include/net/tc_act/tc_pedit.h index dfbd6ee0bc7cd196c052e700da43deddb8d1dfef..a46c3f2ace702932dc95c3021e7b13d72a2a4777 100644 --- a/include/net/tc_act/tc_pedit.h +++ b/include/net/tc_act/tc_pedit.h @@ -2,6 +2,7 @@ #define __NET_TC_PED_H #include +#include struct tcf_pedit_key_ex { enum pedit_header_type htype; @@ -17,4 +18,48 @@ struct tcf_pedit { }; #define to_pedit(a) ((struct tcf_pedit *)a) +static inline bool is_tcf_pedit(const struct tc_action *a) +{ +#ifdef CONFIG_NET_CLS_ACT + if (a->ops && a->ops->type == TCA_ACT_PEDIT) + return true; +#endif + return false; +} + +static inline int tcf_pedit_nkeys(const struct tc_action *a) +{ + return to_pedit(a)->tcfp_nkeys; +} + +static inline u32 tcf_pedit_htype(const struct tc_action *a, int index) +{ + if (to_pedit(a)->tcfp_keys_ex) + return to_pedit(a)->tcfp_keys_ex[index].htype; + + return TCA_PEDIT_KEY_EX_HDR_TYPE_NETWORK; +} + +static inline u32 tcf_pedit_cmd(const struct tc_action *a, int index) +{ + if (to_pedit(a)->tcfp_keys_ex) + return to_pedit(a)->tcfp_keys_ex[index].cmd; + + return __PEDIT_CMD_MAX; +} + +static inline u32 tcf_pedit_mask(const struct tc_action *a, int index) +{ + return to_pedit(a)->tcfp_keys[index].mask; +} + +static inline u32 tcf_pedit_val(const struct tc_action *a, int index) +{ + return to_pedit(a)->tcfp_keys[index].val; +} + +static inline u32 tcf_pedit_offset(const struct tc_action *a, int index) +{ + return to_pedit(a)->tcfp_keys[index].off; +} #endif /* __NET_TC_PED_H */ diff --git a/include/net/tc_act/tc_vlan.h b/include/net/tc_act/tc_vlan.h index 48cca321ee6c4f3e44f8f546d05d5f32b470ccc1..c2090df944ff53224ef0500c05f62672f0d5e0b3 100644 --- a/include/net/tc_act/tc_vlan.h +++ b/include/net/tc_act/tc_vlan.h @@ -13,9 +13,6 @@ #include #include -#define VLAN_F_POP 0x1 -#define VLAN_F_PUSH 0x2 - struct tcf_vlan { struct tc_action common; int tcfv_action; @@ -49,4 +46,9 @@ static inline __be16 tcf_vlan_push_proto(const struct tc_action *a) return to_vlan(a)->tcfv_push_proto; } +static inline u8 tcf_vlan_push_prio(const struct tc_action *a) +{ + return to_vlan(a)->tcfv_push_prio; +} + #endif /* __NET_TC_VLAN_H */ diff --git a/include/net/tcp.h b/include/net/tcp.h index 6ec4ea652f3f55e53675dbe09f29599af179c41a..38a7427ae902e35973a8b7fa0e95ff602ede0e87 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -78,6 +78,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo); /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U +/* Maximal number of window scale according to RFC1323 */ +#define TCP_MAX_WSCALE 14U + /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 @@ -406,11 +409,7 @@ void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, - bool paws_check, bool timestamps); -bool tcp_remember_stamp(struct sock *sk); -bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw); -void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst); +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_disable_fack(struct tcp_sock *tp); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); @@ -471,7 +470,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst); + struct dst_entry *dst, u32 tsoff); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); @@ -1005,7 +1004,7 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct skb_mstamp *now, struct rate_sample *rs); + struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK @@ -1235,10 +1234,12 @@ void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; - if (!sysctl_tcp_slow_start_after_idle || tp->packets_out) + if (!sysctl_tcp_slow_start_after_idle || tp->packets_out || + ca_ops->cong_control) return; delta = tcp_time_stamp - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) @@ -1252,9 +1253,11 @@ void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, static inline int tcp_win_from_space(int space) { - return sysctl_tcp_adv_win_scale<=0 ? - (space>>(-sysctl_tcp_adv_win_scale)) : - space - (space>>sysctl_tcp_adv_win_scale); + int tcp_adv_win_scale = sysctl_tcp_adv_win_scale; + + return tcp_adv_win_scale <= 0 ? + (space>>(-tcp_adv_win_scale)) : + space - (space>>tcp_adv_win_scale); } /* Note: caller must be prepared to deal with negative returns */ @@ -1505,6 +1508,12 @@ struct tcp_fastopen_context { struct rcu_head rcu; }; +extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; +void tcp_fastopen_active_disable(struct sock *sk); +bool tcp_fastopen_active_should_disable(struct sock *sk); +void tcp_fastopen_active_disable_ofo_check(struct sock *sk); +void tcp_fastopen_active_timeout_reset(void); + /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ @@ -1814,9 +1823,9 @@ struct tcp_request_sock_ops { __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict); - __u32 (*init_seq)(const struct sk_buff *skb, u32 *tsoff); + const struct request_sock *req); + u32 (*init_seq)(const struct sk_buff *skb); + u32 (*init_ts_off)(const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, @@ -1847,10 +1856,9 @@ void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ -extern void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now); +extern void tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time, - const struct skb_mstamp *ack_time); + const struct skb_mstamp *xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); /* diff --git a/include/net/udp.h b/include/net/udp.h index c9d8b8e848e05c2e7228f287f88ccdb57b2e10c2..3391dbd739595a76150453c28468ce8bb55530f8 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -372,4 +372,5 @@ void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) void udpv6_encap_enable(void); #endif + #endif /* _UDP_H */ diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 14d82bf16692a6d0ed66721b1548ba152dbc18de..6793a30c66b1f054516a27229c29037fcdfa2f6f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -120,6 +120,13 @@ struct xfrm_state_walk { struct xfrm_address_filter *filter; }; +struct xfrm_state_offload { + struct net_device *dev; + unsigned long offload_handle; + unsigned int num_exthdrs; + u8 flags; +}; + /* Full description of state of transformer. */ struct xfrm_state { possible_net_t xs_net; @@ -207,6 +214,8 @@ struct xfrm_state { struct xfrm_lifetime_cur curlft; struct tasklet_hrtimer mtimer; + struct xfrm_state_offload xso; + /* used to fix curlft->add_time when changing date */ long saved_tmo; @@ -222,6 +231,8 @@ struct xfrm_state { struct xfrm_mode *inner_mode_iaf; struct xfrm_mode *outer_mode; + const struct xfrm_type_offload *type_offload; + /* Security context */ struct xfrm_sec_ctx *security; @@ -314,12 +325,14 @@ void km_state_expired(struct xfrm_state *x, int hard, u32 portid); int __xfrm_state_delete(struct xfrm_state *x); struct xfrm_state_afinfo { - unsigned int family; - unsigned int proto; - __be16 eth_proto; - struct module *owner; - const struct xfrm_type *type_map[IPPROTO_MAX]; - struct xfrm_mode *mode_map[XFRM_MODE_MAX]; + unsigned int family; + unsigned int proto; + __be16 eth_proto; + struct module *owner; + const struct xfrm_type *type_map[IPPROTO_MAX]; + const struct xfrm_type_offload *type_offload_map[IPPROTO_MAX]; + struct xfrm_mode *mode_map[XFRM_MODE_MAX]; + int (*init_flags)(struct xfrm_state *x); void (*init_tempsel)(struct xfrm_selector *sel, const struct flowi *fl); @@ -380,6 +393,18 @@ struct xfrm_type { int xfrm_register_type(const struct xfrm_type *type, unsigned short family); int xfrm_unregister_type(const struct xfrm_type *type, unsigned short family); +struct xfrm_type_offload { + char *description; + struct module *owner; + u8 proto; + void (*encap)(struct xfrm_state *, struct sk_buff *pskb); + int (*input_tail)(struct xfrm_state *x, struct sk_buff *skb); + int (*xmit)(struct xfrm_state *, struct sk_buff *pskb, netdev_features_t features); +}; + +int xfrm_register_type_offload(const struct xfrm_type_offload *type, unsigned short family); +int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, unsigned short family); + struct xfrm_mode { /* * Remove encapsulation header. @@ -428,6 +453,16 @@ struct xfrm_mode { */ int (*output)(struct xfrm_state *x, struct sk_buff *skb); + /* + * Adjust pointers into the packet and do GSO segmentation. + */ + struct sk_buff *(*gso_segment)(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features); + + /* + * Adjust pointers into the packet when IPsec is done at layer2. + */ + void (*xmit)(struct xfrm_state *x, struct sk_buff *skb); + struct xfrm_state_afinfo *afinfo; struct module *owner; unsigned int encap; @@ -586,7 +621,6 @@ struct xfrm_migrate { struct xfrm_mgr { struct list_head list; - char *id; int (*notify)(struct xfrm_state *x, const struct km_event *c); int (*acquire)(struct xfrm_state *x, struct xfrm_tmpl *, struct xfrm_policy *xp); struct xfrm_policy *(*compile_policy)(struct sock *sk, int opt, u8 *data, int len, int *dir); @@ -817,12 +851,12 @@ static inline void xfrm_state_hold(struct xfrm_state *x) } static inline bool addr_match(const void *token1, const void *token2, - int prefixlen) + unsigned int prefixlen) { const __be32 *a1 = token1; const __be32 *a2 = token2; - int pdw; - int pbi; + unsigned int pdw; + unsigned int pbi; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ pbi = prefixlen & 0x1f; /* num of bits in incomplete u32 in prefix */ @@ -846,9 +880,9 @@ static inline bool addr_match(const void *token1, const void *token2, static inline bool addr4_match(__be32 a1, __be32 a2, u8 prefixlen) { /* C99 6.5.7 (3): u32 << 32 is undefined behaviour */ - if (prefixlen == 0) + if (sizeof(long) == 4 && prefixlen == 0) return true; - return !((a1 ^ a2) & htonl(0xFFFFFFFFu << (32 - prefixlen))); + return !((a1 ^ a2) & htonl(~0UL << (32 - prefixlen))); } static __inline__ @@ -1533,6 +1567,7 @@ struct xfrmk_spdinfo { struct xfrm_state *xfrm_find_acq_byseq(struct net *net, u32 mark, u32 seq); int xfrm_state_delete(struct xfrm_state *x); int xfrm_state_flush(struct net *net, u8 proto, bool task_valid); +int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid); void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si); void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si); u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq); @@ -1615,6 +1650,11 @@ static inline int xfrm4_udp_encap_rcv(struct sock *sk, struct sk_buff *skb) } #endif +struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family); + struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type); @@ -1820,6 +1860,61 @@ static inline struct xfrm_offload *xfrm_offload(struct sk_buff *skb) } #endif +#ifdef CONFIG_XFRM_OFFLOAD +void __net_init xfrm_dev_init(void); +int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features); +int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, + struct xfrm_user_offload *xuo); +bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x); + +static inline void xfrm_dev_state_delete(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + + if (xso->dev) + xso->dev->xfrmdev_ops->xdo_dev_state_delete(x); +} + +static inline void xfrm_dev_state_free(struct xfrm_state *x) +{ + struct xfrm_state_offload *xso = &x->xso; + struct net_device *dev = xso->dev; + + if (dev && dev->xfrmdev_ops) { + dev->xfrmdev_ops->xdo_dev_state_free(x); + xso->dev = NULL; + dev_put(dev); + } +} +#else +static inline void __net_init xfrm_dev_init(void) +{ +} + +static inline int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) +{ + return 0; +} + +static inline int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) +{ + return 0; +} + +static inline void xfrm_dev_state_delete(struct xfrm_state *x) +{ +} + +static inline void xfrm_dev_state_free(struct xfrm_state *x) +{ +} + +static inline bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + return false; +} +#endif + static inline int xfrm_mark_get(struct nlattr **attrs, struct xfrm_mark *m) { if (attrs[XFRMA_MARK]) diff --git a/include/rdma/ib.h b/include/rdma/ib.h index 9b4c22a36931884520b4b25e42d3b99c2bd8b733..66dbed0c146d290fb86bd0c14abcdb37945b530f 100644 --- a/include/rdma/ib.h +++ b/include/rdma/ib.h @@ -100,7 +100,7 @@ struct sockaddr_ib { */ static inline bool ib_safe_file_access(struct file *filp) { - return filp->f_cred == current_cred() && segment_eq(get_fs(), USER_DS); + return filp->f_cred == current_cred() && !uaccess_kernel(); } #endif /* _RDMA_IB_H */ diff --git a/include/rdma/ib_cm.h b/include/rdma/ib_cm.h index b49258b16f4ed5610cbb38b29d84f5c617afddb9..7979cb04f529449aaa6d825985e204c5cf7de660 100644 --- a/include/rdma/ib_cm.h +++ b/include/rdma/ib_cm.h @@ -117,8 +117,8 @@ struct ib_cm_req_event_param { u8 port; - struct ib_sa_path_rec *primary_path; - struct ib_sa_path_rec *alternate_path; + struct sa_path_rec *primary_path; + struct sa_path_rec *alternate_path; __be64 remote_ca_guid; u32 remote_qkey; @@ -197,7 +197,7 @@ struct ib_cm_mra_event_param { }; struct ib_cm_lap_event_param { - struct ib_sa_path_rec *alternate_path; + struct sa_path_rec *alternate_path; }; enum ib_cm_apr_status { @@ -363,8 +363,8 @@ struct ib_cm_id *ib_cm_insert_listen(struct ib_device *device, __be64 service_id); struct ib_cm_req_param { - struct ib_sa_path_rec *primary_path; - struct ib_sa_path_rec *alternate_path; + struct sa_path_rec *primary_path; + struct sa_path_rec *alternate_path; __be64 service_id; u32 qp_num; enum ib_qp_type qp_type; @@ -521,7 +521,7 @@ int ib_send_cm_mra(struct ib_cm_id *cm_id, * @private_data_len: Size of the private data buffer, in bytes. */ int ib_send_cm_lap(struct ib_cm_id *cm_id, - struct ib_sa_path_rec *alternate_path, + struct sa_path_rec *alternate_path, const void *private_data, u8 private_data_len); @@ -565,7 +565,7 @@ int ib_send_cm_apr(struct ib_cm_id *cm_id, u8 private_data_len); struct ib_cm_sidr_req_param { - struct ib_sa_path_rec *path; + struct sa_path_rec *path; __be64 service_id; int timeout_ms; const void *private_data; diff --git a/include/rdma/ib_hdrs.h b/include/rdma/ib_hdrs.h index c755325f0831c56b7eca923707e2772c52aa08b9..5519f31f043a45b086e098407c4a15a2a7d02ca4 100644 --- a/include/rdma/ib_hdrs.h +++ b/include/rdma/ib_hdrs.h @@ -74,6 +74,12 @@ #define IB_GRH_FLOW_MASK 0xFFFFF #define IB_GRH_FLOW_SHIFT 0 #define IB_GRH_NEXT_HDR 0x1B +#define IB_FECN_SHIFT 31 +#define IB_FECN_MASK 1 +#define IB_FECN_SMASK BIT(IB_FECN_SHIFT) +#define IB_BECN_SHIFT 30 +#define IB_BECN_MASK 1 +#define IB_BECN_SMASK BIT(IB_BECN_SHIFT) #define IB_AETH_CREDIT_SHIFT 24 #define IB_AETH_CREDIT_MASK 0x1F @@ -181,4 +187,64 @@ static inline void put_ib_ateth_compare(u64 val, struct ib_atomic_eth *ateth) ib_u64_put(val, &ateth->compare_data); } +/* + * 9B/IB Packet Format + */ +#define IB_LNH_MASK 3 +#define IB_SC_MASK 0xf +#define IB_SC_SHIFT 12 +#define IB_SL_MASK 0xf +#define IB_SL_SHIFT 4 + +static inline u8 ib_get_lnh(struct ib_header *hdr) +{ + return (be16_to_cpu(hdr->lrh[0]) & IB_LNH_MASK); +} + +static inline u8 ib_get_sc(struct ib_header *hdr) +{ + return ((be16_to_cpu(hdr->lrh[0]) >> IB_SC_SHIFT) & IB_SC_MASK); +} + +static inline u8 ib_get_sl(struct ib_header *hdr) +{ + return ((be16_to_cpu(hdr->lrh[0]) >> IB_SL_SHIFT) & IB_SL_MASK); +} + +static inline u16 ib_get_dlid(struct ib_header *hdr) +{ + return (be16_to_cpu(hdr->lrh[1])); +} + +static inline u16 ib_get_slid(struct ib_header *hdr) +{ + return (be16_to_cpu(hdr->lrh[3])); +} + +/* + * BTH + */ +#define IB_BTH_OPCODE_MASK 0xff +#define IB_BTH_OPCODE_SHIFT 24 +#define IB_BTH_PAD_MASK 3 +#define IB_BTH_PKEY_MASK 0xffff +#define IB_BTH_PAD_SHIFT 20 + +static inline u8 ib_bth_get_pad(struct ib_other_headers *ohdr) +{ + return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_PAD_SHIFT) & + IB_BTH_PAD_MASK); +} + +static inline u16 ib_bth_get_pkey(struct ib_other_headers *ohdr) +{ + return (be32_to_cpu(ohdr->bth[0]) & IB_BTH_PKEY_MASK); +} + +static inline u8 ib_bth_get_opcode(struct ib_other_headers *ohdr) +{ + return ((be32_to_cpu(ohdr->bth[0]) >> IB_BTH_OPCODE_SHIFT) & + IB_BTH_OPCODE_MASK); +} + #endif /* IB_HDRS_H */ diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index 981214b3790cf4226d68bd5010aa87e14506be27..d67b11b7202924c335345d5e0c78d9cc99ea187b 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -262,6 +262,33 @@ struct ib_class_port_info { __be32 trap_qkey; }; +#define OPA_CLASS_PORT_INFO_PR_SUPPORT BIT(26) + +struct opa_class_port_info { + u8 base_version; + u8 class_version; + __be16 cap_mask; + __be32 cap_mask2_resp_time; + + u8 redirect_gid[16]; + __be32 redirect_tc_fl; + __be32 redirect_lid; + __be32 redirect_sl_qp; + __be32 redirect_qkey; + + u8 trap_gid[16]; + __be32 trap_tc_fl; + __be32 trap_lid; + __be32 trap_hl_qp; + __be32 trap_qkey; + + __be16 trap_pkey; + __be16 redirect_pkey; + + u8 trap_sl_rsvd; + u8 reserved[3]; +} __packed; + /** * ib_get_cpi_resp_time - Returns the resp_time value from * cap_mask2_resp_time in ib_class_port_info. @@ -315,6 +342,17 @@ static inline void ib_set_cpi_capmask2(struct ib_class_port_info *cpi, IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); } +/** + * opa_get_cpi_capmask2 - Returns the capmask2 value from + * cap_mask2_resp_time in ib_class_port_info. + * @cpi: A struct opa_class_port_info mad. + */ +static inline u32 opa_get_cpi_capmask2(struct opa_class_port_info *cpi) +{ + return (be32_to_cpu(cpi->cap_mask2_resp_time) >> + IB_CLASS_PORT_INFO_RESP_TIME_FIELD_SIZE); +} + struct ib_mad_notice_attr { u8 generic_type; u8 prod_type_msb; @@ -673,7 +711,7 @@ struct ib_mad_agent *ib_register_mad_snoop(struct ib_device *device, * After invoking this routine, MAD services are no longer usable by the * client on the associated QP. */ -int ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); +void ib_unregister_mad_agent(struct ib_mad_agent *mad_agent); /** * ib_post_send_mad - Posts MAD(s) to the send queue of the QP associated diff --git a/include/rdma/ib_marshall.h b/include/rdma/ib_marshall.h index db037205c9e8cb4c5883ba0720f4b30105006d69..68cef3bd50fb97730df5112d4dc88a8c9923f46d 100644 --- a/include/rdma/ib_marshall.h +++ b/include/rdma/ib_marshall.h @@ -42,12 +42,12 @@ void ib_copy_qp_attr_to_user(struct ib_uverbs_qp_attr *dst, struct ib_qp_attr *src); void ib_copy_ah_attr_to_user(struct ib_uverbs_ah_attr *dst, - struct ib_ah_attr *src); + struct rdma_ah_attr *src); void ib_copy_path_rec_to_user(struct ib_user_path_rec *dst, - struct ib_sa_path_rec *src); + struct sa_path_rec *src); -void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, +void ib_copy_path_rec_from_user(struct sa_path_rec *dst, struct ib_user_path_rec *src); #endif /* IB_USER_MARSHALL_H */ diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index b13419ce99ff5b52d666c2af7a91f58457f58100..36655899ee028d568257f2b2cc8764b136ab0e20 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -80,6 +80,8 @@ enum { IB_OPCODE_UD = 0x60, /* per IBTA 1.3 vol 1 Table 38, A10.3.2 */ IB_OPCODE_CNP = 0x80, + /* Manufacturer specific */ + IB_OPCODE_MSP = 0xe0, /* operations -- just used to define real constants */ IB_OPCODE_SEND_FIRST = 0x00, diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index fd0e53219f93e56b66643578c31f9172846a08c5..f5f70e345318151356e022ce46b623a357031920 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -43,6 +43,8 @@ #include #include +#include +#include enum { IB_SA_CLASS_VERSION = 2, /* IB spec version 1.1/1.2 */ @@ -56,6 +58,7 @@ enum { IB_SA_METHOD_GET_TRACE_TBL = 0x13 }; +#define OPA_SA_CLASS_VERSION 0x80 enum { IB_SA_ATTR_CLASS_PORTINFO = 0x01, IB_SA_ATTR_NOTICE = 0x02, @@ -147,13 +150,45 @@ enum ib_sa_mc_join_states { #define IB_SA_PATH_REC_PACKET_LIFE_TIME IB_SA_COMP_MASK(21) #define IB_SA_PATH_REC_PREFERENCE IB_SA_COMP_MASK(22) -struct ib_sa_path_rec { +enum sa_path_rec_type { + SA_PATH_REC_TYPE_IB, + SA_PATH_REC_TYPE_ROCE_V1, + SA_PATH_REC_TYPE_ROCE_V2, + SA_PATH_REC_TYPE_OPA +}; + +struct sa_path_rec_ib { __be64 service_id; - union ib_gid dgid; - union ib_gid sgid; __be16 dlid; __be16 slid; u8 raw_traffic; +}; + +struct sa_path_rec_roce { + u8 dmac[ETH_ALEN]; + /* ignored in IB */ + int ifindex; + /* ignored in IB */ + struct net *net; + +}; + +struct sa_path_rec_opa { + __be64 service_id; + __be32 dlid; + __be32 slid; + u8 raw_traffic; + u8 l2_8B; + u8 l2_10B; + u8 l2_9B; + u8 l2_16B; + u8 qos_type; + u8 qos_priority; +}; + +struct sa_path_rec { + union ib_gid dgid; + union ib_gid sgid; /* reserved */ __be32 flow_label; u8 hop_limit; @@ -170,17 +205,109 @@ struct ib_sa_path_rec { u8 packet_life_time_selector; u8 packet_life_time; u8 preference; - u8 dmac[ETH_ALEN]; - /* ignored in IB */ - int ifindex; - /* ignored in IB */ - struct net *net; - enum ib_gid_type gid_type; + union { + struct sa_path_rec_ib ib; + struct sa_path_rec_roce roce; + struct sa_path_rec_opa opa; + }; + enum sa_path_rec_type rec_type; }; -static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) +static inline enum ib_gid_type + sa_conv_pathrec_to_gid_type(struct sa_path_rec *rec) +{ + switch (rec->rec_type) { + case SA_PATH_REC_TYPE_ROCE_V1: + return IB_GID_TYPE_ROCE; + case SA_PATH_REC_TYPE_ROCE_V2: + return IB_GID_TYPE_ROCE_UDP_ENCAP; + default: + return IB_GID_TYPE_IB; + } +} + +static inline enum sa_path_rec_type + sa_conv_gid_to_pathrec_type(enum ib_gid_type type) +{ + switch (type) { + case IB_GID_TYPE_ROCE: + return SA_PATH_REC_TYPE_ROCE_V1; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + return SA_PATH_REC_TYPE_ROCE_V2; + default: + return SA_PATH_REC_TYPE_IB; + } +} + +static inline void path_conv_opa_to_ib(struct sa_path_rec *ib, + struct sa_path_rec *opa) +{ + if ((be32_to_cpu(opa->opa.dlid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE)) || + (be32_to_cpu(opa->opa.slid) >= + be16_to_cpu(IB_MULTICAST_LID_BASE))) { + /* Create OPA GID and zero out the LID */ + ib->dgid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(opa->opa.dlid)); + ib->dgid.global.subnet_prefix + = opa->dgid.global.subnet_prefix; + ib->sgid.global.interface_id + = OPA_MAKE_ID(be32_to_cpu(opa->opa.slid)); + ib->dgid.global.subnet_prefix + = opa->dgid.global.subnet_prefix; + ib->ib.dlid = 0; + + ib->ib.slid = 0; + } else { + ib->ib.dlid = htons(ntohl(opa->opa.dlid)); + ib->ib.slid = htons(ntohl(opa->opa.slid)); + } + ib->ib.service_id = opa->opa.service_id; + ib->ib.raw_traffic = opa->opa.raw_traffic; +} + +static inline void path_conv_ib_to_opa(struct sa_path_rec *opa, + struct sa_path_rec *ib) { - return rec->net ? dev_get_by_index(rec->net, rec->ifindex) : NULL; + __be32 slid, dlid; + + if ((ib_is_opa_gid(&ib->sgid)) || + (ib_is_opa_gid(&ib->dgid))) { + slid = htonl(opa_get_lid_from_gid(&ib->sgid)); + dlid = htonl(opa_get_lid_from_gid(&ib->dgid)); + } else { + slid = htonl(ntohs(ib->ib.slid)); + dlid = htonl(ntohs(ib->ib.dlid)); + } + opa->opa.slid = slid; + opa->opa.dlid = dlid; + opa->opa.service_id = ib->ib.service_id; + opa->opa.raw_traffic = ib->ib.raw_traffic; +} + +/* Convert from OPA to IB path record */ +static inline void sa_convert_path_opa_to_ib(struct sa_path_rec *dest, + struct sa_path_rec *src) +{ + if (src->rec_type != SA_PATH_REC_TYPE_OPA) + return; + + *dest = *src; + dest->rec_type = SA_PATH_REC_TYPE_IB; + path_conv_opa_to_ib(dest, src); +} + +/* Convert from IB to OPA path record */ +static inline void sa_convert_path_ib_to_opa(struct sa_path_rec *dest, + struct sa_path_rec *src) +{ + if (src->rec_type != SA_PATH_REC_TYPE_IB) + return; + + /* Do a structure copy and overwrite the relevant fields */ + *dest = *src; + dest->rec_type = SA_PATH_REC_TYPE_OPA; + path_conv_ib_to_opa(dest, src); } #define IB_SA_MCMEMBER_REC_MGID IB_SA_COMP_MASK( 0) @@ -322,11 +449,11 @@ void ib_sa_cancel_query(int id, struct ib_sa_query *query); int ib_sa_path_rec_get(struct ib_sa_client *client, struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, + struct sa_path_rec *rec, ib_sa_comp_mask comp_mask, int timeout_ms, gfp_t gfp_mask, void (*callback)(int status, - struct ib_sa_path_rec *resp, + struct sa_path_rec *resp, void *context), void *context, struct ib_sa_query **query); @@ -420,27 +547,27 @@ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, struct net_device *ndev, enum ib_gid_type gid_type, - struct ib_ah_attr *ah_attr); + struct rdma_ah_attr *ah_attr); /** * ib_init_ah_from_path - Initialize address handle attributes based on an SA * path record. */ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, - struct ib_sa_path_rec *rec, - struct ib_ah_attr *ah_attr); + struct sa_path_rec *rec, + struct rdma_ah_attr *ah_attr); /** * ib_sa_pack_path - Conert a path record from struct ib_sa_path_rec * to IB MAD wire format. */ -void ib_sa_pack_path(struct ib_sa_path_rec *rec, void *attribute); +void ib_sa_pack_path(struct sa_path_rec *rec, void *attribute); /** * ib_sa_unpack_path - Convert a path record from MAD format to struct * ib_sa_path_rec. */ -void ib_sa_unpack_path(void *attribute, struct ib_sa_path_rec *rec); +void ib_sa_unpack_path(void *attribute, struct sa_path_rec *rec); /* Support GuidInfoRecord */ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, @@ -454,14 +581,137 @@ int ib_sa_guid_info_rec_query(struct ib_sa_client *client, void *context, struct ib_sa_query **sa_query); -/* Support get SA ClassPortInfo */ -int ib_sa_classport_info_rec_query(struct ib_sa_client *client, - struct ib_device *device, u8 port_num, - int timeout_ms, gfp_t gfp_mask, - void (*callback)(int status, - struct ib_class_port_info *resp, - void *context), - void *context, - struct ib_sa_query **sa_query); +bool ib_sa_sendonly_fullmem_support(struct ib_sa_client *client, + struct ib_device *device, + u8 port_num); + +static inline bool sa_path_is_roce(struct sa_path_rec *rec) +{ + return ((rec->rec_type == SA_PATH_REC_TYPE_ROCE_V1) || + (rec->rec_type == SA_PATH_REC_TYPE_ROCE_V2)); +} + +static inline void sa_path_set_service_id(struct sa_path_rec *rec, + __be64 service_id) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + rec->ib.service_id = service_id; + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + rec->opa.service_id = service_id; +} + +static inline void sa_path_set_slid(struct sa_path_rec *rec, __be32 slid) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + rec->ib.slid = htons(ntohl(slid)); + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + rec->opa.slid = slid; +} + +static inline void sa_path_set_dlid(struct sa_path_rec *rec, __be32 dlid) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + rec->ib.dlid = htons(ntohl(dlid)); + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + rec->opa.dlid = dlid; +} + +static inline void sa_path_set_raw_traffic(struct sa_path_rec *rec, + u8 raw_traffic) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + rec->ib.raw_traffic = raw_traffic; + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + rec->opa.raw_traffic = raw_traffic; +} + +static inline __be64 sa_path_get_service_id(struct sa_path_rec *rec) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + return rec->ib.service_id; + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + return rec->opa.service_id; + return 0; +} + +static inline __be32 sa_path_get_slid(struct sa_path_rec *rec) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + return htonl(ntohs(rec->ib.slid)); + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + return rec->opa.slid; + return 0; +} + +static inline __be32 sa_path_get_dlid(struct sa_path_rec *rec) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + return htonl(ntohs(rec->ib.dlid)); + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + return rec->opa.dlid; + return 0; +} + +static inline u8 sa_path_get_raw_traffic(struct sa_path_rec *rec) +{ + if (rec->rec_type == SA_PATH_REC_TYPE_IB) + return rec->ib.raw_traffic; + else if (rec->rec_type == SA_PATH_REC_TYPE_OPA) + return rec->opa.raw_traffic; + return 0; +} + +static inline void sa_path_set_dmac(struct sa_path_rec *rec, u8 *dmac) +{ + if (sa_path_is_roce(rec)) + memcpy(rec->roce.dmac, dmac, ETH_ALEN); +} + +static inline void sa_path_set_dmac_zero(struct sa_path_rec *rec) +{ + if (sa_path_is_roce(rec)) + eth_zero_addr(rec->roce.dmac); +} + +static inline void sa_path_set_ifindex(struct sa_path_rec *rec, int ifindex) +{ + if (sa_path_is_roce(rec)) + rec->roce.ifindex = ifindex; +} + +static inline void sa_path_set_ndev(struct sa_path_rec *rec, struct net *net) +{ + if (sa_path_is_roce(rec)) + rec->roce.net = net; +} + +static inline u8 *sa_path_get_dmac(struct sa_path_rec *rec) +{ + if (sa_path_is_roce(rec)) + return rec->roce.dmac; + return NULL; +} + +static inline int sa_path_get_ifindex(struct sa_path_rec *rec) +{ + if (sa_path_is_roce(rec)) + return rec->roce.ifindex; + return 0; +} + +static inline struct net *sa_path_get_ndev(struct sa_path_rec *rec) +{ + if (sa_path_is_roce(rec)) + return rec->roce.net; + return NULL; +} + +static inline struct net_device *ib_get_ndev_from_path(struct sa_path_rec *rec) +{ + return sa_path_get_ndev(rec) ? + dev_get_by_index(sa_path_get_ndev(rec), + sa_path_get_ifindex(rec)) + : NULL; +} #endif /* IB_SA_H */ diff --git a/include/rdma/ib_umem.h b/include/rdma/ib_umem.h index 2d83cfd7e6ce20da3b1606cfe4335fada792fb51..23159dd5be184bc5db37f4d34a30653afe8b97fd 100644 --- a/include/rdma/ib_umem.h +++ b/include/rdma/ib_umem.h @@ -44,7 +44,7 @@ struct ib_umem { struct ib_ucontext *context; size_t length; unsigned long address; - int page_size; + int page_shift; int writable; int hugetlb; struct work_struct work; @@ -60,7 +60,7 @@ struct ib_umem { /* Returns the offset of the umem start relative to the first page. */ static inline int ib_umem_offset(struct ib_umem *umem) { - return umem->address & ((unsigned long)umem->page_size - 1); + return umem->address & (BIT(umem->page_shift) - 1); } /* Returns the first page of an ODP umem. */ @@ -72,12 +72,12 @@ static inline unsigned long ib_umem_start(struct ib_umem *umem) /* Returns the address of the page after the last one of an ODP umem. */ static inline unsigned long ib_umem_end(struct ib_umem *umem) { - return PAGE_ALIGN(umem->address + umem->length); + return ALIGN(umem->address + umem->length, BIT(umem->page_shift)); } static inline size_t ib_umem_num_pages(struct ib_umem *umem) { - return (ib_umem_end(umem) - ib_umem_start(umem)) >> PAGE_SHIFT; + return (ib_umem_end(umem) - ib_umem_start(umem)) >> umem->page_shift; } #ifdef CONFIG_INFINIBAND_USER_MEM diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 542cd8b3414c14f5fc65aa85006836fffe7ed42a..fb67554aabd6e0f80c34f8966b035cbb4efba029 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -84,7 +84,8 @@ struct ib_umem_odp { #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING -int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem); +int ib_umem_odp_get(struct ib_ucontext *context, struct ib_umem *umem, + int access); struct ib_umem *ib_alloc_odp_umem(struct ib_ucontext *context, unsigned long addr, size_t size); @@ -154,7 +155,8 @@ static inline int ib_umem_mmu_notifier_retry(struct ib_umem *item, #else /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ static inline int ib_umem_odp_get(struct ib_ucontext *context, - struct ib_umem *umem) + struct ib_umem *umem, + int access) { return -EINVAL; } diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 99e4423eb2b80b142024bed892ddc4a84ac5e576..ba8314ec576844ddb7148364617a768c81acd90f 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -55,12 +55,14 @@ #include #include #include +#include #include #include #include #include #include +#include extern struct workqueue_struct *ib_wq; extern struct workqueue_struct *ib_comp_wq; @@ -224,6 +226,7 @@ enum ib_device_cap_flags { IB_DEVICE_VIRTUAL_FUNCTION = (1ULL << 33), /* Deprecated. Please use IB_RAW_PACKET_CAP_SCATTER_FCS. */ IB_DEVICE_RAW_SCATTER_FCS = (1ULL << 34), + IB_DEVICE_RDMA_NETDEV_OPA_VNIC = (1ULL << 35), }; enum ib_signature_prot_cap { @@ -431,7 +434,8 @@ enum ib_port_speed { IB_SPEED_QDR = 4, IB_SPEED_FDR10 = 8, IB_SPEED_FDR = 16, - IB_SPEED_EDR = 32 + IB_SPEED_EDR = 32, + IB_SPEED_HDR = 64 }; /** @@ -498,6 +502,7 @@ static inline struct rdma_hw_stats *rdma_alloc_hw_stats_struct( /* Address format 0x000FF000 */ #define RDMA_CORE_CAP_AF_IB 0x00001000 #define RDMA_CORE_CAP_ETH_AH 0x00002000 +#define RDMA_CORE_CAP_OPA_AH 0x00004000 /* Protocol 0xFFF00000 */ #define RDMA_CORE_CAP_PROT_IB 0x00100000 @@ -836,15 +841,38 @@ struct ib_mr_status { */ __attribute_const__ enum ib_rate mult_to_ib_rate(int mult); +enum rdma_ah_attr_type { + RDMA_AH_ATTR_TYPE_IB, + RDMA_AH_ATTR_TYPE_ROCE, + RDMA_AH_ATTR_TYPE_OPA, +}; + struct ib_ah_attr { - struct ib_global_route grh; u16 dlid; - u8 sl; u8 src_path_bits; +}; + +struct roce_ah_attr { + u8 dmac[ETH_ALEN]; +}; + +struct opa_ah_attr { + u32 dlid; + u8 src_path_bits; +}; + +struct rdma_ah_attr { + struct ib_global_route grh; + u8 sl; u8 static_rate; - u8 ah_flags; u8 port_num; - u8 dmac[ETH_ALEN]; + u8 ah_flags; + enum rdma_ah_attr_type type; + union { + struct ib_ah_attr ib; + struct roce_ah_attr roce; + struct opa_ah_attr opa; + }; }; enum ib_wc_status { @@ -1163,8 +1191,8 @@ struct ib_qp_attr { u32 dest_qp_num; int qp_access_flags; struct ib_qp_cap cap; - struct ib_ah_attr ah_attr; - struct ib_ah_attr alt_ah_attr; + struct rdma_ah_attr ah_attr; + struct rdma_ah_attr alt_ah_attr; u16 pkey_index; u16 alt_pkey_index; u8 en_sqd_async_notify; @@ -1336,6 +1364,7 @@ enum ib_access_flags { IB_ACCESS_MW_BIND = (1<<4), IB_ZERO_BASED = (1<<5), IB_ACCESS_ON_DEMAND = (1<<6), + IB_ACCESS_HUGETLB = (1<<7), }; /* @@ -1357,6 +1386,17 @@ struct ib_fmr_attr { struct ib_umem; +enum rdma_remove_reason { + /* Userspace requested uobject deletion. Call could fail */ + RDMA_REMOVE_DESTROY, + /* Context deletion. This call should delete the actual object itself */ + RDMA_REMOVE_CLOSE, + /* Driver is being hot-unplugged. This call should delete the actual object itself */ + RDMA_REMOVE_DRIVER_REMOVE, + /* Context is being cleaned-up, but commit was just completed */ + RDMA_REMOVE_DURING_CLEANUP, +}; + struct ib_rdmacg_object { #ifdef CONFIG_CGROUP_RDMA struct rdma_cgroup *cg; /* owner rdma cgroup */ @@ -1365,19 +1405,16 @@ struct ib_rdmacg_object { struct ib_ucontext { struct ib_device *device; - struct list_head pd_list; - struct list_head mr_list; - struct list_head mw_list; - struct list_head cq_list; - struct list_head qp_list; - struct list_head srq_list; - struct list_head ah_list; - struct list_head xrcd_list; - struct list_head rule_list; - struct list_head wq_list; - struct list_head rwq_ind_tbl_list; + struct ib_uverbs_file *ufile; int closing; + /* locking the uobjects_list */ + struct mutex uobjects_lock; + struct list_head uobjects; + /* protects cleanup process from other actions */ + struct rw_semaphore cleanup_rwsem; + enum rdma_remove_reason cleanup_reason; + struct pid *tgid; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING struct rb_root umem_tree; @@ -1407,9 +1444,16 @@ struct ib_uobject { struct ib_rdmacg_object cg_obj; /* rdmacg object */ int id; /* index into kernel idr */ struct kref ref; - struct rw_semaphore mutex; /* protects .live */ + atomic_t usecnt; /* protects exclusive access */ struct rcu_head rcu; /* kfree_rcu() overhead */ - int live; + + const struct uverbs_obj_type *type; +}; + +struct ib_uobject_file { + struct ib_uobject uobj; + /* ufile contains the lock between context release and file close */ + struct ib_uverbs_file *ufile; }; struct ib_udata { @@ -1447,6 +1491,7 @@ struct ib_ah { struct ib_device *device; struct ib_pd *pd; struct ib_uobject *uobject; + enum rdma_ah_attr_type type; }; typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); @@ -1662,6 +1707,7 @@ enum ib_flow_spec_type { IB_FLOW_SPEC_INNER = 0x100, /* Actions */ IB_FLOW_SPEC_ACTION_TAG = 0x1000, + IB_FLOW_SPEC_ACTION_DROP = 0x1001, }; #define IB_FLOW_SPEC_LAYER_MASK 0xF0 #define IB_FLOW_SPEC_SUPPORT_LAYERS 8 @@ -1790,6 +1836,11 @@ struct ib_flow_spec_action_tag { u32 tag_id; }; +struct ib_flow_spec_action_drop { + enum ib_flow_spec_type type; + u16 size; +}; + union ib_flow_spec { struct { u32 type; @@ -1802,6 +1853,7 @@ union ib_flow_spec { struct ib_flow_spec_ipv6 ipv6; struct ib_flow_spec_tunnel tunnel; struct ib_flow_spec_action_tag flow_tag; + struct ib_flow_spec_action_drop drop; }; struct ib_flow_attr { @@ -1838,8 +1890,6 @@ enum ib_mad_result { IB_MAD_RESULT_CONSUMED = 1 << 2 /* Packet consumed: stop processing */ }; -#define IB_DEVICE_NAME_MAX 64 - struct ib_port_cache { struct ib_pkey_cache *pkey; struct ib_gid_table *gid; @@ -1862,6 +1912,34 @@ struct ib_port_immutable { u32 max_mad_size; }; +/* rdma netdev type - specifies protocol type */ +enum rdma_netdev_t { + RDMA_NETDEV_OPA_VNIC, + RDMA_NETDEV_IPOIB, +}; + +/** + * struct rdma_netdev - rdma netdev + * For cases where netstack interfacing is required. + */ +struct rdma_netdev { + void *clnt_priv; + struct ib_device *hca; + u8 port_num; + + /* control functions */ + void (*set_id)(struct net_device *netdev, int id); + /* send packet */ + int (*send)(struct net_device *dev, struct sk_buff *skb, + struct ib_ah *address, u32 dqpn); + /* multicast */ + int (*attach_mcast)(struct net_device *dev, struct ib_device *hca, + union ib_gid *gid, u16 mlid, + int set_qkey, u32 qkey); + int (*detach_mcast)(struct net_device *dev, struct ib_device *hca, + union ib_gid *gid, u16 mlid); +}; + struct ib_device { /* Do not access @dma_device directly from ULP nor from HW drivers. */ struct device *dma_device; @@ -1977,12 +2055,12 @@ struct ib_device { struct ib_udata *udata); int (*dealloc_pd)(struct ib_pd *pd); struct ib_ah * (*create_ah)(struct ib_pd *pd, - struct ib_ah_attr *ah_attr, + struct rdma_ah_attr *ah_attr, struct ib_udata *udata); int (*modify_ah)(struct ib_ah *ah, - struct ib_ah_attr *ah_attr); + struct rdma_ah_attr *ah_attr); int (*query_ah)(struct ib_ah *ah, - struct ib_ah_attr *ah_attr); + struct rdma_ah_attr *ah_attr); int (*destroy_ah)(struct ib_ah *ah); struct ib_srq * (*create_srq)(struct ib_pd *pd, struct ib_srq_init_attr *srq_init_attr, @@ -2115,6 +2193,20 @@ struct ib_device { struct ib_rwq_ind_table_init_attr *init_attr, struct ib_udata *udata); int (*destroy_rwq_ind_table)(struct ib_rwq_ind_table *wq_ind_table); + /** + * rdma netdev operations + * + * Driver implementing alloc_rdma_netdev must return -EOPNOTSUPP if it + * doesn't support the specified rdma netdev type. + */ + struct net_device *(*alloc_rdma_netdev)( + struct ib_device *device, + u8 port_num, + enum rdma_netdev_t type, + const char *name, + unsigned char name_assign_type, + void (*setup)(struct net_device *)); + void (*free_rdma_netdev)(struct net_device *netdev); struct module *owner; struct device dev; @@ -2536,6 +2628,21 @@ static inline bool rdma_cap_eth_ah(const struct ib_device *device, u8 port_num) return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_ETH_AH; } +/** + * rdma_cap_opa_ah - Check if the port of device supports + * OPA Address handles + * @device: Device to check + * @port_num: Port number to check + * + * Return: true if we are running on an OPA device which supports + * the extended OPA addressing. + */ +static inline bool rdma_cap_opa_ah(struct ib_device *device, u8 port_num) +{ + return (device->port_immutable[port_num].core_cap_flags & + RDMA_CORE_CAP_OPA_AH) == RDMA_CORE_CAP_OPA_AH; +} + /** * rdma_max_mad_size - Return the max MAD size required by this RDMA Port. * @@ -2636,14 +2743,14 @@ struct ib_pd *__ib_alloc_pd(struct ib_device *device, unsigned int flags, void ib_dealloc_pd(struct ib_pd *pd); /** - * ib_create_ah - Creates an address handle for the given address vector. + * rdma_create_ah - Creates an address handle for the given address vector. * @pd: The protection domain associated with the address handle. * @ah_attr: The attributes of the address vector. * * The address handle is used to reference a local or global destination * in all UD QP post sends. */ -struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); +struct ib_ah *rdma_create_ah(struct ib_pd *pd, struct rdma_ah_attr *ah_attr); /** * ib_get_gids_from_rdma_hdr - Get sgid and dgid from GRH or IPv4 header @@ -2676,7 +2783,7 @@ int ib_get_rdma_header_version(const union rdma_network_hdr *hdr); */ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, - struct ib_ah_attr *ah_attr); + struct rdma_ah_attr *ah_attr); /** * ib_create_ah_from_wc - Creates an address handle associated with the @@ -2694,28 +2801,28 @@ struct ib_ah *ib_create_ah_from_wc(struct ib_pd *pd, const struct ib_wc *wc, const struct ib_grh *grh, u8 port_num); /** - * ib_modify_ah - Modifies the address vector associated with an address + * rdma_modify_ah - Modifies the address vector associated with an address * handle. * @ah: The address handle to modify. * @ah_attr: The new address vector attributes to associate with the * address handle. */ -int ib_modify_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); +int rdma_modify_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** - * ib_query_ah - Queries the address vector associated with an address + * rdma_query_ah - Queries the address vector associated with an address * handle. * @ah: The address handle to query. * @ah_attr: The address vector attributes associated with the address * handle. */ -int ib_query_ah(struct ib_ah *ah, struct ib_ah_attr *ah_attr); +int rdma_query_ah(struct ib_ah *ah, struct rdma_ah_attr *ah_attr); /** - * ib_destroy_ah - Destroys an address handle. + * rdma_destroy_ah - Destroys an address handle. * @ah: The address handle to destroy. */ -int ib_destroy_ah(struct ib_ah *ah); +int rdma_destroy_ah(struct ib_ah *ah); /** * ib_create_srq - Creates a SRQ associated with the specified protection @@ -3380,5 +3487,156 @@ void ib_drain_sq(struct ib_qp *qp); void ib_drain_qp(struct ib_qp *qp); int ib_resolve_eth_dmac(struct ib_device *device, - struct ib_ah_attr *ah_attr); + struct rdma_ah_attr *ah_attr); + +static inline u8 *rdma_ah_retrieve_dmac(struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_ROCE) + return attr->roce.dmac; + return NULL; +} + +static inline void rdma_ah_set_dlid(struct rdma_ah_attr *attr, u32 dlid) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_IB) + attr->ib.dlid = (u16)dlid; + else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + attr->opa.dlid = dlid; +} + +static inline u32 rdma_ah_get_dlid(const struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_IB) + return attr->ib.dlid; + else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + return attr->opa.dlid; + return 0; +} + +static inline void rdma_ah_set_sl(struct rdma_ah_attr *attr, u8 sl) +{ + attr->sl = sl; +} + +static inline u8 rdma_ah_get_sl(const struct rdma_ah_attr *attr) +{ + return attr->sl; +} + +static inline void rdma_ah_set_path_bits(struct rdma_ah_attr *attr, + u8 src_path_bits) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_IB) + attr->ib.src_path_bits = src_path_bits; + else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + attr->opa.src_path_bits = src_path_bits; +} + +static inline u8 rdma_ah_get_path_bits(const struct rdma_ah_attr *attr) +{ + if (attr->type == RDMA_AH_ATTR_TYPE_IB) + return attr->ib.src_path_bits; + else if (attr->type == RDMA_AH_ATTR_TYPE_OPA) + return attr->opa.src_path_bits; + return 0; +} + +static inline void rdma_ah_set_port_num(struct rdma_ah_attr *attr, u8 port_num) +{ + attr->port_num = port_num; +} + +static inline u8 rdma_ah_get_port_num(const struct rdma_ah_attr *attr) +{ + return attr->port_num; +} + +static inline void rdma_ah_set_static_rate(struct rdma_ah_attr *attr, + u8 static_rate) +{ + attr->static_rate = static_rate; +} + +static inline u8 rdma_ah_get_static_rate(const struct rdma_ah_attr *attr) +{ + return attr->static_rate; +} + +static inline void rdma_ah_set_ah_flags(struct rdma_ah_attr *attr, + enum ib_ah_flags flag) +{ + attr->ah_flags = flag; +} + +static inline enum ib_ah_flags + rdma_ah_get_ah_flags(const struct rdma_ah_attr *attr) +{ + return attr->ah_flags; +} + +static inline const struct ib_global_route + *rdma_ah_read_grh(const struct rdma_ah_attr *attr) +{ + return &attr->grh; +} + +/*To retrieve and modify the grh */ +static inline struct ib_global_route + *rdma_ah_retrieve_grh(struct rdma_ah_attr *attr) +{ + return &attr->grh; +} + +static inline void rdma_ah_set_dgid_raw(struct rdma_ah_attr *attr, void *dgid) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + memcpy(grh->dgid.raw, dgid, sizeof(grh->dgid)); +} + +static inline void rdma_ah_set_subnet_prefix(struct rdma_ah_attr *attr, + __be64 prefix) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + grh->dgid.global.subnet_prefix = prefix; +} + +static inline void rdma_ah_set_interface_id(struct rdma_ah_attr *attr, + __be64 if_id) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + grh->dgid.global.interface_id = if_id; +} + +static inline void rdma_ah_set_grh(struct rdma_ah_attr *attr, + union ib_gid *dgid, u32 flow_label, + u8 sgid_index, u8 hop_limit, + u8 traffic_class) +{ + struct ib_global_route *grh = rdma_ah_retrieve_grh(attr); + + attr->ah_flags = IB_AH_GRH; + if (dgid) + grh->dgid = *dgid; + grh->flow_label = flow_label; + grh->sgid_index = sgid_index; + grh->hop_limit = hop_limit; + grh->traffic_class = traffic_class; +} + +/*Get AH type */ +static inline enum rdma_ah_attr_type rdma_ah_find_type(struct ib_device *dev, + u32 port_num) +{ + if ((rdma_protocol_roce(dev, port_num)) || + (rdma_protocol_iwarp(dev, port_num))) + return RDMA_AH_ATTR_TYPE_ROCE; + else if ((rdma_protocol_ib(dev, port_num)) && + (rdma_cap_opa_ah(dev, port_num))) + return RDMA_AH_ATTR_TYPE_OPA; + else + return RDMA_AH_ATTR_TYPE_IB; +} #endif /* IB_VERBS_H */ diff --git a/include/rdma/opa_addr.h b/include/rdma/opa_addr.h new file mode 100644 index 0000000000000000000000000000000000000000..eace28f1555d958a896677977232b814c4642568 --- /dev/null +++ b/include/rdma/opa_addr.h @@ -0,0 +1,79 @@ +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +#ifndef OPA_ADDR_H +#define OPA_ADDR_H + +#define OPA_SPECIAL_OUI (0x00066AULL) +#define OPA_MAKE_ID(x) (cpu_to_be64(OPA_SPECIAL_OUI << 40 | (x))) + +/** + * ib_is_opa_gid: Returns true if the top 24 bits of the gid + * contains the OPA_STL_OUI identifier. This identifies that + * the provided gid is a special purpose GID meant to carry + * extended LID information. + * + * @gid: The Global identifier + */ +static inline bool ib_is_opa_gid(union ib_gid *gid) +{ + return ((be64_to_cpu(gid->global.interface_id) >> 40) == + OPA_SPECIAL_OUI); +} + +/** + * opa_get_lid_from_gid: Returns the last 32 bits of the gid. + * OPA devices use one of the gids in the gid table to also + * store the lid. + * + * @gid: The Global identifier + */ +static inline u32 opa_get_lid_from_gid(union ib_gid *gid) +{ + return be64_to_cpu(gid->global.interface_id) & 0xFFFFFFFF; +} +#endif /* OPA_ADDR_H */ diff --git a/include/rdma/opa_port_info.h b/include/rdma/opa_port_info.h index 9303e0e4f508d8c524a34b979da7a359cb15b30c..b4f0ac02f28315afba4b7cb031858d35da887f49 100644 --- a/include/rdma/opa_port_info.h +++ b/include/rdma/opa_port_info.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014 Intel Corporation. All rights reserved. + * Copyright (c) 2014-2017 Intel Corporation. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU @@ -127,6 +127,7 @@ #define OPA_LINK_WIDTH_3X 0x0004 #define OPA_LINK_WIDTH_4X 0x0008 +#define OPA_CAP_MASK3_IsEthOnFabricSupported (1 << 13) #define OPA_CAP_MASK3_IsSnoopSupported (1 << 7) #define OPA_CAP_MASK3_IsAsyncSC2VLSupported (1 << 6) #define OPA_CAP_MASK3_IsAddrRangeConfigSupported (1 << 5) diff --git a/include/rdma/opa_vnic.h b/include/rdma/opa_vnic.h new file mode 100644 index 0000000000000000000000000000000000000000..39d6890616a6bf8c93ff98242c0cbabd532845ef --- /dev/null +++ b/include/rdma/opa_vnic.h @@ -0,0 +1,141 @@ +#ifndef _OPA_VNIC_H +#define _OPA_VNIC_H +/* + * Copyright(c) 2017 Intel Corporation. + * + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * BSD LICENSE + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * - Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * - Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * - Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + */ + +/* + * This file contains Intel Omni-Path (OPA) Virtual Network Interface + * Controller (VNIC) specific declarations. + */ + +#include + +/* VNIC uses 16B header format */ +#define OPA_VNIC_L2_TYPE 0x2 + +/* 16 header bytes + 2 reserved bytes */ +#define OPA_VNIC_L2_HDR_LEN (16 + 2) + +#define OPA_VNIC_L4_HDR_LEN 2 + +#define OPA_VNIC_HDR_LEN (OPA_VNIC_L2_HDR_LEN + \ + OPA_VNIC_L4_HDR_LEN) + +#define OPA_VNIC_L4_ETHR 0x78 + +#define OPA_VNIC_ICRC_LEN 4 +#define OPA_VNIC_TAIL_LEN 1 +#define OPA_VNIC_ICRC_TAIL_LEN (OPA_VNIC_ICRC_LEN + OPA_VNIC_TAIL_LEN) + +#define OPA_VNIC_SKB_MDATA_LEN 4 +#define OPA_VNIC_SKB_MDATA_ENCAP_ERR 0x1 + +/* opa vnic rdma netdev's private data structure */ +struct opa_vnic_rdma_netdev { + struct rdma_netdev rn; /* keep this first */ + /* followed by device private data */ + char *dev_priv[0]; +}; + +static inline void *opa_vnic_priv(const struct net_device *dev) +{ + struct rdma_netdev *rn = netdev_priv(dev); + + return rn->clnt_priv; +} + +static inline void *opa_vnic_dev_priv(const struct net_device *dev) +{ + struct opa_vnic_rdma_netdev *oparn = netdev_priv(dev); + + return oparn->dev_priv; +} + +/* opa_vnic skb meta data structrue */ +struct opa_vnic_skb_mdata { + u8 vl; + u8 entropy; + u8 flags; + u8 rsvd; +} __packed; + +/* OPA VNIC group statistics */ +struct opa_vnic_grp_stats { + u64 unicast; + u64 mcastbcast; + u64 untagged; + u64 vlan; + u64 s_64; + u64 s_65_127; + u64 s_128_255; + u64 s_256_511; + u64 s_512_1023; + u64 s_1024_1518; + u64 s_1519_max; +}; + +struct opa_vnic_stats { + /* standard netdev statistics */ + struct rtnl_link_stats64 netstats; + + /* OPA VNIC statistics */ + struct opa_vnic_grp_stats tx_grp; + struct opa_vnic_grp_stats rx_grp; + u64 tx_dlid_zero; + u64 tx_drop_state; + u64 rx_drop_state; + u64 rx_runt; + u64 rx_oversize; +}; + +static inline bool rdma_cap_opa_vnic(struct ib_device *device) +{ + return !!(device->attrs.device_cap_flags & + IB_DEVICE_RDMA_NETDEV_OPA_VNIC); +} + +#endif /* _OPA_VNIC_H */ diff --git a/include/rdma/rdma_cm.h b/include/rdma/rdma_cm.h index d3968b561f868db47c9c98ed72ca96d44303bb16..3d2eed3c4e7512b6f6d1592d296e7187ba183404 100644 --- a/include/rdma/rdma_cm.h +++ b/include/rdma/rdma_cm.h @@ -85,7 +85,7 @@ struct rdma_addr { struct rdma_route { struct rdma_addr addr; - struct ib_sa_path_rec *path_rec; + struct sa_path_rec *path_rec; int num_paths; }; @@ -106,7 +106,7 @@ struct rdma_conn_param { struct rdma_ud_param { const void *private_data; u8 private_data_len; - struct ib_ah_attr ah_attr; + struct rdma_ah_attr ah_attr; u32 qp_num; u32 qkey; }; diff --git a/include/rdma/rdma_cm_ib.h b/include/rdma/rdma_cm_ib.h index 2389c3b4540498718860ef04dd5297498257b09a..6947a6ba25579f06c5de1357c6d0a094cbb58d41 100644 --- a/include/rdma/rdma_cm_ib.h +++ b/include/rdma/rdma_cm_ib.h @@ -46,7 +46,7 @@ * connection and replaces the call to rdma_resolve_route. */ int rdma_set_ib_paths(struct rdma_cm_id *id, - struct ib_sa_path_rec *path_rec, int num_paths); + struct sa_path_rec *path_rec, int num_paths); /* Global qkey for UDP QPs and multicast groups. */ #define RDMA_UDP_QKEY 0x01234567 diff --git a/include/rdma/rdma_vt.h b/include/rdma/rdma_vt.h index 8fc1ca7b6f2329fd38f27577de827949a9697df7..4878aaf7bdffd871515bc826470ec77886a1c02c 100644 --- a/include/rdma/rdma_vt.h +++ b/include/rdma/rdma_vt.h @@ -170,7 +170,7 @@ struct rvt_pd { /* Address handle */ struct rvt_ah { struct ib_ah ibah; - struct ib_ah_attr attr; + struct rdma_ah_attr attr; atomic_t refcount; u8 vl; u8 log_pmtu; @@ -311,10 +311,10 @@ struct rvt_driver_provided { unsigned (*free_all_qps)(struct rvt_dev_info *rdi); /* Driver specific AH validation */ - int (*check_ah)(struct ib_device *, struct ib_ah_attr *); + int (*check_ah)(struct ib_device *, struct rdma_ah_attr *); /* Inform the driver a new AH has been created */ - void (*notify_new_ah)(struct ib_device *, struct ib_ah_attr *, + void (*notify_new_ah)(struct ib_device *, struct rdma_ah_attr *, struct rvt_ah *); /* Let the driver pick the next queue pair number*/ @@ -506,7 +506,7 @@ struct rvt_dev_info *rvt_alloc_device(size_t size, int nports); void rvt_dealloc_device(struct rvt_dev_info *rdi); int rvt_register_device(struct rvt_dev_info *rvd); void rvt_unregister_device(struct rvt_dev_info *rvd); -int rvt_check_ah(struct ib_device *ibdev, struct ib_ah_attr *ah_attr); +int rvt_check_ah(struct ib_device *ibdev, struct rdma_ah_attr *ah_attr); int rvt_init_port(struct rvt_dev_info *rdi, struct rvt_ibport *port, int port_index, u16 *pkey_table); int rvt_fast_reg_mr(struct rvt_qp *qp, struct ib_mr *ibmr, u32 key, @@ -516,6 +516,7 @@ int rvt_rkey_ok(struct rvt_qp *qp, struct rvt_sge *sge, u32 len, u64 vaddr, u32 rkey, int acc); int rvt_lkey_ok(struct rvt_lkey_table *rkt, struct rvt_pd *pd, struct rvt_sge *isge, struct ib_sge *sge, int acc); -struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid); +struct rvt_mcast *rvt_mcast_find(struct rvt_ibport *ibp, union ib_gid *mgid, + u16 lid); #endif /* DEF_RDMA_VT_H */ diff --git a/include/rdma/rdmavt_qp.h b/include/rdma/rdmavt_qp.h index f3816396c76acbc9646cd648b8ed2c493d98bdcf..be6472e5b06bd1ed1117e928e9b3d44748dfbfeb 100644 --- a/include/rdma/rdmavt_qp.h +++ b/include/rdma/rdmavt_qp.h @@ -2,7 +2,7 @@ #define DEF_RDMAVT_INCQP_H /* - * Copyright(c) 2016 Intel Corporation. + * Copyright(c) 2016, 2017 Intel Corporation. * * This file is provided under a dual BSD/GPLv2 license. When using or * redistributing this file, you may do so under either license. @@ -269,8 +269,8 @@ struct rvt_qp { struct ib_qp ibqp; void *priv; /* Driver private data */ /* read mostly fields above and below */ - struct ib_ah_attr remote_ah_attr; - struct ib_ah_attr alt_ah_attr; + struct rdma_ah_attr remote_ah_attr; + struct rdma_ah_attr alt_ah_attr; struct rvt_qp __rcu *next; /* link list for QPN hash table */ struct rvt_swqe *s_wq; /* send work queue */ struct rvt_mmap_info *ip; @@ -324,6 +324,7 @@ struct rvt_qp { u8 r_state; /* opcode of last packet received */ u8 r_flags; u8 r_head_ack_queue; /* index into s_ack_queue[] */ + u8 r_adefered; /* defered ack count */ struct list_head rspwait; /* link for waiting to respond */ @@ -435,9 +436,14 @@ struct rvt_mcast_qp { struct rvt_qp *qp; }; +struct rvt_mcast_addr { + union ib_gid mgid; + u16 lid; +}; + struct rvt_mcast { struct rb_node rb_node; - union ib_gid mgid; + struct rvt_mcast_addr mcast_addr; struct list_head qp_list; wait_queue_head_t wait; atomic_t refcount; @@ -526,7 +532,6 @@ static inline void rvt_qp_wqe_reserve( struct rvt_qp *qp, struct rvt_swqe *wqe) { - wqe->wr.send_flags |= RVT_SEND_RESERVE_USED; atomic_inc(&qp->s_reserved_used); } @@ -550,7 +555,6 @@ static inline void rvt_qp_wqe_unreserve( struct rvt_swqe *wqe) { if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) { - wqe->wr.send_flags &= ~RVT_SEND_RESERVE_USED; atomic_dec(&qp->s_reserved_used); /* insure no compiler re-order up to s_last change */ smp_mb__after_atomic(); @@ -574,6 +578,7 @@ extern const enum ib_wc_opcode ib_rvt_wc_opcode[]; static inline void rvt_qp_swqe_complete( struct rvt_qp *qp, struct rvt_swqe *wqe, + enum ib_wc_opcode opcode, enum ib_wc_status status) { if (unlikely(wqe->wr.send_flags & RVT_SEND_RESERVE_USED)) @@ -586,7 +591,7 @@ static inline void rvt_qp_swqe_complete( memset(&wc, 0, sizeof(wc)); wc.wr_id = wqe->wr.wr_id; wc.status = status; - wc.opcode = ib_rvt_wc_opcode[wqe->wr.opcode]; + wc.opcode = opcode; wc.qp = &qp->ibqp; wc.byte_len = wqe->length; rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.send_cq), &wc, diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h new file mode 100644 index 0000000000000000000000000000000000000000..7771ce9669521912ddfcc611bf01dad5381dc58c --- /dev/null +++ b/include/rdma/uverbs_std_types.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UVERBS_STD_TYPES__ +#define _UVERBS_STD_TYPES__ + +#include + +extern const struct uverbs_obj_fd_type uverbs_type_attrs_comp_channel; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_cq; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_qp; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_rwq_ind_table; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_wq; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_srq; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_ah; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_flow; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_mr; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_mw; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_pd; +extern const struct uverbs_obj_idr_type uverbs_type_attrs_xrcd; + +static inline struct ib_uobject *__uobj_get(const struct uverbs_obj_type *type, + bool write, + struct ib_ucontext *ucontext, + int id) +{ + return rdma_lookup_get_uobject(type, ucontext, id, write); +} + +#define uobj_get_type(_type) uverbs_type_attrs_##_type.type + +#define uobj_get_read(_type, _id, _ucontext) \ + __uobj_get(&(_type), false, _ucontext, _id) + +#define uobj_get_obj_read(_type, _id, _ucontext) \ +({ \ + struct ib_uobject *uobj = \ + __uobj_get(&uobj_get_type(_type), \ + false, _ucontext, _id); \ + \ + (struct ib_##_type *)(IS_ERR(uobj) ? NULL : uobj->object); \ +}) + +#define uobj_get_write(_type, _id, _ucontext) \ + __uobj_get(&(_type), true, _ucontext, _id) + +static inline void uobj_put_read(struct ib_uobject *uobj) +{ + rdma_lookup_put_uobject(uobj, false); +} + +#define uobj_put_obj_read(_obj) \ + uobj_put_read((_obj)->uobject) + +static inline void uobj_put_write(struct ib_uobject *uobj) +{ + rdma_lookup_put_uobject(uobj, true); +} + +static inline int __must_check uobj_remove_commit(struct ib_uobject *uobj) +{ + return rdma_remove_commit_uobject(uobj); +} + +static inline void uobj_alloc_commit(struct ib_uobject *uobj) +{ + rdma_alloc_commit_uobject(uobj); +} + +static inline void uobj_alloc_abort(struct ib_uobject *uobj) +{ + rdma_alloc_abort_uobject(uobj); +} + +static inline struct ib_uobject *__uobj_alloc(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext) +{ + return rdma_alloc_begin_uobject(type, ucontext); +} + +#define uobj_alloc(_type, ucontext) \ + __uobj_alloc(&(_type), ucontext) + +#endif + diff --git a/include/rdma/uverbs_types.h b/include/rdma/uverbs_types.h new file mode 100644 index 0000000000000000000000000000000000000000..351ea185df44fee1af734ae6cad770f1a5ba6c6d --- /dev/null +++ b/include/rdma/uverbs_types.h @@ -0,0 +1,172 @@ +/* + * Copyright (c) 2017, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _UVERBS_TYPES_ +#define _UVERBS_TYPES_ + +#include +#include + +struct uverbs_obj_type; + +struct uverbs_obj_type_class { + /* + * Get an ib_uobject that corresponds to the given id from ucontext, + * These functions could create or destroy objects if required. + * The action will be finalized only when commit, abort or put fops are + * called. + * The flow of the different actions is: + * [alloc]: Starts with alloc_begin. The handlers logic is than + * executed. If the handler is successful, alloc_commit + * is called and the object is inserted to the repository. + * Once alloc_commit completes the object is visible to + * other threads and userspace. + e Otherwise, alloc_abort is called and the object is + * destroyed. + * [lookup]: Starts with lookup_get which fetches and locks the + * object. After the handler finished using the object, it + * needs to call lookup_put to unlock it. The exclusive + * flag indicates if the object is locked for exclusive + * access. + * [remove]: Starts with lookup_get with exclusive flag set. This + * locks the object for exclusive access. If the handler + * code completed successfully, remove_commit is called + * and the ib_uobject is removed from the context's + * uobjects repository and put. The object itself is + * destroyed as well. Once remove succeeds new krefs to + * the object cannot be acquired by other threads or + * userspace and the hardware driver is removed from the + * object. Other krefs on the object may still exist. + * If the handler code failed, lookup_put should be + * called. This callback is used when the context + * is destroyed as well (process termination, + * reset flow). + */ + struct ib_uobject *(*alloc_begin)(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext); + void (*alloc_commit)(struct ib_uobject *uobj); + void (*alloc_abort)(struct ib_uobject *uobj); + + struct ib_uobject *(*lookup_get)(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext, int id, + bool exclusive); + void (*lookup_put)(struct ib_uobject *uobj, bool exclusive); + /* + * Must be called with the exclusive lock held. If successful uobj is + * invalid on return. On failure uobject is left completely + * unchanged + */ + int __must_check (*remove_commit)(struct ib_uobject *uobj, + enum rdma_remove_reason why); + u8 needs_kfree_rcu; +}; + +struct uverbs_obj_type { + const struct uverbs_obj_type_class * const type_class; + size_t obj_size; + unsigned int destroy_order; +}; + +/* + * Objects type classes which support a detach state (object is still alive but + * it's not attached to any context need to make sure: + * (a) no call through to a driver after a detach is called + * (b) detach isn't called concurrently with context_cleanup + */ + +struct uverbs_obj_idr_type { + /* + * In idr based objects, uverbs_obj_type_class points to a generic + * idr operations. In order to specialize the underlying types (e.g. CQ, + * QPs, etc.), we add destroy_object specific callbacks. + */ + struct uverbs_obj_type type; + + /* Free driver resources from the uobject, make the driver uncallable, + * and move the uobject to the detached state. If the object was + * destroyed by the user's request, a failure should leave the uobject + * completely unchanged. + */ + int __must_check (*destroy_object)(struct ib_uobject *uobj, + enum rdma_remove_reason why); +}; + +struct ib_uobject *rdma_lookup_get_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext, + int id, bool exclusive); +void rdma_lookup_put_uobject(struct ib_uobject *uobj, bool exclusive); +struct ib_uobject *rdma_alloc_begin_uobject(const struct uverbs_obj_type *type, + struct ib_ucontext *ucontext); +void rdma_alloc_abort_uobject(struct ib_uobject *uobj); +int __must_check rdma_remove_commit_uobject(struct ib_uobject *uobj); +int rdma_alloc_commit_uobject(struct ib_uobject *uobj); + +struct uverbs_obj_fd_type { + /* + * In fd based objects, uverbs_obj_type_ops points to generic + * fd operations. In order to specialize the underlying types (e.g. + * completion_channel), we use fops, name and flags for fd creation. + * context_closed is called when the context is closed either when + * the driver is removed or the process terminated. + */ + struct uverbs_obj_type type; + int (*context_closed)(struct ib_uobject_file *uobj_file, + enum rdma_remove_reason why); + const struct file_operations *fops; + const char *name; + int flags; +}; + +extern const struct uverbs_obj_type_class uverbs_idr_class; +extern const struct uverbs_obj_type_class uverbs_fd_class; + +#define UVERBS_BUILD_BUG_ON(cond) (sizeof(char[1 - 2 * !!(cond)]) - \ + sizeof(char)) +#define UVERBS_TYPE_ALLOC_FD(_size, _order) \ + { \ + .destroy_order = _order, \ + .type_class = &uverbs_fd_class, \ + .obj_size = (_size) + \ + UVERBS_BUILD_BUG_ON((_size) < \ + sizeof(struct ib_uobject_file)),\ + } +#define UVERBS_TYPE_ALLOC_IDR_SZ(_size, _order) \ + { \ + .destroy_order = _order, \ + .type_class = &uverbs_idr_class, \ + .obj_size = (_size) + \ + UVERBS_BUILD_BUG_ON((_size) < \ + sizeof(struct ib_uobject)), \ + } +#define UVERBS_TYPE_ALLOC_IDR(_order) \ + UVERBS_TYPE_ALLOC_IDR_SZ(sizeof(struct ib_uobject), _order) +#endif diff --git a/include/scsi/fc/Kbuild b/include/scsi/fc/Kbuild deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/include/scsi/libfc.h b/include/scsi/libfc.h index da5033dd8cbcab5bff1557452b7a55e888d46c30..2109844be53d132cf13e6b12b5d6ad35cefa10b0 100644 --- a/include/scsi/libfc.h +++ b/include/scsi/libfc.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -321,7 +322,7 @@ struct fc_seq_els_data { */ struct fc_fcp_pkt { spinlock_t scsi_pkt_lock; - atomic_t ref_cnt; + refcount_t ref_cnt; /* SCSI command and data transfer information */ u32 data_len; diff --git a/include/scsi/libiscsi.h b/include/scsi/libiscsi.h index 583875ea136ab228ea14a727581c384f50527211..c9bd935f4fd1ca1ff7cd2ab824b1d0cd0441a143 100644 --- a/include/scsi/libiscsi.h +++ b/include/scsi/libiscsi.h @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -139,7 +140,7 @@ struct iscsi_task { /* state set/tested under session->lock */ int state; - atomic_t refcount; + refcount_t refcount; struct list_head running; /* running cmd list */ void *dd_data; /* driver/transport data */ }; diff --git a/include/scsi/libsas.h b/include/scsi/libsas.h index dae99d7d2bc05a265faddb5a4f953d34f99e4ef1..dd0f72c95abe19bd09bcc7fbd34fb555f3b84e98 100644 --- a/include/scsi/libsas.h +++ b/include/scsi/libsas.h @@ -693,7 +693,6 @@ extern int sas_bios_param(struct scsi_device *, sector_t capacity, int *hsc); extern struct scsi_transport_template * sas_domain_attach_transport(struct sas_domain_function_template *); -extern void sas_domain_release_transport(struct scsi_transport_template *); int sas_discover_root_expander(struct domain_device *); diff --git a/include/scsi/scsi_device.h b/include/scsi/scsi_device.h index 080c7ce9bae8892a43838043d986282a1385283a..05641aebd1811f5141b2bdf63a0f1d82cdace9c1 100644 --- a/include/scsi/scsi_device.h +++ b/include/scsi/scsi_device.h @@ -316,7 +316,7 @@ extern int scsi_unregister_device_handler(struct scsi_device_handler *scsi_dh); void scsi_attach_vpd(struct scsi_device *sdev); extern struct scsi_device *scsi_device_from_queue(struct request_queue *q); -extern int scsi_device_get(struct scsi_device *); +extern int __must_check scsi_device_get(struct scsi_device *); extern void scsi_device_put(struct scsi_device *); extern struct scsi_device *scsi_device_lookup(struct Scsi_Host *, uint, uint, u64); diff --git a/include/scsi/scsi_driver.h b/include/scsi/scsi_driver.h index 891a658aa8673d03b5ad8ac848f3318662d39707..a5534ccad859d29069bff2b240ccfca4dd0c9567 100644 --- a/include/scsi/scsi_driver.h +++ b/include/scsi/scsi_driver.h @@ -16,6 +16,7 @@ struct scsi_driver { void (*uninit_command)(struct scsi_cmnd *); int (*done)(struct scsi_cmnd *); int (*eh_action)(struct scsi_cmnd *, int); + void (*eh_reset)(struct scsi_cmnd *); }; #define to_scsi_driver(drv) \ container_of((drv), struct scsi_driver, gendrv) diff --git a/include/scsi/scsi_eh.h b/include/scsi/scsi_eh.h index 98d366b5577069201815ac77e0827fa6f32f664b..64d30d80dadb5f91ba73f2b020ede8d1e11e7ded 100644 --- a/include/scsi/scsi_eh.h +++ b/include/scsi/scsi_eh.h @@ -23,14 +23,15 @@ static inline bool scsi_sense_is_deferred(const struct scsi_sense_hdr *sshdr) return ((sshdr->response_code >= 0x70) && (sshdr->response_code & 1)); } -extern int scsi_get_sense_info_fld(const u8 * sense_buffer, int sb_len, - u64 * info_out); +extern bool scsi_get_sense_info_fld(const u8 *sense_buffer, int sb_len, + u64 *info_out); extern int scsi_ioctl_reset(struct scsi_device *, int __user *); struct scsi_eh_save { /* saved state */ int result; + int eh_eflags; enum dma_data_direction data_direction; unsigned underflow; unsigned char cmd_len; diff --git a/include/scsi/scsi_host.h b/include/scsi/scsi_host.h index 3cd8c3bec6384e02328bbb31aafb9aec84f78c37..afb04811b7b9c578f24d8360ace4f85efa07632c 100644 --- a/include/scsi/scsi_host.h +++ b/include/scsi/scsi_host.h @@ -451,11 +451,6 @@ struct scsi_host_template { /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; - /* - * True if asynchronous aborts are not supported - */ - unsigned no_async_abort:1; - /* * Countdown for host blocking with no commands outstanding. */ diff --git a/include/scsi/scsi_proto.h b/include/scsi/scsi_proto.h index 6ba66e01f6df33e37bee7f90c773e432ce75fc91..ce78ec8e367da1a34d75136339d9b7e2a088abdf 100644 --- a/include/scsi/scsi_proto.h +++ b/include/scsi/scsi_proto.h @@ -112,6 +112,7 @@ #define WRITE_16 0x8a #define READ_ATTRIBUTE 0x8c #define WRITE_ATTRIBUTE 0x8d +#define WRITE_VERIFY_16 0x8e #define VERIFY_16 0x8f #define SYNCHRONIZE_CACHE_16 0x91 #define WRITE_SAME_16 0x93 diff --git a/include/scsi/scsi_request.h b/include/scsi/scsi_request.h index ba0aeb980f7e7cb654171697316f6089b9db01d0..f0c76f9dc28547301d4f67ac76e580560d081989 100644 --- a/include/scsi/scsi_request.h +++ b/include/scsi/scsi_request.h @@ -9,8 +9,10 @@ struct scsi_request { unsigned char __cmd[BLK_MAX_CDB]; unsigned char *cmd; unsigned short cmd_len; + int result; unsigned int sense_len; unsigned int resid_len; /* residual count */ + int retries; void *sense; }; diff --git a/include/scsi/scsi_transport_fc.h b/include/scsi/scsi_transport_fc.h index b21b8aa58c4d6b8890592754bab2439db5769188..6e208bb32c780d9c075903f5f23a9d2e0f4cb68d 100644 --- a/include/scsi/scsi_transport_fc.h +++ b/include/scsi/scsi_transport_fc.h @@ -162,6 +162,7 @@ enum fc_tgtid_binding_type { #define FC_PORT_ROLE_FCP_TARGET 0x01 #define FC_PORT_ROLE_FCP_INITIATOR 0x02 #define FC_PORT_ROLE_IP_PORT 0x04 +#define FC_PORT_ROLE_FCP_DUMMY_INITIATOR 0x08 /* The following are for compatibility */ #define FC_RPORT_ROLE_UNKNOWN FC_PORT_ROLE_UNKNOWN diff --git a/include/scsi/sg.h b/include/scsi/sg.h index 3afec7032448c8927d2cd134111606ee0206ca3d..20bc71c3e0b81acf07af84fea30d6b530427bb70 100644 --- a/include/scsi/sg.h +++ b/include/scsi/sg.h @@ -197,7 +197,6 @@ typedef struct sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ #define SG_DEFAULT_RETRIES 0 /* Defaults, commented if they differ from original sg driver */ -#define SG_DEF_FORCE_LOW_DMA 0 /* was 1 -> memory below 16MB on i386 */ #define SG_DEF_FORCE_PACK_ID 0 #define SG_DEF_KEEP_ORPHAN 0 #define SG_DEF_RESERVED_SIZE SG_SCATTER_SZ /* load time option */ diff --git a/include/soc/fsl/qe/immap_qe.h b/include/soc/fsl/qe/immap_qe.h index c76ef30b05ba38ff517972125282c33ecc41b8eb..7baaabd5ec2ced0bf227691bdfea94ba9c53a0e4 100644 --- a/include/soc/fsl/qe/immap_qe.h +++ b/include/soc/fsl/qe/immap_qe.h @@ -464,25 +464,6 @@ struct qe_immap { } __attribute__ ((packed)); extern struct qe_immap __iomem *qe_immr; -extern phys_addr_t get_qe_base(void); - -/* - * Returns the offset within the QE address space of the given pointer. - * - * Note that the QE does not support 36-bit physical addresses, so if - * get_qe_base() returns a number above 4GB, the caller will probably fail. - */ -static inline phys_addr_t immrbar_virt_to_phys(void *address) -{ - void *q = (void *)qe_immr; - - /* Is it a MURAM address? */ - if ((address >= q) && (address < (q + QE_IMMAP_SIZE))) - return get_qe_base() + (address - q); - - /* It's an address returned by kmalloc */ - return virt_to_phys(address); -} #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_IMMAP_QE_H */ diff --git a/include/soc/fsl/qe/qe.h b/include/soc/fsl/qe/qe.h index 70339d7958c02c2d8690509a5b62ef8a844bb7ff..0cd4c11479b1f4c4385c294295fdea4a7d82daa1 100644 --- a/include/soc/fsl/qe/qe.h +++ b/include/soc/fsl/qe/qe.h @@ -243,6 +243,7 @@ static inline int qe_alive_during_sleep(void) #define qe_muram_free cpm_muram_free #define qe_muram_addr cpm_muram_addr #define qe_muram_offset cpm_muram_offset +#define qe_muram_dma cpm_muram_dma #define qe_setbits32(_addr, _v) iowrite32be(ioread32be(_addr) | (_v), (_addr)) #define qe_clrbits32(_addr, _v) iowrite32be(ioread32be(_addr) & ~(_v), (_addr)) diff --git a/include/soc/fsl/qman.h b/include/soc/fsl/qman.h index 3d4df74a96dee6ca09e40d2e79bf9c21bdd90886..d4dfefdee6c10dfb607642e3a0c773144b08cd16 100644 --- a/include/soc/fsl/qman.h +++ b/include/soc/fsl/qman.h @@ -36,8 +36,11 @@ /* Hardware constants */ #define QM_CHANNEL_SWPORTAL0 0 #define QMAN_CHANNEL_POOL1 0x21 +#define QMAN_CHANNEL_CAAM 0x80 #define QMAN_CHANNEL_POOL1_REV3 0x401 +#define QMAN_CHANNEL_CAAM_REV3 0x840 extern u16 qm_channel_pool1; +extern u16 qm_channel_caam; /* Portal processing (interrupt) sources */ #define QM_PIRQ_CSCI 0x00100000 /* Congestion State Change */ @@ -165,6 +168,7 @@ static inline void qm_fd_set_param(struct qm_fd *fd, enum qm_fd_format fmt, #define qm_fd_set_contig_big(fd, len) \ qm_fd_set_param(fd, qm_fd_contig_big, 0, len) #define qm_fd_set_sg_big(fd, len) qm_fd_set_param(fd, qm_fd_sg_big, 0, len) +#define qm_fd_set_compound(fd, len) qm_fd_set_param(fd, qm_fd_compound, 0, len) static inline void qm_fd_clear_fd(struct qm_fd *fd) { @@ -639,6 +643,7 @@ struct qm_mcc_initcgr { #define QM_CGR_WE_MODE 0x0001 #define QMAN_CGR_FLAG_USE_INIT 0x00000001 +#define QMAN_CGR_MODE_FRAME 0x00000001 /* Portal and Frame Queues */ /* Represents a managed portal */ @@ -791,6 +796,84 @@ struct qman_cgr { #define QMAN_INITFQ_FLAG_SCHED 0x00000001 /* schedule rather than park */ #define QMAN_INITFQ_FLAG_LOCAL 0x00000004 /* set dest portal */ +/* + * For qman_volatile_dequeue(); Choose one PRECEDENCE. EXACT is optional. Use + * NUMFRAMES(n) (6-bit) or NUMFRAMES_TILLEMPTY to fill in the frame-count. Use + * FQID(n) to fill in the frame queue ID. + */ +#define QM_VDQCR_PRECEDENCE_VDQCR 0x0 +#define QM_VDQCR_PRECEDENCE_SDQCR 0x80000000 +#define QM_VDQCR_EXACT 0x40000000 +#define QM_VDQCR_NUMFRAMES_MASK 0x3f000000 +#define QM_VDQCR_NUMFRAMES_SET(n) (((n) & 0x3f) << 24) +#define QM_VDQCR_NUMFRAMES_GET(n) (((n) >> 24) & 0x3f) +#define QM_VDQCR_NUMFRAMES_TILLEMPTY QM_VDQCR_NUMFRAMES_SET(0) + +#define QMAN_VOLATILE_FLAG_WAIT 0x00000001 /* wait if VDQCR is in use */ +#define QMAN_VOLATILE_FLAG_WAIT_INT 0x00000002 /* if wait, interruptible? */ +#define QMAN_VOLATILE_FLAG_FINISH 0x00000004 /* wait till VDQCR completes */ + +/* "Query FQ Non-Programmable Fields" */ +struct qm_mcr_queryfq_np { + u8 verb; + u8 result; + u8 __reserved1; + u8 state; /* QM_MCR_NP_STATE_*** */ + u32 fqd_link; /* 24-bit, _res2[24-31] */ + u16 odp_seq; /* 14-bit, _res3[14-15] */ + u16 orp_nesn; /* 14-bit, _res4[14-15] */ + u16 orp_ea_hseq; /* 15-bit, _res5[15] */ + u16 orp_ea_tseq; /* 15-bit, _res6[15] */ + u32 orp_ea_hptr; /* 24-bit, _res7[24-31] */ + u32 orp_ea_tptr; /* 24-bit, _res8[24-31] */ + u32 pfdr_hptr; /* 24-bit, _res9[24-31] */ + u32 pfdr_tptr; /* 24-bit, _res10[24-31] */ + u8 __reserved2[5]; + u8 is; /* 1-bit, _res12[1-7] */ + u16 ics_surp; + u32 byte_cnt; + u32 frm_cnt; /* 24-bit, _res13[24-31] */ + u32 __reserved3; + u16 ra1_sfdr; /* QM_MCR_NP_RA1_*** */ + u16 ra2_sfdr; /* QM_MCR_NP_RA2_*** */ + u16 __reserved4; + u16 od1_sfdr; /* QM_MCR_NP_OD1_*** */ + u16 od2_sfdr; /* QM_MCR_NP_OD2_*** */ + u16 od3_sfdr; /* QM_MCR_NP_OD3_*** */ +} __packed; + +#define QM_MCR_NP_STATE_FE 0x10 +#define QM_MCR_NP_STATE_R 0x08 +#define QM_MCR_NP_STATE_MASK 0x07 /* Reads FQD::STATE; */ +#define QM_MCR_NP_STATE_OOS 0x00 +#define QM_MCR_NP_STATE_RETIRED 0x01 +#define QM_MCR_NP_STATE_TEN_SCHED 0x02 +#define QM_MCR_NP_STATE_TRU_SCHED 0x03 +#define QM_MCR_NP_STATE_PARKED 0x04 +#define QM_MCR_NP_STATE_ACTIVE 0x05 +#define QM_MCR_NP_PTR_MASK 0x07ff /* for RA[12] & OD[123] */ +#define QM_MCR_NP_RA1_NRA(v) (((v) >> 14) & 0x3) /* FQD::NRA */ +#define QM_MCR_NP_RA2_IT(v) (((v) >> 14) & 0x1) /* FQD::IT */ +#define QM_MCR_NP_OD1_NOD(v) (((v) >> 14) & 0x3) /* FQD::NOD */ +#define QM_MCR_NP_OD3_NPC(v) (((v) >> 14) & 0x3) /* FQD::NPC */ + +enum qm_mcr_queryfq_np_masks { + qm_mcr_fqd_link_mask = BIT(24) - 1, + qm_mcr_odp_seq_mask = BIT(14) - 1, + qm_mcr_orp_nesn_mask = BIT(14) - 1, + qm_mcr_orp_ea_hseq_mask = BIT(15) - 1, + qm_mcr_orp_ea_tseq_mask = BIT(15) - 1, + qm_mcr_orp_ea_hptr_mask = BIT(24) - 1, + qm_mcr_orp_ea_tptr_mask = BIT(24) - 1, + qm_mcr_pfdr_hptr_mask = BIT(24) - 1, + qm_mcr_pfdr_tptr_mask = BIT(24) - 1, + qm_mcr_is_mask = BIT(1) - 1, + qm_mcr_frm_cnt_mask = BIT(24) - 1, +}; + +#define qm_mcr_np_get(np, field) \ + ((np)->field & (qm_mcr_##field##_mask)) + /* Portal Management */ /** * qman_p_irqsource_add - add processing sources to be interrupt-driven @@ -963,6 +1046,25 @@ int qman_retire_fq(struct qman_fq *fq, u32 *flags); */ int qman_oos_fq(struct qman_fq *fq); +/* + * qman_volatile_dequeue - Issue a volatile dequeue command + * @fq: the frame queue object to dequeue from + * @flags: a bit-mask of QMAN_VOLATILE_FLAG_*** options + * @vdqcr: bit mask of QM_VDQCR_*** options, as per qm_dqrr_vdqcr_set() + * + * Attempts to lock access to the portal's VDQCR volatile dequeue functionality. + * The function will block and sleep if QMAN_VOLATILE_FLAG_WAIT is specified and + * the VDQCR is already in use, otherwise returns non-zero for failure. If + * QMAN_VOLATILE_FLAG_FINISH is specified, the function will only return once + * the VDQCR command has finished executing (ie. once the callback for the last + * DQRR entry resulting from the VDQCR command has been called). If not using + * the FINISH flag, completion can be determined either by detecting the + * presence of the QM_DQRR_STAT_UNSCHEDULED and QM_DQRR_STAT_DQCR_EXPIRED bits + * in the "stat" parameter passed to the FQ's dequeue callback, or by waiting + * for the QMAN_FQ_STATE_VDQCR bit to disappear. + */ +int qman_volatile_dequeue(struct qman_fq *fq, u32 flags, u32 vdqcr); + /** * qman_enqueue - Enqueue a frame to a frame queue * @fq: the frame queue object to enqueue to @@ -994,6 +1096,13 @@ int qman_alloc_fqid_range(u32 *result, u32 count); */ int qman_release_fqid(u32 fqid); +/** + * qman_query_fq_np - Queries non-programmable FQD fields + * @fq: the frame queue object to be queried + * @np: storage for the queried FQD fields + */ +int qman_query_fq_np(struct qman_fq *fq, struct qm_mcr_queryfq_np *np); + /* Pool-channel management */ /** * qman_alloc_pool_range - Allocate a contiguous range of pool-channel IDs diff --git a/include/soc/tegra/flowctrl.h b/include/soc/tegra/flowctrl.h new file mode 100644 index 0000000000000000000000000000000000000000..8f86aea4024b2f41ad9f28b9cbf39d397940dbac --- /dev/null +++ b/include/soc/tegra/flowctrl.h @@ -0,0 +1,82 @@ +/* + * Functions and macros to control the flowcontroller + * + * Copyright (c) 2010-2012, NVIDIA Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __SOC_TEGRA_FLOWCTRL_H__ +#define __SOC_TEGRA_FLOWCTRL_H__ + +#define FLOW_CTRL_HALT_CPU0_EVENTS 0x0 +#define FLOW_CTRL_WAITEVENT (2 << 29) +#define FLOW_CTRL_WAIT_FOR_INTERRUPT (4 << 29) +#define FLOW_CTRL_JTAG_RESUME (1 << 28) +#define FLOW_CTRL_SCLK_RESUME (1 << 27) +#define FLOW_CTRL_HALT_CPU_IRQ (1 << 10) +#define FLOW_CTRL_HALT_CPU_FIQ (1 << 8) +#define FLOW_CTRL_HALT_LIC_IRQ (1 << 11) +#define FLOW_CTRL_HALT_LIC_FIQ (1 << 10) +#define FLOW_CTRL_HALT_GIC_IRQ (1 << 9) +#define FLOW_CTRL_HALT_GIC_FIQ (1 << 8) +#define FLOW_CTRL_CPU0_CSR 0x8 +#define FLOW_CTRL_CSR_INTR_FLAG (1 << 15) +#define FLOW_CTRL_CSR_EVENT_FLAG (1 << 14) +#define FLOW_CTRL_CSR_ENABLE_EXT_CRAIL (1 << 13) +#define FLOW_CTRL_CSR_ENABLE_EXT_NCPU (1 << 12) +#define FLOW_CTRL_CSR_ENABLE_EXT_MASK ( \ + FLOW_CTRL_CSR_ENABLE_EXT_NCPU | \ + FLOW_CTRL_CSR_ENABLE_EXT_CRAIL) +#define FLOW_CTRL_CSR_ENABLE (1 << 0) +#define FLOW_CTRL_HALT_CPU1_EVENTS 0x14 +#define FLOW_CTRL_CPU1_CSR 0x18 + +#define TEGRA20_FLOW_CTRL_CSR_WFE_CPU0 (1 << 4) +#define TEGRA20_FLOW_CTRL_CSR_WFE_BITMAP (3 << 4) +#define TEGRA20_FLOW_CTRL_CSR_WFI_BITMAP 0 + +#define TEGRA30_FLOW_CTRL_CSR_WFI_CPU0 (1 << 8) +#define TEGRA30_FLOW_CTRL_CSR_WFE_BITMAP (0xF << 4) +#define TEGRA30_FLOW_CTRL_CSR_WFI_BITMAP (0xF << 8) + +#ifndef __ASSEMBLY__ +#ifdef CONFIG_SOC_TEGRA_FLOWCTRL +u32 flowctrl_read_cpu_csr(unsigned int cpuid); +void flowctrl_write_cpu_csr(unsigned int cpuid, u32 value); +void flowctrl_write_cpu_halt(unsigned int cpuid, u32 value); + +void flowctrl_cpu_suspend_enter(unsigned int cpuid); +void flowctrl_cpu_suspend_exit(unsigned int cpuid); +#else +static inline u32 flowctrl_read_cpu_csr(unsigned int cpuid) +{ + return 0; +} + +static inline void flowctrl_write_cpu_csr(unsigned int cpuid, u32 value) +{ +} + +static inline void flowctrl_write_cpu_halt(unsigned int cpuid, u32 value) {} + +static inline void flowctrl_cpu_suspend_enter(unsigned int cpuid) +{ +} + +static inline void flowctrl_cpu_suspend_exit(unsigned int cpuid) +{ +} +#endif /* CONFIG_SOC_TEGRA_FLOWCTRL */ +#endif /* __ASSEMBLY */ +#endif /* __SOC_TEGRA_FLOWCTRL_H__ */ diff --git a/include/soc/tegra/pmc.h b/include/soc/tegra/pmc.h index 2f271d1b9cea6fdc9bb1678077eab3222ea3bf92..1c3982bc558fb1d98ec3c90f6663c1a8189dca8e 100644 --- a/include/soc/tegra/pmc.h +++ b/include/soc/tegra/pmc.h @@ -26,12 +26,6 @@ struct clk; struct reset_control; -#ifdef CONFIG_PM_SLEEP -enum tegra_suspend_mode tegra_pmc_get_suspend_mode(void); -void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode); -void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode); -#endif /* CONFIG_PM_SLEEP */ - #ifdef CONFIG_SMP bool tegra_pmc_cpu_is_powered(unsigned int cpuid); int tegra_pmc_cpu_power_on(unsigned int cpuid); @@ -144,7 +138,7 @@ enum tegra_io_pad_voltage { TEGRA_IO_PAD_3300000UV, }; -#ifdef CONFIG_ARCH_TEGRA +#ifdef CONFIG_SOC_TEGRA_PMC int tegra_powergate_is_powered(unsigned int id); int tegra_powergate_power_on(unsigned int id); int tegra_powergate_power_off(unsigned int id); @@ -163,6 +157,11 @@ int tegra_io_pad_get_voltage(enum tegra_io_pad id); /* deprecated, use tegra_io_pad_power_{enable,disable}() instead */ int tegra_io_rail_power_on(unsigned int id); int tegra_io_rail_power_off(unsigned int id); + +enum tegra_suspend_mode tegra_pmc_get_suspend_mode(void); +void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode); +void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode); + #else static inline int tegra_powergate_is_powered(unsigned int id) { @@ -221,6 +220,20 @@ static inline int tegra_io_rail_power_off(unsigned int id) { return -ENOSYS; } -#endif /* CONFIG_ARCH_TEGRA */ + +static inline enum tegra_suspend_mode tegra_pmc_get_suspend_mode(void) +{ + return TEGRA_SUSPEND_NONE; +} + +static inline void tegra_pmc_set_suspend_mode(enum tegra_suspend_mode mode) +{ +} + +static inline void tegra_pmc_enter_suspend_mode(enum tegra_suspend_mode mode) +{ +} + +#endif /* CONFIG_SOC_TEGRA_PMC */ #endif /* __SOC_TEGRA_PMC_H__ */ diff --git a/include/sound/cs35l35.h b/include/sound/cs35l35.h new file mode 100644 index 0000000000000000000000000000000000000000..29da899e17e4def3dfe72e1b67eabba795f2fa8c --- /dev/null +++ b/include/sound/cs35l35.h @@ -0,0 +1,108 @@ +/* + * linux/sound/cs35l35.h -- Platform data for CS35l35 + * + * Copyright (c) 2016 Cirrus Logic Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __CS35L35_H +#define __CS35L35_H + +struct classh_cfg { + /* + * Class H Algorithm Control Variables + * You can either have it done + * automatically or you can adjust + * these variables for tuning + * + * if you do not enable the internal algorithm + * you will get a set of mixer controls for + * Class H tuning + * + * Section 4.3 of the datasheet + */ + bool classh_bst_override; + bool classh_algo_enable; + int classh_bst_max_limit; + int classh_mem_depth; + int classh_release_rate; + int classh_headroom; + int classh_wk_fet_disable; + int classh_wk_fet_delay; + int classh_wk_fet_thld; + int classh_vpch_auto; + int classh_vpch_rate; + int classh_vpch_man; +}; + +struct monitor_cfg { + /* + * Signal Monitor Data + * highly configurable signal monitoring + * data positioning and different types of + * monitoring data. + * + * Section 4.8.2 - 4.8.4 of the datasheet + */ + bool is_present; + bool imon_specs; + bool vmon_specs; + bool vpmon_specs; + bool vbstmon_specs; + bool vpbrstat_specs; + bool zerofill_specs; + u8 imon_dpth; + u8 imon_loc; + u8 imon_frm; + u8 imon_scale; + u8 vmon_dpth; + u8 vmon_loc; + u8 vmon_frm; + u8 vpmon_dpth; + u8 vpmon_loc; + u8 vpmon_frm; + u8 vbstmon_dpth; + u8 vbstmon_loc; + u8 vbstmon_frm; + u8 vpbrstat_dpth; + u8 vpbrstat_loc; + u8 vpbrstat_frm; + u8 zerofill_dpth; + u8 zerofill_loc; + u8 zerofill_frm; +}; + +struct cs35l35_platform_data { + + /* Stereo (2 Device) */ + bool stereo; + /* serial port drive strength */ + int sp_drv_str; + /* serial port drive in unused slots */ + int sp_drv_unused; + /* Boost Power Down with FET */ + bool bst_pdn_fet_on; + /* Boost Voltage : used if ClassH Algo Enabled */ + int bst_vctl; + /* Boost Converter Peak Current CTRL */ + int bst_ipk; + /* Amp Gain Zero Cross */ + bool gain_zc; + /* Audio Input Location */ + int aud_channel; + /* Advisory Input Location */ + int adv_channel; + /* Shared Boost for stereo */ + bool shared_bst; + /* Specifies this amp is using an external boost supply */ + bool ext_bst; + /* ClassH Algorithm */ + struct classh_cfg classh_algo; + /* Monitor Config */ + struct monitor_cfg mon_cfg; +}; + +#endif /* __CS35L35_H */ diff --git a/include/sound/hda_register.h b/include/sound/hda_register.h index 0013063db7f2bfe22faaf7b7f0d66695581254f9..15fc6daf909657ac7237c896cbc2de9b4ff04b18 100644 --- a/include/sound/hda_register.h +++ b/include/sound/hda_register.h @@ -106,8 +106,26 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_HSW_EM4 0x100c #define AZX_REG_HSW_EM5 0x1010 -/* Skylake/Broxton display HD-A controller Extended Mode registers */ -#define AZX_REG_SKL_EM4L 0x1040 +/* Skylake/Broxton vendor-specific registers */ +#define AZX_REG_VS_EM1 0x1000 +#define AZX_REG_VS_INRC 0x1004 +#define AZX_REG_VS_OUTRC 0x1008 +#define AZX_REG_VS_FIFOTRK 0x100C +#define AZX_REG_VS_FIFOTRK2 0x1010 +#define AZX_REG_VS_EM2 0x1030 +#define AZX_REG_VS_EM3L 0x1038 +#define AZX_REG_VS_EM3U 0x103C +#define AZX_REG_VS_EM4L 0x1040 +#define AZX_REG_VS_EM4U 0x1044 +#define AZX_REG_VS_LTRC 0x1048 +#define AZX_REG_VS_D0I3C 0x104A +#define AZX_REG_VS_PCE 0x104B +#define AZX_REG_VS_L2MAGC 0x1050 +#define AZX_REG_VS_L2LAHPT 0x1054 +#define AZX_REG_VS_SDXDPIB_XBASE 0x1084 +#define AZX_REG_VS_SDXDPIB_XINTERVAL 0x20 +#define AZX_REG_VS_SDXEFIFOS_XBASE 0x1094 +#define AZX_REG_VS_SDXEFIFOS_XINTERVAL 0x20 /* PCI space */ #define AZX_PCIREG_TCSEL 0x44 @@ -243,9 +261,11 @@ enum { SDI0, SDI1, SDI2, SDI3, SDO0, SDO1, SDO2, SDO3 }; #define AZX_REG_ML_LOUTPAY 0x20 #define AZX_REG_ML_LINPAY 0x30 -#define AZX_MLCTL_SPA (1<<16) -#define AZX_MLCTL_CPA 23 - +#define ML_LCTL_SCF_MASK 0xF +#define AZX_MLCTL_SPA (0x1 << 16) +#define AZX_MLCTL_CPA (0x1 << 23) +#define AZX_MLCTL_SPA_SHIFT 16 +#define AZX_MLCTL_CPA_SHIFT 23 /* registers for DMA Resume Capability Structure */ #define AZX_DRSM_CAP_ID 0x5 diff --git a/include/sound/hdaudio.h b/include/sound/hdaudio.h index 56004ec8d4418b00e1d62b62e5b057bf964f81a5..96546b30e90075f394ed5e8be608ddeb1bec519c 100644 --- a/include/sound/hdaudio.h +++ b/include/sound/hdaudio.h @@ -368,24 +368,32 @@ void snd_hdac_bus_free_stream_pages(struct hdac_bus *bus); /* * macros for easy use */ -#define _snd_hdac_chip_write(type, chip, reg, value) \ - ((chip)->io_ops->reg_write ## type(value, (chip)->remap_addr + (reg))) -#define _snd_hdac_chip_read(type, chip, reg) \ - ((chip)->io_ops->reg_read ## type((chip)->remap_addr + (reg))) +#define _snd_hdac_chip_writeb(chip, reg, value) \ + ((chip)->io_ops->reg_writeb(value, (chip)->remap_addr + (reg))) +#define _snd_hdac_chip_readb(chip, reg) \ + ((chip)->io_ops->reg_readb((chip)->remap_addr + (reg))) +#define _snd_hdac_chip_writew(chip, reg, value) \ + ((chip)->io_ops->reg_writew(value, (chip)->remap_addr + (reg))) +#define _snd_hdac_chip_readw(chip, reg) \ + ((chip)->io_ops->reg_readw((chip)->remap_addr + (reg))) +#define _snd_hdac_chip_writel(chip, reg, value) \ + ((chip)->io_ops->reg_writel(value, (chip)->remap_addr + (reg))) +#define _snd_hdac_chip_readl(chip, reg) \ + ((chip)->io_ops->reg_readl((chip)->remap_addr + (reg))) /* read/write a register, pass without AZX_REG_ prefix */ #define snd_hdac_chip_writel(chip, reg, value) \ - _snd_hdac_chip_write(l, chip, AZX_REG_ ## reg, value) + _snd_hdac_chip_writel(chip, AZX_REG_ ## reg, value) #define snd_hdac_chip_writew(chip, reg, value) \ - _snd_hdac_chip_write(w, chip, AZX_REG_ ## reg, value) + _snd_hdac_chip_writew(chip, AZX_REG_ ## reg, value) #define snd_hdac_chip_writeb(chip, reg, value) \ - _snd_hdac_chip_write(b, chip, AZX_REG_ ## reg, value) + _snd_hdac_chip_writeb(chip, AZX_REG_ ## reg, value) #define snd_hdac_chip_readl(chip, reg) \ - _snd_hdac_chip_read(l, chip, AZX_REG_ ## reg) + _snd_hdac_chip_readl(chip, AZX_REG_ ## reg) #define snd_hdac_chip_readw(chip, reg) \ - _snd_hdac_chip_read(w, chip, AZX_REG_ ## reg) + _snd_hdac_chip_readw(chip, AZX_REG_ ## reg) #define snd_hdac_chip_readb(chip, reg) \ - _snd_hdac_chip_read(b, chip, AZX_REG_ ## reg) + _snd_hdac_chip_readb(chip, AZX_REG_ ## reg) /* update a register, pass without AZX_REG_ prefix */ #define snd_hdac_chip_updatel(chip, reg, mask, val) \ diff --git a/include/sound/soc.h b/include/sound/soc.h index cdfb55f7aedeac2b0ee6a7f5f7b4bf81da123ddf..5170fd81e1fd0886c465225b72748c44792240d5 100644 --- a/include/sound/soc.h +++ b/include/sound/soc.h @@ -434,6 +434,8 @@ int snd_soc_codec_set_sysclk(struct snd_soc_codec *codec, int clk_id, int source, unsigned int freq, int dir); int snd_soc_codec_set_pll(struct snd_soc_codec *codec, int pll_id, int source, unsigned int freq_in, unsigned int freq_out); +int snd_soc_codec_set_jack(struct snd_soc_codec *codec, + struct snd_soc_jack *jack, void *data); int snd_soc_register_card(struct snd_soc_card *card); int snd_soc_unregister_card(struct snd_soc_card *card); @@ -497,7 +499,15 @@ void snd_soc_runtime_deactivate(struct snd_soc_pcm_runtime *rtd, int stream); int snd_soc_runtime_set_dai_fmt(struct snd_soc_pcm_runtime *rtd, unsigned int dai_fmt); +#ifdef CONFIG_DMI int snd_soc_set_dmi_name(struct snd_soc_card *card, const char *flavour); +#else +static inline int snd_soc_set_dmi_name(struct snd_soc_card *card, + const char *flavour) +{ + return 0; +} +#endif /* Utility functions to get clock rates from various things */ int snd_soc_calc_frame_size(int sample_size, int channels, int tdm_slots); @@ -721,6 +731,7 @@ struct snd_soc_jack_gpio { /* private: */ struct snd_soc_jack *jack; struct delayed_work work; + struct notifier_block pm_notifier; struct gpio_desc *desc; void *data; @@ -812,7 +823,6 @@ struct snd_soc_component { unsigned int ignore_pmdown_time:1; /* pmdown_time is ignored at stop */ unsigned int registered_as_component:1; - unsigned int auxiliary:1; /* for auxiliary component of the card */ unsigned int suspended:1; /* is in suspend PM state */ struct list_head list; @@ -913,6 +923,8 @@ struct snd_soc_codec_driver { int clk_id, int source, unsigned int freq, int dir); int (*set_pll)(struct snd_soc_codec *codec, int pll_id, int source, unsigned int freq_in, unsigned int freq_out); + int (*set_jack)(struct snd_soc_codec *codec, + struct snd_soc_jack *jack, void *data); /* codec IO */ struct regmap *(*get_regmap)(struct device *); diff --git a/include/target/target_core_backend.h b/include/target/target_core_backend.h index 1b0f447ce850f015e64dd27e47751fe945cbb2ec..e475531565fdfb6808de54bf9b4c84de7a501584 100644 --- a/include/target/target_core_backend.h +++ b/include/target/target_core_backend.h @@ -10,6 +10,7 @@ * backend module. */ #define TRANSPORT_FLAG_PASSTHROUGH_ALUA 0x2 +#define TRANSPORT_FLAG_PASSTHROUGH_PGR 0x4 struct request_queue; struct scatterlist; diff --git a/include/target/target_core_base.h b/include/target/target_core_base.h index ccfad0e9c2cdbd68f13c809c7ed6414b2c0c97c1..0c1dce2ac6f0239a90d3e86bff2afd3b0f318128 100644 --- a/include/target/target_core_base.h +++ b/include/target/target_core_base.h @@ -664,6 +664,7 @@ struct se_dev_attrib { int pi_prot_format; enum target_prot_type pi_prot_type; enum target_prot_type hw_pi_prot_type; + int pi_prot_verify; int enforce_pr_isids; int force_pr_aptpl; int is_nonrot; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index a88ed13446ff88e200ed642db63badf4c9a4b680..d0dbe60d8a6dd5ccbb89029796fcabd23d970ceb 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -61,7 +61,16 @@ DEFINE_EVENT(block_buffer, block_dirty_buffer, TP_ARGS(bh) ); -DECLARE_EVENT_CLASS(block_rq_with_error, +/** + * block_rq_requeue - place block IO request back on a queue + * @q: queue holding operation + * @rq: block IO operation request + * + * The block operation request @rq is being placed back into queue + * @q. For some reason the request was not completed and needs to be + * put back in the queue. + */ +TRACE_EVENT(block_rq_requeue, TP_PROTO(struct request_queue *q, struct request *rq), @@ -71,7 +80,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error, __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) - __field( int, errors ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -80,7 +88,6 @@ DECLARE_EVENT_CLASS(block_rq_with_error, __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_trace_sector(rq); __entry->nr_sector = blk_rq_trace_nr_sectors(rq); - __entry->errors = rq->errors; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq)); __get_str(cmd)[0] = '\0'; @@ -90,46 +97,13 @@ DECLARE_EVENT_CLASS(block_rq_with_error, MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->errors) -); - -/** - * block_rq_abort - abort block operation request - * @q: queue containing the block operation request - * @rq: block IO operation request - * - * Called immediately after pending block IO operation request @rq in - * queue @q is aborted. The fields in the operation request @rq - * can be examined to determine which device and sectors the pending - * operation would access. - */ -DEFINE_EVENT(block_rq_with_error, block_rq_abort, - - TP_PROTO(struct request_queue *q, struct request *rq), - - TP_ARGS(q, rq) -); - -/** - * block_rq_requeue - place block IO request back on a queue - * @q: queue holding operation - * @rq: block IO operation request - * - * The block operation request @rq is being placed back into queue - * @q. For some reason the request was not completed and needs to be - * put back in the queue. - */ -DEFINE_EVENT(block_rq_with_error, block_rq_requeue, - - TP_PROTO(struct request_queue *q, struct request *rq), - - TP_ARGS(q, rq) + __entry->nr_sector, 0) ); /** * block_rq_complete - block IO operation completed by device driver - * @q: queue containing the block operation request * @rq: block operations request + * @error: status code * @nr_bytes: number of completed bytes * * The block_rq_complete tracepoint event indicates that some portion @@ -140,16 +114,15 @@ DEFINE_EVENT(block_rq_with_error, block_rq_requeue, */ TRACE_EVENT(block_rq_complete, - TP_PROTO(struct request_queue *q, struct request *rq, - unsigned int nr_bytes), + TP_PROTO(struct request *rq, int error, unsigned int nr_bytes), - TP_ARGS(q, rq, nr_bytes), + TP_ARGS(rq, error, nr_bytes), TP_STRUCT__entry( __field( dev_t, dev ) __field( sector_t, sector ) __field( unsigned int, nr_sector ) - __field( int, errors ) + __field( int, error ) __array( char, rwbs, RWBS_LEN ) __dynamic_array( char, cmd, 1 ) ), @@ -158,7 +131,7 @@ TRACE_EVENT(block_rq_complete, __entry->dev = rq->rq_disk ? disk_devt(rq->rq_disk) : 0; __entry->sector = blk_rq_pos(rq); __entry->nr_sector = nr_bytes >> 9; - __entry->errors = rq->errors; + __entry->error = error; blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, nr_bytes); __get_str(cmd)[0] = '\0'; @@ -168,7 +141,7 @@ TRACE_EVENT(block_rq_complete, MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, __get_str(cmd), (unsigned long long)__entry->sector, - __entry->nr_sector, __entry->errors) + __entry->nr_sector, __entry->error) ); DECLARE_EVENT_CLASS(block_rq, diff --git a/include/trace/events/bpf.h b/include/trace/events/bpf.h index c3a53fd47ff1a20a322c71e1f9a4fd89c3462ed6..52c8425d144b349988a16366ab69d0366bd0c065 100644 --- a/include/trace/events/bpf.h +++ b/include/trace/events/bpf.h @@ -321,11 +321,14 @@ TRACE_EVENT(bpf_map_next_key, __dynamic_array(u8, key, map->key_size) __dynamic_array(u8, nxt, map->key_size) __field(bool, key_trunc) + __field(bool, key_null) __field(int, ufd) ), TP_fast_assign( - memcpy(__get_dynamic_array(key), key, map->key_size); + if (key) + memcpy(__get_dynamic_array(key), key, map->key_size); + __entry->key_null = !key; memcpy(__get_dynamic_array(nxt), key_next, map->key_size); __entry->type = map->map_type; __entry->key_len = min(map->key_size, 16U); @@ -336,8 +339,9 @@ TRACE_EVENT(bpf_map_next_key, TP_printk("map type=%s ufd=%d key=[%s%s] next=[%s%s]", __print_symbolic(__entry->type, __MAP_TYPE_SYM_TAB), __entry->ufd, - __print_hex(__get_dynamic_array(key), __entry->key_len), - __entry->key_trunc ? " ..." : "", + __entry->key_null ? "NULL" : __print_hex(__get_dynamic_array(key), + __entry->key_len), + __entry->key_trunc && !__entry->key_null ? " ..." : "", __print_hex(__get_dynamic_array(nxt), __entry->key_len), __entry->key_trunc ? " ..." : "") ); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index a3c3cab643a9528dd5f8ad50dfac29b53384f1db..e37973526153a83c2d96169b77e507525c53cc53 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -12,6 +12,7 @@ struct btrfs_root; struct btrfs_fs_info; struct btrfs_inode; struct extent_map; +struct btrfs_file_extent_item; struct btrfs_ordered_extent; struct btrfs_delayed_ref_node; struct btrfs_delayed_tree_ref; @@ -24,6 +25,7 @@ struct extent_buffer; struct btrfs_work; struct __btrfs_workqueue; struct btrfs_qgroup_extent_record; +struct btrfs_qgroup; #define show_ref_type(type) \ __print_symbolic(type, \ @@ -54,6 +56,12 @@ struct btrfs_qgroup_extent_record; (obj >= BTRFS_ROOT_TREE_OBJECTID && \ obj <= BTRFS_QUOTA_TREE_OBJECTID)) ? __show_root_type(obj) : "-" +#define show_fi_type(type) \ + __print_symbolic(type, \ + { BTRFS_FILE_EXTENT_INLINE, "INLINE" }, \ + { BTRFS_FILE_EXTENT_REG, "REG" }, \ + { BTRFS_FILE_EXTENT_PREALLOC, "PREALLOC"}) + #define BTRFS_GROUP_FLAGS \ { BTRFS_BLOCK_GROUP_DATA, "DATA"}, \ { BTRFS_BLOCK_GROUP_SYSTEM, "SYSTEM"}, \ @@ -213,7 +221,7 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->block_start = map->block_start; __entry->block_len = map->block_len; __entry->flags = map->flags; - __entry->refs = atomic_read(&map->refs); + __entry->refs = refcount_read(&map->refs); __entry->compress_type = map->compress_type; ), @@ -232,6 +240,138 @@ TRACE_EVENT_CONDITION(btrfs_get_extent, __entry->refs, __entry->compress_type) ); +/* file extent item */ +DECLARE_EVENT_CLASS(btrfs__file_extent_item_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start), + + TP_STRUCT__entry_btrfs( + __field( u64, root_obj ) + __field( u64, ino ) + __field( loff_t, isize ) + __field( u64, disk_isize ) + __field( u64, num_bytes ) + __field( u64, ram_bytes ) + __field( u64, disk_bytenr ) + __field( u64, disk_num_bytes ) + __field( u64, extent_offset ) + __field( u8, extent_type ) + __field( u8, compression ) + __field( u64, extent_start ) + __field( u64, extent_end ) + ), + + TP_fast_assign_btrfs(bi->root->fs_info, + __entry->root_obj = bi->root->objectid; + __entry->ino = btrfs_ino(bi); + __entry->isize = bi->vfs_inode.i_size; + __entry->disk_isize = bi->disk_i_size; + __entry->num_bytes = btrfs_file_extent_num_bytes(l, fi); + __entry->ram_bytes = btrfs_file_extent_ram_bytes(l, fi); + __entry->disk_bytenr = btrfs_file_extent_disk_bytenr(l, fi); + __entry->disk_num_bytes = btrfs_file_extent_disk_num_bytes(l, fi); + __entry->extent_offset = btrfs_file_extent_offset(l, fi); + __entry->extent_type = btrfs_file_extent_type(l, fi); + __entry->compression = btrfs_file_extent_compression(l, fi); + __entry->extent_start = start; + __entry->extent_end = (start + __entry->num_bytes); + ), + + TP_printk_btrfs( + "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " + "file extent range=[%llu %llu] " + "(num_bytes=%llu ram_bytes=%llu disk_bytenr=%llu " + "disk_num_bytes=%llu extent_offset=%llu type=%s " + "compression=%u", + show_root_type(__entry->root_obj), __entry->ino, + __entry->isize, + __entry->disk_isize, __entry->extent_start, + __entry->extent_end, __entry->num_bytes, __entry->ram_bytes, + __entry->disk_bytenr, __entry->disk_num_bytes, + __entry->extent_offset, show_fi_type(__entry->extent_type), + __entry->compression) +); + +DECLARE_EVENT_CLASS( + btrfs__file_extent_item_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start), + + TP_STRUCT__entry_btrfs( + __field( u64, root_obj ) + __field( u64, ino ) + __field( loff_t, isize ) + __field( u64, disk_isize ) + __field( u8, extent_type ) + __field( u8, compression ) + __field( u64, extent_start ) + __field( u64, extent_end ) + ), + + TP_fast_assign_btrfs( + bi->root->fs_info, + __entry->root_obj = bi->root->objectid; + __entry->ino = btrfs_ino(bi); + __entry->isize = bi->vfs_inode.i_size; + __entry->disk_isize = bi->disk_i_size; + __entry->extent_type = btrfs_file_extent_type(l, fi); + __entry->compression = btrfs_file_extent_compression(l, fi); + __entry->extent_start = start; + __entry->extent_end = (start + btrfs_file_extent_inline_len(l, slot, fi)); + ), + + TP_printk_btrfs( + "root=%llu(%s) inode=%llu size=%llu disk_isize=%llu " + "file extent range=[%llu %llu] " + "extent_type=%s compression=%u", + show_root_type(__entry->root_obj), __entry->ino, __entry->isize, + __entry->disk_isize, __entry->extent_start, + __entry->extent_end, show_fi_type(__entry->extent_type), + __entry->compression) +); + +DEFINE_EVENT( + btrfs__file_extent_item_regular, btrfs_get_extent_show_fi_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_regular, btrfs_truncate_show_fi_regular, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, u64 start), + + TP_ARGS(bi, l, fi, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_inline, btrfs_get_extent_show_fi_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start) +); + +DEFINE_EVENT( + btrfs__file_extent_item_inline, btrfs_truncate_show_fi_inline, + + TP_PROTO(struct btrfs_inode *bi, struct extent_buffer *l, + struct btrfs_file_extent_item *fi, int slot, u64 start), + + TP_ARGS(bi, l, fi, slot, start) +); + #define show_ordered_flags(flags) \ __print_flags(flags, "|", \ { (1 << BTRFS_ORDERED_IO_DONE), "IO_DONE" }, \ @@ -275,7 +415,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->bytes_left = ordered->bytes_left; __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; - __entry->refs = atomic_read(&ordered->refs); + __entry->refs = refcount_read(&ordered->refs); __entry->root_objectid = BTRFS_I(inode)->root->root_key.objectid; __entry->truncated_len = ordered->truncated_len; @@ -1475,6 +1615,49 @@ TRACE_EVENT(qgroup_update_counters, __entry->cur_new_count) ); +TRACE_EVENT(qgroup_update_reserve, + + TP_PROTO(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, + s64 diff), + + TP_ARGS(fs_info, qgroup, diff), + + TP_STRUCT__entry_btrfs( + __field( u64, qgid ) + __field( u64, cur_reserved ) + __field( s64, diff ) + ), + + TP_fast_assign_btrfs(fs_info, + __entry->qgid = qgroup->qgroupid; + __entry->cur_reserved = qgroup->reserved; + __entry->diff = diff; + ), + + TP_printk_btrfs("qgid=%llu cur_reserved=%llu diff=%lld", + __entry->qgid, __entry->cur_reserved, __entry->diff) +); + +TRACE_EVENT(qgroup_meta_reserve, + + TP_PROTO(struct btrfs_root *root, s64 diff), + + TP_ARGS(root, diff), + + TP_STRUCT__entry_btrfs( + __field( u64, refroot ) + __field( s64, diff ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->refroot = root->objectid; + __entry->diff = diff; + ), + + TP_printk_btrfs("refroot=%llu(%s) diff=%lld", + show_root_type(__entry->refroot), __entry->diff) +); + #endif /* _TRACE_BTRFS_H */ /* This part must be outside protection */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 09c71e9aaebf7d7bf83857294b9ec37a13d0920e..dfae175ddebc2340c68e9145d588f57f2f45ebce 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -15,6 +15,7 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; +struct ext4_fsmap; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2529,6 +2530,79 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +/* fsmap traces */ +DECLARE_EVENT_CLASS(ext4_fsmap_class, + TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, + u64 owner), + TP_ARGS(sb, keydev, agno, bno, len, owner), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(u32, agno) + __field(u64, bno) + __field(u64, len) + __field(u64, owner) + ), + TP_fast_assign( + __entry->dev = sb->s_bdev->bd_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->bno = bno; + __entry->len = len; + __entry->owner = owner; + ), + TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->bno, + __entry->len, + __entry->owner) +) +#define DEFINE_FSMAP_EVENT(name) \ +DEFINE_EVENT(ext4_fsmap_class, name, \ + TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ + u64 owner), \ + TP_ARGS(sb, keydev, agno, bno, len, owner)) +DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); +DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); +DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); + +DECLARE_EVENT_CLASS(ext4_getfsmap_class, + TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), + TP_ARGS(sb, fsmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(u64, block) + __field(u64, len) + __field(u64, owner) + __field(u64, flags) + ), + TP_fast_assign( + __entry->dev = sb->s_bdev->bd_dev; + __entry->keydev = new_decode_dev(fsmap->fmr_device); + __entry->block = fsmap->fmr_physical; + __entry->len = fsmap->fmr_length; + __entry->owner = fsmap->fmr_owner; + __entry->flags = fsmap->fmr_flags; + ), + TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->block, + __entry->len, + __entry->owner, + __entry->flags) +) +#define DEFINE_GETFSMAP_EVENT(name) \ +DEFINE_EVENT(ext4_getfsmap_class, name, \ + TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ + TP_ARGS(sb, fsmap)) +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c80fcad0a6c97b6975628fe6dcd236863f295c51..15da88c5c3a4d12ef12ff56a7e6cde8a039920b7 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,6 +15,8 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); +TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -42,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -51,12 +54,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_PREFLUSH | REQ_META |\ - REQ_PRIO) +#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \ + REQ_PREFLUSH | REQ_FUA) #define F2FS_BIO_FLAG_MASK(t) (t & F2FS_OP_FLAGS) #define show_bio_type(op,op_flags) show_bio_op(op), \ @@ -75,16 +79,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { REQ_OP_WRITE_ZEROES, "WRITE_ZEROES" }) #define show_bio_op_flags(flags) \ - __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ - { REQ_RAHEAD, "(RA)" }, \ - { REQ_SYNC, "(S)" }, \ - { REQ_SYNC | REQ_PRIO, "(SP)" }, \ - { REQ_META, "(M)" }, \ - { REQ_META | REQ_PRIO, "(MP)" }, \ - { REQ_SYNC | REQ_PREFLUSH , "(SF)" }, \ - { REQ_SYNC | REQ_META | REQ_PRIO, "(SMP)" }, \ - { REQ_PREFLUSH | REQ_META | REQ_PRIO, "(FMP)" }, \ - { 0, " \b" }) + __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ + { REQ_RAHEAD, "R" }, \ + { REQ_SYNC, "S" }, \ + { REQ_META, "M" }, \ + { REQ_PRIO, "P" }, \ + { REQ_PREFLUSH, "PF" }, \ + { REQ_FUA, "FUA" }) #define show_data_type(type) \ __print_symbolic(type, \ @@ -117,12 +118,14 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { GC_CB, "Cost-Benefit" }) #define show_cpreason(type) \ - __print_symbolic(type, \ + __print_flags(type, "|", \ { CP_UMOUNT, "Umount" }, \ { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT, "Umount" }, \ + { CP_TRIMMED, "Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; @@ -769,7 +772,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, @@ -822,7 +825,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u", show_dev(__entry->target), show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), @@ -1126,7 +1129,7 @@ TRACE_EVENT(f2fs_write_checkpoint, __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), @@ -1150,6 +1153,20 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), @@ -1174,26 +1191,29 @@ TRACE_EVENT(f2fs_issue_reset_zone, TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, - unsigned int flush_merge), + unsigned int flush_merge, int ret), - TP_ARGS(dev, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", + TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, diff --git a/include/trace/events/fs_dax.h b/include/trace/events/fs_dax.h index c566ddc87f73e5885841204d5ae7d160374545a6..08bb3ed18dcc213dbbdcfad0832a83a8fbb2ed4e 100644 --- a/include/trace/events/fs_dax.h +++ b/include/trace/events/fs_dax.h @@ -150,6 +150,136 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \ DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback); +DECLARE_EVENT_CLASS(dax_pte_fault_class, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), + TP_ARGS(inode, vmf, result), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(pgoff_t, pgoff) + __field(dev_t, dev) + __field(unsigned int, flags) + __field(int, result) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->flags = vmf->flags; + __entry->pgoff = vmf->pgoff; + __entry->result = result; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx pgoff %#lx %s", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __print_flags(__entry->flags, "|", FAULT_FLAG_TRACE), + __entry->address, + __entry->pgoff, + __print_flags(__entry->result, "|", VM_FAULT_RESULT_TRACE) + ) +) + +#define DEFINE_PTE_FAULT_EVENT(name) \ +DEFINE_EVENT(dax_pte_fault_class, name, \ + TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), \ + TP_ARGS(inode, vmf, result)) + +DEFINE_PTE_FAULT_EVENT(dax_pte_fault); +DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite_no_entry); +DEFINE_PTE_FAULT_EVENT(dax_pfn_mkwrite); +DEFINE_PTE_FAULT_EVENT(dax_load_hole); + +TRACE_EVENT(dax_insert_mapping, + TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), + TP_ARGS(inode, vmf, radix_entry), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(unsigned long, vm_flags) + __field(unsigned long, address) + __field(void *, radix_entry) + __field(dev_t, dev) + __field(int, write) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->vm_flags = vmf->vma->vm_flags; + __entry->address = vmf->address; + __entry->write = vmf->flags & FAULT_FLAG_WRITE; + __entry->radix_entry = radix_entry; + ), + TP_printk("dev %d:%d ino %#lx %s %s address %#lx radix_entry %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->vm_flags & VM_SHARED ? "shared" : "private", + __entry->write ? "write" : "read", + __entry->address, + (unsigned long)__entry->radix_entry + ) +) + +DECLARE_EVENT_CLASS(dax_writeback_range_class, + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index), + TP_ARGS(inode, start_index, end_index), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, start_index) + __field(pgoff_t, end_index) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->start_index = start_index; + __entry->end_index = end_index; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx-%#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->start_index, + __entry->end_index + ) +) + +#define DEFINE_WRITEBACK_RANGE_EVENT(name) \ +DEFINE_EVENT(dax_writeback_range_class, name, \ + TP_PROTO(struct inode *inode, pgoff_t start_index, pgoff_t end_index),\ + TP_ARGS(inode, start_index, end_index)) + +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range); +DEFINE_WRITEBACK_RANGE_EVENT(dax_writeback_range_done); + +TRACE_EVENT(dax_writeback_one, + TP_PROTO(struct inode *inode, pgoff_t pgoff, pgoff_t pglen), + TP_ARGS(inode, pgoff, pglen), + TP_STRUCT__entry( + __field(unsigned long, ino) + __field(pgoff_t, pgoff) + __field(pgoff_t, pglen) + __field(dev_t, dev) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->pgoff = pgoff; + __entry->pglen = pglen; + ), + TP_printk("dev %d:%d ino %#lx pgoff %#lx pglen %#lx", + MAJOR(__entry->dev), + MINOR(__entry->dev), + __entry->ino, + __entry->pgoff, + __entry->pglen + ) +) + #endif /* _TRACE_FS_DAX_H */ /* This part must be outside protection */ diff --git a/include/trace/events/iommu.h b/include/trace/events/iommu.h index 2c7befb10f13e3b0175385e501e40ddedaab9094..99254ed89212b0036c7afecb1fcd318d6cac3b8b 100644 --- a/include/trace/events/iommu.h +++ b/include/trace/events/iommu.h @@ -11,7 +11,6 @@ #define _TRACE_IOMMU_H #include -#include struct device; diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h index 39123c06a5661316a80dd677b43b1e581a17e2e7..29a3d53a401535ef1e602a2dd82d79d66bc68bea 100644 --- a/include/trace/events/rxrpc.h +++ b/include/trace/events/rxrpc.h @@ -683,6 +683,57 @@ TRACE_EVENT(rxrpc_rx_ack, __entry->n_acks) ); +TRACE_EVENT(rxrpc_rx_abort, + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + u32 abort_code), + + TP_ARGS(call, serial, abort_code), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_serial_t, serial ) + __field(u32, abort_code ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->serial = serial; + __entry->abort_code = abort_code; + ), + + TP_printk("c=%p ABORT %08x ac=%d", + __entry->call, + __entry->serial, + __entry->abort_code) + ); + +TRACE_EVENT(rxrpc_rx_rwind_change, + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + u32 rwind, bool wake), + + TP_ARGS(call, serial, rwind, wake), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_serial_t, serial ) + __field(u32, rwind ) + __field(bool, wake ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->serial = serial; + __entry->rwind = rwind; + __entry->wake = wake; + ), + + TP_printk("c=%p %08x rw=%u%s", + __entry->call, + __entry->serial, + __entry->rwind, + __entry->wake ? " wake" : "") + ); + TRACE_EVENT(rxrpc_tx_data, TP_PROTO(struct rxrpc_call *call, rxrpc_seq_t seq, rxrpc_serial_t serial, u8 flags, bool retrans, bool lose), @@ -1087,6 +1138,56 @@ TRACE_EVENT(rxrpc_improper_term, __entry->abort_code) ); +TRACE_EVENT(rxrpc_rx_eproto, + TP_PROTO(struct rxrpc_call *call, rxrpc_serial_t serial, + const char *why), + + TP_ARGS(call, serial, why), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(rxrpc_serial_t, serial ) + __field(const char *, why ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->serial = serial; + __entry->why = why; + ), + + TP_printk("c=%p EPROTO %08x %s", + __entry->call, + __entry->serial, + __entry->why) + ); + +TRACE_EVENT(rxrpc_connect_call, + TP_PROTO(struct rxrpc_call *call), + + TP_ARGS(call), + + TP_STRUCT__entry( + __field(struct rxrpc_call *, call ) + __field(unsigned long, user_call_ID ) + __field(u32, cid ) + __field(u32, call_id ) + ), + + TP_fast_assign( + __entry->call = call; + __entry->user_call_ID = call->user_call_ID; + __entry->cid = call->cid; + __entry->call_id = call->call_id; + ), + + TP_printk("c=%p u=%p %08x:%08x", + __entry->call, + (void *)__entry->user_call_ID, + __entry->cid, + __entry->call_id) + ); + #endif /* _TRACE_RXRPC_H */ /* This part must be outside protection */ diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 9e3ef6c99e4b0db058a312f0405475b44cef8354..ae1409ffe99a00817f02a2cc51771840712fc1c3 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -70,7 +70,7 @@ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->success = 1; /* rudiment, kill when possible */ __entry->target_cpu = task_cpu(p); ), @@ -147,6 +147,7 @@ TRACE_EVENT(sched_switch, memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; + /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", @@ -181,7 +182,7 @@ TRACE_EVENT(sched_migrate_task, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), @@ -206,7 +207,7 @@ DECLARE_EVENT_CLASS(sched_process_template, TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; - __entry->prio = p->prio; + __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", @@ -253,7 +254,7 @@ TRACE_EVENT(sched_process_wait, TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); - __entry->prio = current->prio; + __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", @@ -413,9 +414,9 @@ DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, */ TRACE_EVENT(sched_pi_setprio, - TP_PROTO(struct task_struct *tsk, int newprio), + TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), - TP_ARGS(tsk, newprio), + TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -428,7 +429,8 @@ TRACE_EVENT(sched_pi_setprio, memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; - __entry->newprio = newprio; + __entry->newprio = pi_task ? pi_task->prio : tsk->prio; + /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", diff --git a/include/trace/events/thermal.h b/include/trace/events/thermal.h index 2b4a8ff72d0dd54647255f6f8dc5ec8b61981872..6cde5b3514c2793cbaaffd3420c7217feafbad7a 100644 --- a/include/trace/events/thermal.h +++ b/include/trace/events/thermal.h @@ -151,9 +151,9 @@ TRACE_EVENT(thermal_power_cpu_limit, TRACE_EVENT(thermal_power_devfreq_get_power, TP_PROTO(struct thermal_cooling_device *cdev, struct devfreq_dev_status *status, unsigned long freq, - u32 dynamic_power, u32 static_power), + u32 dynamic_power, u32 static_power, u32 power), - TP_ARGS(cdev, status, freq, dynamic_power, static_power), + TP_ARGS(cdev, status, freq, dynamic_power, static_power, power), TP_STRUCT__entry( __string(type, cdev->type ) @@ -161,6 +161,7 @@ TRACE_EVENT(thermal_power_devfreq_get_power, __field(u32, load ) __field(u32, dynamic_power ) __field(u32, static_power ) + __field(u32, power) ), TP_fast_assign( @@ -169,11 +170,13 @@ TRACE_EVENT(thermal_power_devfreq_get_power, __entry->load = (100 * status->busy_time) / status->total_time; __entry->dynamic_power = dynamic_power; __entry->static_power = static_power; + __entry->power = power; ), - TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u", + TP_printk("type=%s freq=%lu load=%u dynamic_power=%u static_power=%u power=%u", __get_str(type), __entry->freq, - __entry->load, __entry->dynamic_power, __entry->static_power) + __entry->load, __entry->dynamic_power, __entry->static_power, + __entry->power) ); TRACE_EVENT(thermal_power_devfreq_limit, diff --git a/include/trace/events/v4l2.h b/include/trace/events/v4l2.h index ee7754c6e4a1279e87974f62c8d9c7c3fa3d152c..b3a85b3df53e4eb0fd4dc62219a99c7dac42e01e 100644 --- a/include/trace/events/v4l2.h +++ b/include/trace/events/v4l2.h @@ -29,6 +29,7 @@ EM( V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE, "VIDEO_OUTPUT_MPLANE" ) \ EM( V4L2_BUF_TYPE_SDR_CAPTURE, "SDR_CAPTURE" ) \ EM( V4L2_BUF_TYPE_SDR_OUTPUT, "SDR_OUTPUT" ) \ + EM( V4L2_BUF_TYPE_META_CAPTURE, "META_CAPTURE" ) \ EMe(V4L2_BUF_TYPE_PRIVATE, "PRIVATE" ) SHOW_TYPE diff --git a/include/trace/events/xen.h b/include/trace/events/xen.h index bce990f5a35d77615bb536009940f6146d1e7d82..31acce9019a63e7b4123dd89d9daa7f31111f74a 100644 --- a/include/trace/events/xen.h +++ b/include/trace/events/xen.h @@ -241,21 +241,21 @@ TRACE_EVENT(xen_mmu_set_pud, (int)sizeof(pudval_t) * 2, (unsigned long long)__entry->pudval) ); -TRACE_EVENT(xen_mmu_set_pgd, - TP_PROTO(pgd_t *pgdp, pgd_t *user_pgdp, pgd_t pgdval), - TP_ARGS(pgdp, user_pgdp, pgdval), +TRACE_EVENT(xen_mmu_set_p4d, + TP_PROTO(p4d_t *p4dp, p4d_t *user_p4dp, p4d_t p4dval), + TP_ARGS(p4dp, user_p4dp, p4dval), TP_STRUCT__entry( - __field(pgd_t *, pgdp) - __field(pgd_t *, user_pgdp) - __field(pgdval_t, pgdval) - ), - TP_fast_assign(__entry->pgdp = pgdp; - __entry->user_pgdp = user_pgdp; - __entry->pgdval = pgdval.pgd), - TP_printk("pgdp %p user_pgdp %p pgdval %0*llx (raw %0*llx)", - __entry->pgdp, __entry->user_pgdp, - (int)sizeof(pgdval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->pgdval)), - (int)sizeof(pgdval_t) * 2, (unsigned long long)__entry->pgdval) + __field(p4d_t *, p4dp) + __field(p4d_t *, user_p4dp) + __field(p4dval_t, p4dval) + ), + TP_fast_assign(__entry->p4dp = p4dp; + __entry->user_p4dp = user_p4dp; + __entry->p4dval = p4d_val(p4dval)), + TP_printk("p4dp %p user_p4dp %p p4dval %0*llx (raw %0*llx)", + __entry->p4dp, __entry->user_p4dp, + (int)sizeof(p4dval_t) * 2, (unsigned long long)pgd_val(native_make_pgd(__entry->p4dval)), + (int)sizeof(p4dval_t) * 2, (unsigned long long)__entry->p4dval) ); TRACE_EVENT(xen_mmu_pud_clear, diff --git a/include/uapi/Kbuild b/include/uapi/Kbuild deleted file mode 100644 index 245aa6e05e6adef3f0c0cc4ea2e3a017a2231481..0000000000000000000000000000000000000000 --- a/include/uapi/Kbuild +++ /dev/null @@ -1,15 +0,0 @@ -# UAPI Header export list -# Top-level Makefile calls into asm-$(ARCH) -# List only non-arch directories below - - -header-y += asm-generic/ -header-y += linux/ -header-y += sound/ -header-y += mtd/ -header-y += rdma/ -header-y += video/ -header-y += drm/ -header-y += xen/ -header-y += scsi/ -header-y += misc/ diff --git a/include/uapi/asm-generic/Kbuild b/include/uapi/asm-generic/Kbuild deleted file mode 100644 index b73de7bb7a62ae9eff4d0445ae848c5d9b94999d..0000000000000000000000000000000000000000 --- a/include/uapi/asm-generic/Kbuild +++ /dev/null @@ -1,36 +0,0 @@ -# UAPI Header export list -header-y += auxvec.h -header-y += bitsperlong.h -header-y += errno-base.h -header-y += errno.h -header-y += fcntl.h -header-y += int-l64.h -header-y += int-ll64.h -header-y += ioctl.h -header-y += ioctls.h -header-y += ipcbuf.h -header-y += kvm_para.h -header-y += mman-common.h -header-y += mman.h -header-y += msgbuf.h -header-y += param.h -header-y += poll.h -header-y += posix_types.h -header-y += resource.h -header-y += sembuf.h -header-y += setup.h -header-y += shmbuf.h -header-y += shmparam.h -header-y += siginfo.h -header-y += signal-defs.h -header-y += signal.h -header-y += socket.h -header-y += sockios.h -header-y += stat.h -header-y += statfs.h -header-y += swab.h -header-y += termbits.h -header-y += termios.h -header-y += types.h -header-y += ucontext.h -header-y += unistd.h diff --git a/include/uapi/asm-generic/Kbuild.asm b/include/uapi/asm-generic/Kbuild.asm index fcd50b759217615088767688661c4ece3259b5cf..21381449d98a88f0b5f68942b307128a4269926a 100644 --- a/include/uapi/asm-generic/Kbuild.asm +++ b/include/uapi/asm-generic/Kbuild.asm @@ -1,49 +1,33 @@ -# -# Headers that are optional in usr/include/asm/ -# -opt-header += kvm.h -opt-header += kvm_para.h -opt-header += a.out.h - # # Headers that are mandatory in usr/include/asm/ # -header-y += auxvec.h -header-y += bitsperlong.h -header-y += byteorder.h -header-y += errno.h -header-y += fcntl.h -header-y += ioctl.h -header-y += ioctls.h -header-y += ipcbuf.h -header-y += mman.h -header-y += msgbuf.h -header-y += param.h -header-y += poll.h -header-y += posix_types.h -header-y += ptrace.h -header-y += resource.h -header-y += sembuf.h -header-y += setup.h -header-y += shmbuf.h -header-y += sigcontext.h -header-y += siginfo.h -header-y += signal.h -header-y += socket.h -header-y += sockios.h -header-y += stat.h -header-y += statfs.h -header-y += swab.h -header-y += termbits.h -header-y += termios.h -header-y += types.h -header-y += unistd.h - -header-y += $(foreach hdr,$(opt-header), \ - $(if \ - $(wildcard \ - $(srctree)/arch/$(SRCARCH)/include/uapi/asm/$(hdr) \ - $(srctree)/arch/$(SRCARCH)/include/asm/$(hdr) \ - ), \ - $(hdr) \ - )) +mandatory-y += auxvec.h +mandatory-y += bitsperlong.h +mandatory-y += byteorder.h +mandatory-y += errno.h +mandatory-y += fcntl.h +mandatory-y += ioctl.h +mandatory-y += ioctls.h +mandatory-y += ipcbuf.h +mandatory-y += mman.h +mandatory-y += msgbuf.h +mandatory-y += param.h +mandatory-y += poll.h +mandatory-y += posix_types.h +mandatory-y += ptrace.h +mandatory-y += resource.h +mandatory-y += sembuf.h +mandatory-y += setup.h +mandatory-y += shmbuf.h +mandatory-y += sigcontext.h +mandatory-y += siginfo.h +mandatory-y += signal.h +mandatory-y += socket.h +mandatory-y += sockios.h +mandatory-y += stat.h +mandatory-y += statfs.h +mandatory-y += swab.h +mandatory-y += termbits.h +mandatory-y += termios.h +mandatory-y += types.h +mandatory-y += unistd.h diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h index 2c748ddad5f875711ed66f91eae9bc69b9a41fe0..2b488565599daf73b8942fe6482890830af94dff 100644 --- a/include/uapi/asm-generic/socket.h +++ b/include/uapi/asm-generic/socket.h @@ -94,4 +94,10 @@ #define SCM_TIMESTAMPING_OPT_STATS 54 +#define SO_MEMINFO 55 + +#define SO_INCOMING_NAPI_ID 56 + +#define SO_COOKIE 57 + #endif /* __ASM_GENERIC_SOCKET_H */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index a076cf1a3a23be2fbee73dab483e051b37b2370c..061185a5eb51390c68edd38f09ea55284df8ae88 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -194,8 +194,7 @@ __SYSCALL(__NR_quotactl, sys_quotactl) /* fs/readdir.c */ #define __NR_getdents64 61 -#define __ARCH_WANT_COMPAT_SYS_GETDENTS64 -__SC_COMP(__NR_getdents64, sys_getdents64, compat_sys_getdents64) +__SYSCALL(__NR_getdents64, sys_getdents64) /* fs/read_write.c */ #define __NR3264_lseek 62 diff --git a/include/uapi/drm/Kbuild b/include/uapi/drm/Kbuild deleted file mode 100644 index c97addd08f8c9545314e6d3519dc16aeb6f917d5..0000000000000000000000000000000000000000 --- a/include/uapi/drm/Kbuild +++ /dev/null @@ -1,23 +0,0 @@ -# UAPI Header export list -header-y += drm.h -header-y += drm_fourcc.h -header-y += drm_mode.h -header-y += drm_sarea.h -header-y += amdgpu_drm.h -header-y += exynos_drm.h -header-y += i810_drm.h -header-y += i915_drm.h -header-y += mga_drm.h -header-y += nouveau_drm.h -header-y += omap_drm.h -header-y += qxl_drm.h -header-y += r128_drm.h -header-y += radeon_drm.h -header-y += savage_drm.h -header-y += sis_drm.h -header-y += tegra_drm.h -header-y += via_drm.h -header-y += vmwgfx_drm.h -header-y += msm_drm.h -header-y += vc4_drm.h -header-y += virtgpu_drm.h diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h index 516a9f2857307fd672557e2761692fd1b9fbbbb2..6c249e5cfb09d4d74cfe7c18aea7a46208273487 100644 --- a/include/uapi/drm/amdgpu_drm.h +++ b/include/uapi/drm/amdgpu_drm.h @@ -295,7 +295,10 @@ union drm_amdgpu_gem_wait_idle { }; struct drm_amdgpu_wait_cs_in { - /** Command submission handle */ + /* Command submission handle + * handle equals 0 means none to wait for + * handle equals ~0ull means wait for the latest sequence number + */ __u64 handle; /** Absolute timeout to wait */ __u64 timeout; @@ -764,6 +767,25 @@ struct drm_amdgpu_info_device { __u64 cntl_sb_buf_gpu_addr; /* NGG Parameter Cache */ __u64 param_buf_gpu_addr; + __u32 prim_buf_size; + __u32 pos_buf_size; + __u32 cntl_sb_buf_size; + __u32 param_buf_size; + /* wavefront size*/ + __u32 wave_front_size; + /* shader visible vgprs*/ + __u32 num_shader_visible_vgprs; + /* CU per shader array*/ + __u32 num_cu_per_sh; + /* number of tcc blocks*/ + __u32 num_tcc_blocks; + /* gs vgt table depth*/ + __u32 gs_vgt_table_depth; + /* gs primitive buffer depth*/ + __u32 gs_prim_buffer_depth; + /* max gs wavefront per vgt*/ + __u32 max_gs_waves_per_vgt; + __u32 _pad1; }; struct drm_amdgpu_info_hw_ip { diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 995c8f9c692fb36a12056541d944edd7e3590026..55e301047b3e94a4d0eee18c6bb7732cc162b695 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -306,6 +306,51 @@ extern "C" { */ #define DRM_FORMAT_MOD_VIVANTE_SPLIT_SUPER_TILED fourcc_mod_code(VIVANTE, 4) + +/* NVIDIA Tegra frame buffer modifiers */ + +/* + * Some modifiers take parameters, for example the number of vertical GOBs in + * a block. Reserve the lower 32 bits for parameters + */ +#define __fourcc_mod_tegra_mode_shift 32 +#define fourcc_mod_tegra_code(val, params) \ + fourcc_mod_code(NV, ((((__u64)val) << __fourcc_mod_tegra_mode_shift) | params)) +#define fourcc_mod_tegra_mod(m) \ + (m & ~((1ULL << __fourcc_mod_tegra_mode_shift) - 1)) +#define fourcc_mod_tegra_param(m) \ + (m & ((1ULL << __fourcc_mod_tegra_mode_shift) - 1)) + +/* + * Tegra Tiled Layout, used by Tegra 2, 3 and 4. + * + * Pixels are arranged in simple tiles of 16 x 16 bytes. + */ +#define NV_FORMAT_MOD_TEGRA_TILED fourcc_mod_tegra_code(1, 0) + +/* + * Tegra 16Bx2 Block Linear layout, used by TK1/TX1 + * + * Pixels are arranged in 64x8 Groups Of Bytes (GOBs). GOBs are then stacked + * vertically by a power of 2 (1 to 32 GOBs) to form a block. + * + * Within a GOB, data is ordered as 16B x 2 lines sectors laid in Z-shape. + * + * Parameter 'v' is the log2 encoding of the number of GOBs stacked vertically. + * Valid values are: + * + * 0 == ONE_GOB + * 1 == TWO_GOBS + * 2 == FOUR_GOBS + * 3 == EIGHT_GOBS + * 4 == SIXTEEN_GOBS + * 5 == THIRTYTWO_GOBS + * + * Chapter 20 "Pixel Memory Formats" of the Tegra X1 TRM describes this format + * in full detail. + */ +#define NV_FORMAT_MOD_TEGRA_16BX2_BLOCK(v) fourcc_mod_tegra_code(2, v) + #if defined(__cplusplus) } #endif diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index f8d9fed17ba99418d858ceb57a864b31a00d078a..ca2787d9bf0f87598dc827faf40baf7e92403fba 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -1,493 +1,13 @@ # UAPI Header export list -header-y += android/ -header-y += byteorder/ -header-y += can/ -header-y += caif/ -header-y += dvb/ -header-y += hdlc/ -header-y += hsi/ -header-y += iio/ -header-y += isdn/ -header-y += mmc/ -header-y += nfsd/ -header-y += raid/ -header-y += spi/ -header-y += sunrpc/ -header-y += tc_act/ -header-y += tc_ematch/ -header-y += netfilter/ -header-y += netfilter_arp/ -header-y += netfilter_bridge/ -header-y += netfilter_ipv4/ -header-y += netfilter_ipv6/ -header-y += usb/ -header-y += wimax/ -genhdr-y += version.h - -ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/a.out.h \ - $(srctree)/arch/$(SRCARCH)/include/asm/a.out.h),) -header-y += a.out.h +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/a.out.h),) +no-export-headers += a.out.h endif -header-y += acct.h -header-y += adb.h -header-y += adfs_fs.h -header-y += affs_hardblocks.h -header-y += agpgart.h -header-y += aio_abi.h -header-y += am437x-vpfe.h -header-y += apm_bios.h -header-y += arcfb.h -header-y += atalk.h -header-y += atmapi.h -header-y += atmarp.h -header-y += atmbr2684.h -header-y += atmclip.h -header-y += atmdev.h -header-y += atm_eni.h -header-y += atm.h -header-y += atm_he.h -header-y += atm_idt77105.h -header-y += atmioc.h -header-y += atmlec.h -header-y += atmmpc.h -header-y += atm_nicstar.h -header-y += atmppp.h -header-y += atmsap.h -header-y += atmsvc.h -header-y += atm_tcp.h -header-y += atm_zatm.h -header-y += audit.h -header-y += auto_fs4.h -header-y += auto_fs.h -header-y += auxvec.h -header-y += ax25.h -header-y += b1lli.h -header-y += batman_adv.h -header-y += baycom.h -header-y += bcm933xx_hcs.h -header-y += bfs_fs.h -header-y += binfmts.h -header-y += blkpg.h -header-y += blktrace_api.h -header-y += blkzoned.h -header-y += bpf_common.h -header-y += bpf_perf_event.h -header-y += bpf.h -header-y += bpqether.h -header-y += bsg.h -header-y += bt-bmc.h -header-y += btrfs.h -header-y += can.h -header-y += capability.h -header-y += capi.h -header-y += cciss_defs.h -header-y += cciss_ioctl.h -header-y += cdrom.h -header-y += cec.h -header-y += cec-funcs.h -header-y += cgroupstats.h -header-y += chio.h -header-y += cm4000_cs.h -header-y += cn_proc.h -header-y += coda.h -header-y += coda_psdev.h -header-y += coff.h -header-y += connector.h -header-y += const.h -header-y += cramfs_fs.h -header-y += cuda.h -header-y += cyclades.h -header-y += cycx_cfm.h -header-y += dcbnl.h -header-y += dccp.h -header-y += devlink.h -header-y += dlmconstants.h -header-y += dlm_device.h -header-y += dlm.h -header-y += dlm_netlink.h -header-y += dlm_plock.h -header-y += dm-ioctl.h -header-y += dm-log-userspace.h -header-y += dma-buf.h -header-y += dn.h -header-y += dqblk_xfs.h -header-y += edd.h -header-y += efs_fs_sb.h -header-y += elfcore.h -header-y += elf-em.h -header-y += elf-fdpic.h -header-y += elf.h -header-y += errno.h -header-y += errqueue.h -header-y += ethtool.h -header-y += eventpoll.h -header-y += fadvise.h -header-y += falloc.h -header-y += fanotify.h -header-y += fb.h -header-y += fcntl.h -header-y += fd.h -header-y += fdreg.h -header-y += fib_rules.h -header-y += fiemap.h -header-y += filter.h -header-y += firewire-cdev.h -header-y += firewire-constants.h -header-y += flat.h -header-y += fou.h -header-y += fs.h -header-y += fsl_hypervisor.h -header-y += fuse.h -header-y += futex.h -header-y += gameport.h -header-y += genetlink.h -header-y += gen_stats.h -header-y += gfs2_ondisk.h -header-y += gigaset_dev.h -header-y += gpio.h -header-y += gsmmux.h -header-y += gtp.h -header-y += hdlcdrv.h -header-y += hdlc.h -header-y += hdreg.h -header-y += hiddev.h -header-y += hid.h -header-y += hidraw.h -header-y += hpet.h -header-y += hsr_netlink.h -header-y += hyperv.h -header-y += hysdn_if.h -header-y += i2c-dev.h -header-y += i2c.h -header-y += i2o-dev.h -header-y += i8k.h -header-y += icmp.h -header-y += icmpv6.h -header-y += if_addr.h -header-y += if_addrlabel.h -header-y += if_alg.h -header-y += if_arcnet.h -header-y += if_arp.h -header-y += if_bonding.h -header-y += if_bridge.h -header-y += if_cablemodem.h -header-y += if_eql.h -header-y += if_ether.h -header-y += if_fc.h -header-y += if_fddi.h -header-y += if_frad.h -header-y += if.h -header-y += if_hippi.h -header-y += if_infiniband.h -header-y += if_link.h -header-y += if_ltalk.h -header-y += if_macsec.h -header-y += if_packet.h -header-y += if_phonet.h -header-y += if_plip.h -header-y += if_ppp.h -header-y += if_pppol2tp.h -header-y += if_pppox.h -header-y += if_slip.h -header-y += if_team.h -header-y += if_tun.h -header-y += if_tunnel.h -header-y += if_vlan.h -header-y += if_x25.h -header-y += ife.h -header-y += igmp.h -header-y += ila.h -header-y += in6.h -header-y += inet_diag.h -header-y += in.h -header-y += inotify.h -header-y += input.h -header-y += input-event-codes.h -header-y += in_route.h -header-y += ioctl.h -header-y += ip6_tunnel.h -header-y += ipc.h -header-y += ip.h -header-y += ipmi.h -header-y += ipmi_msgdefs.h -header-y += ipsec.h -header-y += ipv6.h -header-y += ipv6_route.h -header-y += ip_vs.h -header-y += ipx.h -header-y += irda.h -header-y += irqnr.h -header-y += isdn_divertif.h -header-y += isdn.h -header-y += isdnif.h -header-y += isdn_ppp.h -header-y += iso_fs.h -header-y += ivtvfb.h -header-y += ivtv.h -header-y += ixjuser.h -header-y += jffs2.h -header-y += joystick.h -header-y += kcmp.h -header-y += kdev_t.h -header-y += kd.h -header-y += kernelcapi.h -header-y += kernel.h -header-y += kernel-page-flags.h -header-y += kexec.h -header-y += keyboard.h -header-y += keyctl.h - -ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm.h \ - $(srctree)/arch/$(SRCARCH)/include/asm/kvm.h),) -header-y += kvm.h +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm.h),) +no-export-headers += kvm.h endif - -ifneq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h \ - $(srctree)/arch/$(SRCARCH)/include/asm/kvm_para.h),) -header-y += kvm_para.h +ifeq ($(wildcard $(srctree)/arch/$(SRCARCH)/include/uapi/asm/kvm_para.h),) +no-export-headers += kvm_para.h endif - -header-y += hw_breakpoint.h -header-y += l2tp.h -header-y += libc-compat.h -header-y += lirc.h -header-y += limits.h -header-y += llc.h -header-y += loop.h -header-y += lp.h -header-y += lwtunnel.h -header-y += magic.h -header-y += major.h -header-y += map_to_7segment.h -header-y += matroxfb.h -header-y += mdio.h -header-y += media.h -header-y += media-bus-format.h -header-y += mei.h -header-y += membarrier.h -header-y += memfd.h -header-y += mempolicy.h -header-y += meye.h -header-y += mic_common.h -header-y += mic_ioctl.h -header-y += mii.h -header-y += minix_fs.h -header-y += mman.h -header-y += mmtimer.h -header-y += mpls.h -header-y += mpls_iptunnel.h -header-y += mqueue.h -header-y += mroute6.h -header-y += mroute.h -header-y += msdos_fs.h -header-y += msg.h -header-y += mtio.h -header-y += nbd.h -header-y += ncp_fs.h -header-y += ncp.h -header-y += ncp_mount.h -header-y += ncp_no.h -header-y += ndctl.h -header-y += neighbour.h -header-y += netconf.h -header-y += netdevice.h -header-y += net_dropmon.h -header-y += netfilter_arp.h -header-y += netfilter_bridge.h -header-y += netfilter_decnet.h -header-y += netfilter.h -header-y += netfilter_ipv4.h -header-y += netfilter_ipv6.h -header-y += net.h -header-y += netlink_diag.h -header-y += netlink.h -header-y += netrom.h -header-y += net_namespace.h -header-y += net_tstamp.h -header-y += nfc.h -header-y += psample.h -header-y += nfs2.h -header-y += nfs3.h -header-y += nfs4.h -header-y += nfs4_mount.h -header-y += nfsacl.h -header-y += nfs_fs.h -header-y += nfs.h -header-y += nfs_idmap.h -header-y += nfs_mount.h -header-y += nl80211.h -header-y += n_r3964.h -header-y += nubus.h -header-y += nvme_ioctl.h -header-y += nvram.h -header-y += omap3isp.h -header-y += omapfb.h -header-y += oom.h -header-y += openvswitch.h -header-y += packet_diag.h -header-y += param.h -header-y += parport.h -header-y += patchkey.h -header-y += pci.h -header-y += pci_regs.h -header-y += perf_event.h -header-y += personality.h -header-y += pfkeyv2.h -header-y += pg.h -header-y += phantom.h -header-y += phonet.h -header-y += pktcdvd.h -header-y += pkt_cls.h -header-y += pkt_sched.h -header-y += pmu.h -header-y += poll.h -header-y += posix_acl.h -header-y += posix_acl_xattr.h -header-y += posix_types.h -header-y += ppdev.h -header-y += ppp-comp.h -header-y += ppp_defs.h -header-y += ppp-ioctl.h -header-y += pps.h -header-y += prctl.h -header-y += psci.h -header-y += ptp_clock.h -header-y += ptrace.h -header-y += qnx4_fs.h -header-y += qnxtypes.h -header-y += quota.h -header-y += radeonfb.h -header-y += random.h -header-y += raw.h -header-y += rds.h -header-y += reboot.h -header-y += reiserfs_fs.h -header-y += reiserfs_xattr.h -header-y += resource.h -header-y += rfkill.h -header-y += rio_cm_cdev.h -header-y += rio_mport_cdev.h -header-y += romfs_fs.h -header-y += rose.h -header-y += route.h -header-y += rtc.h -header-y += rtnetlink.h -header-y += scc.h -header-y += sched.h -header-y += scif_ioctl.h -header-y += screen_info.h -header-y += sctp.h -header-y += sdla.h -header-y += seccomp.h -header-y += securebits.h -header-y += seg6_genl.h -header-y += seg6.h -header-y += seg6_hmac.h -header-y += seg6_iptunnel.h -header-y += selinux_netlink.h -header-y += sem.h -header-y += serial_core.h -header-y += serial.h -header-y += serial_reg.h -header-y += serio.h -header-y += shm.h -header-y += signalfd.h -header-y += signal.h -header-y += smiapp.h -header-y += snmp.h -header-y += sock_diag.h -header-y += socket.h -header-y += sockios.h -header-y += sonet.h -header-y += sonypi.h -header-y += soundcard.h -header-y += sound.h -header-y += stat.h -header-y += stddef.h -header-y += string.h -header-y += suspend_ioctls.h -header-y += swab.h -header-y += synclink.h -header-y += sync_file.h -header-y += sysctl.h -header-y += sysinfo.h -header-y += target_core_user.h -header-y += taskstats.h -header-y += tcp.h -header-y += tcp_metrics.h -header-y += telephony.h -header-y += termios.h -header-y += thermal.h -header-y += time.h -header-y += timerfd.h -header-y += times.h -header-y += timex.h -header-y += tiocl.h -header-y += tipc_config.h -header-y += tipc_netlink.h -header-y += tipc.h -header-y += toshiba.h -header-y += tty_flags.h -header-y += tty.h -header-y += types.h -header-y += udf_fs_i.h -header-y += udp.h -header-y += uhid.h -header-y += uinput.h -header-y += uio.h -header-y += uleds.h -header-y += ultrasound.h -header-y += un.h -header-y += unistd.h -header-y += unix_diag.h -header-y += usbdevice_fs.h -header-y += usbip.h -header-y += userio.h -header-y += utime.h -header-y += utsname.h -header-y += uuid.h -header-y += uvcvideo.h -header-y += v4l2-common.h -header-y += v4l2-controls.h -header-y += v4l2-dv-timings.h -header-y += v4l2-mediabus.h -header-y += v4l2-subdev.h -header-y += veth.h -header-y += vfio.h -header-y += vhost.h -header-y += videodev2.h -header-y += virtio_9p.h -header-y += virtio_balloon.h -header-y += virtio_blk.h -header-y += virtio_config.h -header-y += virtio_console.h -header-y += virtio_gpu.h -header-y += virtio_ids.h -header-y += virtio_input.h -header-y += virtio_mmio.h -header-y += virtio_net.h -header-y += virtio_pci.h -header-y += virtio_ring.h -header-y += virtio_rng.h -header-y += virtio_scsi.h -header-y += virtio_types.h -header-y += virtio_vsock.h -header-y += virtio_crypto.h -header-y += vm_sockets.h -header-y += vt.h -header-y += vtpm_proxy.h -header-y += wait.h -header-y += wanrouter.h -header-y += watchdog.h -header-y += wimax.h -header-y += wireless.h -header-y += x25.h -header-y += xattr.h -header-y += xfrm.h -header-y += xilinx-v4l2-controls.h -header-y += zorro.h -header-y += zorro_ids.h -header-y += userfaultfd.h diff --git a/include/uapi/linux/android/Kbuild b/include/uapi/linux/android/Kbuild deleted file mode 100644 index ca011eec252a8c2fd962e24ca6542a771dcf9284..0000000000000000000000000000000000000000 --- a/include/uapi/linux/android/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += binder.h diff --git a/include/uapi/linux/aspeed-lpc-ctrl.h b/include/uapi/linux/aspeed-lpc-ctrl.h new file mode 100644 index 0000000000000000000000000000000000000000..c328c976c684cb0b5add21c26de7c96c273dc787 --- /dev/null +++ b/include/uapi/linux/aspeed-lpc-ctrl.h @@ -0,0 +1,61 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _UAPI_LINUX_ASPEED_LPC_CTRL_H +#define _UAPI_LINUX_ASPEED_LPC_CTRL_H + +#include +#include + +/* Window types */ +#define ASPEED_LPC_CTRL_WINDOW_FLASH 1 +#define ASPEED_LPC_CTRL_WINDOW_MEMORY 2 + +/* + * This driver provides a window for the host to access a BMC resource + * across the BMC <-> Host LPC bus. + * + * window_type: The BMC resource that the host will access through the + * window. BMC flash and BMC RAM. + * + * window_id: For each window type there may be multiple windows, + * these are referenced by ID. + * + * flags: Reserved for future use, this field is expected to be + * zeroed. + * + * addr: Address on the host LPC bus that the specified window should + * be mapped. This address must be power of two aligned. + * + * offset: Offset into the BMC window that should be mapped to the + * host (at addr). This must be a multiple of size. + * + * size: The size of the mapping. The smallest possible size is 64K. + * This must be power of two aligned. + * + */ + +struct aspeed_lpc_ctrl_mapping { + __u8 window_type; + __u8 window_id; + __u16 flags; + __u32 addr; + __u32 offset; + __u32 size; +}; + +#define __ASPEED_LPC_CTRL_IOCTL_MAGIC 0xb2 + +#define ASPEED_LPC_CTRL_IOCTL_GET_SIZE _IOWR(__ASPEED_LPC_CTRL_IOCTL_MAGIC, \ + 0x00, struct aspeed_lpc_ctrl_mapping) + +#define ASPEED_LPC_CTRL_IOCTL_MAP _IOW(__ASPEED_LPC_CTRL_IOCTL_MAGIC, \ + 0x01, struct aspeed_lpc_ctrl_mapping) + +#endif /* _UAPI_LINUX_ASPEED_LPC_CTRL_H */ diff --git a/include/uapi/linux/bcache.h b/include/uapi/linux/bcache.h index 22b6ad31c706dae59544286faea5808d7b303342..e3bb0635e94ae238481fece501d33b9b6c0ed937 100644 --- a/include/uapi/linux/bcache.h +++ b/include/uapi/linux/bcache.h @@ -5,7 +5,7 @@ * Bcache on disk data structures */ -#include +#include #define BITMASK(name, type, field, offset, size) \ static inline __u64 name(const type *k) \ diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 0539a0ceef38155835552360667070552ebce641..945a1f5f63c546c5521da5254a80664344ebd0ad 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -81,6 +81,7 @@ enum bpf_cmd { BPF_OBJ_GET, BPF_PROG_ATTACH, BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, }; enum bpf_map_type { @@ -96,6 +97,8 @@ enum bpf_map_type { BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, }; enum bpf_prog_type { @@ -152,6 +155,7 @@ union bpf_attr { __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ __u32 map_flags; /* prealloc or not */ + __u32 inner_map_fd; /* fd pointing to the inner map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ @@ -186,6 +190,17 @@ union bpf_attr { __u32 attach_type; __u32 attach_flags; }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; + __u32 data_size_out; + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + } test; } __attribute__((aligned(8))); /* BPF helper function descriptions: @@ -456,6 +471,17 @@ union bpf_attr { * Return: * > 0 length of the string including the trailing NUL on success * < 0 error + * + * u64 bpf_get_socket_cookie(skb) + * Get the cookie for the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: 8 Bytes non-decreasing number on success or 0 if the socket + * field is missing inside sk_buff + * + * u32 bpf_get_socket_uid(skb) + * Get the owner uid of the socket stored inside sk_buff. + * @skb: pointer to skb + * Return: uid of the socket owner on success or overflowuid if failed. */ #define __BPF_FUNC_MAPPER(FN) \ FN(unspec), \ @@ -503,7 +529,9 @@ union bpf_attr { FN(get_numa_node_id), \ FN(skb_change_head), \ FN(xdp_adjust_head), \ - FN(probe_read_str), + FN(probe_read_str), \ + FN(get_socket_cookie), \ + FN(get_socket_uid), /* integer value in 'imm' field of BPF_CALL instruction selects which helper * function eBPF program intends to call @@ -574,6 +602,7 @@ struct __sk_buff { __u32 tc_classid; __u32 data; __u32 data_end; + __u32 napi_id; }; struct bpf_tunnel_key { diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index dcfc3a5a9cb1d20f29bbac00c6ef315006e9d208..a456e5309238bbd78466abee16a0da6ab0248e50 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -291,10 +291,10 @@ struct btrfs_ioctl_feature_flags { struct btrfs_balance_args { __u64 profiles; union { - __le64 usage; + __u64 usage; struct { - __le32 usage_min; - __le32 usage_max; + __u32 usage_min; + __u32 usage_max; }; }; __u64 devid; @@ -324,8 +324,8 @@ struct btrfs_balance_args { * Process chunks that cross stripes_min..stripes_max devices, * BTRFS_BALANCE_ARGS_STRIPES_RANGE */ - __le32 stripes_min; - __le32 stripes_max; + __u32 stripes_min; + __u32 stripes_max; __u64 unused[6]; } __attribute__ ((__packed__)); diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index d5ad15a106a707c13fa77c5733ea96b8e750a5c8..10689e1fdf11d1e232bb3e9dc572a693542dd7b3 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1,6 +1,9 @@ #ifndef _BTRFS_CTREE_H_ #define _BTRFS_CTREE_H_ +#include +#include + /* * This header contains the structure definitions and constants used * by file system objects that can be retrieved using diff --git a/include/uapi/linux/byteorder/Kbuild b/include/uapi/linux/byteorder/Kbuild deleted file mode 100644 index 619225b9ff2ef8d610f5ac6160d51e93561e1ef2..0000000000000000000000000000000000000000 --- a/include/uapi/linux/byteorder/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += big_endian.h -header-y += little_endian.h diff --git a/include/uapi/linux/caif/Kbuild b/include/uapi/linux/caif/Kbuild deleted file mode 100644 index 43396612d3a322dae3cb99f00719bfb15001c476..0000000000000000000000000000000000000000 --- a/include/uapi/linux/caif/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += caif_socket.h -header-y += if_caif.h diff --git a/include/uapi/linux/can/Kbuild b/include/uapi/linux/can/Kbuild deleted file mode 100644 index 21c91bf25a298c9090d103e1fe41406e72a7ae20..0000000000000000000000000000000000000000 --- a/include/uapi/linux/can/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# UAPI Header export list -header-y += bcm.h -header-y += error.h -header-y += gw.h -header-y += netlink.h -header-y += raw.h diff --git a/include/uapi/linux/can/vxcan.h b/include/uapi/linux/can/vxcan.h new file mode 100644 index 0000000000000000000000000000000000000000..ffb0b7156f7e030b9c65a0b5e96f931fa8187eea --- /dev/null +++ b/include/uapi/linux/can/vxcan.h @@ -0,0 +1,12 @@ +#ifndef _UAPI_CAN_VXCAN_H +#define _UAPI_CAN_VXCAN_H + +enum { + VXCAN_INFO_UNSPEC, + VXCAN_INFO_PEER, + + __VXCAN_INFO_MAX +#define VXCAN_INFO_MAX (__VXCAN_INFO_MAX - 1) +}; + +#endif diff --git a/include/uapi/linux/capability.h b/include/uapi/linux/capability.h index 49bc0629539879c8edc3ae83e4b1933bdfc1dcbd..6fe14d001f68f77b98991d8650df0fcc8a6f3988 100644 --- a/include/uapi/linux/capability.h +++ b/include/uapi/linux/capability.h @@ -205,7 +205,7 @@ struct vfs_cap_data { #define CAP_SYS_MODULE 16 /* Allow ioperm/iopl access */ -/* Allow sending USB messages to any device via /proc/bus/usb */ +/* Allow sending USB messages to any device via /dev/bus/usb */ #define CAP_SYS_RAWIO 17 diff --git a/include/uapi/linux/cec.h b/include/uapi/linux/cec.h index 14b6f24b189ef31e59b7cd354dda260f8f98aa28..a0dfe27bc6c7c0d07bf6abca546105b7d94773fb 100644 --- a/include/uapi/linux/cec.h +++ b/include/uapi/linux/cec.h @@ -223,7 +223,7 @@ static inline int cec_msg_status_is_ok(const struct cec_msg *msg) #define CEC_LOG_ADDR_BACKUP_2 13 #define CEC_LOG_ADDR_SPECIFIC 14 #define CEC_LOG_ADDR_UNREGISTERED 15 /* as initiator address */ -#define CEC_LOG_ADDR_BROADCAST 15 /* ad destination address */ +#define CEC_LOG_ADDR_BROADCAST 15 /* as destination address */ /* The logical address types that the CEC device wants to claim */ #define CEC_LOG_ADDR_TYPE_TV 0 diff --git a/include/uapi/linux/cryptouser.h b/include/uapi/linux/cryptouser.h index 11d21fce14d6e08147c94c53b88c66435ac374e8..fdcbb3c29083bb58531f8992bbdd47d5fa0fc23b 100644 --- a/include/uapi/linux/cryptouser.h +++ b/include/uapi/linux/cryptouser.h @@ -18,6 +18,8 @@ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. */ +#include + /* Netlink configuration messages. */ enum { CRYPTO_MSG_BASE = 0x10, @@ -31,7 +33,7 @@ enum { #define CRYPTO_MSG_MAX (__CRYPTO_MSG_MAX - 1) #define CRYPTO_NR_MSGTYPES (CRYPTO_MSG_MAX + 1 - CRYPTO_MSG_BASE) -#define CRYPTO_MAX_NAME CRYPTO_MAX_ALG_NAME +#define CRYPTO_MAX_NAME 64 /* Netlink message attributes. */ enum crypto_attr_type_t { @@ -53,9 +55,9 @@ enum crypto_attr_type_t { }; struct crypto_user_alg { - char cru_name[CRYPTO_MAX_ALG_NAME]; - char cru_driver_name[CRYPTO_MAX_ALG_NAME]; - char cru_module_name[CRYPTO_MAX_ALG_NAME]; + char cru_name[CRYPTO_MAX_NAME]; + char cru_driver_name[CRYPTO_MAX_NAME]; + char cru_module_name[CRYPTO_MAX_NAME]; __u32 cru_type; __u32 cru_mask; __u32 cru_refcnt; @@ -73,7 +75,7 @@ struct crypto_report_hash { }; struct crypto_report_cipher { - char type[CRYPTO_MAX_ALG_NAME]; + char type[CRYPTO_MAX_NAME]; unsigned int blocksize; unsigned int min_keysize; unsigned int max_keysize; diff --git a/include/uapi/linux/devlink.h b/include/uapi/linux/devlink.h index 0f1f3a12e23c30e511cdb332059e30ee8d3d5efb..b0e807ac53bbfd3b088533381ecbf25a689633b0 100644 --- a/include/uapi/linux/devlink.h +++ b/include/uapi/linux/devlink.h @@ -65,8 +65,12 @@ enum devlink_command { #define DEVLINK_CMD_ESWITCH_MODE_SET /* obsolete, never use this! */ \ DEVLINK_CMD_ESWITCH_SET - /* add new commands above here */ + DEVLINK_CMD_DPIPE_TABLE_GET, + DEVLINK_CMD_DPIPE_ENTRIES_GET, + DEVLINK_CMD_DPIPE_HEADERS_GET, + DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + /* add new commands above here */ __DEVLINK_CMD_MAX, DEVLINK_CMD_MAX = __DEVLINK_CMD_MAX - 1 }; @@ -115,6 +119,11 @@ enum devlink_eswitch_inline_mode { DEVLINK_ESWITCH_INLINE_MODE_TRANSPORT, }; +enum devlink_eswitch_encap_mode { + DEVLINK_ESWITCH_ENCAP_MODE_NONE, + DEVLINK_ESWITCH_ENCAP_MODE_BASIC, +}; + enum devlink_attr { /* don't change the order or add anything between, this is ABI! */ DEVLINK_ATTR_UNSPEC, @@ -148,10 +157,73 @@ enum devlink_attr { DEVLINK_ATTR_ESWITCH_MODE, /* u16 */ DEVLINK_ATTR_ESWITCH_INLINE_MODE, /* u8 */ + DEVLINK_ATTR_DPIPE_TABLES, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_NAME, /* string */ + DEVLINK_ATTR_DPIPE_TABLE_SIZE, /* u64 */ + DEVLINK_ATTR_DPIPE_TABLE_MATCHES, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_ACTIONS, /* nested */ + DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, /* u8 */ + + DEVLINK_ATTR_DPIPE_ENTRIES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_INDEX, /* u64 */ + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES, /* nested */ + DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, /* u64 */ + + DEVLINK_ATTR_DPIPE_MATCH, /* nested */ + DEVLINK_ATTR_DPIPE_MATCH_VALUE, /* nested */ + DEVLINK_ATTR_DPIPE_MATCH_TYPE, /* u32 */ + + DEVLINK_ATTR_DPIPE_ACTION, /* nested */ + DEVLINK_ATTR_DPIPE_ACTION_VALUE, /* nested */ + DEVLINK_ATTR_DPIPE_ACTION_TYPE, /* u32 */ + + DEVLINK_ATTR_DPIPE_VALUE, + DEVLINK_ATTR_DPIPE_VALUE_MASK, + DEVLINK_ATTR_DPIPE_VALUE_MAPPING, /* u32 */ + + DEVLINK_ATTR_DPIPE_HEADERS, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER_NAME, /* string */ + DEVLINK_ATTR_DPIPE_HEADER_ID, /* u32 */ + DEVLINK_ATTR_DPIPE_HEADER_FIELDS, /* nested */ + DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, /* u8 */ + DEVLINK_ATTR_DPIPE_HEADER_INDEX, /* u32 */ + + DEVLINK_ATTR_DPIPE_FIELD, /* nested */ + DEVLINK_ATTR_DPIPE_FIELD_NAME, /* string */ + DEVLINK_ATTR_DPIPE_FIELD_ID, /* u32 */ + DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, /* u32 */ + DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, /* u32 */ + + DEVLINK_ATTR_PAD, + + DEVLINK_ATTR_ESWITCH_ENCAP_MODE, /* u8 */ + /* add new attributes above here, update the policy in devlink.c */ __DEVLINK_ATTR_MAX, DEVLINK_ATTR_MAX = __DEVLINK_ATTR_MAX - 1 }; +/* Mapping between internal resource described by the field and system + * structure + */ +enum devlink_dpipe_field_mapping_type { + DEVLINK_DPIPE_FIELD_MAPPING_TYPE_NONE, + DEVLINK_DPIPE_FIELD_MAPPING_TYPE_IFINDEX, +}; + +/* Match type - specify the type of the match */ +enum devlink_dpipe_match_type { + DEVLINK_DPIPE_MATCH_TYPE_FIELD_EXACT, +}; + +/* Action type - specify the action type */ +enum devlink_dpipe_action_type { + DEVLINK_DPIPE_ACTION_TYPE_FIELD_MODIFY, +}; + #endif /* _UAPI_LINUX_DEVLINK_H_ */ diff --git a/include/uapi/linux/dvb/Kbuild b/include/uapi/linux/dvb/Kbuild deleted file mode 100644 index d40942cfc627f43b4b0da73a925d72264f778be0..0000000000000000000000000000000000000000 --- a/include/uapi/linux/dvb/Kbuild +++ /dev/null @@ -1,9 +0,0 @@ -# UAPI Header export list -header-y += audio.h -header-y += ca.h -header-y += dmx.h -header-y += frontend.h -header-y += net.h -header-y += osd.h -header-y += version.h -header-y += video.h diff --git a/include/uapi/linux/elf-em.h b/include/uapi/linux/elf-em.h index cb5d1a51920276d567f7c84b20001bda2f9c76ef..9cd1de954c0aca32718e3f3bd4c95f209cdeb0ae 100644 --- a/include/uapi/linux/elf-em.h +++ b/include/uapi/linux/elf-em.h @@ -42,7 +42,6 @@ #define EM_TILEGX 191 /* Tilera TILE-Gx */ #define EM_BPF 247 /* Linux BPF - in-kernel virtual machine */ #define EM_FRV 0x5441 /* Fujitsu FR-V */ -#define EM_AVR32 0x18ad /* Atmel AVR32 */ /* * This is an interim value that we will use until the committee comes diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index b59ee077a5964a888be764fdb5b817e3e377bcc0..b5280db9ef6a8c27f92ee89ffd0432df525e5039 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -409,6 +409,8 @@ typedef struct elf64_shdr { #define NT_S390_TDB 0x308 /* s390 transaction diagnostic block */ #define NT_S390_VXRS_LOW 0x309 /* s390 vector registers 0-15 upper half */ #define NT_S390_VXRS_HIGH 0x30a /* s390 vector registers 16-31 */ +#define NT_S390_GS_CB 0x30b /* s390 guarded storage registers */ +#define NT_S390_GS_BC 0x30c /* s390 guarded storage broadcast control block */ #define NT_ARM_VFP 0x400 /* ARM VFP/NEON registers */ #define NT_ARM_TLS 0x401 /* ARM TLS register */ #define NT_ARM_HW_BREAK 0x402 /* ARM hardware breakpoint registers */ @@ -417,7 +419,7 @@ typedef struct elf64_shdr { #define NT_METAG_CBUF 0x500 /* Metag catch buffer registers */ #define NT_METAG_RPIPE 0x501 /* Metag read pipeline state */ #define NT_METAG_TLS 0x502 /* Metag TLS pointer */ - +#define NT_ARC_V2 0x600 /* ARCv2 accumulator/extra registers */ /* Note header in a PT_NOTE section */ typedef struct elf32_note { diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h index 3dc91a46e8b8da0b243a12a168bbf205e5a87916..d179d7767f519829a50395f47749360f88cd3d2d 100644 --- a/include/uapi/linux/ethtool.h +++ b/include/uapi/linux/ethtool.h @@ -1487,12 +1487,14 @@ enum ethtool_link_mode_bit_indices { */ /* The forced speed, in units of 1Mb. All values 0 to INT_MAX are legal. */ +/* Update drivers/net/phy/phy.c:phy_speed_to_str() when adding new values */ #define SPEED_10 10 #define SPEED_100 100 #define SPEED_1000 1000 #define SPEED_2500 2500 #define SPEED_5000 5000 #define SPEED_10000 10000 +#define SPEED_14000 14000 #define SPEED_20000 20000 #define SPEED_25000 25000 #define SPEED_40000 40000 diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h index 1c3154913a391fb0f67ade68150fd8af939d791d..f4d5c998cc2bc0b941cc3218a7c512464f450790 100644 --- a/include/uapi/linux/eventpoll.h +++ b/include/uapi/linux/eventpoll.h @@ -26,8 +26,21 @@ #define EPOLL_CTL_DEL 2 #define EPOLL_CTL_MOD 3 +/* Epoll event masks */ +#define EPOLLIN 0x00000001 +#define EPOLLPRI 0x00000002 +#define EPOLLOUT 0x00000004 +#define EPOLLERR 0x00000008 +#define EPOLLHUP 0x00000010 +#define EPOLLRDNORM 0x00000040 +#define EPOLLRDBAND 0x00000080 +#define EPOLLWRNORM 0x00000100 +#define EPOLLWRBAND 0x00000200 +#define EPOLLMSG 0x00000400 +#define EPOLLRDHUP 0x00002000 + /* Set exclusive wakeup mode for the target file descriptor */ -#define EPOLLEXCLUSIVE (1 << 28) +#define EPOLLEXCLUSIVE (1U << 28) /* * Request the handling of system wakeup events so as to prevent system suspends @@ -39,13 +52,13 @@ * * Requires CAP_BLOCK_SUSPEND */ -#define EPOLLWAKEUP (1 << 29) +#define EPOLLWAKEUP (1U << 29) /* Set the One Shot behaviour for the target file descriptor */ -#define EPOLLONESHOT (1 << 30) +#define EPOLLONESHOT (1U << 30) /* Set the Edge Triggered behaviour for the target file descriptor */ -#define EPOLLET (1 << 31) +#define EPOLLET (1U << 31) /* * On x86-64 make the 64bit structure have the same alignment as the diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 048a85e9f0174170bc9cf861a388d08e19d1f36c..24e61a54feaaab505b2d2e74e5462a8638d0c897 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -279,12 +279,25 @@ struct fscrypt_policy { __u8 filenames_encryption_mode; __u8 flags; __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; -} __packed; +}; #define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) * diff --git a/include/uapi/linux/fsmap.h b/include/uapi/linux/fsmap.h new file mode 100644 index 0000000000000000000000000000000000000000..7e8e5f0bd6d263c7a9e583a255810f5b85f96441 --- /dev/null +++ b/include/uapi/linux/fsmap.h @@ -0,0 +1,112 @@ +/* + * FS_IOC_GETFSMAP ioctl infrastructure. + * + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong + */ +#ifndef _LINUX_FSMAP_H +#define _LINUX_FSMAP_H + +#include + +/* + * Structure for FS_IOC_GETFSMAP. + * + * The memory layout for this call are the scalar values defined in + * struct fsmap_head, followed by two struct fsmap that describe + * the lower and upper bound of mappings to return, followed by an + * array of struct fsmap mappings. + * + * fmh_iflags control the output of the call, whereas fmh_oflags report + * on the overall record output. fmh_count should be set to the + * length of the fmh_recs array, and fmh_entries will be set to the + * number of entries filled out during each call. If fmh_count is + * zero, the number of reverse mappings will be returned in + * fmh_entries, though no mappings will be returned. fmh_reserved + * must be set to zero. + * + * The two elements in the fmh_keys array are used to constrain the + * output. The first element in the array should represent the + * lowest disk mapping ("low key") that the user wants to learn + * about. If this value is all zeroes, the filesystem will return + * the first entry it knows about. For a subsequent call, the + * contents of fsmap_head.fmh_recs[fsmap_head.fmh_count - 1] should be + * copied into fmh_keys[0] to have the kernel start where it left off. + * + * The second element in the fmh_keys array should represent the + * highest disk mapping ("high key") that the user wants to learn + * about. If this value is all ones, the filesystem will not stop + * until it runs out of mapping to return or runs out of space in + * fmh_recs. + * + * fmr_device can be either a 32-bit cookie representing a device, or + * a 32-bit dev_t if the FMH_OF_DEV_T flag is set. fmr_physical, + * fmr_offset, and fmr_length are expressed in units of bytes. + * fmr_owner is either an inode number, or a special value if + * FMR_OF_SPECIAL_OWNER is set in fmr_flags. + */ +struct fsmap { + __u32 fmr_device; /* device id */ + __u32 fmr_flags; /* mapping flags */ + __u64 fmr_physical; /* device offset of segment */ + __u64 fmr_owner; /* owner id */ + __u64 fmr_offset; /* file offset of segment */ + __u64 fmr_length; /* length of segment */ + __u64 fmr_reserved[3]; /* must be zero */ +}; + +struct fsmap_head { + __u32 fmh_iflags; /* control flags */ + __u32 fmh_oflags; /* output flags */ + __u32 fmh_count; /* # of entries in array incl. input */ + __u32 fmh_entries; /* # of entries filled in (output). */ + __u64 fmh_reserved[6]; /* must be zero */ + + struct fsmap fmh_keys[2]; /* low and high keys for the mapping search */ + struct fsmap fmh_recs[]; /* returned records */ +}; + +/* Size of an fsmap_head with room for nr records. */ +static inline size_t +fsmap_sizeof( + unsigned int nr) +{ + return sizeof(struct fsmap_head) + nr * sizeof(struct fsmap); +} + +/* Start the next fsmap query at the end of the current query results. */ +static inline void +fsmap_advance( + struct fsmap_head *head) +{ + head->fmh_keys[0] = head->fmh_recs[head->fmh_entries - 1]; +} + +/* fmh_iflags values - set by FS_IOC_GETFSMAP caller in the header. */ +/* no flags defined yet */ +#define FMH_IF_VALID 0 + +/* fmh_oflags values - returned in the header segment only. */ +#define FMH_OF_DEV_T 0x1 /* fmr_device values will be dev_t */ + +/* fmr_flags values - returned for each non-header segment */ +#define FMR_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */ +#define FMR_OF_ATTR_FORK 0x2 /* segment = attribute fork */ +#define FMR_OF_EXTENT_MAP 0x4 /* segment = extent map */ +#define FMR_OF_SHARED 0x8 /* segment = shared with another file */ +#define FMR_OF_SPECIAL_OWNER 0x10 /* owner is a special value */ +#define FMR_OF_LAST 0x20 /* segment is the last in the FS */ + +/* Each FS gets to define its own special owner codes. */ +#define FMR_OWNER(type, code) (((__u64)type << 32) | \ + ((__u64)code & 0xFFFFFFFFULL)) +#define FMR_OWNER_TYPE(owner) ((__u32)((__u64)owner >> 32)) +#define FMR_OWNER_CODE(owner) ((__u32)(((__u64)owner & 0xFFFFFFFFULL))) +#define FMR_OWN_FREE FMR_OWNER(0, 1) /* free space */ +#define FMR_OWN_UNKNOWN FMR_OWNER(0, 2) /* unknown owner */ +#define FMR_OWN_METADATA FMR_OWNER(0, 3) /* metadata */ + +#define FS_IOC_GETFSMAP _IOWR('X', 59, struct fsmap_head) + +#endif /* _LINUX_FSMAP_H */ diff --git a/include/uapi/linux/gtp.h b/include/uapi/linux/gtp.h index 72a04a0e8ccef1e5560378af9177a6c47954daaf..57d1edb8efd9bae86d0b11779f59de9d18f35f1a 100644 --- a/include/uapi/linux/gtp.h +++ b/include/uapi/linux/gtp.h @@ -19,7 +19,8 @@ enum gtp_attrs { GTPA_LINK, GTPA_VERSION, GTPA_TID, /* for GTPv0 only */ - GTPA_SGSN_ADDRESS, + GTPA_PEER_ADDRESS, /* Remote GSN peer, either SGSN or GGSN */ +#define GTPA_SGSN_ADDRESS GTPA_PEER_ADDRESS /* maintain legacy attr name */ GTPA_MS_ADDRESS, GTPA_FLOW, GTPA_NET_NS_FD, diff --git a/include/uapi/linux/hdlc/Kbuild b/include/uapi/linux/hdlc/Kbuild deleted file mode 100644 index 8c1d2cb75e330ae605277aa70ec44415a53f3428..0000000000000000000000000000000000000000 --- a/include/uapi/linux/hdlc/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += ioctl.h diff --git a/include/uapi/linux/hsi/Kbuild b/include/uapi/linux/hsi/Kbuild deleted file mode 100644 index a16a00544258c2e99e46fd1632548f79d875075f..0000000000000000000000000000000000000000 --- a/include/uapi/linux/hsi/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += hsi_char.h cs-protocol.h diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h index 4d024d75d64ba8b7831cdb1d61384e47ac14ef33..cf73510b923864fd1b0c20bd3de47ba874a9b855 100644 --- a/include/uapi/linux/if_arp.h +++ b/include/uapi/linux/if_arp.h @@ -95,6 +95,7 @@ #define ARPHRD_IP6GRE 823 /* GRE over IPv6 */ #define ARPHRD_NETLINK 824 /* Netlink header */ #define ARPHRD_6LOWPAN 825 /* IPv6 over LoWPAN */ +#define ARPHRD_VSOCKMON 826 /* Vsock monitor header */ #define ARPHRD_VOID 0xFFFF /* Void type, nothing is known */ #define ARPHRD_NONE 0xFFFE /* zero header length */ diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 320fc1e747ee9623db56fbaf26b2a514b5d5a3d1..8e56ac70e0d1a3536a43aff83370e5b3bd5bb0b4 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -323,6 +323,7 @@ enum { IFLA_BRPORT_MCAST_FLOOD, IFLA_BRPORT_MCAST_TO_UCAST, IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, __IFLA_BRPORT_MAX }; #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) @@ -538,11 +539,18 @@ enum { #define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) /* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + enum { IFLA_GTP_UNSPEC, IFLA_GTP_FD0, IFLA_GTP_FD1, IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, __IFLA_GTP_MAX, }; #define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) @@ -880,7 +888,9 @@ enum { /* XDP section */ #define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) -#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST) +#define XDP_FLAGS_SKB_MODE (2U << 0) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_SKB_MODE) enum { IFLA_XDP_UNSPEC, diff --git a/include/uapi/linux/if_packet.h b/include/uapi/linux/if_packet.h index 9e7edfd8141e5dea69129807f46b89b2b637ead9..4df96a7dd4fae591d834735046911337221c60d3 100644 --- a/include/uapi/linux/if_packet.h +++ b/include/uapi/linux/if_packet.h @@ -66,6 +66,7 @@ struct sockaddr_ll { #define PACKET_FANOUT_CBPF 6 #define PACKET_FANOUT_EBPF 7 #define PACKET_FANOUT_FLAG_ROLLOVER 0x1000 +#define PACKET_FANOUT_FLAG_UNIQUEID 0x2000 #define PACKET_FANOUT_FLAG_DEFRAG 0x8000 struct tpacket_stats { diff --git a/include/uapi/linux/if_tunnel.h b/include/uapi/linux/if_tunnel.h index 92f3c8677523237ea0db55b0269aa0b73b9c2009..6792d1967d3145f3f2092d3b1bf052bf13ce04fb 100644 --- a/include/uapi/linux/if_tunnel.h +++ b/include/uapi/linux/if_tunnel.h @@ -75,6 +75,7 @@ enum { IFLA_IPTUN_ENCAP_SPORT, IFLA_IPTUN_ENCAP_DPORT, IFLA_IPTUN_COLLECT_METADATA, + IFLA_IPTUN_FWMARK, __IFLA_IPTUN_MAX, }; #define IFLA_IPTUN_MAX (__IFLA_IPTUN_MAX - 1) @@ -132,6 +133,7 @@ enum { IFLA_GRE_ENCAP_DPORT, IFLA_GRE_COLLECT_METADATA, IFLA_GRE_IGNORE_DF, + IFLA_GRE_FWMARK, __IFLA_GRE_MAX, }; @@ -147,6 +149,7 @@ enum { IFLA_VTI_OKEY, IFLA_VTI_LOCAL, IFLA_VTI_REMOTE, + IFLA_VTI_FWMARK, __IFLA_VTI_MAX, }; diff --git a/include/uapi/linux/iio/Kbuild b/include/uapi/linux/iio/Kbuild deleted file mode 100644 index 86f76d84c44f41fec38b49b8f1fa4d11dadfd648..0000000000000000000000000000000000000000 --- a/include/uapi/linux/iio/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += events.h -header-y += types.h diff --git a/include/uapi/linux/input-event-codes.h b/include/uapi/linux/input-event-codes.h index 3af60ee69053322b9c6ca72d3120ce9f23da1c80..f5a8d96e1e098543d7cad6af1d7a77b117701f40 100644 --- a/include/uapi/linux/input-event-codes.h +++ b/include/uapi/linux/input-event-codes.h @@ -641,6 +641,7 @@ * e.g. teletext or data broadcast application (MHEG, MHP, HbbTV, etc.) */ #define KEY_DATA 0x277 +#define KEY_ONSCREEN_KEYBOARD 0x278 #define BTN_TRIGGER_HAPPY 0x2c0 #define BTN_TRIGGER_HAPPY1 0x2c0 diff --git a/include/uapi/linux/input.h b/include/uapi/linux/input.h index e794f7bee22fd82db5595d047203b871778cbc3e..f561c0eb7d63645f341f3d75604c075d5e534319 100644 --- a/include/uapi/linux/input.h +++ b/include/uapi/linux/input.h @@ -61,9 +61,14 @@ struct input_id { * Note that input core does not clamp reported values to the * [minimum, maximum] limits, such task is left to userspace. * - * Resolution for main axes (ABS_X, ABS_Y, ABS_Z) is reported in - * units per millimeter (units/mm), resolution for rotational axes - * (ABS_RX, ABS_RY, ABS_RZ) is reported in units per radian. + * The default resolution for main axes (ABS_X, ABS_Y, ABS_Z) + * is reported in units per millimeter (units/mm), resolution + * for rotational axes (ABS_RX, ABS_RY, ABS_RZ) is reported + * in units per radian. + * When INPUT_PROP_ACCELEROMETER is set the resolution changes. + * The main axes (ABS_X, ABS_Y, ABS_Z) are then reported in + * in units per g (units/g) and in units per degree per second + * (units/deg/s) for rotational axes (ABS_RX, ABS_RY, ABS_RZ). */ struct input_absinfo { __s32 value; diff --git a/include/uapi/linux/ipmi.h b/include/uapi/linux/ipmi.h index 7b26a62e570734219a9a493c75c9abb909ceef1a..b9095a27a08a55fdf2e029178e1ce472637bd877 100644 --- a/include/uapi/linux/ipmi.h +++ b/include/uapi/linux/ipmi.h @@ -355,7 +355,7 @@ struct ipmi_cmdspec { #define IPMICTL_REGISTER_FOR_CMD _IOR(IPMI_IOC_MAGIC, 14, \ struct ipmi_cmdspec) /* - * Unregister a regsitered command. error values: + * Unregister a registered command. error values: * - EFAULT - an address supplied was invalid. * - ENOENT - The netfn/cmd was not found registered for this user. */ diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h index 8ef9e75e004ebbf951cb50b4b4b17f2c5b907dd4..2ae59178189d7983fb9ed99ae1cd5f40197cfb04 100644 --- a/include/uapi/linux/ipv6.h +++ b/include/uapi/linux/ipv6.h @@ -183,6 +183,8 @@ enum { DEVCONF_SEG6_REQUIRE_HMAC, DEVCONF_ENHANCED_DAD, DEVCONF_ADDR_GEN_MODE, + DEVCONF_DISABLE_POLICY, + DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN, DEVCONF_MAX }; diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h index 85bbb1799df3f93cb1eacbfec497278c2c20c098..d496c02e14bc44327fd3ab6c3faad0c1d7ac1e12 100644 --- a/include/uapi/linux/ipv6_route.h +++ b/include/uapi/linux/ipv6_route.h @@ -35,7 +35,7 @@ #define RTF_PREF(pref) ((pref) << 27) #define RTF_PREF_MASK 0x18000000 -#define RTF_PCPU 0x40000000 +#define RTF_PCPU 0x40000000 /* read-only: can not be set by user */ #define RTF_LOCAL 0x80000000 diff --git a/include/uapi/linux/isdn/Kbuild b/include/uapi/linux/isdn/Kbuild deleted file mode 100644 index 89e52850bf29c400d72541991b4877a9d499581d..0000000000000000000000000000000000000000 --- a/include/uapi/linux/isdn/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += capicmd.h diff --git a/include/uapi/linux/keyctl.h b/include/uapi/linux/keyctl.h index 86eddd6241f36eb21897d65c6bb51f15bbb4ec58..201c6644b2376846f3fccf16bc64d99a5aabdfd4 100644 --- a/include/uapi/linux/keyctl.h +++ b/include/uapi/linux/keyctl.h @@ -60,6 +60,7 @@ #define KEYCTL_INVALIDATE 21 /* invalidate a key */ #define KEYCTL_GET_PERSISTENT 22 /* get a user's persistent keyring */ #define KEYCTL_DH_COMPUTE 23 /* Compute Diffie-Hellman values */ +#define KEYCTL_RESTRICT_KEYRING 29 /* Restrict keys allowed to link to a keyring */ /* keyctl structures */ struct keyctl_dh_params { @@ -68,4 +69,11 @@ struct keyctl_dh_params { __s32 base; }; +struct keyctl_kdf_params { + char *hashname; + char *otherinfo; + __u32 otherinfolen; + __u32 __spare[8]; +}; + #endif /* _LINUX_KEYCTL_H */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f51d5082a377ec1b6f2a062e54fe8d4503702e3b..577429a95ad887ca98dd31d7f54b0a6a6da8b7db 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2 +/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ +#define KVM_VM_MIPS_TE 0 +#define KVM_VM_MIPS_VZ 1 + #define KVM_S390_SIE_PAGE_OFFSET 1 /* @@ -883,6 +887,14 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_MIPS_VZ 137 +#define KVM_CAP_MIPS_TE 138 +#define KVM_CAP_MIPS_64BIT 139 +#define KVM_CAP_S390_GS 140 +#define KVM_CAP_S390_AIS 141 +#define KVM_CAP_SPAPR_TCE_VFIO 142 +#define KVM_CAP_X86_GUEST_MWAIT 143 +#define KVM_CAP_ARM_USER_IRQ 144 #ifdef KVM_CAP_IRQ_ROUTING @@ -1087,6 +1099,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP 1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 enum kvm_device_type { KVM_DEV_TYPE_FSL_MPIC_20 = 1, @@ -1108,6 +1121,11 @@ enum kvm_device_type { KVM_DEV_TYPE_MAX, }; +struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; +}; + /* * ioctls for VM fds */ @@ -1354,4 +1372,11 @@ struct kvm_assigned_msix_entry { #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +/* Available with KVM_CAP_ARM_USER_IRQ */ + +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + #endif /* __LINUX_KVM_H */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index fd19f36b3129278343f37ee5e59dfa280713eb9c..c8aec4b9e73b8c85189eb9b62fa98cf1fb6e7bfa 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -85,6 +85,10 @@ struct nvm_ioctl_create_conf { }; }; +enum { + NVM_TARGET_FACTORY = 1 << 0, /* Init target in factory mode */ +}; + struct nvm_ioctl_create { char dev[DISK_NAME_LEN]; /* open-channel SSD device */ char tgttype[NVM_TTYPE_NAME_MAX]; /* target type name */ diff --git a/include/uapi/linux/mmc/Kbuild b/include/uapi/linux/mmc/Kbuild deleted file mode 100644 index 8c1d2cb75e330ae605277aa70ec44415a53f3428..0000000000000000000000000000000000000000 --- a/include/uapi/linux/mmc/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += ioctl.h diff --git a/include/uapi/linux/mpls_iptunnel.h b/include/uapi/linux/mpls_iptunnel.h index d80a0498f77ed2d4dac3b61a6eef1906eb8b0626..f5e45095b0bb5c17af6515012587d6d805a7d79c 100644 --- a/include/uapi/linux/mpls_iptunnel.h +++ b/include/uapi/linux/mpls_iptunnel.h @@ -16,11 +16,13 @@ /* MPLS tunnel attributes * [RTA_ENCAP] = { * [MPLS_IPTUNNEL_DST] + * [MPLS_IPTUNNEL_TTL] * } */ enum { MPLS_IPTUNNEL_UNSPEC, MPLS_IPTUNNEL_DST, + MPLS_IPTUNNEL_TTL, __MPLS_IPTUNNEL_MAX, }; #define MPLS_IPTUNNEL_MAX (__MPLS_IPTUNNEL_MAX - 1) diff --git a/include/uapi/linux/nbd-netlink.h b/include/uapi/linux/nbd-netlink.h new file mode 100644 index 0000000000000000000000000000000000000000..6f7ca3d63a653a113745aa6e1c6fab228c78f610 --- /dev/null +++ b/include/uapi/linux/nbd-netlink.h @@ -0,0 +1,98 @@ +/* + * Copyright (C) 2017 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef _UAPILINUX_NBD_NETLINK_H +#define _UAPILINUX_NBD_NETLINK_H + +#define NBD_GENL_FAMILY_NAME "nbd" +#define NBD_GENL_VERSION 0x1 +#define NBD_GENL_MCAST_GROUP_NAME "nbd_mc_group" + +/* Configuration policy attributes, used for CONNECT */ +enum { + NBD_ATTR_UNSPEC, + NBD_ATTR_INDEX, + NBD_ATTR_SIZE_BYTES, + NBD_ATTR_BLOCK_SIZE_BYTES, + NBD_ATTR_TIMEOUT, + NBD_ATTR_SERVER_FLAGS, + NBD_ATTR_CLIENT_FLAGS, + NBD_ATTR_SOCKETS, + NBD_ATTR_DEAD_CONN_TIMEOUT, + NBD_ATTR_DEVICE_LIST, + __NBD_ATTR_MAX, +}; +#define NBD_ATTR_MAX (__NBD_ATTR_MAX - 1) + +/* + * This is the format for multiple devices with NBD_ATTR_DEVICE_LIST + * + * [NBD_ATTR_DEVICE_LIST] + * [NBD_DEVICE_ITEM] + * [NBD_DEVICE_INDEX] + * [NBD_DEVICE_CONNECTED] + */ +enum { + NBD_DEVICE_ITEM_UNSPEC, + NBD_DEVICE_ITEM, + __NBD_DEVICE_ITEM_MAX, +}; +#define NBD_DEVICE_ITEM_MAX (__NBD_DEVICE_ITEM_MAX - 1) + +enum { + NBD_DEVICE_UNSPEC, + NBD_DEVICE_INDEX, + NBD_DEVICE_CONNECTED, + __NBD_DEVICE_MAX, +}; +#define NBD_DEVICE_ATTR_MAX (__NBD_DEVICE_MAX - 1) + +/* + * This is the format for multiple sockets with NBD_ATTR_SOCKETS + * + * [NBD_ATTR_SOCKETS] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + * [NBD_SOCK_ITEM] + * [NBD_SOCK_FD] + */ +enum { + NBD_SOCK_ITEM_UNSPEC, + NBD_SOCK_ITEM, + __NBD_SOCK_ITEM_MAX, +}; +#define NBD_SOCK_ITEM_MAX (__NBD_SOCK_ITEM_MAX - 1) + +enum { + NBD_SOCK_UNSPEC, + NBD_SOCK_FD, + __NBD_SOCK_MAX, +}; +#define NBD_SOCK_MAX (__NBD_SOCK_MAX - 1) + +enum { + NBD_CMD_UNSPEC, + NBD_CMD_CONNECT, + NBD_CMD_DISCONNECT, + NBD_CMD_RECONFIGURE, + NBD_CMD_LINK_DEAD, + NBD_CMD_STATUS, + __NBD_CMD_MAX, +}; +#define NBD_CMD_MAX (__NBD_CMD_MAX - 1) + +#endif /* _UAPILINUX_NBD_NETLINK_H */ diff --git a/include/uapi/linux/nbd.h b/include/uapi/linux/nbd.h index c91c642ea9003ed3ec5ba1be70b9899af3847948..155e33f819134a3aecdccf3c139974ef0c54c68c 100644 --- a/include/uapi/linux/nbd.h +++ b/include/uapi/linux/nbd.h @@ -37,7 +37,7 @@ enum { NBD_CMD_TRIM = 4 }; -/* values for flags field */ +/* values for flags field, these are server interaction specific. */ #define NBD_FLAG_HAS_FLAGS (1 << 0) /* nbd-server supports flags */ #define NBD_FLAG_READ_ONLY (1 << 1) /* device is read-only */ #define NBD_FLAG_SEND_FLUSH (1 << 2) /* can flush writeback cache */ @@ -45,6 +45,10 @@ enum { #define NBD_FLAG_SEND_TRIM (1 << 5) /* send trim/discard */ #define NBD_FLAG_CAN_MULTI_CONN (1 << 8) /* Server supports multiple connections per export. */ +/* These are client behavior specific flags. */ +#define NBD_CFLAG_DESTROY_ON_DISCONNECT (1 << 0) /* delete the nbd device on + disconnect. */ + /* userspace doesn't need the nbd_device structure */ /* These are sent over the network in the request/reply magic fields */ diff --git a/include/uapi/linux/ndctl.h b/include/uapi/linux/ndctl.h index ede5c6a62164ae182f4495724bf4a518e5b1b93b..7ad3863cb88b5b86766521bcee795e1a794faba3 100644 --- a/include/uapi/linux/ndctl.h +++ b/include/uapi/linux/ndctl.h @@ -169,6 +169,7 @@ enum { enum { ND_ARS_VOLATILE = 1, ND_ARS_PERSISTENT = 2, + ND_CONFIG_LOCKED = 1, }; static inline const char *nvdimm_bus_cmd_name(unsigned cmd) diff --git a/include/uapi/linux/netfilter/Kbuild b/include/uapi/linux/netfilter/Kbuild deleted file mode 100644 index 03f194aeadc51b56d468236c9bba67afbe0b86bd..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter/Kbuild +++ /dev/null @@ -1,89 +0,0 @@ -# UAPI Header export list -header-y += ipset/ -header-y += nf_conntrack_common.h -header-y += nf_conntrack_ftp.h -header-y += nf_conntrack_sctp.h -header-y += nf_conntrack_tcp.h -header-y += nf_conntrack_tuple_common.h -header-y += nf_log.h -header-y += nf_tables.h -header-y += nf_tables_compat.h -header-y += nf_nat.h -header-y += nfnetlink.h -header-y += nfnetlink_acct.h -header-y += nfnetlink_compat.h -header-y += nfnetlink_conntrack.h -header-y += nfnetlink_cthelper.h -header-y += nfnetlink_cttimeout.h -header-y += nfnetlink_log.h -header-y += nfnetlink_queue.h -header-y += x_tables.h -header-y += xt_AUDIT.h -header-y += xt_CHECKSUM.h -header-y += xt_CLASSIFY.h -header-y += xt_CONNMARK.h -header-y += xt_CONNSECMARK.h -header-y += xt_CT.h -header-y += xt_DSCP.h -header-y += xt_HMARK.h -header-y += xt_IDLETIMER.h -header-y += xt_LED.h -header-y += xt_LOG.h -header-y += xt_MARK.h -header-y += xt_NFLOG.h -header-y += xt_NFQUEUE.h -header-y += xt_RATEEST.h -header-y += xt_SECMARK.h -header-y += xt_SYNPROXY.h -header-y += xt_TCPMSS.h -header-y += xt_TCPOPTSTRIP.h -header-y += xt_TEE.h -header-y += xt_TPROXY.h -header-y += xt_addrtype.h -header-y += xt_bpf.h -header-y += xt_cgroup.h -header-y += xt_cluster.h -header-y += xt_comment.h -header-y += xt_connbytes.h -header-y += xt_connlabel.h -header-y += xt_connlimit.h -header-y += xt_connmark.h -header-y += xt_conntrack.h -header-y += xt_cpu.h -header-y += xt_dccp.h -header-y += xt_devgroup.h -header-y += xt_dscp.h -header-y += xt_ecn.h -header-y += xt_esp.h -header-y += xt_hashlimit.h -header-y += xt_helper.h -header-y += xt_ipcomp.h -header-y += xt_iprange.h -header-y += xt_ipvs.h -header-y += xt_l2tp.h -header-y += xt_length.h -header-y += xt_limit.h -header-y += xt_mac.h -header-y += xt_mark.h -header-y += xt_multiport.h -header-y += xt_nfacct.h -header-y += xt_osf.h -header-y += xt_owner.h -header-y += xt_physdev.h -header-y += xt_pkttype.h -header-y += xt_policy.h -header-y += xt_quota.h -header-y += xt_rateest.h -header-y += xt_realm.h -header-y += xt_recent.h -header-y += xt_rpfilter.h -header-y += xt_sctp.h -header-y += xt_set.h -header-y += xt_socket.h -header-y += xt_state.h -header-y += xt_statistic.h -header-y += xt_string.h -header-y += xt_tcpmss.h -header-y += xt_tcpudp.h -header-y += xt_time.h -header-y += xt_u32.h diff --git a/include/uapi/linux/netfilter/ipset/Kbuild b/include/uapi/linux/netfilter/ipset/Kbuild deleted file mode 100644 index d2680423d9abbe0a3e4bd755f0c27e4eb66dacad..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter/ipset/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# UAPI Header export list -header-y += ip_set.h -header-y += ip_set_bitmap.h -header-y += ip_set_hash.h -header-y += ip_set_list.h diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index 6a8e33dd4ecbd83b0d254d4cf2486dfe5cacfd34..dc947e59d03a62c5dd6fccf27c1db0a88270ea4f 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -28,12 +28,14 @@ enum ip_conntrack_info { /* only for userspace compatibility */ #ifndef __KERNEL__ IP_CT_NEW_REPLY = IP_CT_NUMBER, +#else + IP_CT_UNTRACKED = 7, #endif }; #define NF_CT_STATE_INVALID_BIT (1 << 0) #define NF_CT_STATE_BIT(ctinfo) (1 << ((ctinfo) % IP_CT_IS_REPLY + 1)) -#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_NUMBER + 1)) +#define NF_CT_STATE_UNTRACKED_BIT (1 << (IP_CT_UNTRACKED + 1)) /* Bitset representing status of connection. */ enum ip_conntrack_status { @@ -82,10 +84,6 @@ enum ip_conntrack_status { IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), - /* Bits that cannot be altered from userland. */ - IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | - IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING), - /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), @@ -94,13 +92,22 @@ enum ip_conntrack_status { IPS_TEMPLATE_BIT = 11, IPS_TEMPLATE = (1 << IPS_TEMPLATE_BIT), - /* Conntrack is a fake untracked entry */ + /* Conntrack is a fake untracked entry. Obsolete and not used anymore */ IPS_UNTRACKED_BIT = 12, IPS_UNTRACKED = (1 << IPS_UNTRACKED_BIT), /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), + + /* Be careful here, modifying these bits can make things messy, + * so don't let users modify them directly. + */ + IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | + IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | + IPS_SEQ_ADJUST | IPS_TEMPLATE), + + __IPS_MAX_BIT = 14, }; /* Connection tracking event types */ @@ -117,6 +124,9 @@ enum ip_conntrack_events { IPCT_NATSEQADJ = IPCT_SEQADJ, IPCT_SECMARK, /* new security mark has been set */ IPCT_LABEL, /* new connlabel has been set */ +#ifdef __KERNEL__ + __IPCT_MAX +#endif }; enum ip_conntrack_expect_events { diff --git a/include/uapi/linux/netfilter/nf_tables.h b/include/uapi/linux/netfilter/nf_tables.h index 05215d30fe5c9853b7871e799ccdce4878a04ef1..683f6f88fcacefa0898e3898cd75f31422fc0f9a 100644 --- a/include/uapi/linux/netfilter/nf_tables.h +++ b/include/uapi/linux/netfilter/nf_tables.h @@ -815,6 +815,17 @@ enum nft_rt_keys { NFT_RT_NEXTHOP6, }; +/** + * enum nft_hash_types - nf_tables hash expression types + * + * @NFT_HASH_JENKINS: Jenkins Hash + * @NFT_HASH_SYM: Symmetric Hash + */ +enum nft_hash_types { + NFT_HASH_JENKINS, + NFT_HASH_SYM, +}; + /** * enum nft_hash_attributes - nf_tables hash expression netlink attributes * @@ -824,6 +835,7 @@ enum nft_rt_keys { * @NFTA_HASH_MODULUS: modulus value (NLA_U32) * @NFTA_HASH_SEED: seed value (NLA_U32) * @NFTA_HASH_OFFSET: add this offset value to hash result (NLA_U32) + * @NFTA_HASH_TYPE: hash operation (NLA_U32: nft_hash_types) */ enum nft_hash_attributes { NFTA_HASH_UNSPEC, @@ -833,6 +845,7 @@ enum nft_hash_attributes { NFTA_HASH_MODULUS, NFTA_HASH_SEED, NFTA_HASH_OFFSET, + NFTA_HASH_TYPE, __NFTA_HASH_MAX, }; #define NFTA_HASH_MAX (__NFTA_HASH_MAX - 1) @@ -888,6 +901,7 @@ enum nft_rt_attributes { * @NFT_CT_BYTES: conntrack bytes * @NFT_CT_AVGPKT: conntrack average bytes per packet * @NFT_CT_ZONE: conntrack zone + * @NFT_CT_EVENTMASK: ctnetlink events to be generated for this conntrack */ enum nft_ct_keys { NFT_CT_STATE, @@ -908,6 +922,7 @@ enum nft_ct_keys { NFT_CT_BYTES, NFT_CT_AVGPKT, NFT_CT_ZONE, + NFT_CT_EVENTMASK, }; /** @@ -1244,12 +1259,23 @@ enum nft_fib_flags { NFTA_FIB_F_MARK = 1 << 2, /* use skb->mark */ NFTA_FIB_F_IIF = 1 << 3, /* restrict to iif */ NFTA_FIB_F_OIF = 1 << 4, /* restrict to oif */ + NFTA_FIB_F_PRESENT = 1 << 5, /* check existence only */ +}; + +enum nft_ct_helper_attributes { + NFTA_CT_HELPER_UNSPEC, + NFTA_CT_HELPER_NAME, + NFTA_CT_HELPER_L3PROTO, + NFTA_CT_HELPER_L4PROTO, + __NFTA_CT_HELPER_MAX, }; +#define NFTA_CT_HELPER_MAX (__NFTA_CT_HELPER_MAX - 1) #define NFT_OBJECT_UNSPEC 0 #define NFT_OBJECT_COUNTER 1 #define NFT_OBJECT_QUOTA 2 -#define __NFT_OBJECT_MAX 3 +#define NFT_OBJECT_CT_HELPER 3 +#define __NFT_OBJECT_MAX 4 #define NFT_OBJECT_MAX (__NFT_OBJECT_MAX - 1) /** diff --git a/include/uapi/linux/netfilter_arp/Kbuild b/include/uapi/linux/netfilter_arp/Kbuild deleted file mode 100644 index 62d5637cc0ac5c40cbca78d70b1fb3184cd31427..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter_arp/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += arp_tables.h -header-y += arpt_mangle.h diff --git a/include/uapi/linux/netfilter_bridge/Kbuild b/include/uapi/linux/netfilter_bridge/Kbuild deleted file mode 100644 index 0fbad8ef96de9ecd00262800e4dda07564bbb13b..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter_bridge/Kbuild +++ /dev/null @@ -1,18 +0,0 @@ -# UAPI Header export list -header-y += ebt_802_3.h -header-y += ebt_among.h -header-y += ebt_arp.h -header-y += ebt_arpreply.h -header-y += ebt_ip.h -header-y += ebt_ip6.h -header-y += ebt_limit.h -header-y += ebt_log.h -header-y += ebt_mark_m.h -header-y += ebt_mark_t.h -header-y += ebt_nat.h -header-y += ebt_nflog.h -header-y += ebt_pkttype.h -header-y += ebt_redirect.h -header-y += ebt_stp.h -header-y += ebt_vlan.h -header-y += ebtables.h diff --git a/include/uapi/linux/netfilter_ipv4/Kbuild b/include/uapi/linux/netfilter_ipv4/Kbuild deleted file mode 100644 index ecb291df390e584a756e057fbfc8baba9f17f63d..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter_ipv4/Kbuild +++ /dev/null @@ -1,10 +0,0 @@ -# UAPI Header export list -header-y += ip_tables.h -header-y += ipt_CLUSTERIP.h -header-y += ipt_ECN.h -header-y += ipt_LOG.h -header-y += ipt_REJECT.h -header-y += ipt_TTL.h -header-y += ipt_ah.h -header-y += ipt_ecn.h -header-y += ipt_ttl.h diff --git a/include/uapi/linux/netfilter_ipv6/Kbuild b/include/uapi/linux/netfilter_ipv6/Kbuild deleted file mode 100644 index 75a668ca2353ad9abfa07bbbd6e515aa49ec37fb..0000000000000000000000000000000000000000 --- a/include/uapi/linux/netfilter_ipv6/Kbuild +++ /dev/null @@ -1,13 +0,0 @@ -# UAPI Header export list -header-y += ip6_tables.h -header-y += ip6t_HL.h -header-y += ip6t_LOG.h -header-y += ip6t_NPT.h -header-y += ip6t_REJECT.h -header-y += ip6t_ah.h -header-y += ip6t_frag.h -header-y += ip6t_hl.h -header-y += ip6t_ipv6header.h -header-y += ip6t_mh.h -header-y += ip6t_opts.h -header-y += ip6t_rt.h diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h index f3946a27bd07d5164fac6964785c86044f031790..f86127a46cfc80697df711c847d6a07b5a722347 100644 --- a/include/uapi/linux/netlink.h +++ b/include/uapi/linux/netlink.h @@ -50,12 +50,12 @@ struct nlmsghdr { /* Flags values */ -#define NLM_F_REQUEST 1 /* It is request message. */ -#define NLM_F_MULTI 2 /* Multipart message, terminated by NLMSG_DONE */ -#define NLM_F_ACK 4 /* Reply with ack, with zero or error code */ -#define NLM_F_ECHO 8 /* Echo this request */ -#define NLM_F_DUMP_INTR 16 /* Dump was inconsistent due to sequence change */ -#define NLM_F_DUMP_FILTERED 32 /* Dump was filtered as requested */ +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ /* Modifiers to GET request */ #define NLM_F_ROOT 0x100 /* specify tree root */ @@ -69,6 +69,10 @@ struct nlmsghdr { #define NLM_F_CREATE 0x400 /* Create, if it does not exist */ #define NLM_F_APPEND 0x800 /* Add to end of list */ +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + /* 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL 4.4BSD CHANGE NLM_F_REPLACE @@ -101,6 +105,37 @@ struct nlmsghdr { struct nlmsgerr { int error; struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 }; #define NETLINK_ADD_MEMBERSHIP 1 @@ -115,6 +150,7 @@ struct nlmsgerr { #define NETLINK_LISTEN_ALL_NSID 8 #define NETLINK_LIST_MEMBERSHIPS 9 #define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 struct nl_pktinfo { __u32 group; diff --git a/include/uapi/linux/netlink_diag.h b/include/uapi/linux/netlink_diag.h index 76b4d87c83a863dc7b0e809f5ab318b9656d1dfb..6dcd4de3397b393eb64c9da83e898484af65fda9 100644 --- a/include/uapi/linux/netlink_diag.h +++ b/include/uapi/linux/netlink_diag.h @@ -38,6 +38,7 @@ enum { NETLINK_DIAG_GROUPS, NETLINK_DIAG_RX_RING, NETLINK_DIAG_TX_RING, + NETLINK_DIAG_FLAGS, __NETLINK_DIAG_MAX, }; @@ -52,5 +53,14 @@ enum { /* deprecated since 4.6 */ #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ #endif +#define NDIAG_SHOW_FLAGS 0x00000008 /* show flags of a netlink socket */ + +/* flags */ +#define NDIAG_FLAG_CB_RUNNING 0x00000001 +#define NDIAG_FLAG_PKTINFO 0x00000002 +#define NDIAG_FLAG_BROADCAST_ERROR 0x00000004 +#define NDIAG_FLAG_NO_ENOBUFS 0x00000008 +#define NDIAG_FLAG_LISTEN_ALL_NSID 0x00000010 +#define NDIAG_FLAG_CAP_ACK 0x00000020 #endif diff --git a/include/uapi/linux/nfsd/Kbuild b/include/uapi/linux/nfsd/Kbuild deleted file mode 100644 index c11bc404053c121fc82604514fd24e6506e1717f..0000000000000000000000000000000000000000 --- a/include/uapi/linux/nfsd/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# UAPI Header export list -header-y += cld.h -header-y += debug.h -header-y += export.h -header-y += nfsfh.h -header-y += stats.h diff --git a/include/uapi/linux/nfsd/cld.h b/include/uapi/linux/nfsd/cld.h index f14a9ab06f1f705677a797883bf26a431f1b9c65..ec260274be0ced9fd057cb2d3f78c9c7b6b622e9 100644 --- a/include/uapi/linux/nfsd/cld.h +++ b/include/uapi/linux/nfsd/cld.h @@ -22,6 +22,8 @@ #ifndef _NFSD_CLD_H #define _NFSD_CLD_H +#include + /* latest upcall version available */ #define CLD_UPCALL_VERSION 1 @@ -37,18 +39,18 @@ enum cld_command { /* representation of long-form NFSv4 client ID */ struct cld_name { - uint16_t cn_len; /* length of cm_id */ + __u16 cn_len; /* length of cm_id */ unsigned char cn_id[NFS4_OPAQUE_LIMIT]; /* client-provided */ } __attribute__((packed)); /* message struct for communication with userspace */ struct cld_msg { - uint8_t cm_vers; /* upcall version */ - uint8_t cm_cmd; /* upcall command */ - int16_t cm_status; /* return code */ - uint32_t cm_xid; /* transaction id */ + __u8 cm_vers; /* upcall version */ + __u8 cm_cmd; /* upcall command */ + __s16 cm_status; /* return code */ + __u32 cm_xid; /* transaction id */ union { - int64_t cm_gracetime; /* grace period start time */ + __s64 cm_gracetime; /* grace period start time */ struct cld_name cm_name; } __attribute__((packed)) cm_u; } __attribute__((packed)); diff --git a/include/uapi/linux/nl80211.h b/include/uapi/linux/nl80211.h index 5ed257c4cd4eaad8fcbd8dbde4ca88f385629271..b8c44b98f12d4c42d4f88a73049da962be2678b2 100644 --- a/include/uapi/linux/nl80211.h +++ b/include/uapi/linux/nl80211.h @@ -172,6 +172,42 @@ * Multiple such rules can be created. */ +/** + * DOC: FILS shared key authentication offload + * + * FILS shared key authentication offload can be advertized by drivers by + * setting @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD flag. The drivers that support + * FILS shared key authentication offload should be able to construct the + * authentication and association frames for FILS shared key authentication and + * eventually do a key derivation as per IEEE 802.11ai. The below additional + * parameters should be given to driver in %NL80211_CMD_CONNECT. + * %NL80211_ATTR_FILS_ERP_USERNAME - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_REALM - used to construct keyname_nai + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used to construct erp message + * %NL80211_ATTR_FILS_ERP_RRK - used to generate the rIK and rMSK + * rIK should be used to generate an authentication tag on the ERP message and + * rMSK should be used to derive a PMKSA. + * rIK, rMSK should be generated and keyname_nai, sequence number should be used + * as specified in IETF RFC 6696. + * + * When FILS shared key authentication is completed, driver needs to provide the + * below additional parameters to userspace. + * %NL80211_ATTR_FILS_KEK - used for key renewal + * %NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM - used in further EAP-RP exchanges + * %NL80211_ATTR_PMKID - used to identify the PMKSA used/generated + * %Nl80211_ATTR_PMK - used to update PMKSA cache in userspace + * The PMKSA can be maintained in userspace persistently so that it can be used + * later after reboots or wifi turn off/on also. + * + * %NL80211_ATTR_FILS_CACHE_ID is the cache identifier advertized by a FILS + * capable AP supporting PMK caching. It specifies the scope within which the + * PMKSAs are cached in an ESS. %NL80211_CMD_SET_PMKSA and + * %NL80211_CMD_DEL_PMKSA are enhanced to allow support for PMKSA caching based + * on FILS cache identifier. Additionally %NL80211_ATTR_PMK is used with + * %NL80211_SET_PMKSA to specify the PMK corresponding to a PMKSA for driver to + * use in a FILS shared key connection with PMKSA caching. + */ + /** * enum nl80211_commands - supported nl80211 commands * @@ -351,7 +387,9 @@ * are used. Extra IEs can also be passed from the userspace by * using the %NL80211_ATTR_IE attribute. The first cycle of the * scheduled scan can be delayed by %NL80211_ATTR_SCHED_SCAN_DELAY - * is supplied. + * is supplied. If the device supports multiple concurrent scheduled + * scans, it will allow such when the caller provides the flag attribute + * %NL80211_ATTR_SCHED_SCAN_MULTI to indicate user-space support for it. * @NL80211_CMD_STOP_SCHED_SCAN: stop a scheduled scan. Returns -ENOENT if * scheduled scan is not running. The caller may assume that as soon * as the call returns, it is safe to start a new scheduled scan again. @@ -370,10 +408,18 @@ * @NL80211_CMD_NEW_SURVEY_RESULTS: survey data notification (as a reply to * NL80211_CMD_GET_SURVEY and on the "scan" multicast group) * - * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry, using %NL80211_ATTR_MAC - * (for the BSSID) and %NL80211_ATTR_PMKID. + * @NL80211_CMD_SET_PMKSA: Add a PMKSA cache entry using %NL80211_ATTR_MAC + * (for the BSSID), %NL80211_ATTR_PMKID, and optionally %NL80211_ATTR_PMK + * (PMK is used for PTKSA derivation in case of FILS shared key offload) or + * using %NL80211_ATTR_SSID, %NL80211_ATTR_FILS_CACHE_ID, + * %NL80211_ATTR_PMKID, and %NL80211_ATTR_PMK in case of FILS + * authentication where %NL80211_ATTR_FILS_CACHE_ID is the identifier + * advertized by a FILS capable AP identifying the scope of PMKSA in an + * ESS. * @NL80211_CMD_DEL_PMKSA: Delete a PMKSA cache entry, using %NL80211_ATTR_MAC - * (for the BSSID) and %NL80211_ATTR_PMKID. + * (for the BSSID) and %NL80211_ATTR_PMKID or using %NL80211_ATTR_SSID, + * %NL80211_ATTR_FILS_CACHE_ID, and %NL80211_ATTR_PMKID in case of FILS + * authentication. * @NL80211_CMD_FLUSH_PMKSA: Flush all PMKSA cache entries. * * @NL80211_CMD_REG_CHANGE: indicates to userspace the regulatory domain @@ -2012,6 +2058,36 @@ enum nl80211_commands { * u32 attribute with an &enum nl80211_timeout_reason value. This is used, * e.g., with %NL80211_CMD_CONNECT event. * + * @NL80211_ATTR_FILS_ERP_USERNAME: EAP Re-authentication Protocol (ERP) + * username part of NAI used to refer keys rRK and rIK. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_REALM: EAP Re-authentication Protocol (ERP) realm part + * of NAI specifying the domain name of the ER server. This is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM: Unsigned 16-bit ERP next sequence number + * to use in ERP messages. This is used in generating the FILS wrapped data + * for FILS authentication and is used with %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_ERP_RRK: ERP re-authentication Root Key (rRK) for the + * NAI specified by %NL80211_ATTR_FILS_ERP_USERNAME and + * %NL80211_ATTR_FILS_ERP_REALM. This is used for generating rIK and rMSK + * from successful FILS authentication and is used with + * %NL80211_CMD_CONNECT. + * + * @NL80211_ATTR_FILS_CACHE_ID: A 2-octet identifier advertized by a FILS AP + * identifying the scope of PMKSAs. This is used with + * @NL80211_CMD_SET_PMKSA and @NL80211_CMD_DEL_PMKSA. + * + * @NL80211_ATTR_PMK: PMK for the PMKSA identified by %NL80211_ATTR_PMKID. + * This is used with @NL80211_CMD_SET_PMKSA. + * + * @NL80211_ATTR_SCHED_SCAN_MULTI: flag attribute which user-space shall use to + * indicate that it supports multiple active scheduled scan requests. + * @NL80211_ATTR_SCHED_SCAN_MAX_REQS: indicates maximum number of scheduled + * scan request that may be active for the device (u32). + * * @NUM_NL80211_ATTR: total number of nl80211_attrs available * @NL80211_ATTR_MAX: highest attribute number currently defined * @__NL80211_ATTR_AFTER_LAST: internal use @@ -2423,6 +2499,17 @@ enum nl80211_attrs { NL80211_ATTR_TIMEOUT_REASON, + NL80211_ATTR_FILS_ERP_USERNAME, + NL80211_ATTR_FILS_ERP_REALM, + NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + NL80211_ATTR_FILS_ERP_RRK, + NL80211_ATTR_FILS_CACHE_ID, + + NL80211_ATTR_PMK, + + NL80211_ATTR_SCHED_SCAN_MULTI, + NL80211_ATTR_SCHED_SCAN_MAX_REQS, + /* add attributes here, update the policy in nl80211.c */ __NL80211_ATTR_AFTER_LAST, @@ -3107,6 +3194,7 @@ enum nl80211_reg_rule_attr { * @__NL80211_SCHED_SCAN_MATCH_ATTR_INVALID: attribute number 0 is reserved * @NL80211_SCHED_SCAN_MATCH_ATTR_SSID: SSID to be used for matching, * only report BSS with matching SSID. + * (This cannot be used together with BSSID.) * @NL80211_SCHED_SCAN_MATCH_ATTR_RSSI: RSSI threshold (in dBm) for reporting a * BSS in scan results. Filtering is turned off if not specified. Note that * if this attribute is in a match set of its own, then it is treated as @@ -3122,6 +3210,8 @@ enum nl80211_reg_rule_attr { * BSS-es in the specified band is to be adjusted before doing * RSSI-based BSS selection. The attribute value is a packed structure * value as specified by &struct nl80211_bss_select_rssi_adjust. + * @NL80211_SCHED_SCAN_MATCH_ATTR_BSSID: BSSID to be used for matching + * (this cannot be used together with SSID). * @NL80211_SCHED_SCAN_MATCH_ATTR_MAX: highest scheduled scan filter * attribute number currently defined * @__NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST: internal use @@ -3133,6 +3223,7 @@ enum nl80211_sched_scan_match_attr { NL80211_SCHED_SCAN_MATCH_ATTR_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RELATIVE_RSSI, NL80211_SCHED_SCAN_MATCH_ATTR_RSSI_ADJUST, + NL80211_SCHED_SCAN_MATCH_ATTR_BSSID, /* keep last */ __NL80211_SCHED_SCAN_MATCH_ATTR_AFTER_LAST, @@ -3942,7 +4033,10 @@ enum nl80211_ps_state { * @__NL80211_ATTR_CQM_INVALID: invalid * @NL80211_ATTR_CQM_RSSI_THOLD: RSSI threshold in dBm. This value specifies * the threshold for the RSSI level at which an event will be sent. Zero - * to disable. + * to disable. Alternatively, if %NL80211_EXT_FEATURE_CQM_RSSI_LIST is + * set, multiple values can be supplied as a low-to-high sorted array of + * threshold values in dBm. Events will be sent when the RSSI value + * crosses any of the thresholds. * @NL80211_ATTR_CQM_RSSI_HYST: RSSI hysteresis in dBm. This value specifies * the minimum amount the RSSI level must change after an event before a * new event may be issued (to reduce effects of RSSI oscillation). @@ -4753,6 +4847,11 @@ enum nl80211_feature_flags { * @NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI: The driver supports sched_scan * for reporting BSSs with better RSSI than the current connected BSS * (%NL80211_ATTR_SCHED_SCAN_RELATIVE_RSSI). + * @NL80211_EXT_FEATURE_CQM_RSSI_LIST: With this driver the + * %NL80211_ATTR_CQM_RSSI_THOLD attribute accepts a list of zero or more + * RSSI threshold values to monitor rather than exactly one threshold. + * @NL80211_EXT_FEATURE_FILS_SK_OFFLOAD: Driver SME supports FILS shared key + * authentication with %NL80211_CMD_CONNECT. * * @NUM_NL80211_EXT_FEATURES: number of extended features. * @MAX_NL80211_EXT_FEATURES: highest extended feature index. @@ -4771,6 +4870,8 @@ enum nl80211_ext_feature_index { NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA, NL80211_EXT_FEATURE_MGMT_TX_RANDOM_TA_CONNECTED, NL80211_EXT_FEATURE_SCHED_SCAN_RELATIVE_RSSI, + NL80211_EXT_FEATURE_CQM_RSSI_LIST, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD, /* add new features before the definition below */ NUM_NL80211_EXT_FEATURES, @@ -4906,12 +5007,17 @@ enum nl80211_smps_mode { * change to the channel status. * @NL80211_RADAR_NOP_FINISHED: The Non-Occupancy Period for this channel is * over, channel becomes usable. + * @NL80211_RADAR_PRE_CAC_EXPIRED: Channel Availability Check done on this + * non-operating channel is expired and no longer valid. New CAC must + * be done on this channel before starting the operation. This is not + * applicable for ETSI dfs domain where pre-CAC is valid for ever. */ enum nl80211_radar_event { NL80211_RADAR_DETECTED, NL80211_RADAR_CAC_FINISHED, NL80211_RADAR_CAC_ABORTED, NL80211_RADAR_NOP_FINISHED, + NL80211_RADAR_PRE_CAC_EXPIRED, }; /** diff --git a/include/uapi/linux/nubus.h b/include/uapi/linux/nubus.h index 77513d2b5638ec9ff6d21272b203927041575e32..ac516064f0ee7882227520a201067e91c648c8e4 100644 --- a/include/uapi/linux/nubus.h +++ b/include/uapi/linux/nubus.h @@ -113,13 +113,15 @@ enum nubus_drhw { NUBUS_DRHW_SIGMA_CLRMAX = 0x0007, /* Sigma Design ColorMax */ NUBUS_DRHW_APPLE_SE30 = 0x0009, /* Apple SE/30 video */ NUBUS_DRHW_APPLE_HRVC = 0x0013, /* Mac II High-Res Video Card */ + NUBUS_DRHW_APPLE_MVC = 0x0014, /* Mac II Monochrome Video Card */ NUBUS_DRHW_APPLE_PVC = 0x0017, /* Mac II Portrait Video Card */ NUBUS_DRHW_APPLE_RBV1 = 0x0018, /* IIci RBV video */ NUBUS_DRHW_APPLE_MDC = 0x0019, /* Macintosh Display Card */ + NUBUS_DRHW_APPLE_VSC = 0x0020, /* Duo MiniDock ViSC framebuffer */ NUBUS_DRHW_APPLE_SONORA = 0x0022, /* Sonora built-in video */ + NUBUS_DRHW_APPLE_JET = 0x0029, /* Jet framebuffer (DuoDock) */ NUBUS_DRHW_APPLE_24AC = 0x002b, /* Mac 24AC Video Card */ NUBUS_DRHW_APPLE_VALKYRIE = 0x002e, - NUBUS_DRHW_APPLE_JET = 0x0029, /* Jet framebuffer (DuoDock) */ NUBUS_DRHW_SMAC_GFX = 0x0105, /* SuperMac GFX */ NUBUS_DRHW_RASTER_CB264 = 0x013B, /* RasterOps ColorBoard 264 */ NUBUS_DRHW_MICRON_XCEED = 0x0146, /* Micron Exceed color */ diff --git a/include/uapi/linux/openvswitch.h b/include/uapi/linux/openvswitch.h index 7f41f7d0000f9f0ee36c274d88ad0d330fa8f5d6..61b7d36dfe34394f7cbed64217a8a9a1e7f55cfe 100644 --- a/include/uapi/linux/openvswitch.h +++ b/include/uapi/linux/openvswitch.h @@ -578,10 +578,25 @@ enum ovs_sample_attr { OVS_SAMPLE_ATTR_PROBABILITY, /* u32 number */ OVS_SAMPLE_ATTR_ACTIONS, /* Nested OVS_ACTION_ATTR_* attributes. */ __OVS_SAMPLE_ATTR_MAX, + +#ifdef __KERNEL__ + OVS_SAMPLE_ATTR_ARG /* struct sample_arg */ +#endif }; #define OVS_SAMPLE_ATTR_MAX (__OVS_SAMPLE_ATTR_MAX - 1) +#ifdef __KERNEL__ +struct sample_arg { + bool exec; /* When true, actions in sample will not + * change flow keys. False otherwise. + */ + u32 probability; /* Same value as + * 'OVS_SAMPLE_ATTR_PROBABILITY'. + */ +}; +#endif + /** * enum ovs_userspace_attr - Attributes for %OVS_ACTION_ATTR_USERSPACE action. * @OVS_USERSPACE_ATTR_PID: u32 Netlink PID to which the %OVS_PACKET_CMD_ACTION @@ -678,6 +693,17 @@ struct ovs_action_hash { * nothing if the connection is already committed will check that the current * packet is in conntrack entry's original direction. If directionality does * not match, will delete the existing conntrack entry and commit a new one. + * @OVS_CT_ATTR_EVENTMASK: Mask of bits indicating which conntrack event types + * (enum ip_conntrack_events IPCT_*) should be reported. For any bit set to + * zero, the corresponding event type is not generated. Default behavior + * depends on system configuration, but typically all event types are + * generated, hence listening on NFNLGRP_CONNTRACK_UPDATE events may get a lot + * of events. Explicitly passing this attribute allows limiting the updates + * received to the events of interest. The bit 1 << IPCT_NEW, 1 << + * IPCT_RELATED, and 1 << IPCT_DESTROY must be set to ones for those events to + * be received on NFNLGRP_CONNTRACK_NEW and NFNLGRP_CONNTRACK_DESTROY groups, + * respectively. Remaining bits control the changes for which an event is + * delivered on the NFNLGRP_CONNTRACK_UPDATE group. */ enum ovs_ct_attr { OVS_CT_ATTR_UNSPEC, @@ -689,6 +715,7 @@ enum ovs_ct_attr { related connections. */ OVS_CT_ATTR_NAT, /* Nested OVS_NAT_ATTR_* */ OVS_CT_ATTR_FORCE_COMMIT, /* No argument */ + OVS_CT_ATTR_EVENTMASK, /* u32 mask of IPCT_* events. */ __OVS_CT_ATTR_MAX }; diff --git a/include/uapi/linux/pci_regs.h b/include/uapi/linux/pci_regs.h index 634c9c44ed6cb2173174efab81b0826f28c2dcd3..d56bb005100949516c0ab59e964bf1c135899bb2 100644 --- a/include/uapi/linux/pci_regs.h +++ b/include/uapi/linux/pci_regs.h @@ -114,7 +114,7 @@ #define PCI_SUBSYSTEM_ID 0x2e #define PCI_ROM_ADDRESS 0x30 /* Bits 31..11 are address, 10..1 reserved */ #define PCI_ROM_ADDRESS_ENABLE 0x01 -#define PCI_ROM_ADDRESS_MASK (~0x7ffUL) +#define PCI_ROM_ADDRESS_MASK (~0x7ffU) #define PCI_CAPABILITY_LIST 0x34 /* Offset of first capability list entry */ @@ -630,6 +630,7 @@ #define PCI_EXP_DEVCTL2_COMP_TIMEOUT 0x000f /* Completion Timeout Value */ #define PCI_EXP_DEVCTL2_ARI 0x0020 /* Alternative Routing-ID */ #define PCI_EXP_DEVCTL2_ATOMIC_REQ 0x0040 /* Set Atomic requests */ +#define PCI_EXP_DEVCTL2_ATOMIC_EGRESS_BLOCK 0x0080 /* Block atomic egress */ #define PCI_EXP_DEVCTL2_IDO_REQ_EN 0x0100 /* Allow IDO for requests */ #define PCI_EXP_DEVCTL2_IDO_CMP_EN 0x0200 /* Allow IDO for completions */ #define PCI_EXP_DEVCTL2_LTR_EN 0x0400 /* Enable LTR mechanism */ diff --git a/include/uapi/linux/pcitest.h b/include/uapi/linux/pcitest.h new file mode 100644 index 0000000000000000000000000000000000000000..a6aa10c45ad169684923f7b9efcc53345a45b98d --- /dev/null +++ b/include/uapi/linux/pcitest.h @@ -0,0 +1,19 @@ +/** + * pcitest.h - PCI test uapi defines + * + * Copyright (C) 2017 Texas Instruments + * Author: Kishon Vijay Abraham I + * + */ + +#ifndef __UAPI_LINUX_PCITEST_H +#define __UAPI_LINUX_PCITEST_H + +#define PCITEST_BAR _IO('P', 0x1) +#define PCITEST_LEGACY_IRQ _IO('P', 0x2) +#define PCITEST_MSI _IOW('P', 0x3, int) +#define PCITEST_WRITE _IOW('P', 0x4, unsigned long) +#define PCITEST_READ _IOW('P', 0x5, unsigned long) +#define PCITEST_COPY _IOW('P', 0x6, unsigned long) + +#endif /* __UAPI_LINUX_PCITEST_H */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index c66a485a24ac81e324ea53ea17b405a3c73b520a..b1c0b187acfe57da3ece7e91325536edcc9cff0c 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -344,7 +344,8 @@ struct perf_event_attr { use_clockid : 1, /* use @clockid for time fields */ context_switch : 1, /* context switch data */ write_backward : 1, /* Write ring buffer from end to beginning */ - __reserved_1 : 36; + namespaces : 1, /* include namespaces data */ + __reserved_1 : 35; union { __u32 wakeup_events; /* wakeup every n events */ @@ -610,6 +611,23 @@ struct perf_event_header { __u16 size; }; +struct perf_ns_link_info { + __u64 dev; + __u64 ino; +}; + +enum { + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ +}; + enum perf_event_type { /* @@ -862,6 +880,18 @@ enum perf_event_type { */ PERF_RECORD_SWITCH_CPU_WIDE = 15, + /* + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * u64 nr_namespaces; + * { u64 dev, inode; } [nr_namespaces]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_NAMESPACES = 16, + PERF_RECORD_MAX, /* non-ABI */ }; @@ -885,12 +915,14 @@ enum perf_callchain_context { */ #define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ #define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ #define PERF_FLAG_FD_NO_GROUP (1UL << 0) #define PERF_FLAG_FD_OUTPUT (1UL << 1) #define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ #define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ +#if defined(__LITTLE_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { @@ -902,6 +934,21 @@ union perf_mem_data_src { mem_rsvd:31; }; }; +#elif defined(__BIG_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_rsvd:31, + mem_dtlb:7, /* tlb access */ + mem_lock:2, /* lock instr */ + mem_snoop:5, /* snoop mode */ + mem_lvl:14, /* memory hierarchy level */ + mem_op:5; /* type of opcode */ + }; +}; +#else +#error "Unknown endianness" +#endif /* type of opcode (load/store/prefetch,code) */ #define PERF_MEM_OP_NA 0x01 /* not available */ diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index 7a69f2a4ca0c06a68487ff382c6b84f8acab323b..d613be3b3239e476d728ffa323347ef42e97eaec 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -37,7 +37,20 @@ enum { #define TC_ACT_QUEUED 5 #define TC_ACT_REPEAT 6 #define TC_ACT_REDIRECT 7 -#define TC_ACT_JUMP 0x10000000 + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_CMP(combined, opcode) \ + (((combined) & (~TC_ACT_EXT_VAL_MASK)) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) /* Action type identifiers*/ enum { @@ -432,6 +445,11 @@ enum { TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + __TCA_FLOWER_MAX, }; diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index df7451d351311cd915fc96752fcaeb0a43b5d112..099bf5528fed30008bfbde3529315be35c0411f9 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -617,6 +617,14 @@ struct tc_drr_stats { #define TC_QOPT_BITMASK 15 #define TC_QOPT_MAX_QUEUE 16 +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + struct tc_mqprio_qopt { __u8 num_tc; __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; diff --git a/include/uapi/linux/pps.h b/include/uapi/linux/pps.h index a9bb1d93451aeb6c5b73ba4c1bd46db053478564..c1cb3825a8bc805e804f88c4e759e77716c6efef 100644 --- a/include/uapi/linux/pps.h +++ b/include/uapi/linux/pps.h @@ -55,6 +55,12 @@ struct pps_ktime { __s32 nsec; __u32 flags; }; + +struct pps_ktime_compat { + __s64 sec; + __s32 nsec; + __u32 flags; +} __attribute__((packed, aligned(4))); #define PPS_TIME_INVALID (1<<0) /* used to specify timeout==NULL */ struct pps_kinfo { @@ -65,6 +71,14 @@ struct pps_kinfo { int current_mode; /* current mode bits */ }; +struct pps_kinfo_compat { + __u32 assert_sequence; /* seq. num. of assert event */ + __u32 clear_sequence; /* seq. num. of clear event */ + struct pps_ktime_compat assert_tu; /* time of assert event */ + struct pps_ktime_compat clear_tu; /* time of clear event */ + int current_mode; /* current mode bits */ +}; + struct pps_kparams { int api_version; /* API version # */ int mode; /* mode bits */ @@ -114,6 +128,11 @@ struct pps_fdata { struct pps_ktime timeout; }; +struct pps_fdata_compat { + struct pps_kinfo_compat info; + struct pps_ktime_compat timeout; +}; + struct pps_bind_args { int tsformat; /* format of time stamps */ int edge; /* selected event type */ diff --git a/include/uapi/linux/pr.h b/include/uapi/linux/pr.h index 57d7c0f916b6f8307d318bb2f949da7723ae0cf8..645ef3cf3dd08a7d0494cfb509a30c42c3b053c2 100644 --- a/include/uapi/linux/pr.h +++ b/include/uapi/linux/pr.h @@ -1,6 +1,8 @@ #ifndef _UAPI_PR_H #define _UAPI_PR_H +#include + enum pr_type { PR_WRITE_EXCLUSIVE = 1, PR_EXCLUSIVE_ACCESS = 2, diff --git a/include/uapi/linux/qrtr.h b/include/uapi/linux/qrtr.h index 66c0748d26e2b3e9412424bad66011b23ec0f876..9d76c566f66e83b67d4abdc295731031e2362377 100644 --- a/include/uapi/linux/qrtr.h +++ b/include/uapi/linux/qrtr.h @@ -2,6 +2,7 @@ #define _LINUX_QRTR_H #include +#include struct sockaddr_qrtr { __kernel_sa_family_t sq_family; diff --git a/include/uapi/linux/raid/Kbuild b/include/uapi/linux/raid/Kbuild deleted file mode 100644 index e2c3d25405d7e20de2dc708088b0399350bd861f..0000000000000000000000000000000000000000 --- a/include/uapi/linux/raid/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += md_p.h -header-y += md_u.h diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 9930f3e9040f76ebc09bfb9446f4399f9e96b98c..d500bd224979d923f15732bd19a5bcde11c8f91f 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -242,10 +242,18 @@ struct mdp_superblock_1 { __le32 chunksize; /* in 512byte sectors */ __le32 raid_disks; - __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts - * NOTE: signed, so bitmap can be before superblock - * only meaningful of feature_map[0] is set. - */ + union { + __le32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* only meaningful when feature_map[MD_FEATURE_PPL] is set */ + struct { + __le16 offset; /* sectors from start of superblock that ppl starts (signed) */ + __le16 size; /* ppl size in sectors */ + } ppl; + }; /* These are only valid with feature bit '4' */ __le32 new_level; /* new level we are reshaping to */ @@ -318,6 +326,7 @@ struct mdp_superblock_1 { */ #define MD_FEATURE_CLUSTERED 256 /* clustered MD */ #define MD_FEATURE_JOURNAL 512 /* support write cache */ +#define MD_FEATURE_PPL 1024 /* support PPL */ #define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ |MD_FEATURE_RECOVERY_OFFSET \ |MD_FEATURE_RESHAPE_ACTIVE \ @@ -328,6 +337,7 @@ struct mdp_superblock_1 { |MD_FEATURE_RECOVERY_BITMAP \ |MD_FEATURE_CLUSTERED \ |MD_FEATURE_JOURNAL \ + |MD_FEATURE_PPL \ ) struct r5l_payload_header { @@ -388,4 +398,31 @@ struct r5l_meta_block { #define R5LOG_VERSION 0x1 #define R5LOG_MAGIC 0x6433c509 + +struct ppl_header_entry { + __le64 data_sector; /* raid sector of the new data */ + __le32 pp_size; /* length of partial parity */ + __le32 data_size; /* length of data */ + __le32 parity_disk; /* member disk containing parity */ + __le32 checksum; /* checksum of partial parity data for this + * entry (~crc32c) */ +} __attribute__ ((__packed__)); + +#define PPL_HEADER_SIZE 4096 +#define PPL_HDR_RESERVED 512 +#define PPL_HDR_ENTRY_SPACE \ + (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__le32) - sizeof(__le64)) +#define PPL_HDR_MAX_ENTRIES \ + (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry)) + +struct ppl_header { + __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */ + __le32 signature; /* signature (family number of volume) */ + __le32 padding; /* zero pad */ + __le64 generation; /* generation number of the header */ + __le32 entries_count; /* number of entries in entry array */ + __le32 checksum; /* checksum of the header (~crc32c) */ + struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + #endif diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h index 6546917d605a916bfd5a905e30eb05d68fd6ad6b..cce061382e4073d4fe8296a00f22eba42cf13818 100644 --- a/include/uapi/linux/rtnetlink.h +++ b/include/uapi/linux/rtnetlink.h @@ -122,6 +122,8 @@ enum { RTM_NEWNETCONF = 80, #define RTM_NEWNETCONF RTM_NEWNETCONF + RTM_DELNETCONF, +#define RTM_DELNETCONF RTM_DELNETCONF RTM_GETNETCONF = 82, #define RTM_GETNETCONF RTM_GETNETCONF @@ -319,6 +321,7 @@ enum rtattr_type_t { RTA_EXPIRES, RTA_PAD, RTA_UID, + RTA_TTL_PROPAGATE, __RTA_MAX }; @@ -545,6 +548,7 @@ enum { TCA_STATS2, TCA_STAB, TCA_PAD, + TCA_DUMP_INVISIBLE, __TCA_MAX }; diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h index d3ae381fcf3327489c82e2f47eb39a363ec030c7..ced9d8b974268ed270661c3e2da77165e3a24784 100644 --- a/include/uapi/linux/sctp.h +++ b/include/uapi/linux/sctp.h @@ -115,6 +115,8 @@ typedef __s32 sctp_assoc_t; #define SCTP_PR_SUPPORTED 113 #define SCTP_DEFAULT_PRINFO 114 #define SCTP_PR_ASSOC_STATUS 115 +#define SCTP_PR_STREAM_STATUS 116 +#define SCTP_RECONFIG_SUPPORTED 117 #define SCTP_ENABLE_STREAM_RESET 118 #define SCTP_RESET_STREAMS 119 #define SCTP_RESET_ASSOC 120 @@ -502,6 +504,28 @@ struct sctp_stream_reset_event { __u16 strreset_stream_list[]; }; +#define SCTP_ASSOC_RESET_DENIED 0x0004 +#define SCTP_ASSOC_RESET_FAILED 0x0008 +struct sctp_assoc_reset_event { + __u16 assocreset_type; + __u16 assocreset_flags; + __u32 assocreset_length; + sctp_assoc_t assocreset_assoc_id; + __u32 assocreset_local_tsn; + __u32 assocreset_remote_tsn; +}; + +#define SCTP_ASSOC_CHANGE_DENIED 0x0004 +#define SCTP_ASSOC_CHANGE_FAILED 0x0008 +struct sctp_stream_change_event { + __u16 strchange_type; + __u16 strchange_flags; + __u32 strchange_length; + sctp_assoc_t strchange_assoc_id; + __u16 strchange_instrms; + __u16 strchange_outstrms; +}; + /* * Described in Section 7.3 * Ancillary Data and Notification Interest Options @@ -518,6 +542,8 @@ struct sctp_event_subscribe { __u8 sctp_authentication_event; __u8 sctp_sender_dry_event; __u8 sctp_stream_reset_event; + __u8 sctp_assoc_reset_event; + __u8 sctp_stream_change_event; }; /* @@ -543,6 +569,8 @@ union sctp_notification { struct sctp_authkey_event sn_authkey_event; struct sctp_sender_dry_event sn_sender_dry_event; struct sctp_stream_reset_event sn_strreset_event; + struct sctp_assoc_reset_event sn_assocreset_event; + struct sctp_stream_change_event sn_strchange_event; }; /* Section 5.3.1 @@ -572,6 +600,10 @@ enum sctp_sn_type { #define SCTP_SENDER_DRY_EVENT SCTP_SENDER_DRY_EVENT SCTP_STREAM_RESET_EVENT, #define SCTP_STREAM_RESET_EVENT SCTP_STREAM_RESET_EVENT + SCTP_ASSOC_RESET_EVENT, +#define SCTP_ASSOC_RESET_EVENT SCTP_ASSOC_RESET_EVENT + SCTP_STREAM_CHANGE_EVENT, +#define SCTP_STREAM_CHANGE_EVENT SCTP_STREAM_CHANGE_EVENT }; /* Notification error codes used to fill up the error fields in some diff --git a/include/uapi/linux/serio.h b/include/uapi/linux/serio.h index ccd0ccd00f470b92090f1a6eff4a5a03940fa7f6..ac217c6f0151063ba759b7895472dcddb11b29c9 100644 --- a/include/uapi/linux/serio.h +++ b/include/uapi/linux/serio.h @@ -80,5 +80,6 @@ #define SERIO_WACOM_IV 0x3e #define SERIO_EGALAX 0x3f #define SERIO_PULSE8_CEC 0x40 +#define SERIO_RAINSHADOW_CEC 0x41 #endif /* _UAPI_SERIO_H */ diff --git a/include/uapi/linux/smc_diag.h b/include/uapi/linux/smc_diag.h index 0063919fea344664ee657913ffdb1a49c9158a25..87712bfaa9dd00d431cbc1953da0462a52d7e119 100644 --- a/include/uapi/linux/smc_diag.h +++ b/include/uapi/linux/smc_diag.h @@ -3,7 +3,7 @@ #include #include -#include +#include /* Request structure */ struct smc_diag_req { diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 3b2bed7ca9a4d92c5671e614f2bc598668805f75..95cffcb21dfdba7c974706131d0f43e21435e82d 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -177,7 +177,6 @@ enum LINUX_MIB_TIMEWAITED, /* TimeWaited */ LINUX_MIB_TIMEWAITRECYCLED, /* TimeWaitRecycled */ LINUX_MIB_TIMEWAITKILLED, /* TimeWaitKilled */ - LINUX_MIB_PAWSPASSIVEREJECTED, /* PAWSPassiveRejected */ LINUX_MIB_PAWSACTIVEREJECTED, /* PAWSActiveRejected */ LINUX_MIB_PAWSESTABREJECTED, /* PAWSEstabRejected */ LINUX_MIB_DELAYEDACKS, /* DelayedACKs */ @@ -260,6 +259,7 @@ enum LINUX_MIB_TCPFASTOPENPASSIVEFAIL, /* TCPFastOpenPassiveFail */ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ + LINUX_MIB_TCPFASTOPENBLACKHOLE, /* TCPFastOpenBlackholeDetect */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ LINUX_MIB_BUSYPOLLRXPACKETS, /* BusyPollRxPackets */ LINUX_MIB_TCPAUTOCORKING, /* TCPAutoCorking */ diff --git a/include/uapi/linux/spi/Kbuild b/include/uapi/linux/spi/Kbuild deleted file mode 100644 index 0cc747eff165543ce28d45b23f676ca754bc9984..0000000000000000000000000000000000000000 --- a/include/uapi/linux/spi/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += spidev.h diff --git a/include/uapi/linux/stat.h b/include/uapi/linux/stat.h index d538897b8e08bb4358c56148c189fc7b8128a9da..17b10304c393355da9ed2da743107a5c59748290 100644 --- a/include/uapi/linux/stat.h +++ b/include/uapi/linux/stat.h @@ -48,17 +48,13 @@ * tv_sec holds the number of seconds before (negative) or after (positive) * 00:00:00 1st January 1970 UTC. * - * tv_nsec holds a number of nanoseconds before (0..-999,999,999 if tv_sec is - * negative) or after (0..999,999,999 if tv_sec is positive) the tv_sec time. - * - * Note that if both tv_sec and tv_nsec are non-zero, then the two values must - * either be both positive or both negative. + * tv_nsec holds a number of nanoseconds (0..999,999,999) after the tv_sec time. * * __reserved is held in case we need a yet finer resolution. */ struct statx_timestamp { __s64 tv_sec; - __s32 tv_nsec; + __u32 tv_nsec; __s32 __reserved; }; diff --git a/include/uapi/linux/sunrpc/Kbuild b/include/uapi/linux/sunrpc/Kbuild deleted file mode 100644 index 8e02e47c20fb65cbc59ec3ae70bd23bb92af7386..0000000000000000000000000000000000000000 --- a/include/uapi/linux/sunrpc/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += debug.h diff --git a/include/uapi/linux/switchtec_ioctl.h b/include/uapi/linux/switchtec_ioctl.h new file mode 100644 index 0000000000000000000000000000000000000000..3e824e1a649524b545128cba92855e52522acbf8 --- /dev/null +++ b/include/uapi/linux/switchtec_ioctl.h @@ -0,0 +1,132 @@ +/* + * Microsemi Switchtec PCIe Driver + * Copyright (c) 2017, Microsemi Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#ifndef _UAPI_LINUX_SWITCHTEC_IOCTL_H +#define _UAPI_LINUX_SWITCHTEC_IOCTL_H + +#include + +#define SWITCHTEC_IOCTL_PART_CFG0 0 +#define SWITCHTEC_IOCTL_PART_CFG1 1 +#define SWITCHTEC_IOCTL_PART_IMG0 2 +#define SWITCHTEC_IOCTL_PART_IMG1 3 +#define SWITCHTEC_IOCTL_PART_NVLOG 4 +#define SWITCHTEC_IOCTL_PART_VENDOR0 5 +#define SWITCHTEC_IOCTL_PART_VENDOR1 6 +#define SWITCHTEC_IOCTL_PART_VENDOR2 7 +#define SWITCHTEC_IOCTL_PART_VENDOR3 8 +#define SWITCHTEC_IOCTL_PART_VENDOR4 9 +#define SWITCHTEC_IOCTL_PART_VENDOR5 10 +#define SWITCHTEC_IOCTL_PART_VENDOR6 11 +#define SWITCHTEC_IOCTL_PART_VENDOR7 12 +#define SWITCHTEC_IOCTL_NUM_PARTITIONS 13 + +struct switchtec_ioctl_flash_info { + __u64 flash_length; + __u32 num_partitions; + __u32 padding; +}; + +struct switchtec_ioctl_flash_part_info { + __u32 flash_partition; + __u32 address; + __u32 length; + __u32 active; +}; + +struct switchtec_ioctl_event_summary { + __u64 global; + __u64 part_bitmap; + __u32 local_part; + __u32 padding; + __u32 part[48]; + __u32 pff[48]; +}; + +#define SWITCHTEC_IOCTL_EVENT_STACK_ERROR 0 +#define SWITCHTEC_IOCTL_EVENT_PPU_ERROR 1 +#define SWITCHTEC_IOCTL_EVENT_ISP_ERROR 2 +#define SWITCHTEC_IOCTL_EVENT_SYS_RESET 3 +#define SWITCHTEC_IOCTL_EVENT_FW_EXC 4 +#define SWITCHTEC_IOCTL_EVENT_FW_NMI 5 +#define SWITCHTEC_IOCTL_EVENT_FW_NON_FATAL 6 +#define SWITCHTEC_IOCTL_EVENT_FW_FATAL 7 +#define SWITCHTEC_IOCTL_EVENT_TWI_MRPC_COMP 8 +#define SWITCHTEC_IOCTL_EVENT_TWI_MRPC_COMP_ASYNC 9 +#define SWITCHTEC_IOCTL_EVENT_CLI_MRPC_COMP 10 +#define SWITCHTEC_IOCTL_EVENT_CLI_MRPC_COMP_ASYNC 11 +#define SWITCHTEC_IOCTL_EVENT_GPIO_INT 12 +#define SWITCHTEC_IOCTL_EVENT_PART_RESET 13 +#define SWITCHTEC_IOCTL_EVENT_MRPC_COMP 14 +#define SWITCHTEC_IOCTL_EVENT_MRPC_COMP_ASYNC 15 +#define SWITCHTEC_IOCTL_EVENT_DYN_PART_BIND_COMP 16 +#define SWITCHTEC_IOCTL_EVENT_AER_IN_P2P 17 +#define SWITCHTEC_IOCTL_EVENT_AER_IN_VEP 18 +#define SWITCHTEC_IOCTL_EVENT_DPC 19 +#define SWITCHTEC_IOCTL_EVENT_CTS 20 +#define SWITCHTEC_IOCTL_EVENT_HOTPLUG 21 +#define SWITCHTEC_IOCTL_EVENT_IER 22 +#define SWITCHTEC_IOCTL_EVENT_THRESH 23 +#define SWITCHTEC_IOCTL_EVENT_POWER_MGMT 24 +#define SWITCHTEC_IOCTL_EVENT_TLP_THROTTLING 25 +#define SWITCHTEC_IOCTL_EVENT_FORCE_SPEED 26 +#define SWITCHTEC_IOCTL_EVENT_CREDIT_TIMEOUT 27 +#define SWITCHTEC_IOCTL_EVENT_LINK_STATE 28 +#define SWITCHTEC_IOCTL_MAX_EVENTS 29 + +#define SWITCHTEC_IOCTL_EVENT_LOCAL_PART_IDX -1 +#define SWITCHTEC_IOCTL_EVENT_IDX_ALL -2 + +#define SWITCHTEC_IOCTL_EVENT_FLAG_CLEAR (1 << 0) +#define SWITCHTEC_IOCTL_EVENT_FLAG_EN_POLL (1 << 1) +#define SWITCHTEC_IOCTL_EVENT_FLAG_EN_LOG (1 << 2) +#define SWITCHTEC_IOCTL_EVENT_FLAG_EN_CLI (1 << 3) +#define SWITCHTEC_IOCTL_EVENT_FLAG_EN_FATAL (1 << 4) +#define SWITCHTEC_IOCTL_EVENT_FLAG_DIS_POLL (1 << 5) +#define SWITCHTEC_IOCTL_EVENT_FLAG_DIS_LOG (1 << 6) +#define SWITCHTEC_IOCTL_EVENT_FLAG_DIS_CLI (1 << 7) +#define SWITCHTEC_IOCTL_EVENT_FLAG_DIS_FATAL (1 << 8) +#define SWITCHTEC_IOCTL_EVENT_FLAG_UNUSED (~0x1ff) + +struct switchtec_ioctl_event_ctl { + __u32 event_id; + __s32 index; + __u32 flags; + __u32 occurred; + __u32 count; + __u32 data[5]; +}; + +#define SWITCHTEC_IOCTL_PFF_VEP 100 +struct switchtec_ioctl_pff_port { + __u32 pff; + __u32 partition; + __u32 port; +}; + +#define SWITCHTEC_IOCTL_FLASH_INFO \ + _IOR('W', 0x40, struct switchtec_ioctl_flash_info) +#define SWITCHTEC_IOCTL_FLASH_PART_INFO \ + _IOWR('W', 0x41, struct switchtec_ioctl_flash_part_info) +#define SWITCHTEC_IOCTL_EVENT_SUMMARY \ + _IOR('W', 0x42, struct switchtec_ioctl_event_summary) +#define SWITCHTEC_IOCTL_EVENT_CTL \ + _IOWR('W', 0x43, struct switchtec_ioctl_event_ctl) +#define SWITCHTEC_IOCTL_PFF_TO_PORT \ + _IOWR('W', 0x44, struct switchtec_ioctl_pff_port) +#define SWITCHTEC_IOCTL_PORT_TO_PFF \ + _IOWR('W', 0x45, struct switchtec_ioctl_pff_port) + +#endif diff --git a/include/uapi/linux/sysctl.h b/include/uapi/linux/sysctl.h index d2b12152e358f14e791ef3e842cb6eac7cc8ceec..e13d48058b8d0e5cf36e458e68e257d73a9a1e8f 100644 --- a/include/uapi/linux/sysctl.h +++ b/include/uapi/linux/sysctl.h @@ -568,6 +568,7 @@ enum { NET_IPV6_PROXY_NDP=23, NET_IPV6_ACCEPT_SOURCE_ROUTE=25, NET_IPV6_ACCEPT_RA_FROM_LOCAL=26, + NET_IPV6_ACCEPT_RA_RT_INFO_MIN_PLEN=27, __NET_IPV6_MAX }; diff --git a/include/uapi/linux/tc_act/Kbuild b/include/uapi/linux/tc_act/Kbuild deleted file mode 100644 index ba62ddf0e58ab3dba056ae17b5b5379e98b7627e..0000000000000000000000000000000000000000 --- a/include/uapi/linux/tc_act/Kbuild +++ /dev/null @@ -1,16 +0,0 @@ -# UAPI Header export list -header-y += tc_csum.h -header-y += tc_defact.h -header-y += tc_gact.h -header-y += tc_ipt.h -header-y += tc_mirred.h -header-y += tc_sample.h -header-y += tc_nat.h -header-y += tc_pedit.h -header-y += tc_skbedit.h -header-y += tc_vlan.h -header-y += tc_bpf.h -header-y += tc_connmark.h -header-y += tc_ife.h -header-y += tc_tunnel_key.h -header-y += tc_skbmod.h diff --git a/include/uapi/linux/tc_ematch/Kbuild b/include/uapi/linux/tc_ematch/Kbuild deleted file mode 100644 index 53fca39255351730cfd7384d4ca4ccefbb6414c1..0000000000000000000000000000000000000000 --- a/include/uapi/linux/tc_ematch/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# UAPI Header export list -header-y += tc_em_cmp.h -header-y += tc_em_meta.h -header-y += tc_em_nbyte.h -header-y += tc_em_text.h diff --git a/include/uapi/linux/tee.h b/include/uapi/linux/tee.h new file mode 100644 index 0000000000000000000000000000000000000000..370d8845ab2173a733ccbf7091f9ce560024bb0e --- /dev/null +++ b/include/uapi/linux/tee.h @@ -0,0 +1,346 @@ +/* + * Copyright (c) 2015-2016, Linaro Limited + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __TEE_H +#define __TEE_H + +#include +#include + +/* + * This file describes the API provided by a TEE driver to user space. + * + * Each TEE driver defines a TEE specific protocol which is used for the + * data passed back and forth using TEE_IOC_CMD. + */ + +/* Helpers to make the ioctl defines */ +#define TEE_IOC_MAGIC 0xa4 +#define TEE_IOC_BASE 0 + +/* Flags relating to shared memory */ +#define TEE_IOCTL_SHM_MAPPED 0x1 /* memory mapped in normal world */ +#define TEE_IOCTL_SHM_DMA_BUF 0x2 /* dma-buf handle on shared memory */ + +#define TEE_MAX_ARG_SIZE 1024 + +#define TEE_GEN_CAP_GP (1 << 0)/* GlobalPlatform compliant TEE */ + +/* + * TEE Implementation ID + */ +#define TEE_IMPL_ID_OPTEE 1 + +/* + * OP-TEE specific capabilities + */ +#define TEE_OPTEE_CAP_TZ (1 << 0) + +/** + * struct tee_ioctl_version_data - TEE version + * @impl_id: [out] TEE implementation id + * @impl_caps: [out] Implementation specific capabilities + * @gen_caps: [out] Generic capabilities, defined by TEE_GEN_CAPS_* above + * + * Identifies the TEE implementation, @impl_id is one of TEE_IMPL_ID_* above. + * @impl_caps is implementation specific, for example TEE_OPTEE_CAP_* + * is valid when @impl_id == TEE_IMPL_ID_OPTEE. + */ +struct tee_ioctl_version_data { + __u32 impl_id; + __u32 impl_caps; + __u32 gen_caps; +}; + +/** + * TEE_IOC_VERSION - query version of TEE + * + * Takes a tee_ioctl_version_data struct and returns with the TEE version + * data filled in. + */ +#define TEE_IOC_VERSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 0, \ + struct tee_ioctl_version_data) + +/** + * struct tee_ioctl_shm_alloc_data - Shared memory allocate argument + * @size: [in/out] Size of shared memory to allocate + * @flags: [in/out] Flags to/from allocation. + * @id: [out] Identifier of the shared memory + * + * The flags field should currently be zero as input. Updated by the call + * with actual flags as defined by TEE_IOCTL_SHM_* above. + * This structure is used as argument for TEE_IOC_SHM_ALLOC below. + */ +struct tee_ioctl_shm_alloc_data { + __u64 size; + __u32 flags; + __s32 id; +}; + +/** + * TEE_IOC_SHM_ALLOC - allocate shared memory + * + * Allocates shared memory between the user space process and secure OS. + * + * Returns a file descriptor on success or < 0 on failure + * + * The returned file descriptor is used to map the shared memory into user + * space. The shared memory is freed when the descriptor is closed and the + * memory is unmapped. + */ +#define TEE_IOC_SHM_ALLOC _IOWR(TEE_IOC_MAGIC, TEE_IOC_BASE + 1, \ + struct tee_ioctl_shm_alloc_data) + +/** + * struct tee_ioctl_buf_data - Variable sized buffer + * @buf_ptr: [in] A __user pointer to a buffer + * @buf_len: [in] Length of the buffer above + * + * Used as argument for TEE_IOC_OPEN_SESSION, TEE_IOC_INVOKE, + * TEE_IOC_SUPPL_RECV, and TEE_IOC_SUPPL_SEND below. + */ +struct tee_ioctl_buf_data { + __u64 buf_ptr; + __u64 buf_len; +}; + +/* + * Attributes for struct tee_ioctl_param, selects field in the union + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_NONE 0 /* parameter not used */ + +/* + * These defines value parameters (struct tee_ioctl_param_value) + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INPUT 1 +#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_OUTPUT 2 +#define TEE_IOCTL_PARAM_ATTR_TYPE_VALUE_INOUT 3 /* input and output */ + +/* + * These defines shared memory reference parameters (struct + * tee_ioctl_param_memref) + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INPUT 5 +#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_OUTPUT 6 +#define TEE_IOCTL_PARAM_ATTR_TYPE_MEMREF_INOUT 7 /* input and output */ + +/* + * Mask for the type part of the attribute, leaves room for more types + */ +#define TEE_IOCTL_PARAM_ATTR_TYPE_MASK 0xff + +/* + * Matches TEEC_LOGIN_* in GP TEE Client API + * Are only defined for GP compliant TEEs + */ +#define TEE_IOCTL_LOGIN_PUBLIC 0 +#define TEE_IOCTL_LOGIN_USER 1 +#define TEE_IOCTL_LOGIN_GROUP 2 +#define TEE_IOCTL_LOGIN_APPLICATION 4 +#define TEE_IOCTL_LOGIN_USER_APPLICATION 5 +#define TEE_IOCTL_LOGIN_GROUP_APPLICATION 6 + +/** + * struct tee_ioctl_param - parameter + * @attr: attributes + * @a: if a memref, offset into the shared memory object, else a value parameter + * @b: if a memref, size of the buffer, else a value parameter + * @c: if a memref, shared memory identifier, else a value parameter + * + * @attr & TEE_PARAM_ATTR_TYPE_MASK indicates if memref or value is used in + * the union. TEE_PARAM_ATTR_TYPE_VALUE_* indicates value and + * TEE_PARAM_ATTR_TYPE_MEMREF_* indicates memref. TEE_PARAM_ATTR_TYPE_NONE + * indicates that none of the members are used. + * + * Shared memory is allocated with TEE_IOC_SHM_ALLOC which returns an + * identifier representing the shared memory object. A memref can reference + * a part of a shared memory by specifying an offset (@a) and size (@b) of + * the object. To supply the entire shared memory object set the offset + * (@a) to 0 and size (@b) to the previously returned size of the object. + */ +struct tee_ioctl_param { + __u64 attr; + __u64 a; + __u64 b; + __u64 c; +}; + +#define TEE_IOCTL_UUID_LEN 16 + +/** + * struct tee_ioctl_open_session_arg - Open session argument + * @uuid: [in] UUID of the Trusted Application + * @clnt_uuid: [in] UUID of client + * @clnt_login: [in] Login class of client, TEE_IOCTL_LOGIN_* above + * @cancel_id: [in] Cancellation id, a unique value to identify this request + * @session: [out] Session id + * @ret: [out] return value + * @ret_origin [out] origin of the return value + * @num_params [in] number of parameters following this struct + */ +struct tee_ioctl_open_session_arg { + __u8 uuid[TEE_IOCTL_UUID_LEN]; + __u8 clnt_uuid[TEE_IOCTL_UUID_LEN]; + __u32 clnt_login; + __u32 cancel_id; + __u32 session; + __u32 ret; + __u32 ret_origin; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +/** + * TEE_IOC_OPEN_SESSION - opens a session to a Trusted Application + * + * Takes a struct tee_ioctl_buf_data which contains a struct + * tee_ioctl_open_session_arg followed by any array of struct + * tee_ioctl_param + */ +#define TEE_IOC_OPEN_SESSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 2, \ + struct tee_ioctl_buf_data) + +/** + * struct tee_ioctl_invoke_func_arg - Invokes a function in a Trusted + * Application + * @func: [in] Trusted Application function, specific to the TA + * @session: [in] Session id + * @cancel_id: [in] Cancellation id, a unique value to identify this request + * @ret: [out] return value + * @ret_origin [out] origin of the return value + * @num_params [in] number of parameters following this struct + */ +struct tee_ioctl_invoke_arg { + __u32 func; + __u32 session; + __u32 cancel_id; + __u32 ret; + __u32 ret_origin; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +/** + * TEE_IOC_INVOKE - Invokes a function in a Trusted Application + * + * Takes a struct tee_ioctl_buf_data which contains a struct + * tee_invoke_func_arg followed by any array of struct tee_param + */ +#define TEE_IOC_INVOKE _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 3, \ + struct tee_ioctl_buf_data) + +/** + * struct tee_ioctl_cancel_arg - Cancels an open session or invoke ioctl + * @cancel_id: [in] Cancellation id, a unique value to identify this request + * @session: [in] Session id, if the session is opened, else set to 0 + */ +struct tee_ioctl_cancel_arg { + __u32 cancel_id; + __u32 session; +}; + +/** + * TEE_IOC_CANCEL - Cancels an open session or invoke + */ +#define TEE_IOC_CANCEL _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 4, \ + struct tee_ioctl_cancel_arg) + +/** + * struct tee_ioctl_close_session_arg - Closes an open session + * @session: [in] Session id + */ +struct tee_ioctl_close_session_arg { + __u32 session; +}; + +/** + * TEE_IOC_CLOSE_SESSION - Closes a session + */ +#define TEE_IOC_CLOSE_SESSION _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 5, \ + struct tee_ioctl_close_session_arg) + +/** + * struct tee_iocl_supp_recv_arg - Receive a request for a supplicant function + * @func: [in] supplicant function + * @num_params [in/out] number of parameters following this struct + * + * @num_params is the number of params that tee-supplicant has room to + * receive when input, @num_params is the number of actual params + * tee-supplicant receives when output. + */ +struct tee_iocl_supp_recv_arg { + __u32 func; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +/** + * TEE_IOC_SUPPL_RECV - Receive a request for a supplicant function + * + * Takes a struct tee_ioctl_buf_data which contains a struct + * tee_iocl_supp_recv_arg followed by any array of struct tee_param + */ +#define TEE_IOC_SUPPL_RECV _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 6, \ + struct tee_ioctl_buf_data) + +/** + * struct tee_iocl_supp_send_arg - Send a response to a received request + * @ret: [out] return value + * @num_params [in] number of parameters following this struct + */ +struct tee_iocl_supp_send_arg { + __u32 ret; + __u32 num_params; + /* num_params tells the actual number of element in params */ + struct tee_ioctl_param params[]; +}; + +/** + * TEE_IOC_SUPPL_SEND - Receive a request for a supplicant function + * + * Takes a struct tee_ioctl_buf_data which contains a struct + * tee_iocl_supp_send_arg followed by any array of struct tee_param + */ +#define TEE_IOC_SUPPL_SEND _IOR(TEE_IOC_MAGIC, TEE_IOC_BASE + 7, \ + struct tee_ioctl_buf_data) + +/* + * Five syscalls are used when communicating with the TEE driver. + * open(): opens the device associated with the driver + * ioctl(): as described above operating on the file descriptor from open() + * close(): two cases + * - closes the device file descriptor + * - closes a file descriptor connected to allocated shared memory + * mmap(): maps shared memory into user space using information from struct + * tee_ioctl_shm_alloc_data + * munmap(): unmaps previously shared memory + */ + +#endif /*__TEE_H*/ diff --git a/include/uapi/linux/usb/Kbuild b/include/uapi/linux/usb/Kbuild deleted file mode 100644 index 4cc4d6e7e5238e89c3887490b923c30d363a1dca..0000000000000000000000000000000000000000 --- a/include/uapi/linux/usb/Kbuild +++ /dev/null @@ -1,12 +0,0 @@ -# UAPI Header export list -header-y += audio.h -header-y += cdc.h -header-y += cdc-wdm.h -header-y += ch11.h -header-y += ch9.h -header-y += functionfs.h -header-y += g_printer.h -header-y += gadgetfs.h -header-y += midi.h -header-y += tmc.h -header-y += video.h diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 2c5d7c4a69e3e7585a4d8e1eb60b82a8ffd723c5..ce1169af39d72c58663e08f4ff7e0c175f467507 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -224,7 +224,8 @@ struct usb_ctrlrequest { * through the Linux-USB APIs, they are not converted to cpu byte * order; it is the responsibility of the client code to do this. * The single exception is when device and configuration descriptors (but - * not other descriptors) are read from usbfs (i.e. /proc/bus/usb/BBB/DDD); + * not other descriptors) are read from character devices + * (i.e. /dev/bus/usb/BBB/DDD); * in this case the fields are converted to host endianness by the kernel. */ diff --git a/include/uapi/linux/usb/functionfs.h b/include/uapi/linux/usb/functionfs.h index b2a31a55a61237e65e4939a9d2722e136cc73ca0..062606f02309f814ffd7c9a4e5ef5a0f4f31abe5 100644 --- a/include/uapi/linux/usb/functionfs.h +++ b/include/uapi/linux/usb/functionfs.h @@ -158,7 +158,7 @@ struct usb_ext_prop_desc { * |-----+-----------------------+------+-------------------------------------| * | 0 | bFirstInterfaceNumber | U8 | index of the interface or of the 1st| * | | | | interface in an IAD group | - * | 1 | Reserved | U8 | 0 | + * | 1 | Reserved | U8 | 1 | * | 2 | CompatibleID | U8[8]| compatible ID string | * | 10 | SubCompatibleID | U8[8]| subcompatible ID string | * | 18 | Reserved | U8[6]| 0 | diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 519eff362c1caa14e932836816a020ed0f84ed4d..ae461050661afc4ea86413cf2c3e384281ad35f1 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -198,6 +198,7 @@ struct vfio_device_info { #define VFIO_DEVICE_FLAGS_PCI (1 << 1) /* vfio-pci device */ #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2) /* vfio-platform device */ #define VFIO_DEVICE_FLAGS_AMBA (1 << 3) /* vfio-amba device */ +#define VFIO_DEVICE_FLAGS_CCW (1 << 4) /* vfio-ccw device */ __u32 num_regions; /* Max region index + 1 */ __u32 num_irqs; /* Max IRQ index + 1 */ }; @@ -212,6 +213,7 @@ struct vfio_device_info { #define VFIO_DEVICE_API_PCI_STRING "vfio-pci" #define VFIO_DEVICE_API_PLATFORM_STRING "vfio-platform" #define VFIO_DEVICE_API_AMBA_STRING "vfio-amba" +#define VFIO_DEVICE_API_CCW_STRING "vfio-ccw" /** * VFIO_DEVICE_GET_REGION_INFO - _IOWR(VFIO_TYPE, VFIO_BASE + 8, @@ -446,6 +448,22 @@ enum { VFIO_PCI_NUM_IRQS }; +/* + * The vfio-ccw bus driver makes use of the following fixed region and + * IRQ index mapping. Unimplemented regions return a size of zero. + * Unimplemented IRQ types return a count of zero. + */ + +enum { + VFIO_CCW_CONFIG_REGION_INDEX, + VFIO_CCW_NUM_REGIONS +}; + +enum { + VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_NUM_IRQS +}; + /** * VFIO_DEVICE_GET_PCI_HOT_RESET_INFO - _IORW(VFIO_TYPE, VFIO_BASE + 12, * struct vfio_pci_hot_reset_info) diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h new file mode 100644 index 0000000000000000000000000000000000000000..34a7f6f9e0658e7d3e33573df227eb2ba04a5194 --- /dev/null +++ b/include/uapi/linux/vfio_ccw.h @@ -0,0 +1,24 @@ +/* + * Interfaces for vfio-ccw + * + * Copyright IBM Corp. 2017 + * + * Author(s): Dong Jia Shi + */ + +#ifndef _VFIO_CCW_H_ +#define _VFIO_CCW_H_ + +#include + +struct ccw_io_region { +#define ORB_AREA_SIZE 12 + __u8 orb_area[ORB_AREA_SIZE]; +#define SCSW_AREA_SIZE 12 + __u8 scsw_area[SCSW_AREA_SIZE]; +#define IRB_AREA_SIZE 96 + __u8 irb_area[IRB_AREA_SIZE]; + __u32 ret_code; +} __packed; + +#endif diff --git a/include/uapi/linux/videodev2.h b/include/uapi/linux/videodev2.h index 45184a2ef66c219c893769dd73bc5c3485a5c814..2b8feb86d09e902435902145f0707b58b57921f3 100644 --- a/include/uapi/linux/videodev2.h +++ b/include/uapi/linux/videodev2.h @@ -143,6 +143,7 @@ enum v4l2_buf_type { V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE = 10, V4L2_BUF_TYPE_SDR_CAPTURE = 11, V4L2_BUF_TYPE_SDR_OUTPUT = 12, + V4L2_BUF_TYPE_META_CAPTURE = 13, /* Deprecated, do not use */ V4L2_BUF_TYPE_PRIVATE = 0x80, }; @@ -362,8 +363,7 @@ enum v4l2_quantization { /* * The default for R'G'B' quantization is always full range, except * for the BT2020 colorspace. For Y'CbCr the quantization is always - * limited range, except for COLORSPACE_JPEG, XV601 or XV709: those - * are full range. + * limited range, except for COLORSPACE_JPEG: this is full range. */ V4L2_QUANTIZATION_DEFAULT = 0, V4L2_QUANTIZATION_FULL_RANGE = 1, @@ -378,8 +378,7 @@ enum v4l2_quantization { #define V4L2_MAP_QUANTIZATION_DEFAULT(is_rgb_or_hsv, colsp, ycbcr_enc) \ (((is_rgb_or_hsv) && (colsp) == V4L2_COLORSPACE_BT2020) ? \ V4L2_QUANTIZATION_LIM_RANGE : \ - (((is_rgb_or_hsv) || (ycbcr_enc) == V4L2_YCBCR_ENC_XV601 || \ - (ycbcr_enc) == V4L2_YCBCR_ENC_XV709 || (colsp) == V4L2_COLORSPACE_JPEG) ? \ + (((is_rgb_or_hsv) || (colsp) == V4L2_COLORSPACE_JPEG) ? \ V4L2_QUANTIZATION_FULL_RANGE : V4L2_QUANTIZATION_LIM_RANGE)) enum v4l2_priority { @@ -453,6 +452,7 @@ struct v4l2_capability { #define V4L2_CAP_SDR_CAPTURE 0x00100000 /* Is a SDR capture device */ #define V4L2_CAP_EXT_PIX_FORMAT 0x00200000 /* Supports the extended pixel format */ #define V4L2_CAP_SDR_OUTPUT 0x00400000 /* Is a SDR output device */ +#define V4L2_CAP_META_CAPTURE 0x00800000 /* Is a metadata capture device */ #define V4L2_CAP_READWRITE 0x01000000 /* read/write systemcalls */ #define V4L2_CAP_ASYNCIO 0x02000000 /* async I/O */ @@ -661,6 +661,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_Y12I v4l2_fourcc('Y', '1', '2', 'I') /* Greyscale 12-bit L/R interleaved */ #define V4L2_PIX_FMT_Z16 v4l2_fourcc('Z', '1', '6', ' ') /* Depth data 16-bit */ #define V4L2_PIX_FMT_MT21C v4l2_fourcc('M', 'T', '2', '1') /* Mediatek compressed block mode */ +#define V4L2_PIX_FMT_INZI v4l2_fourcc('I', 'N', 'Z', 'I') /* Intel Planar Greyscale 10-bit and Depth 16-bit */ /* SDR formats - used only for Software Defined Radio devices */ #define V4L2_SDR_FMT_CU8 v4l2_fourcc('C', 'U', '0', '8') /* IQ u8 */ @@ -675,6 +676,10 @@ struct v4l2_pix_format { #define V4L2_TCH_FMT_TU16 v4l2_fourcc('T', 'U', '1', '6') /* 16-bit unsigned touch data */ #define V4L2_TCH_FMT_TU08 v4l2_fourcc('T', 'U', '0', '8') /* 8-bit unsigned touch data */ +/* Meta-data formats */ +#define V4L2_META_FMT_VSP1_HGO v4l2_fourcc('V', 'S', 'P', 'H') /* R-Car VSP1 1-D Histogram */ +#define V4L2_META_FMT_VSP1_HGT v4l2_fourcc('V', 'S', 'P', 'T') /* R-Car VSP1 2-D Histogram */ + /* priv field value to indicates that subsequent fields are valid. */ #define V4L2_PIX_FMT_PRIV_MAGIC 0xfeedcafe @@ -1654,6 +1659,7 @@ struct v4l2_querymenu { #define V4L2_CTRL_FLAG_VOLATILE 0x0080 #define V4L2_CTRL_FLAG_HAS_PAYLOAD 0x0100 #define V4L2_CTRL_FLAG_EXECUTE_ON_WRITE 0x0200 +#define V4L2_CTRL_FLAG_MODIFY_LAYOUT 0x0400 /* Query flags, to be ORed with the control ID */ #define V4L2_CTRL_FLAG_NEXT_CTRL 0x80000000 @@ -2086,6 +2092,16 @@ struct v4l2_sdr_format { __u8 reserved[24]; } __attribute__ ((packed)); +/** + * struct v4l2_meta_format - metadata format definition + * @dataformat: little endian four character code (fourcc) + * @buffersize: maximum size in bytes required for data + */ +struct v4l2_meta_format { + __u32 dataformat; + __u32 buffersize; +} __attribute__ ((packed)); + /** * struct v4l2_format - stream data format * @type: enum v4l2_buf_type; type of the data stream @@ -2105,6 +2121,7 @@ struct v4l2_format { struct v4l2_vbi_format vbi; /* V4L2_BUF_TYPE_VBI_CAPTURE */ struct v4l2_sliced_vbi_format sliced; /* V4L2_BUF_TYPE_SLICED_VBI_CAPTURE */ struct v4l2_sdr_format sdr; /* V4L2_BUF_TYPE_SDR_CAPTURE */ + struct v4l2_meta_format meta; /* V4L2_BUF_TYPE_META_CAPTURE */ __u8 raw_data[200]; /* user-defined */ } fmt; }; diff --git a/include/uapi/linux/vsockmon.h b/include/uapi/linux/vsockmon.h new file mode 100644 index 0000000000000000000000000000000000000000..a08b522ef597894b092ecc0fbfe6f35a2580fc1c --- /dev/null +++ b/include/uapi/linux/vsockmon.h @@ -0,0 +1,60 @@ +#ifndef _UAPI_VSOCKMON_H +#define _UAPI_VSOCKMON_H + +#include + +/* + * vsockmon is the AF_VSOCK packet capture device. Packets captured have the + * following layout: + * + * +-----------------------------------+ + * | vsockmon header | + * | (struct af_vsockmon_hdr) | + * +-----------------------------------+ + * | transport header | + * | (af_vsockmon_hdr->len bytes long) | + * +-----------------------------------+ + * | payload | + * | (until end of packet) | + * +-----------------------------------+ + * + * The vsockmon header is a transport-independent description of the packet. + * It duplicates some of the information from the transport header so that + * no transport-specific knowledge is necessary to process packets. + * + * The transport header is useful for low-level transport-specific packet + * analysis. Transport type is given in af_vsockmon_hdr->transport and + * transport header length is given in af_vsockmon_hdr->len. + * + * If af_vsockmon_hdr->op is AF_VSOCK_OP_PAYLOAD then the payload follows the + * transport header. Other ops do not have a payload. + */ + +struct af_vsockmon_hdr { + __le64 src_cid; + __le64 dst_cid; + __le32 src_port; + __le32 dst_port; + __le16 op; /* enum af_vsockmon_op */ + __le16 transport; /* enum af_vsockmon_transport */ + __le16 len; /* Transport header length */ + __u8 reserved[2]; +}; + +enum af_vsockmon_op { + AF_VSOCK_OP_UNKNOWN = 0, + AF_VSOCK_OP_CONNECT = 1, + AF_VSOCK_OP_DISCONNECT = 2, + AF_VSOCK_OP_CONTROL = 3, + AF_VSOCK_OP_PAYLOAD = 4, +}; + +enum af_vsockmon_transport { + AF_VSOCK_TRANSPORT_UNKNOWN = 0, + AF_VSOCK_TRANSPORT_NO_INFO = 1, /* No transport information */ + + /* Transport header type: struct virtio_vsock_hdr */ + AF_VSOCK_TRANSPORT_VIRTIO = 2, +}; + +#endif diff --git a/include/uapi/linux/wimax/Kbuild b/include/uapi/linux/wimax/Kbuild deleted file mode 100644 index 1c97be49971ff377b152af80aca7e51e6c59a9a2..0000000000000000000000000000000000000000 --- a/include/uapi/linux/wimax/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# UAPI Header export list -header-y += i2400m.h diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 1fc62b239f1b7ea9f6da74740f3e35894bbc5a32..2b384ff09fa05411bdffdaa53472591554aa2be6 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -303,6 +303,7 @@ enum xfrm_attr_type_t { XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ XFRMA_PAD, + XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) @@ -494,6 +495,13 @@ struct xfrm_address_filter { __u8 dplen; }; +struct xfrm_user_offload { + int ifindex; + __u8 flags; +}; +#define XFRM_OFFLOAD_IPV6 1 +#define XFRM_OFFLOAD_INBOUND 2 + #ifndef __KERNEL__ /* backwards compatibility for userspace */ #define XFRMGRP_ACQUIRE 1 diff --git a/include/uapi/misc/Kbuild b/include/uapi/misc/Kbuild deleted file mode 100644 index e96cae7d58c9f758baf4846bfa5bc9cf3d6eee3f..0000000000000000000000000000000000000000 --- a/include/uapi/misc/Kbuild +++ /dev/null @@ -1,2 +0,0 @@ -# misc Header export list -header-y += cxl.h diff --git a/include/uapi/mtd/Kbuild b/include/uapi/mtd/Kbuild deleted file mode 100644 index 5a691e10cd0ecd35db2260c7417145dd5a588aad..0000000000000000000000000000000000000000 --- a/include/uapi/mtd/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# UAPI Header export list -header-y += inftl-user.h -header-y += mtd-abi.h -header-y += mtd-user.h -header-y += nftl-user.h -header-y += ubi-user.h diff --git a/include/uapi/rdma/Kbuild b/include/uapi/rdma/Kbuild deleted file mode 100644 index 1e0af1ff75c31a219fd866e52d75629ee3979c78..0000000000000000000000000000000000000000 --- a/include/uapi/rdma/Kbuild +++ /dev/null @@ -1,20 +0,0 @@ -# UAPI Header export list -header-y += ib_user_cm.h -header-y += rdma_user_ioctl.h -header-y += ib_user_mad.h -header-y += ib_user_sa.h -header-y += ib_user_verbs.h -header-y += rdma_netlink.h -header-y += rdma_user_cm.h -header-y += hfi/ -header-y += rdma_user_rxe.h -header-y += cxgb3-abi.h -header-y += cxgb4-abi.h -header-y += mlx4-abi.h -header-y += mlx5-abi.h -header-y += mthca-abi.h -header-y += nes-abi.h -header-y += ocrdma-abi.h -header-y += hns-abi.h -header-y += vmw_pvrdma-abi.h -header-y += qedr-abi.h diff --git a/include/uapi/rdma/bnxt_re-abi.h b/include/uapi/rdma/bnxt_re-abi.h index e2c8a3f0ccecb6f8cceb4a516f3927f4709d8ca0..74018bd18d7216fd0b34cc4d57d06c8138f18881 100644 --- a/include/uapi/rdma/bnxt_re-abi.h +++ b/include/uapi/rdma/bnxt_re-abi.h @@ -39,6 +39,8 @@ #ifndef __BNXT_RE_UVERBS_ABI_H__ #define __BNXT_RE_UVERBS_ABI_H__ +#include + #define BNXT_RE_ABI_VERSION 1 struct bnxt_re_uctx_resp { diff --git a/include/uapi/rdma/hfi/Kbuild b/include/uapi/rdma/hfi/Kbuild deleted file mode 100644 index b65b0b3a5f632ae27f7749dc20934e3bb61f4926..0000000000000000000000000000000000000000 --- a/include/uapi/rdma/hfi/Kbuild +++ /dev/null @@ -1,3 +0,0 @@ -# UAPI Header export list -header-y += hfi1_user.h -header-y += hfi1_ioctl.h diff --git a/include/uapi/rdma/ib_user_verbs.h b/include/uapi/rdma/ib_user_verbs.h index 997f904c76923057a376e0f4f0772ac82dcacef7..270c350bedc6c4911ff0763275fc034e90b2a018 100644 --- a/include/uapi/rdma/ib_user_verbs.h +++ b/include/uapi/rdma/ib_user_verbs.h @@ -947,6 +947,17 @@ struct ib_uverbs_flow_spec_action_tag { __u32 reserved1; }; +struct ib_uverbs_flow_spec_action_drop { + union { + struct ib_uverbs_flow_spec_hdr hdr; + struct { + __u32 type; + __u16 size; + __u16 reserved; + }; + }; +}; + struct ib_uverbs_flow_tunnel_filter { __be32 tunnel_id; }; @@ -1124,4 +1135,6 @@ struct ib_uverbs_ex_destroy_rwq_ind_table { __u32 ind_tbl_handle; }; +#define IB_DEVICE_NAME_MAX 64 + #endif /* IB_USER_VERBS_H */ diff --git a/include/uapi/rdma/vmw_pvrdma-abi.h b/include/uapi/rdma/vmw_pvrdma-abi.h index 5016abc9ee9712abc1fb4e04993940b80460a2be..c8c1d2d6df4d52ad84315727ec69849b9daa32ee 100644 --- a/include/uapi/rdma/vmw_pvrdma-abi.h +++ b/include/uapi/rdma/vmw_pvrdma-abi.h @@ -222,7 +222,7 @@ struct pvrdma_sq_wqe_hdr { __u32 opcode; /* operation type */ __u32 send_flags; /* wr flags */ union { - __u32 imm_data; + __be32 imm_data; __u32 invalidate_rkey; } ex; __u32 reserved; @@ -273,7 +273,7 @@ struct pvrdma_cqe { __u32 opcode; __u32 status; __u32 byte_len; - __u32 imm_data; + __be32 imm_data; __u32 src_qp; __u32 wc_flags; __u32 vendor_err; diff --git a/include/uapi/scsi/Kbuild b/include/uapi/scsi/Kbuild deleted file mode 100644 index d791e0ad509d3fc525f60b220e6e0dcc37c97a08..0000000000000000000000000000000000000000 --- a/include/uapi/scsi/Kbuild +++ /dev/null @@ -1,6 +0,0 @@ -# UAPI Header export list -header-y += fc/ -header-y += scsi_bsg_fc.h -header-y += scsi_netlink.h -header-y += scsi_netlink_fc.h -header-y += cxlflash_ioctl.h diff --git a/include/uapi/scsi/fc/Kbuild b/include/uapi/scsi/fc/Kbuild deleted file mode 100644 index 5ead9fac265c31e26872e8bd991a2e2c81a7064d..0000000000000000000000000000000000000000 --- a/include/uapi/scsi/fc/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# UAPI Header export list -header-y += fc_els.h -header-y += fc_fs.h -header-y += fc_gs.h -header-y += fc_ns.h diff --git a/include/uapi/sound/Kbuild b/include/uapi/sound/Kbuild deleted file mode 100644 index 9578d8bdbf31fe5f1675704bc6fad8b090b96b31..0000000000000000000000000000000000000000 --- a/include/uapi/sound/Kbuild +++ /dev/null @@ -1,16 +0,0 @@ -# UAPI Header export list -header-y += asequencer.h -header-y += asoc.h -header-y += asound.h -header-y += asound_fm.h -header-y += compress_offload.h -header-y += compress_params.h -header-y += emu10k1.h -header-y += firewire.h -header-y += hdsp.h -header-y += hdspm.h -header-y += sb16_csp.h -header-y += sfnt_info.h -header-y += tlv.h -header-y += usb_stream.h -header-y += snd_sst_tokens.h diff --git a/include/uapi/sound/asound.h b/include/uapi/sound/asound.h index be353a78c3030d05a5a8d1ebd478171c1e3c3fa7..fd41697cb4d36a24630e42ebb23cef4b21468506 100644 --- a/include/uapi/sound/asound.h +++ b/include/uapi/sound/asound.h @@ -107,9 +107,11 @@ enum { SNDRV_HWDEP_IFACE_FW_DIGI00X, /* Digidesign Digi 002/003 family */ SNDRV_HWDEP_IFACE_FW_TASCAM, /* TASCAM FireWire series */ SNDRV_HWDEP_IFACE_LINE6, /* Line6 USB processors */ + SNDRV_HWDEP_IFACE_FW_MOTU, /* MOTU FireWire series */ + SNDRV_HWDEP_IFACE_FW_FIREFACE, /* RME Fireface series */ /* Don't forget to change the following: */ - SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_LINE6 + SNDRV_HWDEP_IFACE_LAST = SNDRV_HWDEP_IFACE_FW_FIREFACE }; struct snd_hwdep_info { diff --git a/include/uapi/sound/firewire.h b/include/uapi/sound/firewire.h index db79a12fcc78db70d234b89d4b8bf6f573598224..622900488bdcba396f94641d4b7e9a038fc8c462 100644 --- a/include/uapi/sound/firewire.h +++ b/include/uapi/sound/firewire.h @@ -10,6 +10,7 @@ #define SNDRV_FIREWIRE_EVENT_DICE_NOTIFICATION 0xd1ce004e #define SNDRV_FIREWIRE_EVENT_EFW_RESPONSE 0x4e617475 #define SNDRV_FIREWIRE_EVENT_DIGI00X_MESSAGE 0x746e736c +#define SNDRV_FIREWIRE_EVENT_MOTU_NOTIFICATION 0x64776479 struct snd_firewire_event_common { unsigned int type; /* SNDRV_FIREWIRE_EVENT_xxx */ @@ -46,12 +47,18 @@ struct snd_firewire_event_digi00x_message { __u32 message; /* Digi00x-specific message */ }; +struct snd_firewire_event_motu_notification { + unsigned int type; + __u32 message; /* MOTU-specific bits. */ +}; + union snd_firewire_event { struct snd_firewire_event_common common; struct snd_firewire_event_lock_status lock_status; struct snd_firewire_event_dice_notification dice_notification; struct snd_firewire_event_efw_response efw_response; struct snd_firewire_event_digi00x_message digi00x_message; + struct snd_firewire_event_motu_notification motu_notification; }; @@ -65,7 +72,8 @@ union snd_firewire_event { #define SNDRV_FIREWIRE_TYPE_OXFW 4 #define SNDRV_FIREWIRE_TYPE_DIGI00X 5 #define SNDRV_FIREWIRE_TYPE_TASCAM 6 -/* RME, MOTU, ... */ +#define SNDRV_FIREWIRE_TYPE_MOTU 7 +#define SNDRV_FIREWIRE_TYPE_FIREFACE 8 struct snd_firewire_get_info { unsigned int type; /* SNDRV_FIREWIRE_TYPE_xxx */ diff --git a/include/uapi/video/Kbuild b/include/uapi/video/Kbuild deleted file mode 100644 index ac7203bb32cc0eace2b5be57412d3e215e50b8ac..0000000000000000000000000000000000000000 --- a/include/uapi/video/Kbuild +++ /dev/null @@ -1,4 +0,0 @@ -# UAPI Header export list -header-y += edid.h -header-y += sisfb.h -header-y += uvesafb.h diff --git a/include/uapi/xen/Kbuild b/include/uapi/xen/Kbuild deleted file mode 100644 index 5c459628e8c7492c7eaea4bf6cc75e6976438e82..0000000000000000000000000000000000000000 --- a/include/uapi/xen/Kbuild +++ /dev/null @@ -1,5 +0,0 @@ -# UAPI Header export list -header-y += evtchn.h -header-y += gntalloc.h -header-y += gntdev.h -header-y += privcmd.h diff --git a/include/video/Kbuild b/include/video/Kbuild deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/include/video/udlfb.h b/include/video/udlfb.h index f9466fa54ba4bd626fe618aef5b95f3b30dfde9b..3ea90aea5617cb359423fefc6140e381d2e78e20 100644 --- a/include/video/udlfb.h +++ b/include/video/udlfb.h @@ -92,6 +92,6 @@ struct dlfb_data { /* remove these once align.h patch is taken into kernel */ #define DL_ALIGN_UP(x, a) ALIGN(x, a) -#define DL_ALIGN_DOWN(x, a) ALIGN(x-(a-1), a) +#define DL_ALIGN_DOWN(x, a) ALIGN_DOWN(x, a) #endif diff --git a/include/xen/arm/page-coherent.h b/include/xen/arm/page-coherent.h index 95ce6ac3a971fb1425d2dc841466defdc850bf9c..b1b4ecdf329a7b4fefc623f5276f10f526a8414c 100644 --- a/include/xen/arm/page-coherent.h +++ b/include/xen/arm/page-coherent.h @@ -2,8 +2,16 @@ #define _ASM_ARM_XEN_PAGE_COHERENT_H #include +#include #include +static inline const struct dma_map_ops *xen_get_dma_ops(struct device *dev) +{ + if (dev && dev->archdata.dev_dma_ops) + return dev->archdata.dev_dma_ops; + return get_arch_dma_ops(NULL); +} + void __xen_dma_map_page(struct device *hwdev, struct page *page, dma_addr_t dev_addr, unsigned long offset, size_t size, enum dma_data_direction dir, unsigned long attrs); @@ -19,13 +27,13 @@ void __xen_dma_sync_single_for_device(struct device *hwdev, static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) { - return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); + return xen_get_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); } static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) { - __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); + xen_get_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); } static inline void xen_dma_map_page(struct device *hwdev, struct page *page, @@ -49,7 +57,7 @@ static inline void xen_dma_map_page(struct device *hwdev, struct page *page, * specific function. */ if (local) - __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); + xen_get_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); else __xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs); } @@ -67,8 +75,8 @@ static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, * specific function. */ if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->unmap_page) - __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); + if (xen_get_dma_ops(hwdev)->unmap_page) + xen_get_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); } else __xen_dma_unmap_page(hwdev, handle, size, dir, attrs); } @@ -78,8 +86,8 @@ static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, { unsigned long pfn = PFN_DOWN(handle); if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_cpu) - __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); + if (xen_get_dma_ops(hwdev)->sync_single_for_cpu) + xen_get_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); } else __xen_dma_sync_single_for_cpu(hwdev, handle, size, dir); } @@ -89,8 +97,8 @@ static inline void xen_dma_sync_single_for_device(struct device *hwdev, { unsigned long pfn = PFN_DOWN(handle); if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_device) - __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); + if (xen_get_dma_ops(hwdev)->sync_single_for_device) + xen_get_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); } else __xen_dma_sync_single_for_device(hwdev, handle, size, dir); } diff --git a/include/xen/interface/io/9pfs.h b/include/xen/interface/io/9pfs.h new file mode 100644 index 0000000000000000000000000000000000000000..5b6c19dae5e252f244b447a32e6bcd09b2c9344f --- /dev/null +++ b/include/xen/interface/io/9pfs.h @@ -0,0 +1,36 @@ +/* + * 9pfs.h -- Xen 9PFS transport + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2017 Stefano Stabellini + */ + +#ifndef __XEN_PUBLIC_IO_9PFS_H__ +#define __XEN_PUBLIC_IO_9PFS_H__ + +#include "xen/interface/io/ring.h" + +/* + * See docs/misc/9pfs.markdown in xen.git for the full specification: + * https://xenbits.xen.org/docs/unstable/misc/9pfs.html + */ +DEFINE_XEN_FLEX_RING_AND_INTF(xen_9pfs); + +#endif diff --git a/include/xen/interface/io/displif.h b/include/xen/interface/io/displif.h new file mode 100644 index 0000000000000000000000000000000000000000..596578d9be3e065a4c10d55dbf316bb7e672f84a --- /dev/null +++ b/include/xen/interface/io/displif.h @@ -0,0 +1,854 @@ +/****************************************************************************** + * displif.h + * + * Unified display device I/O interface for Xen guest OSes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2016-2017 EPAM Systems Inc. + * + * Authors: Oleksandr Andrushchenko + * Oleksandr Grytsov + */ + +#ifndef __XEN_PUBLIC_IO_DISPLIF_H__ +#define __XEN_PUBLIC_IO_DISPLIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + ****************************************************************************** + * Protocol version + ****************************************************************************** + */ +#define XENDISPL_PROTOCOL_VERSION "1" + +/* + ****************************************************************************** + * Main features provided by the protocol + ****************************************************************************** + * This protocol aims to provide a unified protocol which fits more + * sophisticated use-cases than a framebuffer device can handle. At the + * moment basic functionality is supported with the intention to be extended: + * o multiple dynamically allocated/destroyed framebuffers + * o buffers of arbitrary sizes + * o buffer allocation at either back or front end + * o better configuration options including multiple display support + * + * Note: existing fbif can be used together with displif running at the + * same time, e.g. on Linux one provides framebuffer and another DRM/KMS + * + * Note: display resolution (XenStore's "resolution" property) defines + * visible area of the virtual display. At the same time resolution of + * the display and frame buffers may differ: buffers can be smaller, equal + * or bigger than the visible area. This is to enable use-cases, where backend + * may do some post-processing of the display and frame buffers supplied, + * e.g. those buffers can be just a part of the final composition. + * + ****************************************************************************** + * Direction of improvements + ****************************************************************************** + * Future extensions to the existing protocol may include: + * o display/connector cloning + * o allocation of objects other than display buffers + * o plane/overlay support + * o scaling support + * o rotation support + * + ****************************************************************************** + * Feature and Parameter Negotiation + ****************************************************************************** + * + * Front->back notifications: when enqueuing a new request, sending a + * notification can be made conditional on xendispl_req (i.e., the generic + * hold-off mechanism provided by the ring macros). Backends must set + * xendispl_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). + * + * Back->front notifications: when enqueuing a new response, sending a + * notification can be made conditional on xendispl_resp (i.e., the generic + * hold-off mechanism provided by the ring macros). Frontends must set + * xendispl_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). + * + * The two halves of a para-virtual display driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following the XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ****************************************************************************** + * Example configuration + ****************************************************************************** + * + * Note: depending on the use-case backend can expose more display connectors + * than the underlying HW physically has by employing SW graphics compositors + * + * This is an example of backend and frontend configuration: + * + *--------------------------------- Backend ----------------------------------- + * + * /local/domain/0/backend/vdispl/1/0/frontend-id = "1" + * /local/domain/0/backend/vdispl/1/0/frontend = "/local/domain/1/device/vdispl/0" + * /local/domain/0/backend/vdispl/1/0/state = "4" + * /local/domain/0/backend/vdispl/1/0/versions = "1,2" + * + *--------------------------------- Frontend ---------------------------------- + * + * /local/domain/1/device/vdispl/0/backend-id = "0" + * /local/domain/1/device/vdispl/0/backend = "/local/domain/0/backend/vdispl/1/0" + * /local/domain/1/device/vdispl/0/state = "4" + * /local/domain/1/device/vdispl/0/version = "1" + * /local/domain/1/device/vdispl/0/be-alloc = "1" + * + *-------------------------- Connector 0 configuration ------------------------ + * + * /local/domain/1/device/vdispl/0/0/resolution = "1920x1080" + * /local/domain/1/device/vdispl/0/0/req-ring-ref = "2832" + * /local/domain/1/device/vdispl/0/0/req-event-channel = "15" + * /local/domain/1/device/vdispl/0/0/evt-ring-ref = "387" + * /local/domain/1/device/vdispl/0/0/evt-event-channel = "16" + * + *-------------------------- Connector 1 configuration ------------------------ + * + * /local/domain/1/device/vdispl/0/1/resolution = "800x600" + * /local/domain/1/device/vdispl/0/1/req-ring-ref = "2833" + * /local/domain/1/device/vdispl/0/1/req-event-channel = "17" + * /local/domain/1/device/vdispl/0/1/evt-ring-ref = "388" + * /local/domain/1/device/vdispl/0/1/evt-event-channel = "18" + * + ****************************************************************************** + * Backend XenBus Nodes + ****************************************************************************** + * + *----------------------------- Protocol version ------------------------------ + * + * versions + * Values: + * + * List of XENDISPL_LIST_SEPARATOR separated protocol versions supported + * by the backend. For example "1,2,3". + * + ****************************************************************************** + * Frontend XenBus Nodes + ****************************************************************************** + * + *-------------------------------- Addressing --------------------------------- + * + * dom-id + * Values: + * + * Domain identifier. + * + * dev-id + * Values: + * + * Device identifier. + * + * conn-idx + * Values: + * + * Zero based contigous index of the connector. + * /local/domain//device/vdispl///... + * + *----------------------------- Protocol version ------------------------------ + * + * version + * Values: + * + * Protocol version, chosen among the ones supported by the backend. + * + *------------------------- Backend buffer allocation ------------------------- + * + * be-alloc + * Values: "0", "1" + * + * If value is set to "1", then backend can be a buffer provider/allocator + * for this domain during XENDISPL_OP_DBUF_CREATE operation (see below + * for negotiation). + * If value is not "1" or omitted frontend must allocate buffers itself. + * + *----------------------------- Connector settings ---------------------------- + * + * resolution + * Values: x + * + * Width and height of the connector in pixels separated by + * XENDISPL_RESOLUTION_SEPARATOR. This defines visible area of the + * display. + * + *------------------ Connector Request Transport Parameters ------------------- + * + * This communication path is used to deliver requests from frontend to backend + * and get the corresponding responses from backend to frontend, + * set up per connector. + * + * req-event-channel + * Values: + * + * The identifier of the Xen connector's control event channel + * used to signal activity in the ring buffer. + * + * req-ring-ref + * Values: + * + * The Xen grant reference granting permission for the backend to map + * a sole page of connector's control ring buffer. + * + *------------------- Connector Event Transport Parameters -------------------- + * + * This communication path is used to deliver asynchronous events from backend + * to frontend, set up per connector. + * + * evt-event-channel + * Values: + * + * The identifier of the Xen connector's event channel + * used to signal activity in the ring buffer. + * + * evt-ring-ref + * Values: + * + * The Xen grant reference granting permission for the backend to map + * a sole page of connector's event ring buffer. + */ + +/* + ****************************************************************************** + * STATE DIAGRAMS + ****************************************************************************** + * + * Tool stack creates front and back state nodes with initial state + * XenbusStateInitialising. + * Tool stack creates and sets up frontend display configuration + * nodes per domain. + * + *-------------------------------- Normal flow -------------------------------- + * + * Front Back + * ================================= ===================================== + * XenbusStateInitialising XenbusStateInitialising + * o Query backend device identification + * data. + * o Open and validate backend device. + * | + * | + * V + * XenbusStateInitWait + * + * o Query frontend configuration + * o Allocate and initialize + * event channels per configured + * connector. + * o Publish transport parameters + * that will be in effect during + * this connection. + * | + * | + * V + * XenbusStateInitialised + * + * o Query frontend transport parameters. + * o Connect to the event channels. + * | + * | + * V + * XenbusStateConnected + * + * o Create and initialize OS + * virtual display connectors + * as per configuration. + * | + * | + * V + * XenbusStateConnected + * + * XenbusStateUnknown + * XenbusStateClosed + * XenbusStateClosing + * o Remove virtual display device + * o Remove event channels + * | + * | + * V + * XenbusStateClosed + * + *------------------------------- Recovery flow ------------------------------- + * + * In case of frontend unrecoverable errors backend handles that as + * if frontend goes into the XenbusStateClosed state. + * + * In case of backend unrecoverable errors frontend tries removing + * the virtualized device. If this is possible at the moment of error, + * then frontend goes into the XenbusStateInitialising state and is ready for + * new connection with backend. If the virtualized device is still in use and + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state + * until either the virtualized device is removed or backend initiates a new + * connection. On the virtualized device removal frontend goes into the + * XenbusStateInitialising state. + * + * Note on XenbusStateReconfiguring state of the frontend: if backend has + * unrecoverable errors then frontend cannot send requests to the backend + * and thus cannot provide functionality of the virtualized device anymore. + * After backend is back to normal the virtualized device may still hold some + * state: configuration in use, allocated buffers, client application state etc. + * In most cases, this will require frontend to implement complex recovery + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, + * frontend will make sure no new clients of the virtualized device are + * accepted, allow existing client(s) to exit gracefully by signaling error + * state etc. + * Once all the clients are gone frontend can reinitialize the virtualized + * device and get into XenbusStateInitialising state again signaling the + * backend that a new connection can be made. + * + * There are multiple conditions possible under which frontend will go from + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS + * specific. For example: + * 1. The underlying OS framework may provide callbacks to signal that the last + * client of the virtualized device has gone and the device can be removed + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) + * to periodically check if this is the right time to re-try removal of + * the virtualized device. + * 3. By any other means. + * + ****************************************************************************** + * REQUEST CODES + ****************************************************************************** + * Request codes [0; 15] are reserved and must not be used + */ + +#define XENDISPL_OP_DBUF_CREATE 0x10 +#define XENDISPL_OP_DBUF_DESTROY 0x11 +#define XENDISPL_OP_FB_ATTACH 0x12 +#define XENDISPL_OP_FB_DETACH 0x13 +#define XENDISPL_OP_SET_CONFIG 0x14 +#define XENDISPL_OP_PG_FLIP 0x15 + +/* + ****************************************************************************** + * EVENT CODES + ****************************************************************************** + */ +#define XENDISPL_EVT_PG_FLIP 0x00 + +/* + ****************************************************************************** + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS + ****************************************************************************** + */ +#define XENDISPL_DRIVER_NAME "vdispl" + +#define XENDISPL_LIST_SEPARATOR "," +#define XENDISPL_RESOLUTION_SEPARATOR "x" + +#define XENDISPL_FIELD_BE_VERSIONS "versions" +#define XENDISPL_FIELD_FE_VERSION "version" +#define XENDISPL_FIELD_REQ_RING_REF "req-ring-ref" +#define XENDISPL_FIELD_REQ_CHANNEL "req-event-channel" +#define XENDISPL_FIELD_EVT_RING_REF "evt-ring-ref" +#define XENDISPL_FIELD_EVT_CHANNEL "evt-event-channel" +#define XENDISPL_FIELD_RESOLUTION "resolution" +#define XENDISPL_FIELD_BE_ALLOC "be-alloc" + +/* + ****************************************************************************** + * STATUS RETURN CODES + ****************************************************************************** + * + * Status return code is zero on success and -XEN_EXX on failure. + * + ****************************************************************************** + * Assumptions + ****************************************************************************** + * o usage of grant reference 0 as invalid grant reference: + * grant reference 0 is valid, but never exposed to a PV driver, + * because of the fact it is already in use/reserved by the PV console. + * o all references in this document to page sizes must be treated + * as pages of size XEN_PAGE_SIZE unless otherwise noted. + * + ****************************************************************************** + * Description of the protocol between frontend and backend driver + ****************************************************************************** + * + * The two halves of a Para-virtual display driver communicate with + * each other using shared pages and event channels. + * Shared page contains a ring with request/response packets. + * + * All reserved fields in the structures below must be 0. + * Display buffers's cookie of value 0 is treated as invalid. + * Framebuffer's cookie of value 0 is treated as invalid. + * + * For all request/response/event packets that use cookies: + * dbuf_cookie - uint64_t, unique to guest domain value used by the backend + * to map remote display buffer to its local one + * fb_cookie - uint64_t, unique to guest domain value used by the backend + * to map remote framebuffer to its local one + * + *---------------------------------- Requests --------------------------------- + * + * All requests/responses, which are not connector specific, must be sent over + * control ring of the connector which has the index value of 0: + * /local/domain//device/vdispl//0/req-ring-ref + * + * All request packets have the same length (64 octets) + * All request packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * id - uint16_t, private guest value, echoed in response + * operation - uint8_t, operation code, XENDISPL_OP_??? + * + * Request dbuf creation - request creation of a display buffer. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id |_OP_DBUF_CREATE | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | width | 20 + * +----------------+----------------+----------------+----------------+ + * | height | 24 + * +----------------+----------------+----------------+----------------+ + * | bpp | 28 + * +----------------+----------------+----------------+----------------+ + * | buffer_sz | 32 + * +----------------+----------------+----------------+----------------+ + * | flags | 36 + * +----------------+----------------+----------------+----------------+ + * | gref_directory | 40 + * +----------------+----------------+----------------+----------------+ + * | reserved | 44 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain//device/vdispl//0/req-ring-ref + * All unused bits in flags field must be set to 0. + * + * An attempt to create multiple display buffers with the same dbuf_cookie is + * an error. dbuf_cookie can be re-used after destroying the corresponding + * display buffer. + * + * Width and height of the display buffers can be smaller, equal or bigger + * than the connector's resolution. Depth/pixel format of the individual + * buffers can differ as well. + * + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * bpp - uint32_t, bits per pixel + * buffer_sz - uint32_t, buffer size to be allocated, octets + * flags - uint32_t, flags of the operation + * o XENDISPL_DBUF_FLG_REQ_ALLOC - if set, then backend is requested + * to allocate the buffer with the parameters provided in this request. + * Page directory is handled as follows: + * Frontend on request: + * o allocates pages for the directory (gref_directory, + * gref_dir_next_page(s) + * o grants permissions for the pages of the directory to the backend + * o sets gref_dir_next_page fields + * Backend on response: + * o grants permissions for the pages of the buffer allocated to + * the frontend + * o fills in page directory with grant references + * (gref[] in struct xendispl_page_directory) + * gref_directory - grant_ref_t, a reference to the first shared page + * describing shared buffer references. At least one page exists. If shared + * buffer size (buffer_sz) exceeds what can be addressed by this single page, + * then reference to the next page must be supplied (see gref_dir_next_page + * below) + */ + +#define XENDISPL_DBUF_FLG_REQ_ALLOC (1 << 0) + +struct xendispl_dbuf_create_req { + uint64_t dbuf_cookie; + uint32_t width; + uint32_t height; + uint32_t bpp; + uint32_t buffer_sz; + uint32_t flags; + grant_ref_t gref_directory; +}; + +/* + * Shared page for XENDISPL_OP_DBUF_CREATE buffer descriptor (gref_directory in + * the request) employs a list of pages, describing all pages of the shared + * data buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | gref_dir_next_page | 4 + * +----------------+----------------+----------------+----------------+ + * | gref[0] | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[i] | i*4+8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[N - 1] | N*4+8 + * +----------------+----------------+----------------+----------------+ + * + * gref_dir_next_page - grant_ref_t, reference to the next page describing + * page directory. Must be 0 if there are no more pages in the list. + * gref[i] - grant_ref_t, reference to a shared page of the buffer + * allocated at XENDISPL_OP_DBUF_CREATE + * + * Number of grant_ref_t entries in the whole page directory is not + * passed, but instead can be calculated as: + * num_grefs_total = (XENDISPL_OP_DBUF_CREATE.buffer_sz + XEN_PAGE_SIZE - 1) / + * XEN_PAGE_SIZE + */ + +struct xendispl_page_directory { + grant_ref_t gref_dir_next_page; + grant_ref_t gref[1]; /* Variable length */ +}; + +/* + * Request dbuf destruction - destroy a previously allocated display buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id |_OP_DBUF_DESTROY| reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain//device/vdispl//0/req-ring-ref + */ + +struct xendispl_dbuf_destroy_req { + uint64_t dbuf_cookie; +}; + +/* + * Request framebuffer attachment - request attachment of a framebuffer to + * previously created display buffer. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_FB_ATTACH | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | dbuf_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 20 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 24 + * +----------------+----------------+----------------+----------------+ + * | width | 28 + * +----------------+----------------+----------------+----------------+ + * | height | 32 + * +----------------+----------------+----------------+----------------+ + * | pixel_format | 36 + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain//device/vdispl//0/req-ring-ref + * Width and height can be smaller, equal or bigger than the connector's + * resolution. + * + * An attempt to create multiple frame buffers with the same fb_cookie is + * an error. fb_cookie can be re-used after destroying the corresponding + * frame buffer. + * + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * pixel_format - uint32_t, pixel format of the framebuffer, FOURCC code + */ + +struct xendispl_fb_attach_req { + uint64_t dbuf_cookie; + uint64_t fb_cookie; + uint32_t width; + uint32_t height; + uint32_t pixel_format; +}; + +/* + * Request framebuffer detach - detach a previously + * attached framebuffer from the display buffer in request: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_FB_DETACH | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Must be sent over control ring of the connector which has the index + * value of 0: + * /local/domain//device/vdispl//0/req-ring-ref + */ + +struct xendispl_fb_detach_req { + uint64_t fb_cookie; +}; + +/* + * Request configuration set/reset - request to set or reset + * the configuration/mode of the display: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_SET_CONFIG | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | x | 20 + * +----------------+----------------+----------------+----------------+ + * | y | 24 + * +----------------+----------------+----------------+----------------+ + * | width | 28 + * +----------------+----------------+----------------+----------------+ + * | height | 32 + * +----------------+----------------+----------------+----------------+ + * | bpp | 40 + * +----------------+----------------+----------------+----------------+ + * | reserved | 44 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * Pass all zeros to reset, otherwise command is treated as + * configuration set. + * Framebuffer's cookie defines which framebuffer/dbuf must be + * displayed while enabling display (applying configuration). + * x, y, width and height are bound by the connector's resolution and must not + * exceed it. + * + * x - uint32_t, starting position in pixels by X axis + * y - uint32_t, starting position in pixels by Y axis + * width - uint32_t, width in pixels + * height - uint32_t, height in pixels + * bpp - uint32_t, bits per pixel + */ + +struct xendispl_set_config_req { + uint64_t fb_cookie; + uint32_t x; + uint32_t y; + uint32_t width; + uint32_t height; + uint32_t bpp; +}; + +/* + * Request page flip - request to flip a page identified by the framebuffer + * cookie: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _OP_PG_FLIP | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + */ + +struct xendispl_page_flip_req { + uint64_t fb_cookie; +}; + +/* + *---------------------------------- Responses -------------------------------- + * + * All response packets have the same length (64 octets) + * + * All response packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | status | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, private guest value, echoed from request + * status - int32_t, response status, zero on success and -XEN_EXX on failure + * + *----------------------------------- Events ---------------------------------- + * + * Events are sent via a shared page allocated by the front and propagated by + * evt-event-channel/evt-ring-ref XenStore entries + * All event packets have the same length (64 octets) + * All event packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | type | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, event id, may be used by front + * type - uint8_t, type of the event + * + * + * Page flip complete event - event from back to front on page flip completed: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | _EVT_PG_FLIP | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie low 32-bit | 12 + * +----------------+----------------+----------------+----------------+ + * | fb_cookie high 32-bit | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 64 + * +----------------+----------------+----------------+----------------+ + */ + +struct xendispl_pg_flip_evt { + uint64_t fb_cookie; +}; + +struct xendispl_req { + uint16_t id; + uint8_t operation; + uint8_t reserved[5]; + union { + struct xendispl_dbuf_create_req dbuf_create; + struct xendispl_dbuf_destroy_req dbuf_destroy; + struct xendispl_fb_attach_req fb_attach; + struct xendispl_fb_detach_req fb_detach; + struct xendispl_set_config_req set_config; + struct xendispl_page_flip_req pg_flip; + uint8_t reserved[56]; + } op; +}; + +struct xendispl_resp { + uint16_t id; + uint8_t operation; + uint8_t reserved; + int32_t status; + uint8_t reserved1[56]; +}; + +struct xendispl_evt { + uint16_t id; + uint8_t type; + uint8_t reserved[5]; + union { + struct xendispl_pg_flip_evt pg_flip; + uint8_t reserved[56]; + } op; +}; + +DEFINE_RING_TYPES(xen_displif, struct xendispl_req, struct xendispl_resp); + +/* + ****************************************************************************** + * Back to front events delivery + ****************************************************************************** + * In order to deliver asynchronous events from back to front a shared page is + * allocated by front and its granted reference propagated to back via + * XenStore entries (evt-ring-ref/evt-event-channel). + * This page has a common header used by both front and back to synchronize + * access and control event's ring buffer, while back being a producer of the + * events and front being a consumer. The rest of the page after the header + * is used for event packets. + * + * Upon reception of an event(s) front may confirm its reception + * for either each event, group of events or none. + */ + +struct xendispl_event_page { + uint32_t in_cons; + uint32_t in_prod; + uint8_t reserved[56]; +}; + +#define XENDISPL_EVENT_PAGE_SIZE XEN_PAGE_SIZE +#define XENDISPL_IN_RING_OFFS (sizeof(struct xendispl_event_page)) +#define XENDISPL_IN_RING_SIZE (XENDISPL_EVENT_PAGE_SIZE - XENDISPL_IN_RING_OFFS) +#define XENDISPL_IN_RING_LEN (XENDISPL_IN_RING_SIZE / sizeof(struct xendispl_evt)) +#define XENDISPL_IN_RING(page) \ + ((struct xendispl_evt *)((char *)(page) + XENDISPL_IN_RING_OFFS)) +#define XENDISPL_IN_RING_REF(page, idx) \ + (XENDISPL_IN_RING((page))[(idx) % XENDISPL_IN_RING_LEN]) + +#endif /* __XEN_PUBLIC_IO_DISPLIF_H__ */ diff --git a/include/xen/interface/io/kbdif.h b/include/xen/interface/io/kbdif.h index 8066c7849fbee36b8d8b0544281fc33af802db96..2a9510ade70181a4c10b65bdfc834d831ef8b287 100644 --- a/include/xen/interface/io/kbdif.h +++ b/include/xen/interface/io/kbdif.h @@ -26,43 +26,432 @@ #ifndef __XEN_PUBLIC_IO_KBDIF_H__ #define __XEN_PUBLIC_IO_KBDIF_H__ -/* In events (backend -> frontend) */ +/* + ***************************************************************************** + * Feature and Parameter Negotiation + ***************************************************************************** + * + * The two halves of a para-virtual driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ***************************************************************************** + * Backend XenBus Nodes + ***************************************************************************** + * + *---------------------------- Features supported ---------------------------- + * + * Capable backend advertises supported features by publishing + * corresponding entries in XenStore and puts 1 as the value of the entry. + * If a feature is not supported then 0 must be set or feature entry omitted. + * + * feature-abs-pointer + * Values: + * + * Backends, which support reporting of absolute coordinates for pointer + * device should set this to 1. + * + * feature-multi-touch + * Values: + * + * Backends, which support reporting of multi-touch events + * should set this to 1. + * + *------------------------- Pointer Device Parameters ------------------------ + * + * width + * Values: + * + * Maximum X coordinate (width) to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + * height + * Values: + * + * Maximum Y coordinate (height) to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + ***************************************************************************** + * Frontend XenBus Nodes + ***************************************************************************** + * + *------------------------------ Feature request ----------------------------- + * + * Capable frontend requests features from backend via setting corresponding + * entries to 1 in XenStore. Requests for features not advertised as supported + * by the backend have no effect. + * + * request-abs-pointer + * Values: + * + * Request backend to report absolute pointer coordinates + * (XENKBD_TYPE_POS) instead of relative ones (XENKBD_TYPE_MOTION). + * + * request-multi-touch + * Values: + * + * Request backend to report multi-touch events. + * + *----------------------- Request Transport Parameters ----------------------- + * + * event-channel + * Values: + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * page-gref + * Values: + * + * The Xen grant reference granting permission for the backend to map + * a sole page in a single page sized event ring buffer. + * + * page-ref + * Values: + * + * OBSOLETE, not recommended for use. + * PFN of the shared page. + * + *----------------------- Multi-touch Device Parameters ----------------------- + * + * multi-touch-num-contacts + * Values: + * + * Number of simultaneous touches reported. + * + * multi-touch-width + * Values: + * + * Width of the touch area to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + * + * multi-touch-height + * Values: + * + * Height of the touch area to be used by the frontend + * while reporting input events, pixels, [0; UINT32_MAX]. + */ /* - * Frontends should ignore unknown in events. + * EVENT CODES. */ -/* Pointer movement event */ -#define XENKBD_TYPE_MOTION 1 -/* Event type 2 currently not used */ -/* Key event (includes pointer buttons) */ -#define XENKBD_TYPE_KEY 3 +#define XENKBD_TYPE_MOTION 1 +#define XENKBD_TYPE_RESERVED 2 +#define XENKBD_TYPE_KEY 3 +#define XENKBD_TYPE_POS 4 +#define XENKBD_TYPE_MTOUCH 5 + +/* Multi-touch event sub-codes */ + +#define XENKBD_MT_EV_DOWN 0 +#define XENKBD_MT_EV_UP 1 +#define XENKBD_MT_EV_MOTION 2 +#define XENKBD_MT_EV_SYN 3 +#define XENKBD_MT_EV_SHAPE 4 +#define XENKBD_MT_EV_ORIENT 5 + +/* + * CONSTANTS, XENSTORE FIELD AND PATH NAME STRINGS, HELPERS. + */ + +#define XENKBD_DRIVER_NAME "vkbd" + +#define XENKBD_FIELD_FEAT_ABS_POINTER "feature-abs-pointer" +#define XENKBD_FIELD_FEAT_MTOUCH "feature-multi-touch" +#define XENKBD_FIELD_REQ_ABS_POINTER "request-abs-pointer" +#define XENKBD_FIELD_REQ_MTOUCH "request-multi-touch" +#define XENKBD_FIELD_RING_GREF "page-gref" +#define XENKBD_FIELD_EVT_CHANNEL "event-channel" +#define XENKBD_FIELD_WIDTH "width" +#define XENKBD_FIELD_HEIGHT "height" +#define XENKBD_FIELD_MT_WIDTH "multi-touch-width" +#define XENKBD_FIELD_MT_HEIGHT "multi-touch-height" +#define XENKBD_FIELD_MT_NUM_CONTACTS "multi-touch-num-contacts" + +/* OBSOLETE, not recommended for use */ +#define XENKBD_FIELD_RING_REF "page-ref" + /* - * Pointer position event - * Capable backend sets feature-abs-pointer in xenstore. - * Frontend requests ot instead of XENKBD_TYPE_MOTION by setting - * request-abs-update in xenstore. + ***************************************************************************** + * Description of the protocol between frontend and backend driver. + ***************************************************************************** + * + * The two halves of a Para-virtual driver communicate with + * each other using a shared page and an event channel. + * Shared page contains a ring with event structures. + * + * All reserved fields in the structures below must be 0. + * + ***************************************************************************** + * Backend to frontend events + ***************************************************************************** + * + * Frontends should ignore unknown in events. + * All event packets have the same length (40 octets) + * All event packets have common header: + * + * 0 octet + * +-----------------+ + * | type | + * +-----------------+ + * type - uint8_t, event code, XENKBD_TYPE_??? + * + * + * Pointer relative movement event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MOTION | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | rel_x | 8 + * +----------------+----------------+----------------+----------------+ + * | rel_y | 12 + * +----------------+----------------+----------------+----------------+ + * | rel_z | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * rel_x - int32_t, relative X motion + * rel_y - int32_t, relative Y motion + * rel_z - int32_t, relative Z motion (wheel) */ -#define XENKBD_TYPE_POS 4 struct xenkbd_motion { - uint8_t type; /* XENKBD_TYPE_MOTION */ - int32_t rel_x; /* relative X motion */ - int32_t rel_y; /* relative Y motion */ - int32_t rel_z; /* relative Z motion (wheel) */ + uint8_t type; + int32_t rel_x; + int32_t rel_y; + int32_t rel_z; }; +/* + * Key event (includes pointer buttons) + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_KEY | pressed | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | keycode | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * pressed - uint8_t, 1 if pressed; 0 otherwise + * keycode - uint32_t, KEY_* from linux/input.h + */ + struct xenkbd_key { - uint8_t type; /* XENKBD_TYPE_KEY */ - uint8_t pressed; /* 1 if pressed; 0 otherwise */ - uint32_t keycode; /* KEY_* from linux/input.h */ + uint8_t type; + uint8_t pressed; + uint32_t keycode; }; +/* + * Pointer absolute position event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_POS | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 12 + * +----------------+----------------+----------------+----------------+ + * | rel_z | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position (in FB pixels) + * abs_y - int32_t, absolute Y position (in FB pixels) + * rel_z - int32_t, relative Z motion (wheel) + */ + struct xenkbd_position { - uint8_t type; /* XENKBD_TYPE_POS */ - int32_t abs_x; /* absolute X position (in FB pixels) */ - int32_t abs_y; /* absolute Y position (in FB pixels) */ - int32_t rel_z; /* relative Z motion (wheel) */ + uint8_t type; + int32_t abs_x; + int32_t abs_y; + int32_t rel_z; +}; + +/* + * Multi-touch event and its sub-types + * + * All multi-touch event packets have common header: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | event_type | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * + * event_type - unt8_t, multi-touch event sub-type, XENKBD_MT_EV_??? + * contact_id - unt8_t, ID of the contact + * + * Touch interactions can consist of one or more contacts. + * For each contact, a series of events is generated, starting + * with a down event, followed by zero or more motion events, + * and ending with an up event. Events relating to the same + * contact point can be identified by the ID of the sequence: contact ID. + * Contact ID may be reused after XENKBD_MT_EV_UP event and + * is in the [0; XENKBD_FIELD_NUM_CONTACTS - 1] range. + * + * For further information please refer to documentation on Wayland [1], + * Linux [2] and Windows [3] multi-touch support. + * + * [1] https://cgit.freedesktop.org/wayland/wayland/tree/protocol/wayland.xml + * [2] https://www.kernel.org/doc/Documentation/input/multi-touch-protocol.txt + * [3] https://msdn.microsoft.com/en-us/library/jj151564(v=vs.85).aspx + * + * + * Multi-touch down event - sent when a new touch is made: touch is assigned + * a unique contact ID, sent with this and consequent events related + * to this touch. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_DOWN | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 12 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position, in pixels + * abs_y - int32_t, absolute Y position, in pixels + * + * Multi-touch contact release event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_UP | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * Multi-touch motion event + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_MOTION | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | abs_x | 12 + * +----------------+----------------+----------------+----------------+ + * | abs_y | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * abs_x - int32_t, absolute X position, in pixels, + * abs_y - int32_t, absolute Y position, in pixels, + * + * Multi-touch input synchronization event - shows end of a set of events + * which logically belong together. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_SYN | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * Multi-touch shape event - touch point's shape has changed its shape. + * Shape is approximated by an ellipse through the major and minor axis + * lengths: major is the longer diameter of the ellipse and minor is the + * shorter one. Center of the ellipse is reported via + * XENKBD_MT_EV_DOWN/XENKBD_MT_EV_MOTION events. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_SHAPE | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | major | 12 + * +----------------+----------------+----------------+----------------+ + * | minor | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * major - unt32_t, length of the major axis, pixels + * minor - unt32_t, length of the minor axis, pixels + * + * Multi-touch orientation event - touch point's shape has changed + * its orientation: calculated as a clockwise angle between the major axis + * of the ellipse and positive Y axis in degrees, [-180; +180]. + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | _TYPE_MTOUCH | _MT_EV_ORIENT | contact_id | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | orientation | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * | reserved | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 40 + * +----------------+----------------+----------------+----------------+ + * + * orientation - int16_t, clockwise angle of the major axis + */ + +struct xenkbd_mtouch { + uint8_t type; /* XENKBD_TYPE_MTOUCH */ + uint8_t event_type; /* XENKBD_MT_EV_??? */ + uint8_t contact_id; + uint8_t reserved[5]; /* reserved for the future use */ + union { + struct { + int32_t abs_x; /* absolute X position, pixels */ + int32_t abs_y; /* absolute Y position, pixels */ + } pos; + struct { + uint32_t major; /* length of the major axis, pixels */ + uint32_t minor; /* length of the minor axis, pixels */ + } shape; + int16_t orientation; /* clockwise angle of the major axis */ + } u; }; #define XENKBD_IN_EVENT_SIZE 40 @@ -72,15 +461,26 @@ union xenkbd_in_event { struct xenkbd_motion motion; struct xenkbd_key key; struct xenkbd_position pos; + struct xenkbd_mtouch mtouch; char pad[XENKBD_IN_EVENT_SIZE]; }; -/* Out events (frontend -> backend) */ - /* + ***************************************************************************** + * Frontend to backend events + ***************************************************************************** + * * Out events may be sent only when requested by backend, and receipt * of an unknown out event is an error. * No out events currently defined. + + * All event packets have the same length (40 octets) + * All event packets have common header: + * 0 octet + * +-----------------+ + * | type | + * +-----------------+ + * type - uint8_t, event code */ #define XENKBD_OUT_EVENT_SIZE 40 @@ -90,7 +490,11 @@ union xenkbd_out_event { char pad[XENKBD_OUT_EVENT_SIZE]; }; -/* shared page */ +/* + ***************************************************************************** + * Shared page + ***************************************************************************** + */ #define XENKBD_IN_RING_SIZE 2048 #define XENKBD_IN_RING_LEN (XENKBD_IN_RING_SIZE / XENKBD_IN_EVENT_SIZE) @@ -113,4 +517,4 @@ struct xenkbd_page { uint32_t out_cons, out_prod; }; -#endif +#endif /* __XEN_PUBLIC_IO_KBDIF_H__ */ diff --git a/include/xen/interface/io/ring.h b/include/xen/interface/io/ring.h index 21f4fbd55e48edf03061b92d687a23cbe34a8989..c794568555391518466d8d7d11a08a4b38ef3eaf 100644 --- a/include/xen/interface/io/ring.h +++ b/include/xen/interface/io/ring.h @@ -283,4 +283,147 @@ struct __name##_back_ring { \ (_work_to_do) = RING_HAS_UNCONSUMED_RESPONSES(_r); \ } while (0) + +/* + * DEFINE_XEN_FLEX_RING_AND_INTF defines two monodirectional rings and + * functions to check if there is data on the ring, and to read and + * write to them. + * + * DEFINE_XEN_FLEX_RING is similar to DEFINE_XEN_FLEX_RING_AND_INTF, but + * does not define the indexes page. As different protocols can have + * extensions to the basic format, this macro allow them to define their + * own struct. + * + * XEN_FLEX_RING_SIZE + * Convenience macro to calculate the size of one of the two rings + * from the overall order. + * + * $NAME_mask + * Function to apply the size mask to an index, to reduce the index + * within the range [0-size]. + * + * $NAME_read_packet + * Function to read data from the ring. The amount of data to read is + * specified by the "size" argument. + * + * $NAME_write_packet + * Function to write data to the ring. The amount of data to write is + * specified by the "size" argument. + * + * $NAME_get_ring_ptr + * Convenience function that returns a pointer to read/write to the + * ring at the right location. + * + * $NAME_data_intf + * Indexes page, shared between frontend and backend. It also + * contains the array of grant refs. + * + * $NAME_queued + * Function to calculate how many bytes are currently on the ring, + * ready to be read. It can also be used to calculate how much free + * space is currently on the ring (XEN_FLEX_RING_SIZE() - + * $NAME_queued()). + */ + +#ifndef XEN_PAGE_SHIFT +/* The PAGE_SIZE for ring protocols and hypercall interfaces is always + * 4K, regardless of the architecture, and page granularity chosen by + * operating systems. + */ +#define XEN_PAGE_SHIFT 12 +#endif +#define XEN_FLEX_RING_SIZE(order) \ + (1UL << ((order) + XEN_PAGE_SHIFT - 1)) + +#define DEFINE_XEN_FLEX_RING(name) \ +static inline RING_IDX name##_mask(RING_IDX idx, RING_IDX ring_size) \ +{ \ + return idx & (ring_size - 1); \ +} \ + \ +static inline unsigned char *name##_get_ring_ptr(unsigned char *buf, \ + RING_IDX idx, \ + RING_IDX ring_size) \ +{ \ + return buf + name##_mask(idx, ring_size); \ +} \ + \ +static inline void name##_read_packet(void *opaque, \ + const unsigned char *buf, \ + size_t size, \ + RING_IDX masked_prod, \ + RING_IDX *masked_cons, \ + RING_IDX ring_size) \ +{ \ + if (*masked_cons < masked_prod || \ + size <= ring_size - *masked_cons) { \ + memcpy(opaque, buf + *masked_cons, size); \ + } else { \ + memcpy(opaque, buf + *masked_cons, ring_size - *masked_cons); \ + memcpy((unsigned char *)opaque + ring_size - *masked_cons, buf, \ + size - (ring_size - *masked_cons)); \ + } \ + *masked_cons = name##_mask(*masked_cons + size, ring_size); \ +} \ + \ +static inline void name##_write_packet(unsigned char *buf, \ + const void *opaque, \ + size_t size, \ + RING_IDX *masked_prod, \ + RING_IDX masked_cons, \ + RING_IDX ring_size) \ +{ \ + if (*masked_prod < masked_cons || \ + size <= ring_size - *masked_prod) { \ + memcpy(buf + *masked_prod, opaque, size); \ + } else { \ + memcpy(buf + *masked_prod, opaque, ring_size - *masked_prod); \ + memcpy(buf, (unsigned char *)opaque + (ring_size - *masked_prod), \ + size - (ring_size - *masked_prod)); \ + } \ + *masked_prod = name##_mask(*masked_prod + size, ring_size); \ +} \ + \ +static inline RING_IDX name##_queued(RING_IDX prod, \ + RING_IDX cons, \ + RING_IDX ring_size) \ +{ \ + RING_IDX size; \ + \ + if (prod == cons) \ + return 0; \ + \ + prod = name##_mask(prod, ring_size); \ + cons = name##_mask(cons, ring_size); \ + \ + if (prod == cons) \ + return ring_size; \ + \ + if (prod > cons) \ + size = prod - cons; \ + else \ + size = ring_size - (cons - prod); \ + return size; \ +} \ + \ +struct name##_data { \ + unsigned char *in; /* half of the allocation */ \ + unsigned char *out; /* half of the allocation */ \ +} + +#define DEFINE_XEN_FLEX_RING_AND_INTF(name) \ +struct name##_data_intf { \ + RING_IDX in_cons, in_prod; \ + \ + uint8_t pad1[56]; \ + \ + RING_IDX out_cons, out_prod; \ + \ + uint8_t pad2[56]; \ + \ + RING_IDX ring_order; \ + grant_ref_t ref[]; \ +}; \ +DEFINE_XEN_FLEX_RING(name) + #endif /* __XEN_PUBLIC_IO_RING_H__ */ diff --git a/include/xen/interface/io/sndif.h b/include/xen/interface/io/sndif.h new file mode 100644 index 0000000000000000000000000000000000000000..5c918276835eb0729c5c5e9ef488e94a5c8afc3e --- /dev/null +++ b/include/xen/interface/io/sndif.h @@ -0,0 +1,793 @@ +/****************************************************************************** + * sndif.h + * + * Unified sound-device I/O interface for Xen guest OSes. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + * + * Copyright (C) 2013-2015 GlobalLogic Inc. + * Copyright (C) 2016-2017 EPAM Systems Inc. + * + * Authors: Oleksandr Andrushchenko + * Oleksandr Grytsov + * Oleksandr Dmytryshyn + * Iurii Konovalenko + */ + +#ifndef __XEN_PUBLIC_IO_SNDIF_H__ +#define __XEN_PUBLIC_IO_SNDIF_H__ + +#include "ring.h" +#include "../grant_table.h" + +/* + ****************************************************************************** + * Feature and Parameter Negotiation + ****************************************************************************** + * + * Front->back notifications: when enqueuing a new request, sending a + * notification can be made conditional on xensnd_req (i.e., the generic + * hold-off mechanism provided by the ring macros). Backends must set + * xensnd_req appropriately (e.g., using RING_FINAL_CHECK_FOR_REQUESTS()). + * + * Back->front notifications: when enqueuing a new response, sending a + * notification can be made conditional on xensnd_resp (i.e., the generic + * hold-off mechanism provided by the ring macros). Frontends must set + * xensnd_resp appropriately (e.g., using RING_FINAL_CHECK_FOR_RESPONSES()). + * + * The two halves of a para-virtual sound card driver utilize nodes within + * XenStore to communicate capabilities and to negotiate operating parameters. + * This section enumerates these nodes which reside in the respective front and + * backend portions of XenStore, following the XenBus convention. + * + * All data in XenStore is stored as strings. Nodes specifying numeric + * values are encoded in decimal. Integer value ranges listed below are + * expressed as fixed sized integer types capable of storing the conversion + * of a properly formated node string, without loss of information. + * + ****************************************************************************** + * Example configuration + ****************************************************************************** + * + * Note: depending on the use-case backend can expose more sound cards and + * PCM devices/streams than the underlying HW physically has by employing + * SW mixers, configuring virtual sound streams, channels etc. + * + * This is an example of backend and frontend configuration: + * + *--------------------------------- Backend ----------------------------------- + * + * /local/domain/0/backend/vsnd/1/0/frontend-id = "1" + * /local/domain/0/backend/vsnd/1/0/frontend = "/local/domain/1/device/vsnd/0" + * /local/domain/0/backend/vsnd/1/0/state = "4" + * /local/domain/0/backend/vsnd/1/0/versions = "1,2" + * + *--------------------------------- Frontend ---------------------------------- + * + * /local/domain/1/device/vsnd/0/backend-id = "0" + * /local/domain/1/device/vsnd/0/backend = "/local/domain/0/backend/vsnd/1/0" + * /local/domain/1/device/vsnd/0/state = "4" + * /local/domain/1/device/vsnd/0/version = "1" + * + *----------------------------- Card configuration ---------------------------- + * + * /local/domain/1/device/vsnd/0/short-name = "Card short name" + * /local/domain/1/device/vsnd/0/long-name = "Card long name" + * /local/domain/1/device/vsnd/0/sample-rates = "8000,32000,44100,48000,96000" + * /local/domain/1/device/vsnd/0/sample-formats = "s8,u8,s16_le,s16_be" + * /local/domain/1/device/vsnd/0/buffer-size = "262144" + * + *------------------------------- PCM device 0 -------------------------------- + * + * /local/domain/1/device/vsnd/0/0/name = "General analog" + * /local/domain/1/device/vsnd/0/0/channels-max = "5" + * + *----------------------------- Stream 0, playback ---------------------------- + * + * /local/domain/1/device/vsnd/0/0/0/type = "p" + * /local/domain/1/device/vsnd/0/0/0/sample-formats = "s8,u8" + * /local/domain/1/device/vsnd/0/0/0/unique-id = "0" + * + * /local/domain/1/device/vsnd/0/0/0/ring-ref = "386" + * /local/domain/1/device/vsnd/0/0/0/event-channel = "15" + * + *------------------------------ Stream 1, capture ---------------------------- + * + * /local/domain/1/device/vsnd/0/0/1/type = "c" + * /local/domain/1/device/vsnd/0/0/1/channels-max = "2" + * /local/domain/1/device/vsnd/0/0/1/unique-id = "1" + * + * /local/domain/1/device/vsnd/0/0/1/ring-ref = "384" + * /local/domain/1/device/vsnd/0/0/1/event-channel = "13" + * + *------------------------------- PCM device 1 -------------------------------- + * + * /local/domain/1/device/vsnd/0/1/name = "HDMI-0" + * /local/domain/1/device/vsnd/0/1/sample-rates = "8000,32000,44100" + * + *------------------------------ Stream 0, capture ---------------------------- + * + * /local/domain/1/device/vsnd/0/1/0/type = "c" + * /local/domain/1/device/vsnd/0/1/0/unique-id = "2" + * + * /local/domain/1/device/vsnd/0/1/0/ring-ref = "387" + * /local/domain/1/device/vsnd/0/1/0/event-channel = "151" + * + *------------------------------- PCM device 2 -------------------------------- + * + * /local/domain/1/device/vsnd/0/2/name = "SPDIF" + * + *----------------------------- Stream 0, playback ---------------------------- + * + * /local/domain/1/device/vsnd/0/2/0/type = "p" + * /local/domain/1/device/vsnd/0/2/0/unique-id = "3" + * + * /local/domain/1/device/vsnd/0/2/0/ring-ref = "389" + * /local/domain/1/device/vsnd/0/2/0/event-channel = "152" + * + ****************************************************************************** + * Backend XenBus Nodes + ****************************************************************************** + * + *----------------------------- Protocol version ------------------------------ + * + * versions + * Values: + * + * List of XENSND_LIST_SEPARATOR separated protocol versions supported + * by the backend. For example "1,2,3". + * + ****************************************************************************** + * Frontend XenBus Nodes + ****************************************************************************** + * + *-------------------------------- Addressing --------------------------------- + * + * dom-id + * Values: + * + * Domain identifier. + * + * dev-id + * Values: + * + * Device identifier. + * + * pcm-dev-idx + * Values: + * + * Zero based contigous index of the PCM device. + * + * stream-idx + * Values: + * + * Zero based contigous index of the stream of the PCM device. + * + * The following pattern is used for addressing: + * /local/domain//device/vsnd////... + * + *----------------------------- Protocol version ------------------------------ + * + * version + * Values: + * + * Protocol version, chosen among the ones supported by the backend. + * + *------------------------------- PCM settings -------------------------------- + * + * Every virtualized sound frontend has a set of PCM devices and streams, each + * could be individually configured. Part of the PCM configuration can be + * defined at higher level of the hierarchy and be fully or partially re-used + * by the underlying layers. These configuration values are: + * o number of channels (min/max) + * o supported sample rates + * o supported sample formats. + * E.g. one can define these values for the whole card, device or stream. + * Every underlying layer in turn can re-define some or all of them to better + * fit its needs. For example, card may define number of channels to be + * in [1; 8] range, and some particular stream may be limited to [1; 2] only. + * The rule is that the underlying layer must be a subset of the upper layer + * range. + * + * channels-min + * Values: + * + * The minimum amount of channels that is supported, [1; channels-max]. + * Optional, if not set or omitted a value of 1 is used. + * + * channels-max + * Values: + * + * The maximum amount of channels that is supported. + * Must be at least . + * + * sample-rates + * Values: + * + * List of supported sample rates separated by XENSND_LIST_SEPARATOR. + * Sample rates are expressed as a list of decimal values w/o any + * ordering requirement. + * + * sample-formats + * Values: + * + * List of supported sample formats separated by XENSND_LIST_SEPARATOR. + * Items must not exceed XENSND_SAMPLE_FORMAT_MAX_LEN length. + * + * buffer-size + * Values: + * + * The maximum size in octets of the buffer to allocate per stream. + * + *----------------------- Virtual sound card settings ------------------------- + * short-name + * Values: + * + * Short name of the virtual sound card. Optional. + * + * long-name + * Values: + * + * Long name of the virtual sound card. Optional. + * + *----------------------------- Device settings ------------------------------- + * name + * Values: + * + * Name of the sound device within the virtual sound card. Optional. + * + *----------------------------- Stream settings ------------------------------- + * + * type + * Values: "p", "c" + * + * Stream type: "p" - playback stream, "c" - capture stream + * + * If both capture and playback are needed then two streams need to be + * defined under the same device. + * + * unique-id + * Values: + * + * After stream initialization it is assigned a unique ID (within the front + * driver), so every stream of the frontend can be identified by the + * backend by this ID. This is not equal to stream-idx as the later is + * zero based within the device, but this index is contigous within the + * driver. + * + *-------------------- Stream Request Transport Parameters -------------------- + * + * event-channel + * Values: + * + * The identifier of the Xen event channel used to signal activity + * in the ring buffer. + * + * ring-ref + * Values: + * + * The Xen grant reference granting permission for the backend to map + * a sole page in a single page sized ring buffer. + * + ****************************************************************************** + * STATE DIAGRAMS + ****************************************************************************** + * + * Tool stack creates front and back state nodes with initial state + * XenbusStateInitialising. + * Tool stack creates and sets up frontend sound configuration nodes per domain. + * + * Front Back + * ================================= ===================================== + * XenbusStateInitialising XenbusStateInitialising + * o Query backend device identification + * data. + * o Open and validate backend device. + * | + * | + * V + * XenbusStateInitWait + * + * o Query frontend configuration + * o Allocate and initialize + * event channels per configured + * playback/capture stream. + * o Publish transport parameters + * that will be in effect during + * this connection. + * | + * | + * V + * XenbusStateInitialised + * + * o Query frontend transport parameters. + * o Connect to the event channels. + * | + * | + * V + * XenbusStateConnected + * + * o Create and initialize OS + * virtual sound device instances + * as per configuration. + * | + * | + * V + * XenbusStateConnected + * + * XenbusStateUnknown + * XenbusStateClosed + * XenbusStateClosing + * o Remove virtual sound device + * o Remove event channels + * | + * | + * V + * XenbusStateClosed + * + *------------------------------- Recovery flow ------------------------------- + * + * In case of frontend unrecoverable errors backend handles that as + * if frontend goes into the XenbusStateClosed state. + * + * In case of backend unrecoverable errors frontend tries removing + * the virtualized device. If this is possible at the moment of error, + * then frontend goes into the XenbusStateInitialising state and is ready for + * new connection with backend. If the virtualized device is still in use and + * cannot be removed, then frontend goes into the XenbusStateReconfiguring state + * until either the virtualized device removed or backend initiates a new + * connection. On the virtualized device removal frontend goes into the + * XenbusStateInitialising state. + * + * Note on XenbusStateReconfiguring state of the frontend: if backend has + * unrecoverable errors then frontend cannot send requests to the backend + * and thus cannot provide functionality of the virtualized device anymore. + * After backend is back to normal the virtualized device may still hold some + * state: configuration in use, allocated buffers, client application state etc. + * So, in most cases, this will require frontend to implement complex recovery + * reconnect logic. Instead, by going into XenbusStateReconfiguring state, + * frontend will make sure no new clients of the virtualized device are + * accepted, allow existing client(s) to exit gracefully by signaling error + * state etc. + * Once all the clients are gone frontend can reinitialize the virtualized + * device and get into XenbusStateInitialising state again signaling the + * backend that a new connection can be made. + * + * There are multiple conditions possible under which frontend will go from + * XenbusStateReconfiguring into XenbusStateInitialising, some of them are OS + * specific. For example: + * 1. The underlying OS framework may provide callbacks to signal that the last + * client of the virtualized device has gone and the device can be removed + * 2. Frontend can schedule a deferred work (timer/tasklet/workqueue) + * to periodically check if this is the right time to re-try removal of + * the virtualized device. + * 3. By any other means. + * + ****************************************************************************** + * PCM FORMATS + ****************************************************************************** + * + * XENSND_PCM_FORMAT_[_] + * + * format: or + * S - signed, U - unsigned, F - float + * bits - 8, 16, 24, 32 + * name - MU_LAW, GSM, etc. + * + * endian: , may be absent + * LE - Little endian, BE - Big endian + */ +#define XENSND_PCM_FORMAT_S8 0 +#define XENSND_PCM_FORMAT_U8 1 +#define XENSND_PCM_FORMAT_S16_LE 2 +#define XENSND_PCM_FORMAT_S16_BE 3 +#define XENSND_PCM_FORMAT_U16_LE 4 +#define XENSND_PCM_FORMAT_U16_BE 5 +#define XENSND_PCM_FORMAT_S24_LE 6 +#define XENSND_PCM_FORMAT_S24_BE 7 +#define XENSND_PCM_FORMAT_U24_LE 8 +#define XENSND_PCM_FORMAT_U24_BE 9 +#define XENSND_PCM_FORMAT_S32_LE 10 +#define XENSND_PCM_FORMAT_S32_BE 11 +#define XENSND_PCM_FORMAT_U32_LE 12 +#define XENSND_PCM_FORMAT_U32_BE 13 +#define XENSND_PCM_FORMAT_F32_LE 14 /* 4-byte float, IEEE-754 32-bit, */ +#define XENSND_PCM_FORMAT_F32_BE 15 /* range -1.0 to 1.0 */ +#define XENSND_PCM_FORMAT_F64_LE 16 /* 8-byte float, IEEE-754 64-bit, */ +#define XENSND_PCM_FORMAT_F64_BE 17 /* range -1.0 to 1.0 */ +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE 18 +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE 19 +#define XENSND_PCM_FORMAT_MU_LAW 20 +#define XENSND_PCM_FORMAT_A_LAW 21 +#define XENSND_PCM_FORMAT_IMA_ADPCM 22 +#define XENSND_PCM_FORMAT_MPEG 23 +#define XENSND_PCM_FORMAT_GSM 24 + +/* + ****************************************************************************** + * REQUEST CODES + ****************************************************************************** + */ +#define XENSND_OP_OPEN 0 +#define XENSND_OP_CLOSE 1 +#define XENSND_OP_READ 2 +#define XENSND_OP_WRITE 3 +#define XENSND_OP_SET_VOLUME 4 +#define XENSND_OP_GET_VOLUME 5 +#define XENSND_OP_MUTE 6 +#define XENSND_OP_UNMUTE 7 + +/* + ****************************************************************************** + * XENSTORE FIELD AND PATH NAME STRINGS, HELPERS + ****************************************************************************** + */ +#define XENSND_DRIVER_NAME "vsnd" + +#define XENSND_LIST_SEPARATOR "," +/* Field names */ +#define XENSND_FIELD_BE_VERSIONS "versions" +#define XENSND_FIELD_FE_VERSION "version" +#define XENSND_FIELD_VCARD_SHORT_NAME "short-name" +#define XENSND_FIELD_VCARD_LONG_NAME "long-name" +#define XENSND_FIELD_RING_REF "ring-ref" +#define XENSND_FIELD_EVT_CHNL "event-channel" +#define XENSND_FIELD_DEVICE_NAME "name" +#define XENSND_FIELD_TYPE "type" +#define XENSND_FIELD_STREAM_UNIQUE_ID "unique-id" +#define XENSND_FIELD_CHANNELS_MIN "channels-min" +#define XENSND_FIELD_CHANNELS_MAX "channels-max" +#define XENSND_FIELD_SAMPLE_RATES "sample-rates" +#define XENSND_FIELD_SAMPLE_FORMATS "sample-formats" +#define XENSND_FIELD_BUFFER_SIZE "buffer-size" + +/* Stream type field values. */ +#define XENSND_STREAM_TYPE_PLAYBACK "p" +#define XENSND_STREAM_TYPE_CAPTURE "c" +/* Sample rate max string length */ +#define XENSND_SAMPLE_RATE_MAX_LEN 11 +/* Sample format field values */ +#define XENSND_SAMPLE_FORMAT_MAX_LEN 24 + +#define XENSND_PCM_FORMAT_S8_STR "s8" +#define XENSND_PCM_FORMAT_U8_STR "u8" +#define XENSND_PCM_FORMAT_S16_LE_STR "s16_le" +#define XENSND_PCM_FORMAT_S16_BE_STR "s16_be" +#define XENSND_PCM_FORMAT_U16_LE_STR "u16_le" +#define XENSND_PCM_FORMAT_U16_BE_STR "u16_be" +#define XENSND_PCM_FORMAT_S24_LE_STR "s24_le" +#define XENSND_PCM_FORMAT_S24_BE_STR "s24_be" +#define XENSND_PCM_FORMAT_U24_LE_STR "u24_le" +#define XENSND_PCM_FORMAT_U24_BE_STR "u24_be" +#define XENSND_PCM_FORMAT_S32_LE_STR "s32_le" +#define XENSND_PCM_FORMAT_S32_BE_STR "s32_be" +#define XENSND_PCM_FORMAT_U32_LE_STR "u32_le" +#define XENSND_PCM_FORMAT_U32_BE_STR "u32_be" +#define XENSND_PCM_FORMAT_F32_LE_STR "float_le" +#define XENSND_PCM_FORMAT_F32_BE_STR "float_be" +#define XENSND_PCM_FORMAT_F64_LE_STR "float64_le" +#define XENSND_PCM_FORMAT_F64_BE_STR "float64_be" +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_LE_STR "iec958_subframe_le" +#define XENSND_PCM_FORMAT_IEC958_SUBFRAME_BE_STR "iec958_subframe_be" +#define XENSND_PCM_FORMAT_MU_LAW_STR "mu_law" +#define XENSND_PCM_FORMAT_A_LAW_STR "a_law" +#define XENSND_PCM_FORMAT_IMA_ADPCM_STR "ima_adpcm" +#define XENSND_PCM_FORMAT_MPEG_STR "mpeg" +#define XENSND_PCM_FORMAT_GSM_STR "gsm" + + +/* + ****************************************************************************** + * STATUS RETURN CODES + ****************************************************************************** + * + * Status return code is zero on success and -XEN_EXX on failure. + * + ****************************************************************************** + * Assumptions + ****************************************************************************** + * o usage of grant reference 0 as invalid grant reference: + * grant reference 0 is valid, but never exposed to a PV driver, + * because of the fact it is already in use/reserved by the PV console. + * o all references in this document to page sizes must be treated + * as pages of size XEN_PAGE_SIZE unless otherwise noted. + * + ****************************************************************************** + * Description of the protocol between frontend and backend driver + ****************************************************************************** + * + * The two halves of a Para-virtual sound driver communicate with + * each other using shared pages and event channels. + * Shared page contains a ring with request/response packets. + * + * Packets, used for input/output operations, e.g. read/write, set/get volume, + * etc., provide offset/length fields in order to allow asynchronous protocol + * operation with buffer space sharing: part of the buffer allocated at + * XENSND_OP_OPEN can be used for audio samples and part, for example, + * for volume control. + * + * All reserved fields in the structures below must be 0. + * + *---------------------------------- Requests --------------------------------- + * + * All request packets have the same length (32 octets) + * All request packets have common header: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * id - uint16_t, private guest value, echoed in response + * operation - uint8_t, operation code, XENSND_OP_??? + * + * For all packets which use offset and length: + * offset - uint32_t, read or write data offset within the shared buffer, + * passed with XENSND_OP_OPEN request, octets, + * [0; XENSND_OP_OPEN.buffer_sz - 1]. + * length - uint32_t, read or write data length, octets + * + * Request open - open a PCM stream for playback or capture: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | XENSND_OP_OPEN | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | pcm_rate | 12 + * +----------------+----------------+----------------+----------------+ + * | pcm_format | pcm_channels | reserved | 16 + * +----------------+----------------+----------------+----------------+ + * | buffer_sz | 20 + * +----------------+----------------+----------------+----------------+ + * | gref_directory | 24 + * +----------------+----------------+----------------+----------------+ + * | reserved | 28 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * pcm_rate - uint32_t, stream data rate, Hz + * pcm_format - uint8_t, XENSND_PCM_FORMAT_XXX value + * pcm_channels - uint8_t, number of channels of this stream, + * [channels-min; channels-max] + * buffer_sz - uint32_t, buffer size to be allocated, octets + * gref_directory - grant_ref_t, a reference to the first shared page + * describing shared buffer references. At least one page exists. If shared + * buffer size (buffer_sz) exceeds what can be addressed by this single page, + * then reference to the next page must be supplied (see gref_dir_next_page + * below) + */ + +struct xensnd_open_req { + uint32_t pcm_rate; + uint8_t pcm_format; + uint8_t pcm_channels; + uint16_t reserved; + uint32_t buffer_sz; + grant_ref_t gref_directory; +}; + +/* + * Shared page for XENSND_OP_OPEN buffer descriptor (gref_directory in the + * request) employs a list of pages, describing all pages of the shared data + * buffer: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | gref_dir_next_page | 4 + * +----------------+----------------+----------------+----------------+ + * | gref[0] | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[i] | i*4+8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | gref[N - 1] | N*4+8 + * +----------------+----------------+----------------+----------------+ + * + * gref_dir_next_page - grant_ref_t, reference to the next page describing + * page directory. Must be 0 if there are no more pages in the list. + * gref[i] - grant_ref_t, reference to a shared page of the buffer + * allocated at XENSND_OP_OPEN + * + * Number of grant_ref_t entries in the whole page directory is not + * passed, but instead can be calculated as: + * num_grefs_total = (XENSND_OP_OPEN.buffer_sz + XEN_PAGE_SIZE - 1) / + * XEN_PAGE_SIZE + */ + +struct xensnd_page_directory { + grant_ref_t gref_dir_next_page; + grant_ref_t gref[1]; /* Variable length */ +}; + +/* + * Request close - close an opened pcm stream: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | XENSND_OP_CLOSE| reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * Request read/write - used for read (for capture) or write (for playback): + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * | reserved | 20 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_READ for read or XENSND_OP_WRITE for write + */ + +struct xensnd_rw_req { + uint32_t offset; + uint32_t length; +}; + +/* + * Request set/get volume - set/get channels' volume of the stream given: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_SET_VOLUME for volume set + * or XENSND_OP_GET_VOLUME for volume get + * Buffer passed with XENSND_OP_OPEN is used to exchange volume + * values: + * + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | channel[0] | 4 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[i] | i*4 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[N - 1] | (N-1)*4 + * +----------------+----------------+----------------+----------------+ + * + * N = XENSND_OP_OPEN.pcm_channels + * i - uint8_t, index of a channel + * channel[i] - sint32_t, volume of i-th channel + * Volume is expressed as a signed value in steps of 0.001 dB, + * while 0 being 0 dB. + * + * Request mute/unmute - mute/unmute stream: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | reserved | 8 + * +----------------+----------------+----------------+----------------+ + * | offset | 12 + * +----------------+----------------+----------------+----------------+ + * | length | 16 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * operation - XENSND_OP_MUTE for mute or XENSND_OP_UNMUTE for unmute + * Buffer passed with XENSND_OP_OPEN is used to exchange mute/unmute + * values: + * + * 0 octet + * +----------------+----------------+----------------+----------------+ + * | channel[0] | 4 + * +----------------+----------------+----------------+----------------+ + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[i] | i*4 + * +----------------+----------------+----------------+----------------+ + * +/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | channel[N - 1] | (N-1)*4 + * +----------------+----------------+----------------+----------------+ + * + * N = XENSND_OP_OPEN.pcm_channels + * i - uint8_t, index of a channel + * channel[i] - uint8_t, non-zero if i-th channel needs to be muted/unmuted + * + *------------------------------------ N.B. ----------------------------------- + * + * The 'struct xensnd_rw_req' is also used for XENSND_OP_SET_VOLUME, + * XENSND_OP_GET_VOLUME, XENSND_OP_MUTE, XENSND_OP_UNMUTE. + */ + +/* + *---------------------------------- Responses -------------------------------- + * + * All response packets have the same length (32 octets) + * + * Response for all requests: + * 0 1 2 3 octet + * +----------------+----------------+----------------+----------------+ + * | id | operation | reserved | 4 + * +----------------+----------------+----------------+----------------+ + * | status | 8 + * +----------------+----------------+----------------+----------------+ + * | reserved | 12 + * +----------------+----------------+----------------+----------------+ + * |/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/\/| + * +----------------+----------------+----------------+----------------+ + * | reserved | 32 + * +----------------+----------------+----------------+----------------+ + * + * id - uint16_t, copied from the request + * operation - uint8_t, XENSND_OP_* - copied from request + * status - int32_t, response status, zero on success and -XEN_EXX on failure + */ + +struct xensnd_req { + uint16_t id; + uint8_t operation; + uint8_t reserved[5]; + union { + struct xensnd_open_req open; + struct xensnd_rw_req rw; + uint8_t reserved[24]; + } op; +}; + +struct xensnd_resp { + uint16_t id; + uint8_t operation; + uint8_t reserved; + int32_t status; + uint8_t reserved1[24]; +}; + +DEFINE_RING_TYPES(xen_sndif, struct xensnd_req, struct xensnd_resp); + +#endif /* __XEN_PUBLIC_IO_SNDIF_H__ */ diff --git a/include/xen/page.h b/include/xen/page.h index 9dc46cb8a0fd79be7f4bdae7ae8f5f4dbdcdb7f4..064194f6453ebccc04ffc3cda9e1c1901b7c1728 100644 --- a/include/xen/page.h +++ b/include/xen/page.h @@ -38,7 +38,7 @@ struct xen_memory_region { unsigned long n_pfns; }; -#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820MAX */ +#define XEN_EXTRA_MEM_MAX_REGIONS 128 /* == E820_MAX_ENTRIES_ZEROPAGE */ extern __initdata struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS]; diff --git a/include/xen/xen-ops.h b/include/xen/xen-ops.h index b5486e64860759aed11763ac983b3b4aa12988b3..c44a2ee8c8f8079a13095b3b67ec6f794a683ff8 100644 --- a/include/xen/xen-ops.h +++ b/include/xen/xen-ops.h @@ -22,6 +22,8 @@ void xen_timer_resume(void); void xen_arch_resume(void); void xen_arch_suspend(void); +void xen_reboot(int reason); + void xen_resume_notifier_register(struct notifier_block *nb); void xen_resume_notifier_unregister(struct notifier_block *nb); @@ -34,11 +36,25 @@ u64 xen_steal_clock(int cpu); int xen_setup_shutdown_event(void); extern unsigned long *xen_contiguous_bitmap; + +#ifdef CONFIG_XEN_PV int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order, unsigned int address_bits, dma_addr_t *dma_handle); void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order); +#else +static inline int xen_create_contiguous_region(phys_addr_t pstart, + unsigned int order, + unsigned int address_bits, + dma_addr_t *dma_handle) +{ + return 0; +} + +static inline void xen_destroy_contiguous_region(phys_addr_t pstart, + unsigned int order) { } +#endif struct vm_area_struct; @@ -120,6 +136,9 @@ efi_status_t xen_efi_update_capsule(efi_capsule_header_t **capsules, efi_status_t xen_efi_query_capsule_caps(efi_capsule_header_t **capsules, unsigned long count, u64 *max_size, int *reset_type); +void xen_efi_reset_system(int reset_type, efi_status_t status, + unsigned long data_size, efi_char16_t *data); + #ifdef CONFIG_PREEMPT diff --git a/init/Kconfig b/init/Kconfig index a92f27da4a272ec873707bc4b7a68e46a4340b86..1d3475fc94967a58904627d80e8461a3721ddbb0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -521,11 +521,41 @@ config RCU_EXPERT config SRCU bool + default y help This option selects the sleepable version of RCU. This version permits arbitrary sleeping or blocking within RCU read-side critical sections. +config CLASSIC_SRCU + bool "Use v4.11 classic SRCU implementation" + default n + depends on RCU_EXPERT && SRCU + help + This option selects the traditional well-tested classic SRCU + implementation from v4.11, as might be desired for enterprise + Linux distributions. Without this option, the shiny new + Tiny SRCU and Tree SRCU implementations are used instead. + At some point, it is hoped that Tiny SRCU and Tree SRCU + will accumulate enough test time and confidence to allow + Classic SRCU to be dropped entirely. + + Say Y if you need a rock-solid SRCU. + + Say N if you would like help test Tree SRCU. + +config TINY_SRCU + bool + default y if SRCU && TINY_RCU && !CLASSIC_SRCU + help + This option selects the single-CPU non-preemptible version of SRCU. + +config TREE_SRCU + bool + default y if SRCU && !TINY_RCU && !CLASSIC_SRCU + help + This option selects the full-fledged version of SRCU. + config TASKS_RCU bool default n @@ -543,6 +573,9 @@ config RCU_STALL_COMMON the tiny variants to disable RCU CPU stall warnings, while making these warnings mandatory for the tree variants. +config RCU_NEED_SEGCBLIST + def_bool ( TREE_RCU || PREEMPT_RCU || TINY_SRCU || TREE_SRCU ) + config CONTEXT_TRACKING bool @@ -612,11 +645,17 @@ config RCU_FANOUT_LEAF initialization. These systems tend to run CPU-bound, and thus are not helped by synchronized interrupts, and thus tend to skew them, which reduces lock contention enough that large - leaf-level fanouts work well. + leaf-level fanouts work well. That said, setting leaf-level + fanout to a large number will likely cause problematic + lock contention on the leaf-level rcu_node structures unless + you boot with the skew_tick kernel parameter. Select a specific number if testing RCU itself. - Select the maximum permissible value for large systems. + Select the maximum permissible value for large systems, but + please understand that you may also need to set the skew_tick + kernel boot parameter to avoid contention on the rcu_node + structure's locks. Take the default if unsure. diff --git a/init/do_mounts.h b/init/do_mounts.h index 067af1d9e8b620bfac146dae57d0a0f690b33475..282d65bfd6741e51c7f341c9c4fda65a492936c6 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -19,29 +19,15 @@ static inline int create_dev(char *name, dev_t dev) return sys_mknod(name, S_IFBLK|0600, new_encode_dev(dev)); } -#if BITS_PER_LONG == 32 static inline u32 bstat(char *name) { - struct stat64 stat; - if (sys_stat64(name, &stat) != 0) + struct kstat stat; + if (vfs_stat(name, &stat) != 0) return 0; - if (!S_ISBLK(stat.st_mode)) + if (!S_ISBLK(stat.mode)) return 0; - if (stat.st_rdev != (u32)stat.st_rdev) - return 0; - return stat.st_rdev; -} -#else -static inline u32 bstat(char *name) -{ - struct stat stat; - if (sys_newstat(name, &stat) != 0) - return 0; - if (!S_ISBLK(stat.st_mode)) - return 0; - return stat.st_rdev; + return stat.rdev; } -#endif #ifdef CONFIG_BLK_DEV_RAM diff --git a/init/initramfs.c b/init/initramfs.c index 981f286c1d16ab57dff10cb10099f573b62f07b9..8a532050043f5f804f06ddbf980e54965c9055de 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -312,10 +312,10 @@ static int __init maybe_link(void) static void __init clean_path(char *path, umode_t fmode) { - struct stat st; + struct kstat st; - if (!sys_newlstat(path, &st) && (st.st_mode ^ fmode) & S_IFMT) { - if (S_ISDIR(st.st_mode)) + if (!vfs_lstat(path, &st) && (st.mode ^ fmode) & S_IFMT) { + if (S_ISDIR(st.mode)) sys_rmdir(path); else sys_unlink(path); @@ -581,13 +581,13 @@ static void __init clean_rootfs(void) num = sys_getdents64(fd, dirp, BUF_SIZE); while (num > 0) { while (num > 0) { - struct stat st; + struct kstat st; int ret; - ret = sys_newlstat(dirp->d_name, &st); + ret = vfs_lstat(dirp->d_name, &st); WARN_ON_ONCE(ret); if (!ret) { - if (S_ISDIR(st.st_mode)) + if (S_ISDIR(st.mode)) sys_rmdir(dirp->d_name); else sys_unlink(dirp->d_name); @@ -608,10 +608,12 @@ static void __init clean_rootfs(void) static int __init populate_rootfs(void) { + /* Load the built in initramfs */ char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size); if (err) panic("%s", err); /* Failed to decompress INTERNAL initramfs */ - if (initrd_start) { + /* If available load the bootloader supplied initrd */ + if (initrd_start && !IS_ENABLED(CONFIG_INITRAMFS_FORCE)) { #ifdef CONFIG_BLK_DEV_RAM int fd; printk(KERN_INFO "Trying to unpack rootfs image as initramfs...\n"); @@ -640,6 +642,7 @@ static int __init populate_rootfs(void) free_initrd(); } done: + /* empty statement */; #else printk(KERN_INFO "Unpacking initramfs...\n"); err = unpack_to_rootfs((char *)initrd_start, @@ -648,13 +651,14 @@ static int __init populate_rootfs(void) printk(KERN_EMERG "Initramfs unpacking failed: %s\n", err); free_initrd(); #endif - flush_delayed_fput(); - /* - * Try loading default modules from initramfs. This gives - * us a chance to load before device_initcalls. - */ - load_default_modules(); } + flush_delayed_fput(); + /* + * Try loading default modules from initramfs. This gives + * us a chance to load before device_initcalls. + */ + load_default_modules(); + return 0; } rootfs_initcall(populate_rootfs); diff --git a/init/main.c b/init/main.c index b0c11cbf5ddf8a55a3c832e4acbd72653d9c38f1..f866510472d7a263f03bc21b7a158e5b288185f9 100644 --- a/init/main.c +++ b/init/main.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -495,7 +495,7 @@ asmlinkage __visible void __init start_kernel(void) debug_objects_early_init(); /* - * Set up the the initial canary ASAP: + * Set up the initial canary ASAP: */ boot_init_stack_canary(); @@ -504,10 +504,10 @@ asmlinkage __visible void __init start_kernel(void) local_irq_disable(); early_boot_irqs_disabled = true; -/* - * Interrupts are still disabled. Do necessary setups, then - * enable them - */ + /* + * Interrupts are still disabled. Do necessary setups, then + * enable them. + */ boot_cpu_init(); page_address_init(); pr_notice("%s", linux_banner); @@ -545,6 +545,11 @@ asmlinkage __visible void __init start_kernel(void) trap_init(); mm_init(); + ftrace_init(); + + /* trace_printk can be enabled here */ + early_trace_init(); + /* * Set up the scheduler prior starting any interrupts (such as the * timer interrupt). Full topology setup happens at smp_init() @@ -570,7 +575,7 @@ asmlinkage __visible void __init start_kernel(void) rcu_init(); - /* trace_printk() and trace points may be used after this */ + /* Trace events are available after this */ trace_init(); context_tracking_init(); @@ -670,8 +675,6 @@ asmlinkage __visible void __init start_kernel(void) efi_free_boot_services(); } - ftrace_init(); - /* Do the rest non-__init'ed, we're now alive */ rest_init(); } @@ -959,6 +962,7 @@ static int __ref kernel_init(void *unused) kernel_init_freeable(); /* need to finish all async __init code before freeing the memory */ async_synchronize_full(); + ftrace_free_init_mem(); free_initmem(); mark_readonly(); system_state = SYSTEM_RUNNING; diff --git a/ipc/shm.c b/ipc/shm.c index 481d2a9c298ab14916543a040d4f45b9b69764e0..34c4344e8d4b2f00206208f1d2c8d48e85dd0e48 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1095,11 +1095,11 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr, unsigned long shmlba) { struct shmid_kernel *shp; - unsigned long addr; + unsigned long addr = (unsigned long)shmaddr; unsigned long size; struct file *file; int err; - unsigned long flags; + unsigned long flags = MAP_SHARED; unsigned long prot; int acc_mode; struct ipc_namespace *ns; @@ -1111,7 +1111,8 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, err = -EINVAL; if (shmid < 0) goto out; - else if ((addr = (ulong)shmaddr)) { + + if (addr) { if (addr & (shmlba - 1)) { /* * Round down to the nearest multiple of shmlba. @@ -1126,13 +1127,10 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, #endif goto out; } - flags = MAP_SHARED | MAP_FIXED; - } else { - if ((shmflg & SHM_REMAP)) - goto out; - flags = MAP_SHARED; - } + flags |= MAP_FIXED; + } else if ((shmflg & SHM_REMAP)) + goto out; if (shmflg & SHM_RDONLY) { prot = PROT_READ; diff --git a/ipc/util.c b/ipc/util.c index 798cad18dd878f2ee81f03e34431dbc001e2c2a9..caec7b1bfaa335f3d6df017eb9b0ad24058edd9c 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -403,12 +403,7 @@ void ipc_rmid(struct ipc_ids *ids, struct kern_ipc_perm *ipcp) */ void *ipc_alloc(int size) { - void *out; - if (size > PAGE_SIZE) - out = vmalloc(size); - else - out = kmalloc(size, GFP_KERNEL); - return out; + return kvmalloc(size, GFP_KERNEL); } /** @@ -474,7 +469,7 @@ void ipc_rcu_free(struct rcu_head *head) * Check user, group, other permissions for access * to ipc resources. return 0 if allowed * - * @flag will most probably be 0 or S_...UGO from + * @flag will most probably be 0 or ``S_...UGO`` from */ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flag) { @@ -672,10 +667,12 @@ int ipc_update_perm(struct ipc64_perm *in, struct kern_ipc_perm *out) * * This function does some common audit and permissions check for some IPC_XXX * cmd and is called from semctl_down, shmctl_down and msgctl_down. - * It must be called without any lock held and - * - retrieves the ipc with the given id in the given table. - * - performs some audit and permission check, depending on the given cmd - * - returns a pointer to the ipc object or otherwise, the corresponding error. + * It must be called without any lock held and: + * + * - retrieves the ipc with the given id in the given table. + * - performs some audit and permission check, depending on the given cmd + * - returns a pointer to the ipc object or otherwise, the corresponding + * error. * * Call holding the both the rwsem and the rcu read lock. */ diff --git a/ipc/util.h b/ipc/util.h index 51f7ca58ac6742989c61c0e0fc8d73bc7eab6bf7..60ddccca464dd2a9f54ebe50401cdd009b3084ca 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -153,8 +153,6 @@ extern struct msg_msg *load_msg(const void __user *src, size_t len); extern struct msg_msg *copy_msg(struct msg_msg *src, struct msg_msg *dst); extern int store_msg(void __user *dest, struct msg_msg *msg, size_t len); -extern void recompute_msgmni(struct ipc_namespace *); - static inline int ipc_buildid(int id, int seq) { return SEQ_MULTIPLIER * seq + id; diff --git a/kernel/Makefile b/kernel/Makefile index b302b4731d16547a88e4ddc6db21e130ae821113..72aa080f91f04bd9256f1df714186033294b19b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -59,6 +59,7 @@ obj-$(CONFIG_MODULES) += module.o obj-$(CONFIG_MODULE_SIG) += module_signing.o obj-$(CONFIG_KALLSYMS) += kallsyms.o obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o +obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_KEXEC_CORE) += kexec_core.o obj-$(CONFIG_KEXEC) += kexec.o obj-$(CONFIG_KEXEC_FILE) += kexec_file.o diff --git a/kernel/audit.c b/kernel/audit.c index a871bf80fde1adc79040936475b7045971e72d76..4b7d49868ce1e39d11603f6160d04e7e5d4457e2 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include @@ -110,18 +112,19 @@ struct audit_net { * @pid: auditd PID * @portid: netlink portid * @net: the associated network namespace - * @lock: spinlock to protect write access + * @rcu: RCU head * * Description: * This struct is RCU protected; you must either hold the RCU lock for reading - * or the included spinlock for writing. + * or the associated spinlock for writing. */ static struct auditd_connection { - int pid; + struct pid *pid; u32 portid; struct net *net; - spinlock_t lock; -} auditd_conn; + struct rcu_head rcu; +} *auditd_conn = NULL; +static DEFINE_SPINLOCK(auditd_conn_lock); /* If audit_rate_limit is non-zero, limit the rate of sending audit records * to that number per second. This prevents DoS attacks, but results in @@ -151,12 +154,7 @@ static atomic_t audit_lost = ATOMIC_INIT(0); /* Hash for inode-based rules */ struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; -/* The audit_freelist is a list of pre-allocated audit buffers (if more - * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of - * being placed on the freelist). */ -static DEFINE_SPINLOCK(audit_freelist_lock); -static int audit_freelist_count; -static LIST_HEAD(audit_freelist); +static struct kmem_cache *audit_buffer_cache; /* queue msgs to send via kauditd_task */ static struct sk_buff_head audit_queue; @@ -191,17 +189,12 @@ DEFINE_MUTEX(audit_cmd_mutex); * should be at least that large. */ #define AUDIT_BUFSIZ 1024 -/* AUDIT_MAXFREE is the number of empty audit_buffers we keep on the - * audit_freelist. Doing so eliminates many kmalloc/kfree calls. */ -#define AUDIT_MAXFREE (2*NR_CPUS) - /* The audit_buffer is used when formatting an audit record. The caller * locks briefly to get the record off the freelist or to allocate the * buffer, and locks briefly to send the buffer to the netlink layer or * to place it on a transmit queue. Multiple audit_buffers can be in * use simultaneously. */ struct audit_buffer { - struct list_head list; struct sk_buff *skb; /* formatted skb ready to send */ struct audit_context *ctx; /* NULL or associated context */ gfp_t gfp_mask; @@ -220,17 +213,41 @@ struct audit_reply { * Description: * Return 1 if the task is a registered audit daemon, 0 otherwise. */ -int auditd_test_task(const struct task_struct *task) +int auditd_test_task(struct task_struct *task) { int rc; + struct auditd_connection *ac; rcu_read_lock(); - rc = (auditd_conn.pid && task->tgid == auditd_conn.pid ? 1 : 0); + ac = rcu_dereference(auditd_conn); + rc = (ac && ac->pid == task_tgid(task) ? 1 : 0); rcu_read_unlock(); return rc; } +/** + * auditd_pid_vnr - Return the auditd PID relative to the namespace + * + * Description: + * Returns the PID in relation to the namespace, 0 on failure. + */ +static pid_t auditd_pid_vnr(void) +{ + pid_t pid; + const struct auditd_connection *ac; + + rcu_read_lock(); + ac = rcu_dereference(auditd_conn); + if (!ac || !ac->pid) + pid = 0; + else + pid = pid_vnr(ac->pid); + rcu_read_unlock(); + + return pid; +} + /** * audit_get_sk - Return the audit socket for the given network namespace * @net: the destination network namespace @@ -250,14 +267,6 @@ static struct sock *audit_get_sk(const struct net *net) return aunet->sk; } -static void audit_set_portid(struct audit_buffer *ab, __u32 portid) -{ - if (ab) { - struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); - nlh->nlmsg_pid = portid; - } -} - void audit_panic(const char *message) { switch (audit_failure) { @@ -426,6 +435,24 @@ static int audit_set_failure(u32 state) return audit_do_config_change("audit_failure", &audit_failure, state); } +/** + * auditd_conn_free - RCU helper to release an auditd connection struct + * @rcu: RCU head + * + * Description: + * Drop any references inside the auditd connection tracking struct and free + * the memory. + */ + static void auditd_conn_free(struct rcu_head *rcu) + { + struct auditd_connection *ac; + + ac = container_of(rcu, struct auditd_connection, rcu); + put_pid(ac->pid); + put_net(ac->net); + kfree(ac); + } + /** * auditd_set - Set/Reset the auditd connection state * @pid: auditd PID @@ -434,22 +461,33 @@ static int audit_set_failure(u32 state) * * Description: * This function will obtain and drop network namespace references as - * necessary. + * necessary. Returns zero on success, negative values on failure. */ -static void auditd_set(int pid, u32 portid, struct net *net) +static int auditd_set(struct pid *pid, u32 portid, struct net *net) { unsigned long flags; + struct auditd_connection *ac_old, *ac_new; - spin_lock_irqsave(&auditd_conn.lock, flags); - auditd_conn.pid = pid; - auditd_conn.portid = portid; - if (auditd_conn.net) - put_net(auditd_conn.net); - if (net) - auditd_conn.net = get_net(net); - else - auditd_conn.net = NULL; - spin_unlock_irqrestore(&auditd_conn.lock, flags); + if (!pid || !net) + return -EINVAL; + + ac_new = kzalloc(sizeof(*ac_new), GFP_KERNEL); + if (!ac_new) + return -ENOMEM; + ac_new->pid = get_pid(pid); + ac_new->portid = portid; + ac_new->net = get_net(net); + + spin_lock_irqsave(&auditd_conn_lock, flags); + ac_old = rcu_dereference_protected(auditd_conn, + lockdep_is_held(&auditd_conn_lock)); + rcu_assign_pointer(auditd_conn, ac_new); + spin_unlock_irqrestore(&auditd_conn_lock, flags); + + if (ac_old) + call_rcu(&ac_old->rcu, auditd_conn_free); + + return 0; } /** @@ -544,13 +582,19 @@ static void kauditd_retry_skb(struct sk_buff *skb) */ static void auditd_reset(void) { + unsigned long flags; struct sk_buff *skb; + struct auditd_connection *ac_old; /* if it isn't already broken, break the connection */ - rcu_read_lock(); - if (auditd_conn.pid) - auditd_set(0, 0, NULL); - rcu_read_unlock(); + spin_lock_irqsave(&auditd_conn_lock, flags); + ac_old = rcu_dereference_protected(auditd_conn, + lockdep_is_held(&auditd_conn_lock)); + rcu_assign_pointer(auditd_conn, NULL); + spin_unlock_irqrestore(&auditd_conn_lock, flags); + + if (ac_old) + call_rcu(&ac_old->rcu, auditd_conn_free); /* flush all of the main and retry queues to the hold queue */ while ((skb = skb_dequeue(&audit_retry_queue))) @@ -576,6 +620,7 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) u32 portid; struct net *net; struct sock *sk; + struct auditd_connection *ac; /* NOTE: we can't call netlink_unicast while in the RCU section so * take a reference to the network namespace and grab local @@ -585,15 +630,15 @@ static int auditd_send_unicast_skb(struct sk_buff *skb) * section netlink_unicast() should safely return an error */ rcu_read_lock(); - if (!auditd_conn.pid) { + ac = rcu_dereference(auditd_conn); + if (!ac) { rcu_read_unlock(); rc = -ECONNREFUSED; goto err; } - net = auditd_conn.net; - get_net(net); + net = get_net(ac->net); sk = audit_get_sk(net); - portid = auditd_conn.portid; + portid = ac->portid; rcu_read_unlock(); rc = netlink_unicast(sk, skb, portid, 0); @@ -728,6 +773,7 @@ static int kauditd_thread(void *dummy) u32 portid = 0; struct net *net = NULL; struct sock *sk = NULL; + struct auditd_connection *ac; #define UNICAST_RETRIES 5 @@ -735,14 +781,14 @@ static int kauditd_thread(void *dummy) while (!kthread_should_stop()) { /* NOTE: see the lock comments in auditd_send_unicast_skb() */ rcu_read_lock(); - if (!auditd_conn.pid) { + ac = rcu_dereference(auditd_conn); + if (!ac) { rcu_read_unlock(); goto main_queue; } - net = auditd_conn.net; - get_net(net); + net = get_net(ac->net); sk = audit_get_sk(net); - portid = auditd_conn.portid; + portid = ac->portid; rcu_read_unlock(); /* attempt to flush the hold queue */ @@ -816,7 +862,7 @@ int audit_send_list(void *_dest) return 0; } -struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, +struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size) { struct sk_buff *skb; @@ -829,7 +875,7 @@ struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, int done, if (!skb) return NULL; - nlh = nlmsg_put(skb, portid, seq, t, size, flags); + nlh = nlmsg_put(skb, 0, seq, t, size, flags); if (!nlh) goto out_kfree_skb; data = nlmsg_data(nlh); @@ -873,7 +919,6 @@ static int audit_send_reply_thread(void *arg) static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int done, int multi, const void *payload, int size) { - u32 portid = NETLINK_CB(request_skb).portid; struct net *net = sock_net(NETLINK_CB(request_skb).sk); struct sk_buff *skb; struct task_struct *tsk; @@ -883,12 +928,12 @@ static void audit_send_reply(struct sk_buff *request_skb, int seq, int type, int if (!reply) return; - skb = audit_make_reply(portid, seq, type, done, multi, payload, size); + skb = audit_make_reply(seq, type, done, multi, payload, size); if (!skb) goto out; reply->net = get_net(net); - reply->portid = portid; + reply->portid = NETLINK_CB(request_skb).portid; reply->skb = skb; tsk = kthread_run(audit_send_reply_thread, reply, "audit_send_reply"); @@ -1068,11 +1113,13 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } -static int audit_replace(pid_t pid) +static int audit_replace(struct pid *pid) { + pid_t pvnr; struct sk_buff *skb; - skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, &pid, sizeof(pid)); + pvnr = pid_vnr(pid); + skb = audit_make_reply(0, AUDIT_REPLACE, 0, 0, &pvnr, sizeof(pvnr)); if (!skb) return -ENOMEM; return auditd_send_unicast_skb(skb); @@ -1102,9 +1149,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) memset(&s, 0, sizeof(s)); s.enabled = audit_enabled; s.failure = audit_failure; - rcu_read_lock(); - s.pid = auditd_conn.pid; - rcu_read_unlock(); + /* NOTE: use pid_vnr() so the PID is relative to the current + * namespace */ + s.pid = auditd_pid_vnr(); s.rate_limit = audit_rate_limit; s.backlog_limit = audit_backlog_limit; s.lost = atomic_read(&audit_lost); @@ -1130,51 +1177,61 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } if (s.mask & AUDIT_STATUS_PID) { - /* NOTE: we are using task_tgid_vnr() below because - * the s.pid value is relative to the namespace - * of the caller; at present this doesn't matter - * much since you can really only run auditd - * from the initial pid namespace, but something - * to keep in mind if this changes */ - int new_pid = s.pid; + /* NOTE: we are using the vnr PID functions below + * because the s.pid value is relative to the + * namespace of the caller; at present this + * doesn't matter much since you can really only + * run auditd from the initial pid namespace, but + * something to keep in mind if this changes */ + pid_t new_pid = s.pid; pid_t auditd_pid; - pid_t requesting_pid = task_tgid_vnr(current); + struct pid *req_pid = task_tgid(current); + + /* sanity check - PID values must match */ + if (new_pid != pid_vnr(req_pid)) + return -EINVAL; /* test the auditd connection */ - audit_replace(requesting_pid); + audit_replace(req_pid); - rcu_read_lock(); - auditd_pid = auditd_conn.pid; + auditd_pid = auditd_pid_vnr(); /* only the current auditd can unregister itself */ - if ((!new_pid) && (requesting_pid != auditd_pid)) { - rcu_read_unlock(); + if ((!new_pid) && (new_pid != auditd_pid)) { audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EACCES; } /* replacing a healthy auditd is not allowed */ if (auditd_pid && new_pid) { - rcu_read_unlock(); audit_log_config_change("audit_pid", new_pid, auditd_pid, 0); return -EEXIST; } - rcu_read_unlock(); - - if (audit_enabled != AUDIT_OFF) - audit_log_config_change("audit_pid", new_pid, - auditd_pid, 1); if (new_pid) { /* register a new auditd connection */ - auditd_set(new_pid, - NETLINK_CB(skb).portid, - sock_net(NETLINK_CB(skb).sk)); + err = auditd_set(req_pid, + NETLINK_CB(skb).portid, + sock_net(NETLINK_CB(skb).sk)); + if (audit_enabled != AUDIT_OFF) + audit_log_config_change("audit_pid", + new_pid, + auditd_pid, + err ? 0 : 1); + if (err) + return err; + /* try to process any backlog */ wake_up_interruptible(&kauditd_wait); - } else + } else { + if (audit_enabled != AUDIT_OFF) + audit_log_config_change("audit_pid", + new_pid, + auditd_pid, 1); + /* unregister the auditd connection */ auditd_reset(); + } } if (s.mask & AUDIT_STATUS_RATE_LIMIT) { err = audit_set_rate_limit(s.rate_limit); @@ -1242,7 +1299,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) size--; audit_log_n_untrustedstring(ab, data, size); } - audit_set_portid(ab, NETLINK_CB(skb).portid); audit_log_end(ab); } break; @@ -1256,8 +1312,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) audit_log_end(ab); return -EPERM; } - err = audit_rule_change(msg_type, NETLINK_CB(skb).portid, - seq, data, nlmsg_len(nlh)); + err = audit_rule_change(msg_type, seq, data, nlmsg_len(nlh)); break; case AUDIT_LIST_RULES: err = audit_list_rules_send(skb, seq); @@ -1378,11 +1433,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return err < 0 ? err : 0; } -/* - * Get message from skb. Each message is processed by audit_receive_msg. - * Malformed skbs with wrong length are discarded silently. +/** + * audit_receive - receive messages from a netlink control socket + * @skb: the message buffer + * + * Parse the provided skb and deal with any messages that may be present, + * malformed skbs are discarded. */ -static void audit_receive_skb(struct sk_buff *skb) +static void audit_receive(struct sk_buff *skb) { struct nlmsghdr *nlh; /* @@ -1395,21 +1453,15 @@ static void audit_receive_skb(struct sk_buff *skb) nlh = nlmsg_hdr(skb); len = skb->len; + mutex_lock(&audit_cmd_mutex); while (nlmsg_ok(nlh, len)) { err = audit_receive_msg(skb, nlh); /* if err or if this message says it wants a response */ if (err || (nlh->nlmsg_flags & NLM_F_ACK)) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); nlh = nlmsg_next(nlh, &len); } -} - -/* Receive messages from netlink socket. */ -static void audit_receive(struct sk_buff *skb) -{ - mutex_lock(&audit_cmd_mutex); - audit_receive_skb(skb); mutex_unlock(&audit_cmd_mutex); } @@ -1447,10 +1499,11 @@ static void __net_exit audit_net_exit(struct net *net) { struct audit_net *aunet = net_generic(net, audit_net_id); - rcu_read_lock(); - if (net == auditd_conn.net) - auditd_reset(); - rcu_read_unlock(); + /* NOTE: you would think that we would want to check the auditd + * connection and potentially reset it here if it lives in this + * namespace, but since the auditd connection tracking struct holds a + * reference to this namespace (see auditd_set()) we are only ever + * going to get here after that connection has been released */ netlink_kernel_release(aunet->sk); } @@ -1470,8 +1523,9 @@ static int __init audit_init(void) if (audit_initialized == AUDIT_DISABLED) return 0; - memset(&auditd_conn, 0, sizeof(auditd_conn)); - spin_lock_init(&auditd_conn.lock); + audit_buffer_cache = kmem_cache_create("audit_buffer", + sizeof(struct audit_buffer), + 0, SLAB_PANIC, NULL); skb_queue_head_init(&audit_queue); skb_queue_head_init(&audit_retry_queue); @@ -1538,60 +1592,33 @@ __setup("audit_backlog_limit=", audit_backlog_limit_set); static void audit_buffer_free(struct audit_buffer *ab) { - unsigned long flags; - if (!ab) return; kfree_skb(ab->skb); - spin_lock_irqsave(&audit_freelist_lock, flags); - if (audit_freelist_count > AUDIT_MAXFREE) - kfree(ab); - else { - audit_freelist_count++; - list_add(&ab->list, &audit_freelist); - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); + kmem_cache_free(audit_buffer_cache, ab); } -static struct audit_buffer * audit_buffer_alloc(struct audit_context *ctx, - gfp_t gfp_mask, int type) +static struct audit_buffer *audit_buffer_alloc(struct audit_context *ctx, + gfp_t gfp_mask, int type) { - unsigned long flags; - struct audit_buffer *ab = NULL; - struct nlmsghdr *nlh; - - spin_lock_irqsave(&audit_freelist_lock, flags); - if (!list_empty(&audit_freelist)) { - ab = list_entry(audit_freelist.next, - struct audit_buffer, list); - list_del(&ab->list); - --audit_freelist_count; - } - spin_unlock_irqrestore(&audit_freelist_lock, flags); - - if (!ab) { - ab = kmalloc(sizeof(*ab), gfp_mask); - if (!ab) - goto err; - } + struct audit_buffer *ab; - ab->ctx = ctx; - ab->gfp_mask = gfp_mask; + ab = kmem_cache_alloc(audit_buffer_cache, gfp_mask); + if (!ab) + return NULL; ab->skb = nlmsg_new(AUDIT_BUFSIZ, gfp_mask); if (!ab->skb) goto err; + if (!nlmsg_put(ab->skb, 0, 0, type, 0, 0)) + goto err; - nlh = nlmsg_put(ab->skb, 0, 0, type, 0, 0); - if (!nlh) - goto out_kfree_skb; + ab->ctx = ctx; + ab->gfp_mask = gfp_mask; return ab; -out_kfree_skb: - kfree_skb(ab->skb); - ab->skb = NULL; err: audit_buffer_free(ab); return NULL; @@ -1622,10 +1649,10 @@ unsigned int audit_serial(void) } static inline void audit_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) + struct timespec64 *t, unsigned int *serial) { if (!ctx || !auditsc_get_stamp(ctx, t, serial)) { - *t = CURRENT_TIME; + ktime_get_real_ts64(t); *serial = audit_serial(); } } @@ -1649,7 +1676,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, int type) { struct audit_buffer *ab; - struct timespec t; + struct timespec64 t; unsigned int uninitialized_var(serial); if (audit_initialized != AUDIT_INITIALIZED) @@ -1702,8 +1729,8 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, } audit_get_stamp(ab->ctx, &t, &serial); - audit_log_format(ab, "audit(%lu.%03lu:%u): ", - t.tv_sec, t.tv_nsec/1000000, serial); + audit_log_format(ab, "audit(%llu.%03lu:%u): ", + (unsigned long long)t.tv_sec, t.tv_nsec/1000000, serial); return ab; } diff --git a/kernel/audit.h b/kernel/audit.h index 0d87f8ab8778579dde21754e4ed647286f34e3ef..ddfce2ea4891221a49e9fc3e7f744920c1e67f8b 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -112,7 +112,7 @@ struct audit_context { enum audit_state state, current_state; unsigned int serial; /* serial number for record */ int major; /* syscall number */ - struct timespec ctime; /* time of syscall entry */ + struct timespec64 ctime; /* time of syscall entry */ unsigned long argv[4]; /* syscall arguments */ long return_code;/* syscall return code */ u64 prio; @@ -218,7 +218,7 @@ extern void audit_log_name(struct audit_context *context, struct audit_names *n, const struct path *path, int record_num, int *call_panic); -extern int auditd_test_task(const struct task_struct *task); +extern int auditd_test_task(struct task_struct *task); #define AUDIT_INODE_BUCKETS 32 extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS]; @@ -237,8 +237,7 @@ extern int audit_uid_comparator(kuid_t left, u32 op, kuid_t right); extern int audit_gid_comparator(kgid_t left, u32 op, kgid_t right); extern int parent_len(const char *path); extern int audit_compare_dname_path(const char *dname, const char *path, int plen); -extern struct sk_buff *audit_make_reply(__u32 portid, int seq, int type, - int done, int multi, +extern struct sk_buff *audit_make_reply(int seq, int type, int done, int multi, const void *payload, int size); extern void audit_panic(const char *message); diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 7ea57e516029d6b82a9e72adfea70d45f538cb2b..52f368b6561e984e5b93b8e71bc56cb80f3683aa 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -103,15 +103,15 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa goto out; } - fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark); + fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_group); audit_mark->mark.mask = AUDIT_FS_EVENTS; audit_mark->path = pathname; audit_update_mark(audit_mark, dentry->d_inode); audit_mark->rule = krule; - ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true); + ret = fsnotify_add_mark(&audit_mark->mark, inode, NULL, true); if (ret < 0) { - audit_fsnotify_mark_free(audit_mark); + fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); } out: @@ -168,7 +168,8 @@ static int audit_mark_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie) + const unsigned char *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { struct audit_fsnotify_mark *audit_mark; const struct inode *inode = NULL; @@ -187,7 +188,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, default: BUG(); return 0; - }; + } if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) { if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL)) @@ -201,6 +202,7 @@ static int audit_mark_handle_event(struct fsnotify_group *group, static const struct fsnotify_ops audit_mark_fsnotify_ops = { .handle_event = audit_mark_handle_event, + .free_mark = audit_fsnotify_free_mark, }; static int __init audit_fsnotify_init(void) diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 7b44195da81bb25a080b165402b74cf8c7f59bcc..011d46e5f73fb448634c603fc474f479fe3e790b 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -3,13 +3,14 @@ #include #include #include +#include #include struct audit_tree; struct audit_chunk; struct audit_tree { - atomic_t count; + refcount_t count; int goner; struct audit_chunk *root; struct list_head chunks; @@ -77,7 +78,7 @@ static struct audit_tree *alloc_tree(const char *s) tree = kmalloc(sizeof(struct audit_tree) + strlen(s) + 1, GFP_KERNEL); if (tree) { - atomic_set(&tree->count, 1); + refcount_set(&tree->count, 1); tree->goner = 0; INIT_LIST_HEAD(&tree->chunks); INIT_LIST_HEAD(&tree->rules); @@ -91,12 +92,12 @@ static struct audit_tree *alloc_tree(const char *s) static inline void get_tree(struct audit_tree *tree) { - atomic_inc(&tree->count); + refcount_inc(&tree->count); } static inline void put_tree(struct audit_tree *tree) { - if (atomic_dec_and_test(&tree->count)) + if (refcount_dec_and_test(&tree->count)) kfree_rcu(tree, head); } @@ -154,7 +155,7 @@ static struct audit_chunk *alloc_chunk(int count) INIT_LIST_HEAD(&chunk->owners[i].list); chunk->owners[i].index = i; } - fsnotify_init_mark(&chunk->mark, audit_tree_destroy_watch); + fsnotify_init_mark(&chunk->mark, audit_tree_group); chunk->mark.mask = FS_IN_IGNORED; return chunk; } @@ -163,33 +164,54 @@ enum {HASH_SIZE = 128}; static struct list_head chunk_hash_heads[HASH_SIZE]; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(hash_lock); -static inline struct list_head *chunk_hash(const struct inode *inode) +/* Function to return search key in our hash from inode. */ +static unsigned long inode_to_key(const struct inode *inode) { - unsigned long n = (unsigned long)inode / L1_CACHE_BYTES; + return (unsigned long)inode; +} + +/* + * Function to return search key in our hash from chunk. Key 0 is special and + * should never be present in the hash. + */ +static unsigned long chunk_to_key(struct audit_chunk *chunk) +{ + /* + * We have a reference to the mark so it should be attached to a + * connector. + */ + if (WARN_ON_ONCE(!chunk->mark.connector)) + return 0; + return (unsigned long)chunk->mark.connector->inode; +} + +static inline struct list_head *chunk_hash(unsigned long key) +{ + unsigned long n = key / L1_CACHE_BYTES; return chunk_hash_heads + n % HASH_SIZE; } /* hash_lock & entry->lock is held by caller */ static void insert_hash(struct audit_chunk *chunk) { - struct fsnotify_mark *entry = &chunk->mark; + unsigned long key = chunk_to_key(chunk); struct list_head *list; - if (!entry->inode) + if (!(chunk->mark.flags & FSNOTIFY_MARK_FLAG_ATTACHED)) return; - list = chunk_hash(entry->inode); + list = chunk_hash(key); list_add_rcu(&chunk->hash, list); } /* called under rcu_read_lock */ struct audit_chunk *audit_tree_lookup(const struct inode *inode) { - struct list_head *list = chunk_hash(inode); + unsigned long key = inode_to_key(inode); + struct list_head *list = chunk_hash(key); struct audit_chunk *p; list_for_each_entry_rcu(p, list, hash) { - /* mark.inode may have gone NULL, but who cares? */ - if (p->mark.inode == inode) { + if (chunk_to_key(p) == key) { atomic_long_inc(&p->refs); return p; } @@ -233,11 +255,15 @@ static void untag_chunk(struct node *p) mutex_lock(&entry->group->mark_mutex); spin_lock(&entry->lock); - if (chunk->dead || !entry->inode) { + /* + * mark_mutex protects mark from getting detached and thus also from + * mark->connector->inode getting NULL. + */ + if (chunk->dead || !(entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { spin_unlock(&entry->lock); mutex_unlock(&entry->group->mark_mutex); if (new) - free_chunk(new); + fsnotify_put_mark(&new->mark); goto out; } @@ -261,7 +287,7 @@ static void untag_chunk(struct node *p) if (!new) goto Fallback; - if (fsnotify_add_mark_locked(&new->mark, entry->group, entry->inode, + if (fsnotify_add_mark_locked(&new->mark, entry->connector->inode, NULL, 1)) { fsnotify_put_mark(&new->mark); goto Fallback; @@ -327,7 +353,7 @@ static int create_chunk(struct inode *inode, struct audit_tree *tree) return -ENOMEM; entry = &chunk->mark; - if (fsnotify_add_mark(entry, audit_tree_group, inode, NULL, 0)) { + if (fsnotify_add_mark(entry, inode, NULL, 0)) { fsnotify_put_mark(entry); return -ENOSPC; } @@ -366,7 +392,8 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) struct node *p; int n; - old_entry = fsnotify_find_inode_mark(audit_tree_group, inode); + old_entry = fsnotify_find_mark(&inode->i_fsnotify_marks, + audit_tree_group); if (!old_entry) return create_chunk(inode, tree); @@ -393,17 +420,21 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree) mutex_lock(&old_entry->group->mark_mutex); spin_lock(&old_entry->lock); - if (!old_entry->inode) { + /* + * mark_mutex protects mark from getting detached and thus also from + * mark->connector->inode getting NULL. + */ + if (!(old_entry->flags & FSNOTIFY_MARK_FLAG_ATTACHED)) { /* old_entry is being shot, lets just lie */ spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(old_entry); - free_chunk(chunk); + fsnotify_put_mark(&chunk->mark); return -ENOENT; } - if (fsnotify_add_mark_locked(chunk_entry, old_entry->group, - old_entry->inode, NULL, 1)) { + if (fsnotify_add_mark_locked(chunk_entry, + old_entry->connector->inode, NULL, 1)) { spin_unlock(&old_entry->lock); mutex_unlock(&old_entry->group->mark_mutex); fsnotify_put_mark(chunk_entry); @@ -588,7 +619,8 @@ int audit_remove_tree_rule(struct audit_krule *rule) static int compare_root(struct vfsmount *mnt, void *arg) { - return d_backing_inode(mnt->mnt_root) == arg; + return inode_to_key(d_backing_inode(mnt->mnt_root)) == + (unsigned long)arg; } void audit_trim_trees(void) @@ -623,9 +655,10 @@ void audit_trim_trees(void) list_for_each_entry(node, &tree->chunks, list) { struct audit_chunk *chunk = find_chunk(node); /* this could be NULL if the watch is dying else where... */ - struct inode *inode = chunk->mark.inode; node->index |= 1U<<31; - if (iterate_mounts(compare_root, inode, root_mnt)) + if (iterate_mounts(compare_root, + (void *)chunk_to_key(chunk), + root_mnt)) node->index &= ~(1U<<31); } spin_unlock(&hash_lock); @@ -958,7 +991,8 @@ static int audit_tree_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *file_name, u32 cookie) + const unsigned char *file_name, u32 cookie, + struct fsnotify_iter_info *iter_info) { return 0; } @@ -979,6 +1013,7 @@ static void audit_tree_freeing_mark(struct fsnotify_mark *entry, struct fsnotify static const struct fsnotify_ops audit_tree_ops = { .handle_event = audit_tree_handle_event, .freeing_mark = audit_tree_freeing_mark, + .free_mark = audit_tree_destroy_watch, }; static int __init audit_tree_init(void) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index f79e4658433d45e9d32a299ebf3f427b508ada2c..62d686d965813aeb4b8ef34877e886e08bd71912 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -46,7 +47,7 @@ */ struct audit_watch { - atomic_t count; /* reference count */ + refcount_t count; /* reference count */ dev_t dev; /* associated superblock device */ char *path; /* insertion path */ unsigned long ino; /* associated inode number */ @@ -102,7 +103,7 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) struct audit_parent *parent = NULL; struct fsnotify_mark *entry; - entry = fsnotify_find_inode_mark(audit_watch_group, inode); + entry = fsnotify_find_mark(&inode->i_fsnotify_marks, audit_watch_group); if (entry) parent = container_of(entry, struct audit_parent, mark); @@ -111,12 +112,12 @@ static inline struct audit_parent *audit_find_parent(struct inode *inode) void audit_get_watch(struct audit_watch *watch) { - atomic_inc(&watch->count); + refcount_inc(&watch->count); } void audit_put_watch(struct audit_watch *watch) { - if (atomic_dec_and_test(&watch->count)) { + if (refcount_dec_and_test(&watch->count)) { WARN_ON(watch->parent); WARN_ON(!list_empty(&watch->rules)); kfree(watch->path); @@ -157,9 +158,9 @@ static struct audit_parent *audit_init_parent(struct path *path) INIT_LIST_HEAD(&parent->watches); - fsnotify_init_mark(&parent->mark, audit_watch_free_mark); + fsnotify_init_mark(&parent->mark, audit_watch_group); parent->mark.mask = AUDIT_FS_WATCH; - ret = fsnotify_add_mark(&parent->mark, audit_watch_group, inode, NULL, 0); + ret = fsnotify_add_mark(&parent->mark, inode, NULL, 0); if (ret < 0) { audit_free_parent(parent); return ERR_PTR(ret); @@ -178,7 +179,7 @@ static struct audit_watch *audit_init_watch(char *path) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&watch->rules); - atomic_set(&watch->count, 1); + refcount_set(&watch->count, 1); watch->path = path; watch->dev = AUDIT_DEV_UNSET; watch->ino = AUDIT_INO_UNSET; @@ -472,7 +473,8 @@ static int audit_watch_handle_event(struct fsnotify_group *group, struct fsnotify_mark *inode_mark, struct fsnotify_mark *vfsmount_mark, u32 mask, const void *data, int data_type, - const unsigned char *dname, u32 cookie) + const unsigned char *dname, u32 cookie, + struct fsnotify_iter_info *iter_info) { const struct inode *inode; struct audit_parent *parent; @@ -492,7 +494,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, BUG(); inode = NULL; break; - }; + } if (mask & (FS_CREATE|FS_MOVED_TO) && inode) audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0); @@ -506,6 +508,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group, static const struct fsnotify_ops audit_watch_fsnotify_ops = { .handle_event = audit_watch_handle_event, + .free_mark = audit_watch_free_mark, }; static int __init audit_watch_init(void) diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 880519d6cf2ad00fbb15a23e545a629f580ae6f8..0b0aa5854dac1ed41959f9383af95d4097ad4646 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -338,7 +338,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) entry->rule.listnr != AUDIT_FILTER_USER) return -EINVAL; break; - }; + } switch(f->type) { default: @@ -412,7 +412,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) if (entry->rule.listnr != AUDIT_FILTER_EXIT) return -EINVAL; break; - }; + } return 0; } @@ -1033,7 +1033,7 @@ int audit_del_rule(struct audit_entry *entry) } /* List rules using struct audit_rule_data. */ -static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) +static void audit_list_rules(int seq, struct sk_buff_head *q) { struct sk_buff *skb; struct audit_krule *r; @@ -1048,15 +1048,15 @@ static void audit_list_rules(__u32 portid, int seq, struct sk_buff_head *q) data = audit_krule_to_data(r); if (unlikely(!data)) break; - skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, - 0, 1, data, + skb = audit_make_reply(seq, AUDIT_LIST_RULES, 0, 1, + data, sizeof(*data) + data->buflen); if (skb) skb_queue_tail(q, skb); kfree(data); } } - skb = audit_make_reply(portid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); + skb = audit_make_reply(seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); if (skb) skb_queue_tail(q, skb); } @@ -1085,13 +1085,11 @@ static void audit_log_rule_change(char *action, struct audit_krule *rule, int re /** * audit_rule_change - apply all rules to the specified message type * @type: audit message type - * @portid: target port id for netlink audit messages * @seq: netlink audit message sequence (serial) number * @data: payload data * @datasz: size of payload data */ -int audit_rule_change(int type, __u32 portid, int seq, void *data, - size_t datasz) +int audit_rule_change(int type, int seq, void *data, size_t datasz) { int err = 0; struct audit_entry *entry; @@ -1150,7 +1148,7 @@ int audit_list_rules_send(struct sk_buff *request_skb, int seq) skb_queue_head_init(&dest->q); mutex_lock(&audit_filter_mutex); - audit_list_rules(portid, seq, &dest->q); + audit_list_rules(seq, &dest->q); mutex_unlock(&audit_filter_mutex); tsk = kthread_run(audit_send_list, dest, "audit_send_list"); diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 1c2333155893fac54138a8b3f7311c2ca520992c..bb724baa7ac9443b9b0f56b8d046b5ab3de17a65 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include "audit.h" @@ -1532,7 +1533,7 @@ void __audit_syscall_entry(int major, unsigned long a1, unsigned long a2, return; context->serial = 0; - context->ctime = CURRENT_TIME; + ktime_get_real_ts64(&context->ctime); context->in_syscall = 1; context->current_state = state; context->ppid = 0; @@ -1596,7 +1597,7 @@ static inline void handle_one(const struct inode *inode) struct audit_tree_refs *p; struct audit_chunk *chunk; int count; - if (likely(hlist_empty(&inode->i_fsnotify_marks))) + if (likely(!inode->i_fsnotify_marks)) return; context = current->audit_context; p = context->trees; @@ -1639,7 +1640,7 @@ static void handle_path(const struct dentry *dentry) seq = read_seqbegin(&rename_lock); for(;;) { struct inode *inode = d_backing_inode(d); - if (inode && unlikely(!hlist_empty(&inode->i_fsnotify_marks))) { + if (inode && unlikely(inode->i_fsnotify_marks)) { struct audit_chunk *chunk; chunk = audit_tree_lookup(inode); if (chunk) { @@ -1941,13 +1942,13 @@ EXPORT_SYMBOL_GPL(__audit_inode_child); /** * auditsc_get_stamp - get local copies of audit_context values * @ctx: audit_context for the task - * @t: timespec to store time recorded in the audit_context + * @t: timespec64 to store time recorded in the audit_context * @serial: serial value that is recorded in the audit_context * * Also sets the context as auditable. */ int auditsc_get_stamp(struct audit_context *ctx, - struct timespec *t, unsigned int *serial) + struct timespec64 *t, unsigned int *serial) { if (!ctx->in_syscall) return 0; diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index e1ce4f4fd7fd47fda2c18776573c0f65479c6728..e1e5e658f2dbf887be70fbfc9492d4365aef5064 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,7 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o ifeq ($(CONFIG_PERF_EVENTS),y) obj-$(CONFIG_BPF_SYSCALL) += stackmap.o endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 6b6f41f0b21164a3cc2c26bb71be2b89098bbdbd..5e00b2333c26e3256a6f4dc51a2a58011a2c9db6 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016,2017 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -16,6 +17,8 @@ #include #include +#include "map_in_map.h" + static void bpf_array_free_percpu(struct bpf_array *array) { int i; @@ -113,6 +116,30 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) return array->value + array->elem_size * index; } +/* emit BPF instructions equivalent to C code of array_map_lookup_elem() */ +static u32 array_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + u32 elem_size = round_up(map->value_size, 8); + const int ret = BPF_REG_0; + const int map_ptr = BPF_REG_1; + const int index = BPF_REG_2; + + *insn++ = BPF_ALU64_IMM(BPF_ADD, map_ptr, offsetof(struct bpf_array, value)); + *insn++ = BPF_LDX_MEM(BPF_W, ret, index, 0); + *insn++ = BPF_JMP_IMM(BPF_JGE, ret, map->max_entries, 3); + + if (is_power_of_2(elem_size)) { + *insn++ = BPF_ALU64_IMM(BPF_LSH, ret, ilog2(elem_size)); + } else { + *insn++ = BPF_ALU64_IMM(BPF_MUL, ret, elem_size); + } + *insn++ = BPF_ALU64_REG(BPF_ADD, ret, map_ptr); + *insn++ = BPF_JMP_IMM(BPF_JA, 0, 0, 1); + *insn++ = BPF_MOV64_IMM(ret, 0); + return insn - insn_buf; +} + /* Called from eBPF program */ static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) { @@ -155,7 +182,7 @@ int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { struct bpf_array *array = container_of(map, struct bpf_array, map); - u32 index = *(u32 *)key; + u32 index = key ? *(u32 *)key : U32_MAX; u32 *next = (u32 *)next_key; if (index >= array->map.max_entries) { @@ -260,21 +287,17 @@ static void array_map_free(struct bpf_map *map) bpf_map_area_free(array); } -static const struct bpf_map_ops array_ops = { +const struct bpf_map_ops array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, .map_lookup_elem = array_map_lookup_elem, .map_update_elem = array_map_update_elem, .map_delete_elem = array_map_delete_elem, + .map_gen_lookup = array_map_gen_lookup, }; -static struct bpf_map_type_list array_type __ro_after_init = { - .ops = &array_ops, - .type = BPF_MAP_TYPE_ARRAY, -}; - -static const struct bpf_map_ops percpu_array_ops = { +const struct bpf_map_ops percpu_array_map_ops = { .map_alloc = array_map_alloc, .map_free = array_map_free, .map_get_next_key = array_map_get_next_key, @@ -283,19 +306,6 @@ static const struct bpf_map_ops percpu_array_ops = { .map_delete_elem = array_map_delete_elem, }; -static struct bpf_map_type_list percpu_array_type __ro_after_init = { - .ops = &percpu_array_ops, - .type = BPF_MAP_TYPE_PERCPU_ARRAY, -}; - -static int __init register_array_map(void) -{ - bpf_register_map_type(&array_type); - bpf_register_map_type(&percpu_array_type); - return 0; -} -late_initcall(register_array_map); - static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr) { /* only file descriptors can be stored in this type of map */ @@ -399,7 +409,7 @@ void bpf_fd_array_map_clear(struct bpf_map *map) fd_array_map_delete_elem(map, &i); } -static const struct bpf_map_ops prog_array_ops = { +const struct bpf_map_ops prog_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -409,18 +419,6 @@ static const struct bpf_map_ops prog_array_ops = { .map_fd_put_ptr = prog_fd_array_put_ptr, }; -static struct bpf_map_type_list prog_array_type __ro_after_init = { - .ops = &prog_array_ops, - .type = BPF_MAP_TYPE_PROG_ARRAY, -}; - -static int __init register_prog_array_map(void) -{ - bpf_register_map_type(&prog_array_type); - return 0; -} -late_initcall(register_prog_array_map); - static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file, struct file *map_file) { @@ -511,7 +509,7 @@ static void perf_event_fd_array_release(struct bpf_map *map, rcu_read_unlock(); } -static const struct bpf_map_ops perf_event_array_ops = { +const struct bpf_map_ops perf_event_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = fd_array_map_free, .map_get_next_key = array_map_get_next_key, @@ -522,18 +520,6 @@ static const struct bpf_map_ops perf_event_array_ops = { .map_release = perf_event_fd_array_release, }; -static struct bpf_map_type_list perf_event_array_type __ro_after_init = { - .ops = &perf_event_array_ops, - .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY, -}; - -static int __init register_perf_event_array_map(void) -{ - bpf_register_map_type(&perf_event_array_type); - return 0; -} -late_initcall(register_perf_event_array_map); - #ifdef CONFIG_CGROUPS static void *cgroup_fd_array_get_ptr(struct bpf_map *map, struct file *map_file /* not used */, @@ -554,7 +540,7 @@ static void cgroup_fd_array_free(struct bpf_map *map) fd_array_map_free(map); } -static const struct bpf_map_ops cgroup_array_ops = { +const struct bpf_map_ops cgroup_array_map_ops = { .map_alloc = fd_array_map_alloc, .map_free = cgroup_fd_array_free, .map_get_next_key = array_map_get_next_key, @@ -563,16 +549,53 @@ static const struct bpf_map_ops cgroup_array_ops = { .map_fd_get_ptr = cgroup_fd_array_get_ptr, .map_fd_put_ptr = cgroup_fd_array_put_ptr, }; +#endif -static struct bpf_map_type_list cgroup_array_type __ro_after_init = { - .ops = &cgroup_array_ops, - .type = BPF_MAP_TYPE_CGROUP_ARRAY, -}; +static struct bpf_map *array_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; -static int __init register_cgroup_array_map(void) + map = fd_array_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; +} + +static void array_of_map_free(struct bpf_map *map) { - bpf_register_map_type(&cgroup_array_type); - return 0; + /* map->inner_map_meta is only accessed by syscall which + * is protected by fdget/fdput. + */ + bpf_map_meta_free(map->inner_map_meta); + bpf_fd_array_map_clear(map); + fd_array_map_free(map); } -late_initcall(register_cgroup_array_map); -#endif + +static void *array_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = array_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +const struct bpf_map_ops array_of_maps_map_ops = { + .map_alloc = array_of_map_alloc, + .map_free = array_of_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = array_of_map_lookup_elem, + .map_delete_elem = fd_array_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/bpf_lru_list.c b/kernel/bpf/bpf_lru_list.c index f62d1d56f41d0c5f3e3ef2c12b7dea3eb6578a10..e6ef4401a13804b6a6604bf22671e0c9dc4d39b2 100644 --- a/kernel/bpf/bpf_lru_list.c +++ b/kernel/bpf/bpf_lru_list.c @@ -13,7 +13,7 @@ #define LOCAL_FREE_TARGET (128) #define LOCAL_NR_SCANS LOCAL_FREE_TARGET -#define PERCPU_FREE_TARGET (16) +#define PERCPU_FREE_TARGET (4) #define PERCPU_NR_SCANS PERCPU_FREE_TARGET /* Helpers to get the local list index */ diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index da0f53690295610ff5ae447ed98fb6e70c99c4f1..ea6033cba94721fd8ca080354c825771d93620fb 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -154,7 +154,7 @@ int __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering - * @sk: The socken sending or receiving traffic + * @sk: The socket sending or receiving traffic * @skb: The skb that is being sent or received * @type: The type of program to be exectuted * @@ -189,10 +189,13 @@ int __cgroup_bpf_run_filter_skb(struct sock *sk, prog = rcu_dereference(cgrp->bpf.effective[type]); if (prog) { unsigned int offset = skb->data - skb_network_header(skb); + struct sock *save_sk = skb->sk; + skb->sk = sk; __skb_push(skb, offset); ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; __skb_pull(skb, offset); + skb->sk = save_sk; } rcu_read_unlock(); diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index b4f1cb0c5ac7104c3f12f9ed5c1e3fe159c57824..dedf367f59bba529ded5603d61782bccb80782cb 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -76,8 +76,7 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog_aux *aux; struct bpf_prog *fp; @@ -107,8 +106,7 @@ EXPORT_SYMBOL_GPL(bpf_prog_alloc); struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; u32 pages, delta; int ret; @@ -394,27 +392,23 @@ static bool bpf_prog_kallsyms_verify_off(const struct bpf_prog *fp) void bpf_prog_kallsyms_add(struct bpf_prog *fp) { - unsigned long flags; - if (!bpf_prog_kallsyms_candidate(fp) || !capable(CAP_SYS_ADMIN)) return; - spin_lock_irqsave(&bpf_lock, flags); + spin_lock_bh(&bpf_lock); bpf_prog_ksym_node_add(fp->aux); - spin_unlock_irqrestore(&bpf_lock, flags); + spin_unlock_bh(&bpf_lock); } void bpf_prog_kallsyms_del(struct bpf_prog *fp) { - unsigned long flags; - if (!bpf_prog_kallsyms_candidate(fp)) return; - spin_lock_irqsave(&bpf_lock, flags); + spin_lock_bh(&bpf_lock); bpf_prog_ksym_node_del(fp->aux); - spin_unlock_irqrestore(&bpf_lock, flags); + spin_unlock_bh(&bpf_lock); } static struct bpf_prog *bpf_prog_kallsyms_find(unsigned long addr) @@ -659,8 +653,7 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, gfp_t gfp_extra_flags) { - gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO | - gfp_extra_flags; + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO | gfp_extra_flags; struct bpf_prog *fp; fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 361a69dfe5434d6afb554477b899e91db90fb29f..004334ea13ba3f56f10b33e9a1a50a97906000e4 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -16,6 +16,7 @@ #include #include "percpu_freelist.h" #include "bpf_lru_list.h" +#include "map_in_map.h" struct bucket { struct hlist_nulls_head head; @@ -86,6 +87,11 @@ static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size return *(void __percpu **)(l->key + key_size); } +static void *fd_htab_map_get_ptr(const struct bpf_map *map, struct htab_elem *l) +{ + return *(void **)(l->key + roundup(map->key_size, 8)); +} + static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) { return (struct htab_elem *) (htab->elems + i * htab->elem_size); @@ -426,7 +432,11 @@ static struct htab_elem *lookup_nulls_elem_raw(struct hlist_nulls_head *head, return NULL; } -/* Called from syscall or from eBPF program */ +/* Called from syscall or from eBPF program directly, so + * arguments have to match bpf_map_lookup_elem() exactly. + * The return value is adjusted by BPF instructions + * in htab_map_gen_lookup(). + */ static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); @@ -458,6 +468,30 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) return NULL; } +/* inline bpf_map_lookup_elem() call. + * Instead of: + * bpf_prog + * bpf_map_lookup_elem + * map->ops->map_lookup_elem + * htab_map_lookup_elem + * __htab_map_lookup_elem + * do: + * bpf_prog + * __htab_map_lookup_elem + */ +static u32 htab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf) +{ + struct bpf_insn *insn = insn_buf; + const int ret = BPF_REG_0; + + *insn++ = BPF_EMIT_CALL((u64 (*)(u64, u64, u64, u64, u64))__htab_map_lookup_elem); + *insn++ = BPF_JMP_IMM(BPF_JEQ, ret, 0, 1); + *insn++ = BPF_ALU64_IMM(BPF_ADD, ret, + offsetof(struct htab_elem, key) + + round_up(map->key_size, 8)); + return insn - insn_buf; +} + static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key) { struct htab_elem *l = __htab_map_lookup_elem(map, key); @@ -506,12 +540,15 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) struct hlist_nulls_head *head; struct htab_elem *l, *next_l; u32 hash, key_size; - int i; + int i = 0; WARN_ON_ONCE(!rcu_read_lock_held()); key_size = map->key_size; + if (!key) + goto find_first_elem; + hash = htab_map_hash(key, key_size); head = select_bucket(htab, hash); @@ -519,10 +556,8 @@ static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key) /* lookup the key */ l = lookup_nulls_elem_raw(head, hash, key, key_size, htab->n_buckets); - if (!l) { - i = 0; + if (!l) goto find_first_elem; - } /* key was found, get next key in the same bucket */ next_l = hlist_nulls_entry_safe(rcu_dereference_raw(hlist_nulls_next_rcu(&l->hash_node)), @@ -582,6 +617,14 @@ static void htab_elem_free_rcu(struct rcu_head *head) static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) { + struct bpf_map *map = &htab->map; + + if (map->ops->map_fd_put_ptr) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + if (htab_is_prealloc(htab)) { pcpu_freelist_push(&htab->freelist, &l->fnode); } else { @@ -1027,6 +1070,7 @@ static void delete_all_elements(struct bpf_htab *htab) } } } + /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -1053,21 +1097,17 @@ static void htab_map_free(struct bpf_map *map) kfree(htab); } -static const struct bpf_map_ops htab_ops = { +const struct bpf_map_ops htab_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, .map_lookup_elem = htab_map_lookup_elem, .map_update_elem = htab_map_update_elem, .map_delete_elem = htab_map_delete_elem, + .map_gen_lookup = htab_map_gen_lookup, }; -static struct bpf_map_type_list htab_type __ro_after_init = { - .ops = &htab_ops, - .type = BPF_MAP_TYPE_HASH, -}; - -static const struct bpf_map_ops htab_lru_ops = { +const struct bpf_map_ops htab_lru_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1076,11 +1116,6 @@ static const struct bpf_map_ops htab_lru_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_type __ro_after_init = { - .ops = &htab_lru_ops, - .type = BPF_MAP_TYPE_LRU_HASH, -}; - /* Called from eBPF program */ static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) { @@ -1154,7 +1189,7 @@ int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, return ret; } -static const struct bpf_map_ops htab_percpu_ops = { +const struct bpf_map_ops htab_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1163,12 +1198,7 @@ static const struct bpf_map_ops htab_percpu_ops = { .map_delete_elem = htab_map_delete_elem, }; -static struct bpf_map_type_list htab_percpu_type __ro_after_init = { - .ops = &htab_percpu_ops, - .type = BPF_MAP_TYPE_PERCPU_HASH, -}; - -static const struct bpf_map_ops htab_lru_percpu_ops = { +const struct bpf_map_ops htab_lru_percpu_map_ops = { .map_alloc = htab_map_alloc, .map_free = htab_map_free, .map_get_next_key = htab_map_get_next_key, @@ -1177,17 +1207,102 @@ static const struct bpf_map_ops htab_lru_percpu_ops = { .map_delete_elem = htab_lru_map_delete_elem, }; -static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = { - .ops = &htab_lru_percpu_ops, - .type = BPF_MAP_TYPE_LRU_PERCPU_HASH, -}; +static struct bpf_map *fd_htab_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map; + + if (attr->value_size != sizeof(u32)) + return ERR_PTR(-EINVAL); + + /* pointer is stored internally */ + attr->value_size = sizeof(void *); + map = htab_map_alloc(attr); + attr->value_size = sizeof(u32); -static int __init register_htab_map(void) + return map; +} + +static void fd_htab_map_free(struct bpf_map *map) { - bpf_register_map_type(&htab_type); - bpf_register_map_type(&htab_percpu_type); - bpf_register_map_type(&htab_lru_type); - bpf_register_map_type(&htab_lru_percpu_type); - return 0; + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct hlist_nulls_node *n; + struct hlist_nulls_head *head; + struct htab_elem *l; + int i; + + for (i = 0; i < htab->n_buckets; i++) { + head = select_bucket(htab, i); + + hlist_nulls_for_each_entry_safe(l, n, head, hash_node) { + void *ptr = fd_htab_map_get_ptr(map, l); + + map->ops->map_fd_put_ptr(ptr); + } + } + + htab_map_free(map); +} + +/* only called from syscall */ +int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file, + void *key, void *value, u64 map_flags) +{ + void *ptr; + int ret; + u32 ufd = *(u32 *)value; + + ptr = map->ops->map_fd_get_ptr(map, map_file, ufd); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + ret = htab_map_update_elem(map, key, &ptr, map_flags); + if (ret) + map->ops->map_fd_put_ptr(ptr); + + return ret; +} + +static struct bpf_map *htab_of_map_alloc(union bpf_attr *attr) +{ + struct bpf_map *map, *inner_map_meta; + + inner_map_meta = bpf_map_meta_alloc(attr->inner_map_fd); + if (IS_ERR(inner_map_meta)) + return inner_map_meta; + + map = fd_htab_map_alloc(attr); + if (IS_ERR(map)) { + bpf_map_meta_free(inner_map_meta); + return map; + } + + map->inner_map_meta = inner_map_meta; + + return map; } -late_initcall(register_htab_map); + +static void *htab_of_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_map **inner_map = htab_map_lookup_elem(map, key); + + if (!inner_map) + return NULL; + + return READ_ONCE(*inner_map); +} + +static void htab_of_map_free(struct bpf_map *map) +{ + bpf_map_meta_free(map->inner_map_meta); + fd_htab_map_free(map); +} + +const struct bpf_map_ops htab_of_maps_map_ops = { + .map_alloc = htab_of_map_alloc, + .map_free = htab_of_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_of_map_lookup_elem, + .map_delete_elem = htab_map_delete_elem, + .map_fd_get_ptr = bpf_map_fd_get_ptr, + .map_fd_put_ptr = bpf_map_fd_put_ptr, +}; diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index fddcae801724be149b73794ab42f9f177ecf4f56..9bbd33497d3d0846de8ebd7b74828e5566045e68 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -429,7 +429,7 @@ static int bpf_parse_options(char *data, struct bpf_mount_opts *opts) static int bpf_fill_super(struct super_block *sb, void *data, int silent) { - static struct tree_descr bpf_rfiles[] = { { "" } }; + static const struct tree_descr bpf_rfiles[] = { { "" } }; struct bpf_mount_opts opts; struct inode *inode; int ret; diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c index b37bd9ab7f574242722c1d9b0503b896019488b2..39cfafd895b80d2f0fb5054626af001949a8ad98 100644 --- a/kernel/bpf/lpm_trie.c +++ b/kernel/bpf/lpm_trie.c @@ -505,7 +505,7 @@ static int trie_get_next_key(struct bpf_map *map, void *key, void *next_key) return -ENOTSUPP; } -static const struct bpf_map_ops trie_ops = { +const struct bpf_map_ops trie_map_ops = { .map_alloc = trie_alloc, .map_free = trie_free, .map_get_next_key = trie_get_next_key, @@ -513,15 +513,3 @@ static const struct bpf_map_ops trie_ops = { .map_update_elem = trie_update_elem, .map_delete_elem = trie_delete_elem, }; - -static struct bpf_map_type_list trie_type __ro_after_init = { - .ops = &trie_ops, - .type = BPF_MAP_TYPE_LPM_TRIE, -}; - -static int __init register_trie_map(void) -{ - bpf_register_map_type(&trie_type); - return 0; -} -late_initcall(register_trie_map); diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c new file mode 100644 index 0000000000000000000000000000000000000000..59bcdf821ae47477f2a058af3e6918f0ce222325 --- /dev/null +++ b/kernel/bpf/map_in_map.c @@ -0,0 +1,97 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include + +#include "map_in_map.h" + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) +{ + struct bpf_map *inner_map, *inner_map_meta; + struct fd f; + + f = fdget(inner_map_ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + /* prog_array->owner_prog_type and owner_jited + * is a runtime binding. Doing static check alone + * in the verifier is not enough. + */ + if (inner_map->map_type == BPF_MAP_TYPE_PROG_ARRAY) { + fdput(f); + return ERR_PTR(-ENOTSUPP); + } + + /* Does not support >1 level map-in-map */ + if (inner_map->inner_map_meta) { + fdput(f); + return ERR_PTR(-EINVAL); + } + + inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + if (!inner_map_meta) { + fdput(f); + return ERR_PTR(-ENOMEM); + } + + inner_map_meta->map_type = inner_map->map_type; + inner_map_meta->key_size = inner_map->key_size; + inner_map_meta->value_size = inner_map->value_size; + inner_map_meta->map_flags = inner_map->map_flags; + inner_map_meta->ops = inner_map->ops; + inner_map_meta->max_entries = inner_map->max_entries; + + fdput(f); + return inner_map_meta; +} + +void bpf_map_meta_free(struct bpf_map *map_meta) +{ + kfree(map_meta); +} + +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1) +{ + /* No need to compare ops because it is covered by map_type */ + return meta0->map_type == meta1->map_type && + meta0->key_size == meta1->key_size && + meta0->value_size == meta1->value_size && + meta0->map_flags == meta1->map_flags && + meta0->max_entries == meta1->max_entries; +} + +void *bpf_map_fd_get_ptr(struct bpf_map *map, + struct file *map_file /* not used */, + int ufd) +{ + struct bpf_map *inner_map; + struct fd f; + + f = fdget(ufd); + inner_map = __bpf_map_get(f); + if (IS_ERR(inner_map)) + return inner_map; + + if (bpf_map_meta_equal(map->inner_map_meta, inner_map)) + inner_map = bpf_map_inc(inner_map, false); + else + inner_map = ERR_PTR(-EINVAL); + + fdput(f); + return inner_map; +} + +void bpf_map_fd_put_ptr(void *ptr) +{ + /* ptr->ops->map_free() has to go through one + * rcu grace period by itself. + */ + bpf_map_put(ptr); +} diff --git a/kernel/bpf/map_in_map.h b/kernel/bpf/map_in_map.h new file mode 100644 index 0000000000000000000000000000000000000000..177fadb689dca9075ff9c43055d392821908ba6d --- /dev/null +++ b/kernel/bpf/map_in_map.h @@ -0,0 +1,23 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __MAP_IN_MAP_H__ +#define __MAP_IN_MAP_H__ + +#include + +struct file; +struct bpf_map; + +struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd); +void bpf_map_meta_free(struct bpf_map *map_meta); +bool bpf_map_meta_equal(const struct bpf_map *meta0, + const struct bpf_map *meta1); +void *bpf_map_fd_get_ptr(struct bpf_map *map, struct file *map_file, + int ufd); +void bpf_map_fd_put_ptr(void *ptr); + +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c index 22aa45cd0324e320b9cd8b0f89bf9befdaae74b6..4dfd6f2ec2f9725681f490971e60acc8033c64b6 100644 --- a/kernel/bpf/stackmap.c +++ b/kernel/bpf/stackmap.c @@ -264,7 +264,7 @@ static void stack_map_free(struct bpf_map *map) put_callchain_buffers(); } -static const struct bpf_map_ops stack_map_ops = { +const struct bpf_map_ops stack_map_ops = { .map_alloc = stack_map_alloc, .map_free = stack_map_free, .map_get_next_key = stack_map_get_next_key, @@ -272,15 +272,3 @@ static const struct bpf_map_ops stack_map_ops = { .map_update_elem = stack_map_update_elem, .map_delete_elem = stack_map_delete_elem, }; - -static struct bpf_map_type_list stack_map_type __ro_after_init = { - .ops = &stack_map_ops, - .type = BPF_MAP_TYPE_STACK_TRACE, -}; - -static int __init register_stack_map(void) -{ - bpf_register_map_type(&stack_map_type); - return 0; -} -late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 7af0dcc5d7555679cea6c08395ab54710e7066e6..fd2411fd69148cff375120316fba0a0e033541f6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -27,30 +27,29 @@ DEFINE_PER_CPU(int, bpf_prog_active); int sysctl_unprivileged_bpf_disabled __read_mostly; -static LIST_HEAD(bpf_map_types); +static const struct bpf_map_ops * const bpf_map_types[] = { +#define BPF_PROG_TYPE(_id, _ops) +#define BPF_MAP_TYPE(_id, _ops) \ + [_id] = &_ops, +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static struct bpf_map *find_and_alloc_map(union bpf_attr *attr) { - struct bpf_map_type_list *tl; struct bpf_map *map; - list_for_each_entry(tl, &bpf_map_types, list_node) { - if (tl->type == attr->map_type) { - map = tl->ops->map_alloc(attr); - if (IS_ERR(map)) - return map; - map->ops = tl->ops; - map->map_type = attr->map_type; - return map; - } - } - return ERR_PTR(-EINVAL); -} + if (attr->map_type >= ARRAY_SIZE(bpf_map_types) || + !bpf_map_types[attr->map_type]) + return ERR_PTR(-EINVAL); -/* boot time registration of different map implementations */ -void bpf_register_map_type(struct bpf_map_type_list *tl) -{ - list_add(&tl->list_node, &bpf_map_types); + map = bpf_map_types[attr->map_type]->map_alloc(attr); + if (IS_ERR(map)) + return map; + map->ops = bpf_map_types[attr->map_type]; + map->map_type = attr->map_type; + return map; } void *bpf_map_area_alloc(size_t size) @@ -68,8 +67,7 @@ void *bpf_map_area_alloc(size_t size) return area; } - return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags, - PAGE_KERNEL); + return __vmalloc(size, GFP_KERNEL | flags, PAGE_KERNEL); } void bpf_map_area_free(void *area) @@ -215,7 +213,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD map_flags +#define BPF_MAP_CREATE_LAST_FIELD inner_map_fd /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -352,6 +350,9 @@ static int map_lookup_elem(union bpf_attr *attr) err = bpf_percpu_array_copy(map, key, value); } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { err = bpf_stackmap_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + err = -ENOTSUPP; } else { rcu_read_lock(); ptr = map->ops->map_lookup_elem(map, key); @@ -438,11 +439,17 @@ static int map_update_elem(union bpf_attr *attr) err = bpf_percpu_array_update(map, key, value, attr->flags); } else if (map->map_type == BPF_MAP_TYPE_PERF_EVENT_ARRAY || map->map_type == BPF_MAP_TYPE_PROG_ARRAY || - map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY) { + map->map_type == BPF_MAP_TYPE_CGROUP_ARRAY || + map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) { rcu_read_lock(); err = bpf_fd_array_map_update_elem(map, f.file, key, value, attr->flags); rcu_read_unlock(); + } else if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { + rcu_read_lock(); + err = bpf_fd_htab_map_update_elem(map, f.file, key, value, + attr->flags); + rcu_read_unlock(); } else { rcu_read_lock(); err = map->ops->map_update_elem(map, key, value, attr->flags); @@ -528,14 +535,18 @@ static int map_get_next_key(union bpf_attr *attr) if (IS_ERR(map)) return PTR_ERR(map); - err = -ENOMEM; - key = kmalloc(map->key_size, GFP_USER); - if (!key) - goto err_put; + if (ukey) { + err = -ENOMEM; + key = kmalloc(map->key_size, GFP_USER); + if (!key) + goto err_put; - err = -EFAULT; - if (copy_from_user(key, ukey, map->key_size) != 0) - goto free_key; + err = -EFAULT; + if (copy_from_user(key, ukey, map->key_size) != 0) + goto free_key; + } else { + key = NULL; + } err = -ENOMEM; next_key = kmalloc(map->key_size, GFP_USER); @@ -564,79 +575,23 @@ static int map_get_next_key(union bpf_attr *attr) return err; } -static LIST_HEAD(bpf_prog_types); +static const struct bpf_verifier_ops * const bpf_prog_types[] = { +#define BPF_PROG_TYPE(_id, _ops) \ + [_id] = &_ops, +#define BPF_MAP_TYPE(_id, _ops) +#include +#undef BPF_PROG_TYPE +#undef BPF_MAP_TYPE +}; static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog) { - struct bpf_prog_type_list *tl; - - list_for_each_entry(tl, &bpf_prog_types, list_node) { - if (tl->type == type) { - prog->aux->ops = tl->ops; - prog->type = type; - return 0; - } - } - - return -EINVAL; -} - -void bpf_register_prog_type(struct bpf_prog_type_list *tl) -{ - list_add(&tl->list_node, &bpf_prog_types); -} - -/* fixup insn->imm field of bpf_call instructions: - * if (insn->imm == BPF_FUNC_map_lookup_elem) - * insn->imm = bpf_map_lookup_elem - __bpf_call_base; - * else if (insn->imm == BPF_FUNC_map_update_elem) - * insn->imm = bpf_map_update_elem - __bpf_call_base; - * else ... - * - * this function is called after eBPF program passed verification - */ -static void fixup_bpf_calls(struct bpf_prog *prog) -{ - const struct bpf_func_proto *fn; - int i; + if (type >= ARRAY_SIZE(bpf_prog_types) || !bpf_prog_types[type]) + return -EINVAL; - for (i = 0; i < prog->len; i++) { - struct bpf_insn *insn = &prog->insnsi[i]; - - if (insn->code == (BPF_JMP | BPF_CALL)) { - /* we reach here when program has bpf_call instructions - * and it passed bpf_check(), means that - * ops->get_func_proto must have been supplied, check it - */ - BUG_ON(!prog->aux->ops->get_func_proto); - - if (insn->imm == BPF_FUNC_get_route_realm) - prog->dst_needed = 1; - if (insn->imm == BPF_FUNC_get_prandom_u32) - bpf_user_rnd_init_once(); - if (insn->imm == BPF_FUNC_xdp_adjust_head) - prog->xdp_adjust_head = 1; - if (insn->imm == BPF_FUNC_tail_call) { - /* mark bpf_tail_call as different opcode - * to avoid conditional branch in - * interpeter for every normal call - * and to prevent accidental JITing by - * JIT compiler that doesn't support - * bpf_tail_call yet - */ - insn->imm = 0; - insn->code |= BPF_X; - continue; - } - - fn = prog->aux->ops->get_func_proto(insn->imm); - /* all functions that have prototype and verifier allowed - * programs to call them, must be real in-kernel functions - */ - BUG_ON(!fn->func); - insn->imm = fn->func - __bpf_call_base; - } - } + prog->aux->ops = bpf_prog_types[type]; + prog->type = type; + return 0; } /* drop refcnt on maps used by eBPF program and free auxilary data */ @@ -892,9 +847,6 @@ static int bpf_prog_load(union bpf_attr *attr) if (err < 0) goto free_used_maps; - /* fixup BPF_CALL->imm field */ - fixup_bpf_calls(prog); - /* eBPF program is ready to be JITed */ prog = bpf_prog_select_runtime(prog, &err); if (err < 0) @@ -1020,6 +972,28 @@ static int bpf_prog_detach(const union bpf_attr *attr) } #endif /* CONFIG_CGROUP_BPF */ +#define BPF_PROG_TEST_RUN_LAST_FIELD test.duration + +static int bpf_prog_test_run(const union bpf_attr *attr, + union bpf_attr __user *uattr) +{ + struct bpf_prog *prog; + int ret = -ENOTSUPP; + + if (CHECK_ATTR(BPF_PROG_TEST_RUN)) + return -EINVAL; + + prog = bpf_prog_get(attr->test.prog_fd); + if (IS_ERR(prog)) + return PTR_ERR(prog); + + if (prog->aux->ops->test_run) + ret = prog->aux->ops->test_run(prog, attr, uattr); + + bpf_prog_put(prog); + return ret; +} + SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size) { union bpf_attr attr = {}; @@ -1086,7 +1060,6 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz case BPF_OBJ_GET: err = bpf_obj_get(&attr); break; - #ifdef CONFIG_CGROUP_BPF case BPF_PROG_ATTACH: err = bpf_prog_attach(&attr); @@ -1095,7 +1068,9 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz err = bpf_prog_detach(&attr); break; #endif - + case BPF_PROG_TEST_RUN: + err = bpf_prog_test_run(&attr, uattr); + break; default: err = -EINVAL; break; diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a834068a400e279f963097489ed165c6ad1301b2..c5b56c92f8e255d1b13634ad07c460969484f2f4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -143,6 +143,8 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 65536 #define BPF_COMPLEXITY_LIMIT_STACK 1024 +#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) + struct bpf_call_arg_meta { struct bpf_map *map_ptr; bool raw_mode; @@ -296,7 +298,8 @@ static const char *const bpf_jmp_string[16] = { [BPF_EXIT >> 4] = "exit", }; -static void print_bpf_insn(struct bpf_insn *insn) +static void print_bpf_insn(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) { u8 class = BPF_CLASS(insn->code); @@ -360,9 +363,19 @@ static void print_bpf_insn(struct bpf_insn *insn) insn->code, bpf_ldst_string[BPF_SIZE(insn->code) >> 3], insn->src_reg, insn->imm); - } else if (BPF_MODE(insn->code) == BPF_IMM) { - verbose("(%02x) r%d = 0x%x\n", - insn->code, insn->dst_reg, insn->imm); + } else if (BPF_MODE(insn->code) == BPF_IMM && + BPF_SIZE(insn->code) == BPF_DW) { + /* At this point, we already made sure that the second + * part of the ldimm64 insn is accessible. + */ + u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm; + bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD; + + if (map_ptr && !env->allow_ptr_leaks) + imm = 0; + + verbose("(%02x) r%d = 0x%llx\n", insn->code, + insn->dst_reg, (unsigned long long)imm); } else { verbose("BUG_ld_%02x\n", insn->code); return; @@ -1215,6 +1228,10 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) func_id != BPF_FUNC_current_task_under_cgroup) goto error; break; + case BPF_MAP_TYPE_ARRAY_OF_MAPS: + case BPF_MAP_TYPE_HASH_OF_MAPS: + if (func_id != BPF_FUNC_map_lookup_elem) + goto error; default: break; } @@ -1291,7 +1308,7 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) } } -static int check_call(struct bpf_verifier_env *env, int func_id) +static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; @@ -1375,6 +1392,8 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { + struct bpf_insn_aux_data *insn_aux; + regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; regs[BPF_REG_0].max_value = regs[BPF_REG_0].min_value = 0; /* remember map_ptr, so that check_map_access() @@ -1387,6 +1406,11 @@ static int check_call(struct bpf_verifier_env *env, int func_id) } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; + insn_aux = &env->insn_aux_data[insn_idx]; + if (!insn_aux->map_ptr) + insn_aux->map_ptr = meta.map_ptr; + else if (insn_aux->map_ptr != meta.map_ptr) + insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose("unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -1909,6 +1933,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) dst_reg->type = PTR_TO_STACK; dst_reg->imm = insn->imm; return 0; + } else if (opcode == BPF_ADD && + BPF_CLASS(insn->code) == BPF_ALU64 && + dst_reg->type == PTR_TO_STACK && + ((BPF_SRC(insn->code) == BPF_X && + regs[insn->src_reg].type == CONST_IMM) || + BPF_SRC(insn->code) == BPF_K)) { + if (BPF_SRC(insn->code) == BPF_X) + dst_reg->imm += regs[insn->src_reg].imm; + else + dst_reg->imm += insn->imm; + return 0; } else if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && (dst_reg->type == PTR_TO_PACKET || @@ -2112,14 +2147,19 @@ static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id, struct bpf_reg_state *reg = ®s[regno]; if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) { - reg->type = type; + if (type == UNKNOWN_VALUE) { + __mark_reg_unknown_value(regs, regno); + } else if (reg->map_ptr->inner_map_meta) { + reg->type = CONST_PTR_TO_MAP; + reg->map_ptr = reg->map_ptr->inner_map_meta; + } else { + reg->type = type; + } /* We don't need id from this point onwards anymore, thus we * should better reset it, so that state pruning has chances * to take effect. */ reg->id = 0; - if (type == UNKNOWN_VALUE) - __mark_reg_unknown_value(regs, regno); } } @@ -2824,7 +2864,7 @@ static int do_check(struct bpf_verifier_env *env) if (log_level) { verbose("%d: ", insn_idx); - print_bpf_insn(insn); + print_bpf_insn(env, insn); } err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); @@ -2960,7 +3000,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm); + err = check_call(env, insn->imm, insn_idx); if (err) return err; @@ -3044,16 +3084,33 @@ static int do_check(struct bpf_verifier_env *env) return 0; } +static int check_map_prealloc(struct bpf_map *map) +{ + return (map->map_type != BPF_MAP_TYPE_HASH && + map->map_type != BPF_MAP_TYPE_PERCPU_HASH && + map->map_type != BPF_MAP_TYPE_HASH_OF_MAPS) || + !(map->map_flags & BPF_F_NO_PREALLOC); +} + static int check_map_prog_compatibility(struct bpf_map *map, struct bpf_prog *prog) { - if (prog->type == BPF_PROG_TYPE_PERF_EVENT && - (map->map_type == BPF_MAP_TYPE_HASH || - map->map_type == BPF_MAP_TYPE_PERCPU_HASH) && - (map->map_flags & BPF_F_NO_PREALLOC)) { - verbose("perf_event programs can only use preallocated hash map\n"); - return -EINVAL; + /* Make sure that BPF_PROG_TYPE_PERF_EVENT programs only use + * preallocated hash maps, since doing memory allocation + * in overflow_handler can crash depending on where nmi got + * triggered. + */ + if (prog->type == BPF_PROG_TYPE_PERF_EVENT) { + if (!check_map_prealloc(map)) { + verbose("perf_event programs can only use preallocated hash map\n"); + return -EINVAL; + } + if (map->inner_map_meta && + !check_map_prealloc(map->inner_map_meta)) { + verbose("perf_event programs can only use preallocated inner hash map\n"); + return -EINVAL; + } } return 0; } @@ -3182,6 +3239,41 @@ static void convert_pseudo_ld_imm64(struct bpf_verifier_env *env) insn->src_reg = 0; } +/* single env->prog->insni[off] instruction was replaced with the range + * insni[off, off + cnt). Adjust corresponding insn_aux_data by copying + * [0, off) and [off, end) to new locations, so the patched range stays zero + */ +static int adjust_insn_aux_data(struct bpf_verifier_env *env, u32 prog_len, + u32 off, u32 cnt) +{ + struct bpf_insn_aux_data *new_data, *old_data = env->insn_aux_data; + + if (cnt == 1) + return 0; + new_data = vzalloc(sizeof(struct bpf_insn_aux_data) * prog_len); + if (!new_data) + return -ENOMEM; + memcpy(new_data, old_data, sizeof(struct bpf_insn_aux_data) * off); + memcpy(new_data + off + cnt - 1, old_data + off, + sizeof(struct bpf_insn_aux_data) * (prog_len - off - cnt + 1)); + env->insn_aux_data = new_data; + vfree(old_data); + return 0; +} + +static struct bpf_prog *bpf_patch_insn_data(struct bpf_verifier_env *env, u32 off, + const struct bpf_insn *patch, u32 len) +{ + struct bpf_prog *new_prog; + + new_prog = bpf_patch_insn_single(env->prog, off, patch, len); + if (!new_prog) + return NULL; + if (adjust_insn_aux_data(env, new_prog->len, off, len)) + return NULL; + return new_prog; +} + /* convert load instructions that access fields of 'struct __sk_buff' * into sequence of instructions that access fields of 'struct sk_buff' */ @@ -3201,10 +3293,10 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) verbose("bpf verifier is misconfigured\n"); return -EINVAL; } else if (cnt) { - new_prog = bpf_patch_insn_single(env->prog, 0, - insn_buf, cnt); + new_prog = bpf_patch_insn_data(env, 0, insn_buf, cnt); if (!new_prog) return -ENOMEM; + env->prog = new_prog; delta += cnt - 1; } @@ -3229,7 +3321,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else continue; - if (env->insn_aux_data[i].ptr_type != PTR_TO_CTX) + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; cnt = ops->convert_ctx_access(type, insn, insn_buf, env->prog); @@ -3238,8 +3330,7 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return -EINVAL; } - new_prog = bpf_patch_insn_single(env->prog, i + delta, insn_buf, - cnt); + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); if (!new_prog) return -ENOMEM; @@ -3253,6 +3344,89 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) return 0; } +/* fixup insn->imm field of bpf_call instructions + * and inline eligible helpers as explicit sequence of BPF instructions + * + * this function is called after eBPF program passed verification + */ +static int fixup_bpf_calls(struct bpf_verifier_env *env) +{ + struct bpf_prog *prog = env->prog; + struct bpf_insn *insn = prog->insnsi; + const struct bpf_func_proto *fn; + const int insn_cnt = prog->len; + struct bpf_insn insn_buf[16]; + struct bpf_prog *new_prog; + struct bpf_map *map_ptr; + int i, cnt, delta = 0; + + for (i = 0; i < insn_cnt; i++, insn++) { + if (insn->code != (BPF_JMP | BPF_CALL)) + continue; + + if (insn->imm == BPF_FUNC_get_route_realm) + prog->dst_needed = 1; + if (insn->imm == BPF_FUNC_get_prandom_u32) + bpf_user_rnd_init_once(); + if (insn->imm == BPF_FUNC_tail_call) { + /* If we tail call into other programs, we + * cannot make any assumptions since they can + * be replaced dynamically during runtime in + * the program array. + */ + prog->cb_access = 1; + + /* mark bpf_tail_call as different opcode to avoid + * conditional branch in the interpeter for every normal + * call and to prevent accidental JITing by JIT compiler + * that doesn't support bpf_tail_call yet + */ + insn->imm = 0; + insn->code |= BPF_X; + continue; + } + + if (ebpf_jit_enabled() && insn->imm == BPF_FUNC_map_lookup_elem) { + map_ptr = env->insn_aux_data[i + delta].map_ptr; + if (map_ptr == BPF_MAP_PTR_POISON || + !map_ptr->ops->map_gen_lookup) + goto patch_call_imm; + + cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); + if (cnt == 0 || cnt >= ARRAY_SIZE(insn_buf)) { + verbose("bpf verifier is misconfigured\n"); + return -EINVAL; + } + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, + cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + + /* keep walking new program and skip insns we just inserted */ + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + +patch_call_imm: + fn = prog->aux->ops->get_func_proto(insn->imm); + /* all functions that have prototype and verifier allowed + * programs to call them, must be real in-kernel functions + */ + if (!fn->func) { + verbose("kernel subsystem misconfigured func %s#%d\n", + func_id_name(insn->imm), insn->imm); + return -EFAULT; + } + insn->imm = fn->func - __bpf_call_base; + } + + return 0; +} + static void free_states(struct bpf_verifier_env *env) { struct bpf_verifier_state_list *sl, *sln; @@ -3348,6 +3522,9 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); + if (ret == 0) + ret = fixup_bpf_calls(env); + if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index 9203bfb0560399af9611bd40516758ab01076cba..00f4d6bf048fab1d25068842859075a3e3b952dd 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -5,6 +5,7 @@ #include #include #include +#include /* * A cgroup can be associated with multiple css_sets as different tasks may @@ -134,7 +135,7 @@ static inline void put_css_set(struct css_set *cset) * can see it. Similar to atomic_dec_and_lock(), but for an * rwlock */ - if (atomic_add_unless(&cset->refcount, -1, 1)) + if (refcount_dec_not_one(&cset->refcount)) return; spin_lock_irqsave(&css_set_lock, flags); @@ -147,7 +148,7 @@ static inline void put_css_set(struct css_set *cset) */ static inline void get_css_set(struct css_set *cset) { - atomic_inc(&cset->refcount); + refcount_inc(&cset->refcount); } bool cgroup_ssid_enabled(int ssid); @@ -163,7 +164,7 @@ int cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, void cgroup_free_root(struct cgroup_root *root); void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts); -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask); +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags); int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_root *root, unsigned long magic, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 1dc22f6b49f5e06c4af22222dfb1b32c885ce16a..85d75152402dd8c601c65c988d11efa1f75945ad 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -346,7 +346,7 @@ static int cgroup_task_count(const struct cgroup *cgrp) spin_lock_irq(&css_set_lock); list_for_each_entry(link, &cgrp->cset_links, cset_link) - count += atomic_read(&link->cset->refcount); + count += refcount_read(&link->cset->refcount); spin_unlock_irq(&css_set_lock); return count; } @@ -1072,6 +1072,7 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, struct cgroup_subsys *ss; struct dentry *dentry; int i, ret; + bool new_root = false; cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); @@ -1181,10 +1182,11 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, ret = -ENOMEM; goto out_unlock; } + new_root = true; init_cgroup_root(root, &opts); - ret = cgroup_setup_root(root, opts.subsys_mask); + ret = cgroup_setup_root(root, opts.subsys_mask, PERCPU_REF_INIT_DEAD); if (ret) cgroup_free_root(root); @@ -1200,6 +1202,18 @@ struct dentry *cgroup1_mount(struct file_system_type *fs_type, int flags, dentry = cgroup_do_mount(&cgroup_fs_type, flags, root, CGROUP_SUPER_MAGIC, ns); + /* + * There's a race window after we release cgroup_mutex and before + * allocating a superblock. Make sure a concurrent process won't + * be able to re-use the root during this window by delaying the + * initialization of root refcnt. + */ + if (new_root) { + mutex_lock(&cgroup_mutex); + percpu_ref_reinit(&root->cgrp.self.refcnt); + mutex_unlock(&cgroup_mutex); + } + /* * If @pinned_sb, we're reusing an existing root and holding an * extra ref on its sb. Mount is complete. Put the extra ref. @@ -1286,7 +1300,7 @@ static u64 current_css_set_refcount_read(struct cgroup_subsys_state *css, u64 count; rcu_read_lock(); - count = atomic_read(&task_css_set(current)->refcount); + count = refcount_read(&task_css_set(current)->refcount); rcu_read_unlock(); return count; } diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 687f5e0194efccbadce199c0570cd08b8c96181a..c3c9a0e1b3c9a474bd80b8cb10ea1049284474b0 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -189,7 +189,7 @@ static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ struct cgroup_namespace init_cgroup_ns = { - .count = { .counter = 2, }, + .count = REFCOUNT_INIT(2), .user_ns = &init_user_ns, .ns.ops = &cgroupns_operations, .ns.inum = PROC_CGROUP_INIT_INO, @@ -436,7 +436,12 @@ struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, return css; } -static void cgroup_get(struct cgroup *cgrp) +static void __maybe_unused cgroup_get(struct cgroup *cgrp) +{ + css_get(&cgrp->self); +} + +static void cgroup_get_live(struct cgroup *cgrp) { WARN_ON_ONCE(cgroup_is_dead(cgrp)); css_get(&cgrp->self); @@ -554,7 +559,7 @@ EXPORT_SYMBOL_GPL(of_css); * haven't been created. */ struct css_set init_css_set = { - .refcount = ATOMIC_INIT(1), + .refcount = REFCOUNT_INIT(1), .tasks = LIST_HEAD_INIT(init_css_set.tasks), .mg_tasks = LIST_HEAD_INIT(init_css_set.mg_tasks), .task_iters = LIST_HEAD_INIT(init_css_set.task_iters), @@ -724,7 +729,7 @@ void put_css_set_locked(struct css_set *cset) lockdep_assert_held(&css_set_lock); - if (!atomic_dec_and_test(&cset->refcount)) + if (!refcount_dec_and_test(&cset->refcount)) return; /* This css_set is dead. unlink it and release cgroup and css refs */ @@ -932,7 +937,7 @@ static void link_css_set(struct list_head *tmp_links, struct css_set *cset, list_add_tail(&link->cgrp_link, &cset->cgrp_links); if (cgroup_parent(cgrp)) - cgroup_get(cgrp); + cgroup_get_live(cgrp); } /** @@ -977,7 +982,7 @@ static struct css_set *find_css_set(struct css_set *old_cset, return NULL; } - atomic_set(&cset->refcount, 1); + refcount_set(&cset->refcount, 1); INIT_LIST_HEAD(&cset->tasks); INIT_LIST_HEAD(&cset->mg_tasks); INIT_LIST_HEAD(&cset->task_iters); @@ -1640,7 +1645,7 @@ void init_cgroup_root(struct cgroup_root *root, struct cgroup_sb_opts *opts) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) +int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask, int ref_flags) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1656,8 +1661,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) root_cgrp->id = ret; root_cgrp->ancestor_ids[0] = ret; - ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, - GFP_KERNEL); + ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, + ref_flags, GFP_KERNEL); if (ret) goto out; @@ -1802,7 +1807,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, return ERR_PTR(-EINVAL); } cgrp_dfl_visible = true; - cgroup_get(&cgrp_dfl_root.cgrp); + cgroup_get_live(&cgrp_dfl_root.cgrp); dentry = cgroup_do_mount(&cgroup2_fs_type, flags, &cgrp_dfl_root, CGROUP2_SUPER_MAGIC, ns); @@ -2576,7 +2581,7 @@ void cgroup_lock_and_drain_offline(struct cgroup *cgrp) if (!css || !percpu_ref_is_dying(&css->refcnt)) continue; - cgroup_get(dsct); + cgroup_get_live(dsct); prepare_to_wait(&dsct->offline_waitq, &wait, TASK_UNINTERRUPTIBLE); @@ -3947,7 +3952,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, { lockdep_assert_held(&cgroup_mutex); - cgroup_get(cgrp); + cgroup_get_live(cgrp); memset(css, 0, sizeof(*css)); css->cgroup = cgrp; @@ -4123,7 +4128,7 @@ static struct cgroup *cgroup_create(struct cgroup *parent) /* allocation complete, commit to creation */ list_add_tail_rcu(&cgrp->self.sibling, &cgroup_parent(cgrp)->self.children); atomic_inc(&root->nr_cgrps); - cgroup_get(parent); + cgroup_get_live(parent); /* * @cgrp is now fully operational. If something fails after this @@ -4513,7 +4518,7 @@ int __init cgroup_init(void) hash_add(css_set_table, &init_css_set.hlist, css_set_hash(init_css_set.subsys)); - BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); + BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0, 0)); mutex_unlock(&cgroup_mutex); @@ -4947,7 +4952,7 @@ struct cgroup *cgroup_get_from_path(const char *path) if (kn) { if (kernfs_type(kn) == KERNFS_DIR) { cgrp = kn->priv; - cgroup_get(cgrp); + cgroup_get_live(cgrp); } else { cgrp = ERR_PTR(-ENOTDIR); } @@ -5027,6 +5032,11 @@ void cgroup_sk_alloc(struct sock_cgroup_data *skcd) /* Socket clone path */ if (skcd->val) { + /* + * We might be cloning a socket which is left in an empty + * cgroup and the cgroup might have already been rmdir'd. + * Don't use cgroup_get_live(). + */ cgroup_get(sock_cgroup_ptr(skcd)); return; } diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0f41292be0fb7dea2800acae3550c0dea3903f45..f6501f4f6040b5a9c21e84aeb57e20906ef1c614 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -2121,10 +2121,8 @@ int __init cpuset_init(void) { int err = 0; - if (!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)) - BUG(); - if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL)); + BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL)); cpumask_setall(top_cpuset.cpus_allowed); nodes_setall(top_cpuset.mems_allowed); @@ -2139,8 +2137,7 @@ int __init cpuset_init(void) if (err < 0) return err; - if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) - BUG(); + BUG_ON(!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)); return 0; } @@ -2354,7 +2351,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) rebuild_sched_domains(); } -void cpuset_update_active_cpus(bool cpu_online) +void cpuset_update_active_cpus(void) { /* * We're inside cpu hotplug critical region which usually nests diff --git a/kernel/cgroup/namespace.c b/kernel/cgroup/namespace.c index 96d38dab6fb2f7fdad9e64236572562c9c43ff36..66129eb4371d124f97c2a3233549be3673bd93ca 100644 --- a/kernel/cgroup/namespace.c +++ b/kernel/cgroup/namespace.c @@ -31,7 +31,7 @@ static struct cgroup_namespace *alloc_cgroup_ns(void) kfree(new_ns); return ERR_PTR(ret); } - atomic_set(&new_ns->count, 1); + refcount_set(&new_ns->count, 1); new_ns->ns.ops = &cgroupns_operations; return new_ns; } diff --git a/kernel/compat.c b/kernel/compat.c index 19aec5d981081d26914c14f5db2e76646c399888..933bcb31ae10d4c1dcaf2c9b5f48861080d9b3df 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -108,8 +108,8 @@ COMPAT_SYSCALL_DEFINE2(gettimeofday, struct compat_timeval __user *, tv, COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -123,7 +123,7 @@ COMPAT_SYSCALL_DEFINE2(settimeofday, struct compat_timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } static int __compat_get_timeval(struct timeval *tv, const struct compat_timeval __user *ctv) @@ -240,18 +240,20 @@ COMPAT_SYSCALL_DEFINE2(nanosleep, struct compat_timespec __user *, rqtp, struct compat_timespec __user *, rmtp) { struct timespec tu, rmt; + struct timespec64 tu64; mm_segment_t oldfs; long ret; if (compat_get_timespec(&tu, rqtp)) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; oldfs = get_fs(); set_fs(KERNEL_DS); - ret = hrtimer_nanosleep(&tu, + ret = hrtimer_nanosleep(&tu64, rmtp ? (struct timespec __user *)&rmt : NULL, HRTIMER_MODE_REL, CLOCK_MONOTONIC); set_fs(oldfs); diff --git a/kernel/cpu.c b/kernel/cpu.c index 37b223e4fc05b74fc50aa51df0c307d65da026c3..9ae6fbe5b5cf5d70d5869e8efb00c61f47cf66d4 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1125,6 +1125,8 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +int __boot_cpu_id; + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -1815,6 +1817,10 @@ void __init boot_cpu_init(void) set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); + +#ifdef CONFIG_SMP + __boot_cpu_id = cpu; +#endif } /* diff --git a/kernel/crash_core.c b/kernel/crash_core.c new file mode 100644 index 0000000000000000000000000000000000000000..fcbd568f1e95069e6f42a46cad5f5c739f273def --- /dev/null +++ b/kernel/crash_core.c @@ -0,0 +1,439 @@ +/* + * crash.c - kernel crash support code. + * Copyright (C) 2002-2004 Eric Biederman + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include +#include +#include + +#include +#include + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warn("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warn("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warn("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warn("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warn("Memory value expected after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + else if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + +/* + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char: %c\n", *cur); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + +static int __init __parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *name, + const char *suffix) +{ + char *first_colon, *first_space; + char *ck_cmdline; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += strlen(name); + + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + suffix); + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + + return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); +} + +int __init parse_crashkernel_low(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_LOW]); +} + +Elf_Word *append_elf_note(Elf_Word *buf, char *name, unsigned int type, + void *data, size_t data_len) +{ + struct elf_note *note = (struct elf_note *)buf; + + note->n_namesz = strlen(name) + 1; + note->n_descsz = data_len; + note->n_type = type; + buf += DIV_ROUND_UP(sizeof(*note), sizeof(Elf_Word)); + memcpy(buf, name, note->n_namesz); + buf += DIV_ROUND_UP(note->n_namesz, sizeof(Elf_Word)); + memcpy(buf, data, data_len); + buf += DIV_ROUND_UP(data_len, sizeof(Elf_Word)); + + return buf; +} + +void final_note(Elf_Word *buf) +{ + memset(buf, 0, sizeof(struct elf_note)); +} + +static void update_vmcoreinfo_note(void) +{ + u32 *buf = vmcoreinfo_note; + + if (!vmcoreinfo_size) + return; + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + final_note(buf); +} + +void crash_save_vmcoreinfo(void) +{ + vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); + update_vmcoreinfo_note(); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + size_t r; + + va_start(args, fmt); + r = vscnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __weak arch_crash_save_vmcoreinfo(void) +{} + +phys_addr_t __weak paddr_vmcoreinfo_note(void) +{ + return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); +#ifdef CONFIG_MMU + VMCOREINFO_SYMBOL(swapper_pg_dir); +#endif + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmap_area_list); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _refcount); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(page, _mapcount); + VMCOREINFO_OFFSET(page, private); + VMCOREINFO_OFFSET(page, compound_dtor); + VMCOREINFO_OFFSET(page, compound_order); + VMCOREINFO_OFFSET(page, compound_head); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_vmcoreinfo_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + VMCOREINFO_NUMBER(PG_slab); +#ifdef CONFIG_MEMORY_FAILURE + VMCOREINFO_NUMBER(PG_hwpoison); +#endif + VMCOREINFO_NUMBER(PG_head_mask); + VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); +#ifdef CONFIG_HUGETLB_PAGE + VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); +#endif + + arch_crash_save_vmcoreinfo(); + update_vmcoreinfo_note(); + + return 0; +} + +subsys_initcall(crash_save_vmcoreinfo_init); diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index c04917cad1bfdc50fe4f00b08eaa58c9e7e7692e..1b2be63c85282fdb6f910e145853f97c23d8efb4 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -229,12 +229,18 @@ get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, } if (regs) { + mm_segment_t fs; + if (crosstask) goto exit_put; if (add_mark) perf_callchain_store_context(&ctx, PERF_CONTEXT_USER); + + fs = get_fs(); + set_fs(USER_DS); perf_callchain_user(&ctx, regs); + set_fs(fs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index ff01cba86f430fd29916ab73c755698bf81feff0..6e75a5c9412dee17daabc1cb131bb517ad431207 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -48,6 +48,8 @@ #include #include #include +#include +#include #include "internal.h" @@ -379,6 +381,7 @@ static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events); static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; +static atomic_t nr_namespaces_events __read_mostly; static atomic_t nr_task_events __read_mostly; static atomic_t nr_freq_events __read_mostly; static atomic_t nr_switch_events __read_mostly; @@ -3991,6 +3994,8 @@ static void unaccount_event(struct perf_event *event) atomic_dec(&nr_mmap_events); if (event->attr.comm) atomic_dec(&nr_comm_events); + if (event->attr.namespaces) + atomic_dec(&nr_namespaces_events); if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) @@ -6491,6 +6496,7 @@ static void perf_event_task(struct task_struct *task, void perf_event_fork(struct task_struct *task) { perf_event_task(task, NULL, 1); + perf_event_namespaces(task); } /* @@ -6592,6 +6598,132 @@ void perf_event_comm(struct task_struct *task, bool exec) perf_event_comm_event(&comm_event); } +/* + * namespaces tracking + */ + +struct perf_namespaces_event { + struct task_struct *task; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 nr_namespaces; + struct perf_ns_link_info link_info[NR_NAMESPACES]; + } event_id; +}; + +static int perf_event_namespaces_match(struct perf_event *event) +{ + return event->attr.namespaces; +} + +static void perf_event_namespaces_output(struct perf_event *event, + void *data) +{ + struct perf_namespaces_event *namespaces_event = data; + struct perf_output_handle handle; + struct perf_sample_data sample; + int ret; + + if (!perf_event_namespaces_match(event)) + return; + + perf_event_header__init_id(&namespaces_event->event_id.header, + &sample, event); + ret = perf_output_begin(&handle, event, + namespaces_event->event_id.header.size); + if (ret) + return; + + namespaces_event->event_id.pid = perf_event_pid(event, + namespaces_event->task); + namespaces_event->event_id.tid = perf_event_tid(event, + namespaces_event->task); + + perf_output_put(&handle, namespaces_event->event_id); + + perf_event__output_id_sample(event, &handle, &sample); + + perf_output_end(&handle); +} + +static void perf_fill_ns_link_info(struct perf_ns_link_info *ns_link_info, + struct task_struct *task, + const struct proc_ns_operations *ns_ops) +{ + struct path ns_path; + struct inode *ns_inode; + void *error; + + error = ns_get_path(&ns_path, task, ns_ops); + if (!error) { + ns_inode = ns_path.dentry->d_inode; + ns_link_info->dev = new_encode_dev(ns_inode->i_sb->s_dev); + ns_link_info->ino = ns_inode->i_ino; + } +} + +void perf_event_namespaces(struct task_struct *task) +{ + struct perf_namespaces_event namespaces_event; + struct perf_ns_link_info *ns_link_info; + + if (!atomic_read(&nr_namespaces_events)) + return; + + namespaces_event = (struct perf_namespaces_event){ + .task = task, + .event_id = { + .header = { + .type = PERF_RECORD_NAMESPACES, + .misc = 0, + .size = sizeof(namespaces_event.event_id), + }, + /* .pid */ + /* .tid */ + .nr_namespaces = NR_NAMESPACES, + /* .link_info[NR_NAMESPACES] */ + }, + }; + + ns_link_info = namespaces_event.event_id.link_info; + + perf_fill_ns_link_info(&ns_link_info[MNT_NS_INDEX], + task, &mntns_operations); + +#ifdef CONFIG_USER_NS + perf_fill_ns_link_info(&ns_link_info[USER_NS_INDEX], + task, &userns_operations); +#endif +#ifdef CONFIG_NET_NS + perf_fill_ns_link_info(&ns_link_info[NET_NS_INDEX], + task, &netns_operations); +#endif +#ifdef CONFIG_UTS_NS + perf_fill_ns_link_info(&ns_link_info[UTS_NS_INDEX], + task, &utsns_operations); +#endif +#ifdef CONFIG_IPC_NS + perf_fill_ns_link_info(&ns_link_info[IPC_NS_INDEX], + task, &ipcns_operations); +#endif +#ifdef CONFIG_PID_NS + perf_fill_ns_link_info(&ns_link_info[PID_NS_INDEX], + task, &pidns_operations); +#endif +#ifdef CONFIG_CGROUPS + perf_fill_ns_link_info(&ns_link_info[CGROUP_NS_INDEX], + task, &cgroupns_operations); +#endif + + perf_iterate_sb(perf_event_namespaces_output, + &namespaces_event, + NULL); +} + /* * mmap tracking */ @@ -9146,6 +9278,8 @@ static void account_event(struct perf_event *event) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); + if (event->attr.namespaces) + atomic_inc(&nr_namespaces_events); if (event->attr.task) atomic_inc(&nr_task_events); if (event->attr.freq) @@ -9691,6 +9825,11 @@ SYSCALL_DEFINE5(perf_event_open, return -EACCES; } + if (attr.namespaces) { + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + } + if (attr.freq) { if (attr.sample_freq > sysctl_perf_event_sample_rate) return -EINVAL; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 257fa460b846032744e2da500d489d0995678571..2831480c63a28b8e9b8cee1c0b30968860b3fcd0 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -297,6 +297,19 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->paused = 1; } +void perf_aux_output_flag(struct perf_output_handle *handle, u64 flags) +{ + /* + * OVERWRITE is determined by perf_aux_output_end() and can't + * be passed in directly. + */ + if (WARN_ON_ONCE(flags & PERF_AUX_FLAG_OVERWRITE)) + return; + + handle->aux_flags |= flags; +} +EXPORT_SYMBOL_GPL(perf_aux_output_flag); + /* * This is called before hardware starts writing to the AUX area to * obtain an output handle and make sure there's room in the buffer. @@ -360,6 +373,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, handle->event = event; handle->head = aux_head; handle->size = 0; + handle->aux_flags = 0; /* * In overwrite mode, AUX data stores do not depend on aux_tail, @@ -408,34 +422,32 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, * of the AUX buffer management code is that after pmu::stop(), the AUX * transaction must be stopped and therefore drop the AUX reference count. */ -void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, - bool truncated) +void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) { + bool wakeup = !!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED); struct ring_buffer *rb = handle->rb; - bool wakeup = truncated; unsigned long aux_head; - u64 flags = 0; - - if (truncated) - flags |= PERF_AUX_FLAG_TRUNCATED; /* in overwrite mode, driver provides aux_head via handle */ if (rb->aux_overwrite) { - flags |= PERF_AUX_FLAG_OVERWRITE; + handle->aux_flags |= PERF_AUX_FLAG_OVERWRITE; aux_head = handle->head; local_set(&rb->aux_head, aux_head); } else { + handle->aux_flags &= ~PERF_AUX_FLAG_OVERWRITE; + aux_head = local_read(&rb->aux_head); local_add(size, &rb->aux_head); } - if (size || flags) { + if (size || handle->aux_flags) { /* * Only send RECORD_AUX if we have something useful to communicate */ - perf_event_aux_event(handle->event, aux_head, size, flags); + perf_event_aux_event(handle->event, aux_head, size, + handle->aux_flags); } aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); @@ -446,7 +458,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, } if (wakeup) { - if (truncated) + if (handle->aux_flags & PERF_AUX_FLAG_TRUNCATED) handle->event->pending_disable = 1; perf_output_wakeup(handle); } diff --git a/kernel/fork.c b/kernel/fork.c index 6c463c80e93de8c3be3180f3cbd8694b955a1ac3..06d759ab4c62ff090ab51715eaf0bc4fcabc250e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include @@ -178,6 +179,24 @@ void __weak arch_release_thread_stack(unsigned long *stack) */ #define NR_CACHED_STACKS 2 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); + +static int free_vm_stack_cache(unsigned int cpu) +{ + struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu); + int i; + + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *vm_stack = cached_vm_stacks[i]; + + if (!vm_stack) + continue; + + vfree(vm_stack->addr); + cached_vm_stacks[i] = NULL; + } + + return 0; +} #endif static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) @@ -202,7 +221,7 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, + THREADINFO_GFP, PAGE_KERNEL, 0, node, __builtin_return_address(0)); @@ -466,6 +485,11 @@ void __init fork_init(void) for (i = 0; i < UCOUNT_COUNTS; i++) { init_user_ns.ucount_max[i] = max_threads/2; } + +#ifdef CONFIG_VMAP_STACK + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", + NULL, free_vm_stack_cache); +#endif } int __weak arch_dup_task_struct(struct task_struct *dst, @@ -536,7 +560,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) set_task_stack_end_magic(tsk); #ifdef CONFIG_CC_STACKPROTECTOR - tsk->stack_canary = get_random_int(); + tsk->stack_canary = get_random_long(); #endif /* @@ -1313,7 +1337,7 @@ void __cleanup_sighand(struct sighand_struct *sighand) if (atomic_dec_and_test(&sighand->count)) { signalfd_cleanup(sighand); /* - * sighand_cachep is SLAB_DESTROY_BY_RCU so we can free it + * sighand_cachep is SLAB_TYPESAFE_BY_RCU so we can free it * without an RCU grace period, see __lock_task_sighand(). */ kmem_cache_free(sighand_cachep, sighand); @@ -1438,6 +1462,7 @@ static void rt_mutex_init_task(struct task_struct *p) #ifdef CONFIG_RT_MUTEXES p->pi_waiters = RB_ROOT; p->pi_waiters_leftmost = NULL; + p->pi_top_task = NULL; p->pi_blocked_on = NULL; #endif } @@ -1679,9 +1704,12 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_cleanup_perf; /* copy all the process information */ shm_init_task(p); - retval = copy_semundo(clone_flags, p); + retval = security_task_alloc(p, clone_flags); if (retval) goto bad_fork_cleanup_audit; + retval = copy_semundo(clone_flags, p); + if (retval) + goto bad_fork_cleanup_security; retval = copy_files(clone_flags, p); if (retval) goto bad_fork_cleanup_semundo; @@ -1797,6 +1825,8 @@ static __latent_entropy struct task_struct *copy_process( p->parent_exec_id = current->self_exec_id; } + klp_copy_process(p); + spin_lock(¤t->sighand->siglock); /* @@ -1903,6 +1933,8 @@ static __latent_entropy struct task_struct *copy_process( exit_files(p); /* blocking */ bad_fork_cleanup_semundo: exit_sem(p); +bad_fork_cleanup_security: + security_task_free(p); bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_perf: @@ -2144,7 +2176,7 @@ void __init proc_caches_init(void) { sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, @@ -2352,6 +2384,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) } } + perf_event_namespaces(current); + bad_unshare_cleanup_cred: if (new_cred) put_cred(new_cred); diff --git a/kernel/futex.c b/kernel/futex.c index 45858ec739411f5741667e560552757697441e6b..357348a6cf6b4d71dbc24c9ad89ad2d0e63b20d2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -802,7 +802,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -812,6 +812,11 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -856,7 +861,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) * * [10] There is no transient state which leaves owner and user space * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * */ /* @@ -980,10 +1020,13 @@ void exit_pi_state_list(struct task_struct *curr) * the pi_state against the user space value. If correct, attach to * it. */ -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; + u32 uval2; + int ret; /* * Userspace might have messed up non-PI and PI futexes [3] @@ -991,8 +1034,38 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, if (unlikely(!pi_state)) return -EINVAL; + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ WARN_ON(!atomic_read(&pi_state->refcount)); + /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + /* * Handle the owner died case: */ @@ -1008,11 +1081,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * is not 0. Inconsistent state. [5] */ if (pid) - return -EINVAL; + goto out_einval; /* * Take a ref on the state and return success. [4] */ - goto out_state; + goto out_attach; } /* @@ -1024,14 +1097,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * Take a ref on the state and return success. [6] */ if (!pid) - goto out_state; + goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) - return -EINVAL; + goto out_einval; } /* @@ -1040,11 +1113,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; -out_state: - atomic_inc(&pi_state->refcount); + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } /* @@ -1095,6 +1186,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); @@ -1119,17 +1213,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { - struct futex_q *match = futex_top_waiter(hb, key); + struct futex_q *top_waiter = futex_top_waiter(hb, key); /* * If there is a waiter on that futex, validate it and * attach to the pi_state when the validation succeeds. */ - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * We are the first waiter - try to look up the owner based on @@ -1148,7 +1243,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; - /*If user space value changed, let the caller retry */ + /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; } @@ -1176,7 +1271,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *match; + struct futex_q *top_waiter; int ret; /* @@ -1202,9 +1297,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * Lookup existing state first. If it exists, try to attach to * its pi_state. */ - match = futex_top_waiter(hb, key); - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * No waiter and user TID is 0. We are here because the @@ -1285,50 +1380,44 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) wake_q_add(wake_q, p); __unqueue_futex(q); /* - * The waiting task can free the futex_q as soon as - * q->lock_ptr = NULL is written, without taking any locks. A - * memory barrier is required here to prevent the following - * store to lock_ptr from getting ahead of the plist_del. + * The waiting task can free the futex_q as soon as q->lock_ptr = NULL + * is written, without taking any locks. This is possible in the event + * of a spurious wakeup, for example. A memory barrier is required here + * to prevent the following store to lock_ptr from getting ahead of the + * plist_del in __unqueue_futex(). */ - smp_wmb(); - q->lock_ptr = NULL; + smp_store_release(&q->lock_ptr, NULL); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool postunlock = false; DEFINE_WAKE_Q(wake_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1337,6 +1426,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; + } else if (curval != uval) { /* * If a unconditional UNLOCK_PI operation (user space did not @@ -1349,10 +1439,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; - } + + if (ret) + goto out_unlock; + + /* + * This is a point of no return; once we modify the uval there is no + * going back and subsequent operations must not fail. + */ raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1365,22 +1459,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ - spin_unlock(&hb->lock); - wake_up_q(&wake_q); - if (deboost) - rt_mutex_adjust_prio(current); + if (postunlock) + rt_mutex_postunlock(&wake_q); - return 0; + return ret; } /* @@ -1826,7 +1913,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1909,7 +1996,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * refcount on the pi_state and store the pointer in * the futex_q object of the waiter. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, @@ -2009,20 +2096,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2039,6 +2113,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2125,10 +2217,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; int ret; + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2136,7 +2231,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. + * waiter but have failed to get the rtmutex the first time. + * * We have to replace the newowner TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * @@ -2144,17 +2240,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * because we can fault here. Imagine swapped out pages or a fork * that marked all the anonymous memory readonly for cow. * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. */ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2169,47 +2264,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * itself. */ if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - raw_spin_lock_irq(&newowner->pi_lock); + raw_spin_lock(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return 0; /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. */ handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) - return 0; + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } if (ret) - return ret; + goto out_unlock; goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } static long futex_wait_restart(struct restart_block *restart); @@ -2231,57 +2339,32 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0; if (locked) { /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); goto out; } - /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + } out: return ret ? ret : locked; @@ -2505,6 +2588,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2557,24 +2642,67 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; } + rt_mutex_init_waiter(&rt_waiter); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); + /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2591,12 +2719,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, * If fixup_owner() faulted and was unable to handle the fault, unlock * it and return the fault to userspace. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock */ unqueue_me_pi(&q); + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + goto out_put_key; out_unlock_put_key: @@ -2605,8 +2740,10 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, out_put_key: put_futex_key(&q.key); out: - if (to) + if (to) { + hrtimer_cancel(&to->timer); destroy_hrtimer_on_stack(&to->timer); + } return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: @@ -2633,7 +2770,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *match; + struct futex_q *top_waiter; int ret; retry: @@ -2657,12 +2794,37 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * all and we at least want to know if user space fiddled * with the futex value instead of blindly unlocking. */ - match = futex_top_waiter(hb, &key); - if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2677,7 +2839,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -2685,7 +2846,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -2695,8 +2856,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -2710,7 +2873,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); @@ -2814,6 +2976,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; union futex_key key2 = FUTEX_KEY_INIT; @@ -2840,10 +3003,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - debug_rt_mutex_init_waiter(&rt_waiter); - RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); - RB_CLEAR_NODE(&rt_waiter.tree_entry); - rt_waiter.task = NULL; + rt_mutex_init_waiter(&rt_waiter); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); if (unlikely(ret != 0)) @@ -2898,8 +3058,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2917,10 +3079,13 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); spin_lock(q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2938,13 +3103,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * the fault, unlock the rt_mutex and return the fault to * userspace. */ - if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 2f9df37940a0f5a489efc4eaa6afc1ccaa35ad68..c51a49c9be7064d30638d7ae184a4fcbc2136ec7 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_icall_topn); +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 6a5c239c7669c5ad8e01565be5a6fd38ae7d4452..46a18e72bce614c804fbe958f0a6b4d00d11027a 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) +#if (__GNUC__ >= 7) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 diff --git a/kernel/groups.c b/kernel/groups.c index 8dd7a61b7115e8808408cd2cb911d224a9528297..d09727692a2af1801fb82982d4a08daee5c86d62 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -18,7 +18,7 @@ struct group_info *groups_alloc(int gidsetsize) len = sizeof(struct group_info) + sizeof(kgid_t) * gidsetsize; gi = kmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY); if (!gi) - gi = __vmalloc(len, GFP_KERNEL_ACCOUNT|__GFP_HIGHMEM, PAGE_KERNEL); + gi = __vmalloc(len, GFP_KERNEL_ACCOUNT, PAGE_KERNEL); if (!gi) return NULL; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index f0f8e2a8496feafbb0f9589a6d8159779b9effe0..751593ed7c0b0b9cc9bf74735fb9bd5d7e100be2 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -43,6 +43,7 @@ unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; +static bool hung_task_show_lock; static struct task_struct *watchdog_task; @@ -120,12 +121,14 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) pr_err("\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" " disables this message.\n"); sched_show_task(t); - debug_show_all_locks(); + hung_task_show_lock = true; } touch_nmi_watchdog(); if (sysctl_hung_task_panic) { + if (hung_task_show_lock) + debug_show_all_locks(); trigger_all_cpu_backtrace(); panic("hung_task: blocked tasks"); } @@ -172,6 +175,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) if (test_taint(TAINT_DIE) || did_panic) return; + hung_task_show_lock = false; rcu_read_lock(); for_each_process_thread(g, t) { if (!max_count--) @@ -187,6 +191,8 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) } unlock: rcu_read_unlock(); + if (hung_task_show_lock) + debug_show_all_locks(); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c index d052947fe78507ac80bbe58a09cc612bca7bab73..e2d356dd75812df8e42dfac3feb132edd5612e33 100644 --- a/kernel/irq/affinity.c +++ b/kernel/irq/affinity.c @@ -98,7 +98,7 @@ irq_create_affinity_masks(int nvecs, const struct irq_affinity *affd) int ncpus, v, vecs_to_assign, vecs_per_node; /* Spread the vectors per node */ - vecs_per_node = (affv - curvec) / nodes; + vecs_per_node = (affv - (curvec - affd->pre_vectors)) / nodes; /* Get the cpus on this node which are in the mask */ cpumask_and(nmsk, cpu_online_mask, cpumask_of_node(n)); diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index be3c34e4f2ac1bbf679e92de5d82398a22aba7c9..686be4b73018886d5b676d99cc5f5645cf408259 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -348,7 +348,10 @@ void handle_nested_irq(unsigned int irq) irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); - action_ret = action->thread_fn(action->irq, action->dev_id); + action_ret = IRQ_NONE; + for_each_action_of_desc(desc, action) + action_ret |= action->thread_fn(action->irq, action->dev_id); + if (!noirqdebug) note_interrupt(desc, action_ret); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index a4afe5cc5af1828a49f1005825eadc880a727088..070be980c37a57d91f86099d2be1b81db80176bc 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -852,7 +852,7 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) * This code is triggered unconditionally. Check the affinity * mask pointer. For CPU_MASK_OFFSTACK=n this is optimized out. */ - if (desc->irq_common_data.affinity) + if (cpumask_available(desc->irq_common_data.affinity)) cpumask_copy(mask, desc->irq_common_data.affinity); else valid = false; @@ -1212,8 +1212,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) * set the trigger type must match. Also all must * agree on ONESHOT. */ + unsigned int oldtype = irqd_get_trigger_type(&desc->irq_data); + if (!((old->flags & new->flags) & IRQF_SHARED) || - ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) || + (oldtype != (new->flags & IRQF_TRIGGER_MASK)) || ((old->flags ^ new->flags) & IRQF_ONESHOT)) goto mismatch; @@ -1557,7 +1559,7 @@ void remove_irq(unsigned int irq, struct irqaction *act) struct irq_desc *desc = irq_to_desc(irq); if (desc && !WARN_ON(irq_settings_is_per_cpu_devid(desc))) - __free_irq(irq, act->dev_id); + __free_irq(irq, act->dev_id); } EXPORT_SYMBOL_GPL(remove_irq); @@ -1574,20 +1576,27 @@ EXPORT_SYMBOL_GPL(remove_irq); * have completed. * * This function must not be called from interrupt context. + * + * Returns the devname argument passed to request_irq. */ -void free_irq(unsigned int irq, void *dev_id) +const void *free_irq(unsigned int irq, void *dev_id) { struct irq_desc *desc = irq_to_desc(irq); + struct irqaction *action; + const char *devname; if (!desc || WARN_ON(irq_settings_is_per_cpu_devid(desc))) - return; + return NULL; #ifdef CONFIG_SMP if (WARN_ON(desc->affinity_notify)) desc->affinity_notify = NULL; #endif - kfree(__free_irq(irq, dev_id)); + action = __free_irq(irq, dev_id); + devname = action->name; + kfree(action); + return devname; } EXPORT_SYMBOL(free_irq); diff --git a/kernel/kcov.c b/kernel/kcov.c index 85e5546cd791cc31261cd6deb8aea933bb41c008..cd771993f96f4a7ff23592f1abaea22c71444fda 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -60,15 +60,8 @@ void notrace __sanitizer_cov_trace_pc(void) /* * We are interested in code coverage as a function of a syscall inputs, * so we ignore code executed in interrupts. - * The checks for whether we are in an interrupt are open-coded, because - * 1. We can't use in_interrupt() here, since it also returns true - * when we are inside local_bh_disable() section. - * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()), - * since that leads to slower generated code (three separate tests, - * one for each of the flags). */ - if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET - | NMI_MASK))) + if (!t || !in_task()) return; mode = READ_ONCE(t->kcov_mode); if (mode == KCOV_MODE_TRACE) { diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bfe62d5b38722251681f7257788cd14533fe4362..ae1a3ba24df56958bb48f30ddde85ae6ae833fdf 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -51,12 +51,6 @@ DEFINE_MUTEX(kexec_mutex); /* Per cpu memory for storing cpu states in case of system crash. */ note_buf_t __percpu *crash_notes; -/* vmcoreinfo stuff */ -static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; -u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; -size_t vmcoreinfo_size; -size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); - /* Flag to indicate we are going to kexec a new kernel */ bool kexec_in_progress = false; @@ -996,34 +990,6 @@ int crash_shrink_memory(unsigned long new_size) return ret; } -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, - size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) + 3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - void crash_save_cpu(struct pt_regs *regs, int cpu) { struct elf_prstatus prstatus; @@ -1084,403 +1050,6 @@ static int __init crash_notes_memory_init(void) subsys_initcall(crash_notes_memory_init); -/* - * parsing the "crashkernel" commandline - * - * this code is intended to be called from architecture specific code - */ - - -/* - * This function parses command lines in the format - * - * crashkernel=ramsize-range:size[,...][@offset] - * - * The function returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_mem(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline, *tmp; - - /* for each entry of the comma-separated list */ - do { - unsigned long long start, end = ULLONG_MAX, size; - - /* get the start of the range */ - start = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (*cur != '-') { - pr_warn("crashkernel: '-' expected\n"); - return -EINVAL; - } - cur++; - - /* if no ':' is here, than we read the end */ - if (*cur != ':') { - end = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("crashkernel: Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (end <= start) { - pr_warn("crashkernel: end <= start\n"); - return -EINVAL; - } - } - - if (*cur != ':') { - pr_warn("crashkernel: ':' expected\n"); - return -EINVAL; - } - cur++; - - size = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected\n"); - return -EINVAL; - } - cur = tmp; - if (size >= system_ram) { - pr_warn("crashkernel: invalid size\n"); - return -EINVAL; - } - - /* match ? */ - if (system_ram >= start && system_ram < end) { - *crash_size = size; - break; - } - } while (*cur++ == ','); - - if (*crash_size > 0) { - while (*cur && *cur != ' ' && *cur != '@') - cur++; - if (*cur == '@') { - cur++; - *crash_base = memparse(cur, &tmp); - if (cur == tmp) { - pr_warn("Memory value expected after '@'\n"); - return -EINVAL; - } - } - } - - return 0; -} - -/* - * That function parses "simple" (old) crashkernel command lines like - * - * crashkernel=size[@offset] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_simple(char *cmdline, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - if (*cur == '@') - *crash_base = memparse(cur+1, &cur); - else if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -#define SUFFIX_HIGH 0 -#define SUFFIX_LOW 1 -#define SUFFIX_NULL 2 -static __initdata char *suffix_tbl[] = { - [SUFFIX_HIGH] = ",high", - [SUFFIX_LOW] = ",low", - [SUFFIX_NULL] = NULL, -}; - -/* - * That function parses "suffix" crashkernel command lines like - * - * crashkernel=size,[high|low] - * - * It returns 0 on success and -EINVAL on failure. - */ -static int __init parse_crashkernel_suffix(char *cmdline, - unsigned long long *crash_size, - const char *suffix) -{ - char *cur = cmdline; - - *crash_size = memparse(cmdline, &cur); - if (cmdline == cur) { - pr_warn("crashkernel: memory value expected\n"); - return -EINVAL; - } - - /* check with suffix */ - if (strncmp(cur, suffix, strlen(suffix))) { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - cur += strlen(suffix); - if (*cur != ' ' && *cur != '\0') { - pr_warn("crashkernel: unrecognized char: %c\n", *cur); - return -EINVAL; - } - - return 0; -} - -static __init char *get_last_crashkernel(char *cmdline, - const char *name, - const char *suffix) -{ - char *p = cmdline, *ck_cmdline = NULL; - - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - char *end_p = strchr(p, ' '); - char *q; - - if (!end_p) - end_p = p + strlen(p); - - if (!suffix) { - int i; - - /* skip the one with any known suffix */ - for (i = 0; suffix_tbl[i]; i++) { - q = end_p - strlen(suffix_tbl[i]); - if (!strncmp(q, suffix_tbl[i], - strlen(suffix_tbl[i]))) - goto next; - } - ck_cmdline = p; - } else { - q = end_p - strlen(suffix); - if (!strncmp(q, suffix, strlen(suffix))) - ck_cmdline = p; - } -next: - p = strstr(p+1, name); - } - - if (!ck_cmdline) - return NULL; - - return ck_cmdline; -} - -static int __init __parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base, - const char *name, - const char *suffix) -{ - char *first_colon, *first_space; - char *ck_cmdline; - - BUG_ON(!crash_size || !crash_base); - *crash_size = 0; - *crash_base = 0; - - ck_cmdline = get_last_crashkernel(cmdline, name, suffix); - - if (!ck_cmdline) - return -EINVAL; - - ck_cmdline += strlen(name); - - if (suffix) - return parse_crashkernel_suffix(ck_cmdline, crash_size, - suffix); - /* - * if the commandline contains a ':', then that's the extended - * syntax -- if not, it must be the classic syntax - */ - first_colon = strchr(ck_cmdline, ':'); - first_space = strchr(ck_cmdline, ' '); - if (first_colon && (!first_space || first_colon < first_space)) - return parse_crashkernel_mem(ck_cmdline, system_ram, - crash_size, crash_base); - - return parse_crashkernel_simple(ck_cmdline, crash_size, crash_base); -} - -/* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. - */ -int __init parse_crashkernel(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", NULL); -} - -int __init parse_crashkernel_high(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_HIGH]); -} - -int __init parse_crashkernel_low(char *cmdline, - unsigned long long system_ram, - unsigned long long *crash_size, - unsigned long long *crash_base) -{ - return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel=", suffix_tbl[SUFFIX_LOW]); -} - -static void update_vmcoreinfo_note(void) -{ - u32 *buf = vmcoreinfo_note; - - if (!vmcoreinfo_size) - return; - buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, - vmcoreinfo_size); - final_note(buf); -} - -void crash_save_vmcoreinfo(void) -{ - vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); - update_vmcoreinfo_note(); -} - -void vmcoreinfo_append_str(const char *fmt, ...) -{ - va_list args; - char buf[0x50]; - size_t r; - - va_start(args, fmt); - r = vscnprintf(buf, sizeof(buf), fmt, args); - va_end(args); - - r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); - - memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); - - vmcoreinfo_size += r; -} - -/* - * provide an empty default implementation here -- architecture - * code may override this - */ -void __weak arch_crash_save_vmcoreinfo(void) -{} - -phys_addr_t __weak paddr_vmcoreinfo_note(void) -{ - return __pa_symbol((unsigned long)(char *)&vmcoreinfo_note); -} - -static int __init crash_save_vmcoreinfo_init(void) -{ - VMCOREINFO_OSRELEASE(init_uts_ns.name.release); - VMCOREINFO_PAGESIZE(PAGE_SIZE); - - VMCOREINFO_SYMBOL(init_uts_ns); - VMCOREINFO_SYMBOL(node_online_map); -#ifdef CONFIG_MMU - VMCOREINFO_SYMBOL(swapper_pg_dir); -#endif - VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmap_area_list); - -#ifndef CONFIG_NEED_MULTIPLE_NODES - VMCOREINFO_SYMBOL(mem_map); - VMCOREINFO_SYMBOL(contig_page_data); -#endif -#ifdef CONFIG_SPARSEMEM - VMCOREINFO_SYMBOL(mem_section); - VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); - VMCOREINFO_STRUCT_SIZE(mem_section); - VMCOREINFO_OFFSET(mem_section, section_mem_map); -#endif - VMCOREINFO_STRUCT_SIZE(page); - VMCOREINFO_STRUCT_SIZE(pglist_data); - VMCOREINFO_STRUCT_SIZE(zone); - VMCOREINFO_STRUCT_SIZE(free_area); - VMCOREINFO_STRUCT_SIZE(list_head); - VMCOREINFO_SIZE(nodemask_t); - VMCOREINFO_OFFSET(page, flags); - VMCOREINFO_OFFSET(page, _refcount); - VMCOREINFO_OFFSET(page, mapping); - VMCOREINFO_OFFSET(page, lru); - VMCOREINFO_OFFSET(page, _mapcount); - VMCOREINFO_OFFSET(page, private); - VMCOREINFO_OFFSET(page, compound_dtor); - VMCOREINFO_OFFSET(page, compound_order); - VMCOREINFO_OFFSET(page, compound_head); - VMCOREINFO_OFFSET(pglist_data, node_zones); - VMCOREINFO_OFFSET(pglist_data, nr_zones); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - VMCOREINFO_OFFSET(pglist_data, node_mem_map); -#endif - VMCOREINFO_OFFSET(pglist_data, node_start_pfn); - VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); - VMCOREINFO_OFFSET(pglist_data, node_id); - VMCOREINFO_OFFSET(zone, free_area); - VMCOREINFO_OFFSET(zone, vm_stat); - VMCOREINFO_OFFSET(zone, spanned_pages); - VMCOREINFO_OFFSET(free_area, free_list); - VMCOREINFO_OFFSET(list_head, next); - VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vmap_area, va_start); - VMCOREINFO_OFFSET(vmap_area, list); - VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); - log_buf_kexec_setup(); - VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); - VMCOREINFO_NUMBER(NR_FREE_PAGES); - VMCOREINFO_NUMBER(PG_lru); - VMCOREINFO_NUMBER(PG_private); - VMCOREINFO_NUMBER(PG_swapcache); - VMCOREINFO_NUMBER(PG_slab); -#ifdef CONFIG_MEMORY_FAILURE - VMCOREINFO_NUMBER(PG_hwpoison); -#endif - VMCOREINFO_NUMBER(PG_head_mask); - VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); -#ifdef CONFIG_HUGETLB_PAGE - VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); -#endif - - arch_crash_save_vmcoreinfo(); - update_vmcoreinfo_note(); - - return 0; -} - -subsys_initcall(crash_save_vmcoreinfo_init); - /* * Move into place and start executing a preloaded standalone * executable. If nothing was preloaded return an error. diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 699c5bc51a9219d664578e4d7978dee94d291c82..7367e0ec6f8188973a0ee63539cd9a8485e47585 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -58,15 +58,6 @@ #define KPROBE_TABLE_SIZE (1 << KPROBE_HASH_BITS) -/* - * Some oddball architectures like 64bit powerpc have function descriptors - * so this must be overridable. - */ -#ifndef kprobe_lookup_name -#define kprobe_lookup_name(name, addr) \ - addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) -#endif - static int kprobes_initialized; static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; @@ -81,6 +72,12 @@ static struct { raw_spinlock_t lock ____cacheline_aligned_in_smp; } kretprobe_table_locks[KPROBE_TABLE_SIZE]; +kprobe_opcode_t * __weak kprobe_lookup_name(const char *name, + unsigned int __unused) +{ + return ((kprobe_opcode_t *)(kallsyms_lookup_name(name))); +} + static raw_spinlock_t *kretprobe_table_lock_ptr(unsigned long hash) { return &(kretprobe_table_locks[hash].lock); @@ -746,13 +743,20 @@ static void kill_optimized_kprobe(struct kprobe *p) arch_remove_optimized_kprobe(op); } +static inline +void __prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) +{ + if (!kprobe_ftrace(p)) + arch_prepare_optimized_kprobe(op, p); +} + /* Try to prepare optimized instructions */ static void prepare_optimized_kprobe(struct kprobe *p) { struct optimized_kprobe *op; op = container_of(p, struct optimized_kprobe, kp); - arch_prepare_optimized_kprobe(op, p); + __prepare_optimized_kprobe(op, p); } /* Allocate new optimized_kprobe and try to prepare optimized instructions */ @@ -766,7 +770,7 @@ static struct kprobe *alloc_aggr_kprobe(struct kprobe *p) INIT_LIST_HEAD(&op->list); op->kp.addr = p->addr; - arch_prepare_optimized_kprobe(op, p); + __prepare_optimized_kprobe(op, p); return &op->kp; } @@ -1391,21 +1395,19 @@ bool within_kprobe_blacklist(unsigned long addr) * This returns encoded errors if it fails to look up symbol or invalid * combination of parameters. */ -static kprobe_opcode_t *kprobe_addr(struct kprobe *p) +static kprobe_opcode_t *_kprobe_addr(kprobe_opcode_t *addr, + const char *symbol_name, unsigned int offset) { - kprobe_opcode_t *addr = p->addr; - - if ((p->symbol_name && p->addr) || - (!p->symbol_name && !p->addr)) + if ((symbol_name && addr) || (!symbol_name && !addr)) goto invalid; - if (p->symbol_name) { - kprobe_lookup_name(p->symbol_name, addr); + if (symbol_name) { + addr = kprobe_lookup_name(symbol_name, offset); if (!addr) return ERR_PTR(-ENOENT); } - addr = (kprobe_opcode_t *)(((char *)addr) + p->offset); + addr = (kprobe_opcode_t *)(((char *)addr) + offset); if (addr) return addr; @@ -1413,6 +1415,11 @@ static kprobe_opcode_t *kprobe_addr(struct kprobe *p) return ERR_PTR(-EINVAL); } +static kprobe_opcode_t *kprobe_addr(struct kprobe *p) +{ + return _kprobe_addr(p->addr, p->symbol_name, p->offset); +} + /* Check passed kprobe is valid and return kprobe in kprobe_table. */ static struct kprobe *__get_valid_kprobe(struct kprobe *p) { @@ -1740,11 +1747,12 @@ void unregister_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(unregister_kprobes); -int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) +int __weak kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) { return NOTIFY_DONE; } +NOKPROBE_SYMBOL(kprobe_exceptions_notify); static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, @@ -1875,6 +1883,25 @@ static int pre_handler_kretprobe(struct kprobe *p, struct pt_regs *regs) } NOKPROBE_SYMBOL(pre_handler_kretprobe); +bool __weak arch_function_offset_within_entry(unsigned long offset) +{ + return !offset; +} + +bool function_offset_within_entry(kprobe_opcode_t *addr, const char *sym, unsigned long offset) +{ + kprobe_opcode_t *kp_addr = _kprobe_addr(addr, sym, offset); + + if (IS_ERR(kp_addr)) + return false; + + if (!kallsyms_lookup_size_offset((unsigned long)kp_addr, NULL, &offset) || + !arch_function_offset_within_entry(offset)) + return false; + + return true; +} + int register_kretprobe(struct kretprobe *rp) { int ret = 0; @@ -1882,6 +1909,9 @@ int register_kretprobe(struct kretprobe *rp) int i; void *addr; + if (!function_offset_within_entry(rp->kp.addr, rp->kp.symbol_name, rp->kp.offset)) + return -EINVAL; + if (kretprobe_blacklist_size) { addr = kprobe_addr(&rp->kp); if (IS_ERR(addr)) @@ -2192,8 +2222,8 @@ static int __init init_kprobes(void) if (kretprobe_blacklist_size) { /* lookup the function address from its name */ for (i = 0; kretprobe_blacklist[i].name != NULL; i++) { - kprobe_lookup_name(kretprobe_blacklist[i].name, - kretprobe_blacklist[i].addr); + kretprobe_blacklist[i].addr = + kprobe_lookup_name(kretprobe_blacklist[i].name, 0); if (!kretprobe_blacklist[i].addr) printk("kretprobe: lookup failed: %s\n", kretprobe_blacklist[i].name); diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 0999679d6f26706350cca88f42b970a1306f7037..23cd70651238383ac0b112acf4827ebd1b1ad399 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -125,6 +125,10 @@ static ssize_t kexec_crash_size_store(struct kobject *kobj, } KERNEL_ATTR_RW(kexec_crash_size); +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_CORE + static ssize_t vmcoreinfo_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { @@ -134,7 +138,7 @@ static ssize_t vmcoreinfo_show(struct kobject *kobj, } KERNEL_ATTR_RO(vmcoreinfo); -#endif /* CONFIG_KEXEC_CORE */ +#endif /* CONFIG_CRASH_CORE */ /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, @@ -219,6 +223,8 @@ static struct attribute * kernel_attrs[] = { &kexec_loaded_attr.attr, &kexec_crash_loaded_attr.attr, &kexec_crash_size_attr.attr, +#endif +#ifdef CONFIG_CRASH_CORE &vmcoreinfo_attr.attr, #endif #ifndef CONFIG_TINY_RCU diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile index e8780c0901d9c56da9e0daa6abff776a3db7111a..2b8bdb1925dad406031a89b6dfea02ee066d534c 100644 --- a/kernel/livepatch/Makefile +++ b/kernel/livepatch/Makefile @@ -1,3 +1,3 @@ obj-$(CONFIG_LIVEPATCH) += livepatch.o -livepatch-objs := core.o +livepatch-objs := core.o patch.o transition.o diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index af4643873e7179c0e750d3d69a665844d06f4725..b9628e43c78f60711f8a8c21693aef5c7041f2bb 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -24,61 +24,31 @@ #include #include #include -#include #include #include #include #include #include +#include #include - -/** - * struct klp_ops - structure for tracking registered ftrace ops structs - * - * A single ftrace_ops is shared between all enabled replacement functions - * (klp_func structs) which have the same old_addr. This allows the switch - * between function versions to happen instantaneously by updating the klp_ops - * struct's func_stack list. The winner is the klp_func at the top of the - * func_stack (front of the list). - * - * @node: node for the global klp_ops list - * @func_stack: list head for the stack of klp_func's (active func is on top) - * @fops: registered ftrace ops struct - */ -struct klp_ops { - struct list_head node; - struct list_head func_stack; - struct ftrace_ops fops; -}; +#include "core.h" +#include "patch.h" +#include "transition.h" /* - * The klp_mutex protects the global lists and state transitions of any - * structure reachable from them. References to any structure must be obtained - * under mutex protection (except in klp_ftrace_handler(), which uses RCU to - * ensure it gets consistent data). + * klp_mutex is a coarse lock which serializes access to klp data. All + * accesses to klp-related variables and structures must have mutex protection, + * except within the following functions which carefully avoid the need for it: + * + * - klp_ftrace_handler() + * - klp_update_patch_state() */ -static DEFINE_MUTEX(klp_mutex); +DEFINE_MUTEX(klp_mutex); static LIST_HEAD(klp_patches); -static LIST_HEAD(klp_ops); static struct kobject *klp_root_kobj; -static struct klp_ops *klp_find_ops(unsigned long old_addr) -{ - struct klp_ops *ops; - struct klp_func *func; - - list_for_each_entry(ops, &klp_ops, node) { - func = list_first_entry(&ops->func_stack, struct klp_func, - stack_node); - if (func->old_addr == old_addr) - return ops; - } - - return NULL; -} - static bool klp_is_module(struct klp_object *obj) { return obj->name; @@ -117,7 +87,6 @@ static void klp_find_object_module(struct klp_object *obj) mutex_unlock(&module_mutex); } -/* klp_mutex must be held by caller */ static bool klp_is_patch_registered(struct klp_patch *patch) { struct klp_patch *mypatch; @@ -182,7 +151,10 @@ static int klp_find_object_symbol(const char *objname, const char *name, }; mutex_lock(&module_mutex); - kallsyms_on_each_symbol(klp_find_callback, &args); + if (objname) + module_kallsyms_on_each_symbol(klp_find_callback, &args); + else + kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); /* @@ -233,7 +205,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) for (i = 0; i < relasec->sh_size / sizeof(Elf_Rela); i++) { sym = pmod->core_kallsyms.symtab + ELF_R_SYM(relas[i].r_info); if (sym->st_shndx != SHN_LIVEPATCH) { - pr_err("symbol %s is not marked as a livepatch symbol", + pr_err("symbol %s is not marked as a livepatch symbol\n", strtab + sym->st_name); return -EINVAL; } @@ -243,7 +215,7 @@ static int klp_resolve_symbols(Elf_Shdr *relasec, struct module *pmod) ".klp.sym.%55[^.].%127[^,],%lu", objname, symname, &sympos); if (cnt != 3) { - pr_err("symbol %s has an incorrectly formatted name", + pr_err("symbol %s has an incorrectly formatted name\n", strtab + sym->st_name); return -EINVAL; } @@ -288,7 +260,7 @@ static int klp_write_object_relocations(struct module *pmod, */ cnt = sscanf(secname, ".klp.rela.%55[^.]", sec_objname); if (cnt != 1) { - pr_err("section %s has an incorrectly formatted name", + pr_err("section %s has an incorrectly formatted name\n", secname); ret = -EINVAL; break; @@ -311,191 +283,30 @@ static int klp_write_object_relocations(struct module *pmod, return ret; } -static void notrace klp_ftrace_handler(unsigned long ip, - unsigned long parent_ip, - struct ftrace_ops *fops, - struct pt_regs *regs) -{ - struct klp_ops *ops; - struct klp_func *func; - - ops = container_of(fops, struct klp_ops, fops); - - rcu_read_lock(); - func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, - stack_node); - if (WARN_ON_ONCE(!func)) - goto unlock; - - klp_arch_set_pc(regs, (unsigned long)func->new_func); -unlock: - rcu_read_unlock(); -} - -/* - * Convert a function address into the appropriate ftrace location. - * - * Usually this is just the address of the function, but on some architectures - * it's more complicated so allow them to provide a custom behaviour. - */ -#ifndef klp_get_ftrace_location -static unsigned long klp_get_ftrace_location(unsigned long faddr) -{ - return faddr; -} -#endif - -static void klp_disable_func(struct klp_func *func) -{ - struct klp_ops *ops; - - if (WARN_ON(func->state != KLP_ENABLED)) - return; - if (WARN_ON(!func->old_addr)) - return; - - ops = klp_find_ops(func->old_addr); - if (WARN_ON(!ops)) - return; - - if (list_is_singular(&ops->func_stack)) { - unsigned long ftrace_loc; - - ftrace_loc = klp_get_ftrace_location(func->old_addr); - if (WARN_ON(!ftrace_loc)) - return; - - WARN_ON(unregister_ftrace_function(&ops->fops)); - WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); - - list_del_rcu(&func->stack_node); - list_del(&ops->node); - kfree(ops); - } else { - list_del_rcu(&func->stack_node); - } - - func->state = KLP_DISABLED; -} - -static int klp_enable_func(struct klp_func *func) -{ - struct klp_ops *ops; - int ret; - - if (WARN_ON(!func->old_addr)) - return -EINVAL; - - if (WARN_ON(func->state != KLP_DISABLED)) - return -EINVAL; - - ops = klp_find_ops(func->old_addr); - if (!ops) { - unsigned long ftrace_loc; - - ftrace_loc = klp_get_ftrace_location(func->old_addr); - if (!ftrace_loc) { - pr_err("failed to find location for function '%s'\n", - func->old_name); - return -EINVAL; - } - - ops = kzalloc(sizeof(*ops), GFP_KERNEL); - if (!ops) - return -ENOMEM; - - ops->fops.func = klp_ftrace_handler; - ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | - FTRACE_OPS_FL_DYNAMIC | - FTRACE_OPS_FL_IPMODIFY; - - list_add(&ops->node, &klp_ops); - - INIT_LIST_HEAD(&ops->func_stack); - list_add_rcu(&func->stack_node, &ops->func_stack); - - ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); - if (ret) { - pr_err("failed to set ftrace filter for function '%s' (%d)\n", - func->old_name, ret); - goto err; - } - - ret = register_ftrace_function(&ops->fops); - if (ret) { - pr_err("failed to register ftrace handler for function '%s' (%d)\n", - func->old_name, ret); - ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); - goto err; - } - - - } else { - list_add_rcu(&func->stack_node, &ops->func_stack); - } - - func->state = KLP_ENABLED; - - return 0; - -err: - list_del_rcu(&func->stack_node); - list_del(&ops->node); - kfree(ops); - return ret; -} - -static void klp_disable_object(struct klp_object *obj) -{ - struct klp_func *func; - - klp_for_each_func(obj, func) - if (func->state == KLP_ENABLED) - klp_disable_func(func); - - obj->state = KLP_DISABLED; -} - -static int klp_enable_object(struct klp_object *obj) -{ - struct klp_func *func; - int ret; - - if (WARN_ON(obj->state != KLP_DISABLED)) - return -EINVAL; - - if (WARN_ON(!klp_is_object_loaded(obj))) - return -EINVAL; - - klp_for_each_func(obj, func) { - ret = klp_enable_func(func); - if (ret) { - klp_disable_object(obj); - return ret; - } - } - obj->state = KLP_ENABLED; - - return 0; -} - static int __klp_disable_patch(struct klp_patch *patch) { - struct klp_object *obj; + if (klp_transition_patch) + return -EBUSY; /* enforce stacking: only the last enabled patch can be disabled */ if (!list_is_last(&patch->list, &klp_patches) && - list_next_entry(patch, list)->state == KLP_ENABLED) + list_next_entry(patch, list)->enabled) return -EBUSY; - pr_notice("disabling patch '%s'\n", patch->mod->name); + klp_init_transition(patch, KLP_UNPATCHED); - klp_for_each_object(patch, obj) { - if (obj->state == KLP_ENABLED) - klp_disable_object(obj); - } + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the TIF_PATCH_PENDING writes in + * klp_start_transition(). In the rare case where klp_ftrace_handler() + * is called shortly after klp_update_patch_state() switches the task, + * this ensures the handler sees that func->transition is set. + */ + smp_wmb(); - patch->state = KLP_DISABLED; + klp_start_transition(); + klp_try_complete_transition(); + patch->enabled = false; return 0; } @@ -519,7 +330,7 @@ int klp_disable_patch(struct klp_patch *patch) goto err; } - if (patch->state == KLP_DISABLED) { + if (!patch->enabled) { ret = -EINVAL; goto err; } @@ -537,32 +348,61 @@ static int __klp_enable_patch(struct klp_patch *patch) struct klp_object *obj; int ret; - if (WARN_ON(patch->state != KLP_DISABLED)) + if (klp_transition_patch) + return -EBUSY; + + if (WARN_ON(patch->enabled)) return -EINVAL; /* enforce stacking: only the first disabled patch can be enabled */ if (patch->list.prev != &klp_patches && - list_prev_entry(patch, list)->state == KLP_DISABLED) + !list_prev_entry(patch, list)->enabled) return -EBUSY; + /* + * A reference is taken on the patch module to prevent it from being + * unloaded. + * + * Note: For immediate (no consistency model) patches we don't allow + * patch modules to unload since there is no safe/sane method to + * determine if a thread is still running in the patched code contained + * in the patch module once the ftrace registration is successful. + */ + if (!try_module_get(patch->mod)) + return -ENODEV; + pr_notice("enabling patch '%s'\n", patch->mod->name); + klp_init_transition(patch, KLP_PATCHED); + + /* + * Enforce the order of the func->transition writes in + * klp_init_transition() and the ops->func_stack writes in + * klp_patch_object(), so that klp_ftrace_handler() will see the + * func->transition updates before the handler is registered and the + * new funcs become visible to the handler. + */ + smp_wmb(); + klp_for_each_object(patch, obj) { if (!klp_is_object_loaded(obj)) continue; - ret = klp_enable_object(obj); - if (ret) - goto unregister; + ret = klp_patch_object(obj); + if (ret) { + pr_warn("failed to enable patch '%s'\n", + patch->mod->name); + + klp_cancel_transition(); + return ret; + } } - patch->state = KLP_ENABLED; + klp_start_transition(); + klp_try_complete_transition(); + patch->enabled = true; return 0; - -unregister: - WARN_ON(__klp_disable_patch(patch)); - return ret; } /** @@ -599,6 +439,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch * /sys/kernel/livepatch/ * /sys/kernel/livepatch//enabled + * /sys/kernel/livepatch//transition * /sys/kernel/livepatch// * /sys/kernel/livepatch/// */ @@ -608,26 +449,34 @@ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, { struct klp_patch *patch; int ret; - unsigned long val; + bool enabled; - ret = kstrtoul(buf, 10, &val); + ret = kstrtobool(buf, &enabled); if (ret) - return -EINVAL; - - if (val != KLP_DISABLED && val != KLP_ENABLED) - return -EINVAL; + return ret; patch = container_of(kobj, struct klp_patch, kobj); mutex_lock(&klp_mutex); - if (val == patch->state) { + if (!klp_is_patch_registered(patch)) { + /* + * Module with the patch could either disappear meanwhile or is + * not properly initialized yet. + */ + ret = -EINVAL; + goto err; + } + + if (patch->enabled == enabled) { /* already in requested state */ ret = -EINVAL; goto err; } - if (val == KLP_ENABLED) { + if (patch == klp_transition_patch) { + klp_reverse_transition(); + } else if (enabled) { ret = __klp_enable_patch(patch); if (ret) goto err; @@ -652,21 +501,33 @@ static ssize_t enabled_show(struct kobject *kobj, struct klp_patch *patch; patch = container_of(kobj, struct klp_patch, kobj); - return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->state); + return snprintf(buf, PAGE_SIZE-1, "%d\n", patch->enabled); +} + +static ssize_t transition_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + return snprintf(buf, PAGE_SIZE-1, "%d\n", + patch == klp_transition_patch); } static struct kobj_attribute enabled_kobj_attr = __ATTR_RW(enabled); +static struct kobj_attribute transition_kobj_attr = __ATTR_RO(transition); static struct attribute *klp_patch_attrs[] = { &enabled_kobj_attr.attr, + &transition_kobj_attr.attr, NULL }; static void klp_kobj_release_patch(struct kobject *kobj) { - /* - * Once we have a consistency model we'll need to module_put() the - * patch module here. See klp_register_patch() for more details. - */ + struct klp_patch *patch; + + patch = container_of(kobj, struct klp_patch, kobj); + complete(&patch->finish); } static struct kobj_type klp_ktype_patch = { @@ -737,7 +598,6 @@ static void klp_free_patch(struct klp_patch *patch) klp_free_objects_limited(patch, NULL); if (!list_empty(&patch->list)) list_del(&patch->list); - kobject_put(&patch->kobj); } static int klp_init_func(struct klp_object *obj, struct klp_func *func) @@ -746,7 +606,8 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) return -EINVAL; INIT_LIST_HEAD(&func->stack_node); - func->state = KLP_DISABLED; + func->patched = false; + func->transition = false; /* The format for the sysfs directory is where sympos * is the nth occurrence of this symbol in kallsyms for the patched @@ -787,6 +648,22 @@ static int klp_init_object_loaded(struct klp_patch *patch, &func->old_addr); if (ret) return ret; + + ret = kallsyms_lookup_size_offset(func->old_addr, + &func->old_size, NULL); + if (!ret) { + pr_err("kallsyms size lookup failed for '%s'\n", + func->old_name); + return -ENOENT; + } + + ret = kallsyms_lookup_size_offset((unsigned long)func->new_func, + &func->new_size, NULL); + if (!ret) { + pr_err("kallsyms size lookup failed for '%s' replacement\n", + func->old_name); + return -ENOENT; + } } return 0; @@ -801,7 +678,7 @@ static int klp_init_object(struct klp_patch *patch, struct klp_object *obj) if (!obj->funcs) return -EINVAL; - obj->state = KLP_DISABLED; + obj->patched = false; obj->mod = NULL; klp_find_object_module(obj); @@ -842,12 +719,15 @@ static int klp_init_patch(struct klp_patch *patch) mutex_lock(&klp_mutex); - patch->state = KLP_DISABLED; + patch->enabled = false; + init_completion(&patch->finish); ret = kobject_init_and_add(&patch->kobj, &klp_ktype_patch, klp_root_kobj, "%s", patch->mod->name); - if (ret) - goto unlock; + if (ret) { + mutex_unlock(&klp_mutex); + return ret; + } klp_for_each_object(patch, obj) { ret = klp_init_object(patch, obj); @@ -863,9 +743,12 @@ static int klp_init_patch(struct klp_patch *patch) free: klp_free_objects_limited(patch, obj); - kobject_put(&patch->kobj); -unlock: + mutex_unlock(&klp_mutex); + + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + return ret; } @@ -879,23 +762,29 @@ static int klp_init_patch(struct klp_patch *patch) */ int klp_unregister_patch(struct klp_patch *patch) { - int ret = 0; + int ret; mutex_lock(&klp_mutex); if (!klp_is_patch_registered(patch)) { ret = -EINVAL; - goto out; + goto err; } - if (patch->state == KLP_ENABLED) { + if (patch->enabled) { ret = -EBUSY; - goto out; + goto err; } klp_free_patch(patch); -out: + mutex_unlock(&klp_mutex); + + kobject_put(&patch->kobj); + wait_for_completion(&patch->finish); + + return 0; +err: mutex_unlock(&klp_mutex); return ret; } @@ -908,17 +797,18 @@ EXPORT_SYMBOL_GPL(klp_unregister_patch); * Initializes the data structure associated with the patch and * creates the sysfs interface. * + * There is no need to take the reference on the patch module here. It is done + * later when the patch is enabled. + * * Return: 0 on success, otherwise error */ int klp_register_patch(struct klp_patch *patch) { - int ret; - if (!patch || !patch->mod) return -EINVAL; if (!is_livepatch_module(patch->mod)) { - pr_err("module %s is not marked as a livepatch module", + pr_err("module %s is not marked as a livepatch module\n", patch->mod->name); return -EINVAL; } @@ -927,20 +817,16 @@ int klp_register_patch(struct klp_patch *patch) return -ENODEV; /* - * A reference is taken on the patch module to prevent it from being - * unloaded. Right now, we don't allow patch modules to unload since - * there is currently no method to determine if a thread is still - * running in the patched code contained in the patch module once - * the ftrace registration is successful. + * Architectures without reliable stack traces have to set + * patch->immediate because there's currently no way to patch kthreads + * with the consistency model. */ - if (!try_module_get(patch->mod)) - return -ENODEV; - - ret = klp_init_patch(patch); - if (ret) - module_put(patch->mod); + if (!klp_have_reliable_stack() && !patch->immediate) { + pr_err("This architecture doesn't have support for the livepatch consistency model.\n"); + return -ENOSYS; + } - return ret; + return klp_init_patch(patch); } EXPORT_SYMBOL_GPL(klp_register_patch); @@ -975,13 +861,17 @@ int klp_module_coming(struct module *mod) goto err; } - if (patch->state == KLP_DISABLED) + /* + * Only patch the module if the patch is enabled or is + * in transition. + */ + if (!patch->enabled && patch != klp_transition_patch) break; pr_notice("applying patch '%s' to loading module '%s'\n", patch->mod->name, obj->mod->name); - ret = klp_enable_object(obj); + ret = klp_patch_object(obj); if (ret) { pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", patch->mod->name, obj->mod->name, ret); @@ -1032,10 +922,14 @@ void klp_module_going(struct module *mod) if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (patch->state != KLP_DISABLED) { + /* + * Only unpatch the module if the patch is enabled or + * is in transition. + */ + if (patch->enabled || patch == klp_transition_patch) { pr_notice("reverting patch '%s' on unloading module '%s'\n", patch->mod->name, obj->mod->name); - klp_disable_object(obj); + klp_unpatch_object(obj); } klp_free_object_loaded(obj); diff --git a/kernel/livepatch/core.h b/kernel/livepatch/core.h new file mode 100644 index 0000000000000000000000000000000000000000..c74f24c478379fdd6a22f312eb970f8c2ba42a74 --- /dev/null +++ b/kernel/livepatch/core.h @@ -0,0 +1,6 @@ +#ifndef _LIVEPATCH_CORE_H +#define _LIVEPATCH_CORE_H + +extern struct mutex klp_mutex; + +#endif /* _LIVEPATCH_CORE_H */ diff --git a/kernel/livepatch/patch.c b/kernel/livepatch/patch.c new file mode 100644 index 0000000000000000000000000000000000000000..f8269036bf0b84f89faa2db3b5dc5fe4c259a019 --- /dev/null +++ b/kernel/livepatch/patch.c @@ -0,0 +1,272 @@ +/* + * patch.c - livepatch patching functions + * + * Copyright (C) 2014 Seth Jennings + * Copyright (C) 2014 SUSE + * Copyright (C) 2015 Josh Poimboeuf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include "patch.h" +#include "transition.h" + +static LIST_HEAD(klp_ops); + +struct klp_ops *klp_find_ops(unsigned long old_addr) +{ + struct klp_ops *ops; + struct klp_func *func; + + list_for_each_entry(ops, &klp_ops, node) { + func = list_first_entry(&ops->func_stack, struct klp_func, + stack_node); + if (func->old_addr == old_addr) + return ops; + } + + return NULL; +} + +static void notrace klp_ftrace_handler(unsigned long ip, + unsigned long parent_ip, + struct ftrace_ops *fops, + struct pt_regs *regs) +{ + struct klp_ops *ops; + struct klp_func *func; + int patch_state; + + ops = container_of(fops, struct klp_ops, fops); + + rcu_read_lock(); + + func = list_first_or_null_rcu(&ops->func_stack, struct klp_func, + stack_node); + + /* + * func should never be NULL because preemption should be disabled here + * and unregister_ftrace_function() does the equivalent of a + * synchronize_sched() before the func_stack removal. + */ + if (WARN_ON_ONCE(!func)) + goto unlock; + + /* + * In the enable path, enforce the order of the ops->func_stack and + * func->transition reads. The corresponding write barrier is in + * __klp_enable_patch(). + * + * (Note that this barrier technically isn't needed in the disable + * path. In the rare case where klp_update_patch_state() runs before + * this handler, its TIF_PATCH_PENDING read and this func->transition + * read need to be ordered. But klp_update_patch_state() already + * enforces that.) + */ + smp_rmb(); + + if (unlikely(func->transition)) { + + /* + * Enforce the order of the func->transition and + * current->patch_state reads. Otherwise we could read an + * out-of-date task state and pick the wrong function. The + * corresponding write barrier is in klp_init_transition(). + */ + smp_rmb(); + + patch_state = current->patch_state; + + WARN_ON_ONCE(patch_state == KLP_UNDEFINED); + + if (patch_state == KLP_UNPATCHED) { + /* + * Use the previously patched version of the function. + * If no previous patches exist, continue with the + * original function. + */ + func = list_entry_rcu(func->stack_node.next, + struct klp_func, stack_node); + + if (&func->stack_node == &ops->func_stack) + goto unlock; + } + } + + klp_arch_set_pc(regs, (unsigned long)func->new_func); +unlock: + rcu_read_unlock(); +} + +/* + * Convert a function address into the appropriate ftrace location. + * + * Usually this is just the address of the function, but on some architectures + * it's more complicated so allow them to provide a custom behaviour. + */ +#ifndef klp_get_ftrace_location +static unsigned long klp_get_ftrace_location(unsigned long faddr) +{ + return faddr; +} +#endif + +static void klp_unpatch_func(struct klp_func *func) +{ + struct klp_ops *ops; + + if (WARN_ON(!func->patched)) + return; + if (WARN_ON(!func->old_addr)) + return; + + ops = klp_find_ops(func->old_addr); + if (WARN_ON(!ops)) + return; + + if (list_is_singular(&ops->func_stack)) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (WARN_ON(!ftrace_loc)) + return; + + WARN_ON(unregister_ftrace_function(&ops->fops)); + WARN_ON(ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0)); + + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + } else { + list_del_rcu(&func->stack_node); + } + + func->patched = false; +} + +static int klp_patch_func(struct klp_func *func) +{ + struct klp_ops *ops; + int ret; + + if (WARN_ON(!func->old_addr)) + return -EINVAL; + + if (WARN_ON(func->patched)) + return -EINVAL; + + ops = klp_find_ops(func->old_addr); + if (!ops) { + unsigned long ftrace_loc; + + ftrace_loc = klp_get_ftrace_location(func->old_addr); + if (!ftrace_loc) { + pr_err("failed to find location for function '%s'\n", + func->old_name); + return -EINVAL; + } + + ops = kzalloc(sizeof(*ops), GFP_KERNEL); + if (!ops) + return -ENOMEM; + + ops->fops.func = klp_ftrace_handler; + ops->fops.flags = FTRACE_OPS_FL_SAVE_REGS | + FTRACE_OPS_FL_DYNAMIC | + FTRACE_OPS_FL_IPMODIFY; + + list_add(&ops->node, &klp_ops); + + INIT_LIST_HEAD(&ops->func_stack); + list_add_rcu(&func->stack_node, &ops->func_stack); + + ret = ftrace_set_filter_ip(&ops->fops, ftrace_loc, 0, 0); + if (ret) { + pr_err("failed to set ftrace filter for function '%s' (%d)\n", + func->old_name, ret); + goto err; + } + + ret = register_ftrace_function(&ops->fops); + if (ret) { + pr_err("failed to register ftrace handler for function '%s' (%d)\n", + func->old_name, ret); + ftrace_set_filter_ip(&ops->fops, ftrace_loc, 1, 0); + goto err; + } + + + } else { + list_add_rcu(&func->stack_node, &ops->func_stack); + } + + func->patched = true; + + return 0; + +err: + list_del_rcu(&func->stack_node); + list_del(&ops->node); + kfree(ops); + return ret; +} + +void klp_unpatch_object(struct klp_object *obj) +{ + struct klp_func *func; + + klp_for_each_func(obj, func) + if (func->patched) + klp_unpatch_func(func); + + obj->patched = false; +} + +int klp_patch_object(struct klp_object *obj) +{ + struct klp_func *func; + int ret; + + if (WARN_ON(obj->patched)) + return -EINVAL; + + klp_for_each_func(obj, func) { + ret = klp_patch_func(func); + if (ret) { + klp_unpatch_object(obj); + return ret; + } + } + obj->patched = true; + + return 0; +} + +void klp_unpatch_objects(struct klp_patch *patch) +{ + struct klp_object *obj; + + klp_for_each_object(patch, obj) + if (obj->patched) + klp_unpatch_object(obj); +} diff --git a/kernel/livepatch/patch.h b/kernel/livepatch/patch.h new file mode 100644 index 0000000000000000000000000000000000000000..0db227170c36a10de1c2774d06dd03aeadeeb07b --- /dev/null +++ b/kernel/livepatch/patch.h @@ -0,0 +1,33 @@ +#ifndef _LIVEPATCH_PATCH_H +#define _LIVEPATCH_PATCH_H + +#include +#include +#include + +/** + * struct klp_ops - structure for tracking registered ftrace ops structs + * + * A single ftrace_ops is shared between all enabled replacement functions + * (klp_func structs) which have the same old_addr. This allows the switch + * between function versions to happen instantaneously by updating the klp_ops + * struct's func_stack list. The winner is the klp_func at the top of the + * func_stack (front of the list). + * + * @node: node for the global klp_ops list + * @func_stack: list head for the stack of klp_func's (active func is on top) + * @fops: registered ftrace ops struct + */ +struct klp_ops { + struct list_head node; + struct list_head func_stack; + struct ftrace_ops fops; +}; + +struct klp_ops *klp_find_ops(unsigned long old_addr); + +int klp_patch_object(struct klp_object *obj); +void klp_unpatch_object(struct klp_object *obj); +void klp_unpatch_objects(struct klp_patch *patch); + +#endif /* _LIVEPATCH_PATCH_H */ diff --git a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c new file mode 100644 index 0000000000000000000000000000000000000000..adc0cc64aa4b6ae199e5f4a5b2af0ad59c382175 --- /dev/null +++ b/kernel/livepatch/transition.c @@ -0,0 +1,553 @@ +/* + * transition.c - Kernel Live Patching transition functions + * + * Copyright (C) 2015-2016 Josh Poimboeuf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include "core.h" +#include "patch.h" +#include "transition.h" +#include "../sched/sched.h" + +#define MAX_STACK_ENTRIES 100 +#define STACK_ERR_BUF_SIZE 128 + +struct klp_patch *klp_transition_patch; + +static int klp_target_state = KLP_UNDEFINED; + +/* + * This work can be performed periodically to finish patching or unpatching any + * "straggler" tasks which failed to transition in the first attempt. + */ +static void klp_transition_work_fn(struct work_struct *work) +{ + mutex_lock(&klp_mutex); + + if (klp_transition_patch) + klp_try_complete_transition(); + + mutex_unlock(&klp_mutex); +} +static DECLARE_DELAYED_WORK(klp_transition_work, klp_transition_work_fn); + +/* + * The transition to the target patch state is complete. Clean up the data + * structures. + */ +static void klp_complete_transition(void) +{ + struct klp_object *obj; + struct klp_func *func; + struct task_struct *g, *task; + unsigned int cpu; + bool immediate_func = false; + + if (klp_target_state == KLP_UNPATCHED) { + /* + * All tasks have transitioned to KLP_UNPATCHED so we can now + * remove the new functions from the func_stack. + */ + klp_unpatch_objects(klp_transition_patch); + + /* + * Make sure klp_ftrace_handler() can no longer see functions + * from this patch on the ops->func_stack. Otherwise, after + * func->transition gets cleared, the handler may choose a + * removed function. + */ + synchronize_rcu(); + } + + if (klp_transition_patch->immediate) + goto done; + + klp_for_each_object(klp_transition_patch, obj) { + klp_for_each_func(obj, func) { + func->transition = false; + if (func->immediate) + immediate_func = true; + } + } + + if (klp_target_state == KLP_UNPATCHED && !immediate_func) + module_put(klp_transition_patch->mod); + + /* Prevent klp_ftrace_handler() from seeing KLP_UNDEFINED state */ + if (klp_target_state == KLP_PATCHED) + synchronize_rcu(); + + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); + task->patch_state = KLP_UNDEFINED; + } + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + WARN_ON_ONCE(test_tsk_thread_flag(task, TIF_PATCH_PENDING)); + task->patch_state = KLP_UNDEFINED; + } + +done: + klp_target_state = KLP_UNDEFINED; + klp_transition_patch = NULL; +} + +/* + * This is called in the error path, to cancel a transition before it has + * started, i.e. klp_init_transition() has been called but + * klp_start_transition() hasn't. If the transition *has* been started, + * klp_reverse_transition() should be used instead. + */ +void klp_cancel_transition(void) +{ + if (WARN_ON_ONCE(klp_target_state != KLP_PATCHED)) + return; + + klp_target_state = KLP_UNPATCHED; + klp_complete_transition(); +} + +/* + * Switch the patched state of the task to the set of functions in the target + * patch state. + * + * NOTE: If task is not 'current', the caller must ensure the task is inactive. + * Otherwise klp_ftrace_handler() might read the wrong 'patch_state' value. + */ +void klp_update_patch_state(struct task_struct *task) +{ + rcu_read_lock(); + + /* + * This test_and_clear_tsk_thread_flag() call also serves as a read + * barrier (smp_rmb) for two cases: + * + * 1) Enforce the order of the TIF_PATCH_PENDING read and the + * klp_target_state read. The corresponding write barrier is in + * klp_init_transition(). + * + * 2) Enforce the order of the TIF_PATCH_PENDING read and a future read + * of func->transition, if klp_ftrace_handler() is called later on + * the same CPU. See __klp_disable_patch(). + */ + if (test_and_clear_tsk_thread_flag(task, TIF_PATCH_PENDING)) + task->patch_state = READ_ONCE(klp_target_state); + + rcu_read_unlock(); +} + +/* + * Determine whether the given stack trace includes any references to a + * to-be-patched or to-be-unpatched function. + */ +static int klp_check_stack_func(struct klp_func *func, + struct stack_trace *trace) +{ + unsigned long func_addr, func_size, address; + struct klp_ops *ops; + int i; + + if (func->immediate) + return 0; + + for (i = 0; i < trace->nr_entries; i++) { + address = trace->entries[i]; + + if (klp_target_state == KLP_UNPATCHED) { + /* + * Check for the to-be-unpatched function + * (the func itself). + */ + func_addr = (unsigned long)func->new_func; + func_size = func->new_size; + } else { + /* + * Check for the to-be-patched function + * (the previous func). + */ + ops = klp_find_ops(func->old_addr); + + if (list_is_singular(&ops->func_stack)) { + /* original function */ + func_addr = func->old_addr; + func_size = func->old_size; + } else { + /* previously patched function */ + struct klp_func *prev; + + prev = list_next_entry(func, stack_node); + func_addr = (unsigned long)prev->new_func; + func_size = prev->new_size; + } + } + + if (address >= func_addr && address < func_addr + func_size) + return -EAGAIN; + } + + return 0; +} + +/* + * Determine whether it's safe to transition the task to the target patch state + * by looking for any to-be-patched or to-be-unpatched functions on its stack. + */ +static int klp_check_stack(struct task_struct *task, char *err_buf) +{ + static unsigned long entries[MAX_STACK_ENTRIES]; + struct stack_trace trace; + struct klp_object *obj; + struct klp_func *func; + int ret; + + trace.skip = 0; + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_ENTRIES; + trace.entries = entries; + ret = save_stack_trace_tsk_reliable(task, &trace); + WARN_ON_ONCE(ret == -ENOSYS); + if (ret) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d has an unreliable stack\n", + __func__, task->comm, task->pid); + return ret; + } + + klp_for_each_object(klp_transition_patch, obj) { + if (!obj->patched) + continue; + klp_for_each_func(obj, func) { + ret = klp_check_stack_func(func, &trace); + if (ret) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is sleeping on function %s\n", + __func__, task->comm, task->pid, + func->old_name); + return ret; + } + } + } + + return 0; +} + +/* + * Try to safely switch a task to the target patch state. If it's currently + * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or + * if the stack is unreliable, return false. + */ +static bool klp_try_switch_task(struct task_struct *task) +{ + struct rq *rq; + struct rq_flags flags; + int ret; + bool success = false; + char err_buf[STACK_ERR_BUF_SIZE]; + + err_buf[0] = '\0'; + + /* check if this task has already switched over */ + if (task->patch_state == klp_target_state) + return true; + + /* + * For arches which don't have reliable stack traces, we have to rely + * on other methods (e.g., switching tasks at kernel exit). + */ + if (!klp_have_reliable_stack()) + return false; + + /* + * Now try to check the stack for any to-be-patched or to-be-unpatched + * functions. If all goes well, switch the task to the target patch + * state. + */ + rq = task_rq_lock(task, &flags); + + if (task_running(rq, task) && task != current) { + snprintf(err_buf, STACK_ERR_BUF_SIZE, + "%s: %s:%d is running\n", __func__, task->comm, + task->pid); + goto done; + } + + ret = klp_check_stack(task, err_buf); + if (ret) + goto done; + + success = true; + + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + task->patch_state = klp_target_state; + +done: + task_rq_unlock(rq, task, &flags); + + /* + * Due to console deadlock issues, pr_debug() can't be used while + * holding the task rq lock. Instead we have to use a temporary buffer + * and print the debug message after releasing the lock. + */ + if (err_buf[0] != '\0') + pr_debug("%s", err_buf); + + return success; + +} + +/* + * Try to switch all remaining tasks to the target patch state by walking the + * stacks of sleeping tasks and looking for any to-be-patched or + * to-be-unpatched functions. If such functions are found, the task can't be + * switched yet. + * + * If any tasks are still stuck in the initial patch state, schedule a retry. + */ +void klp_try_complete_transition(void) +{ + unsigned int cpu; + struct task_struct *g, *task; + bool complete = true; + + WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (klp_transition_patch->immediate) + goto success; + + /* + * Try to switch the tasks to the target patch state by walking their + * stacks and looking for any to-be-patched or to-be-unpatched + * functions. If such functions are found on a stack, or if the stack + * is deemed unreliable, the task can't be switched yet. + * + * Usually this will transition most (or all) of the tasks on a system + * unless the patch includes changes to a very common function. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + if (!klp_try_switch_task(task)) + complete = false; + read_unlock(&tasklist_lock); + + /* + * Ditto for the idle "swapper" tasks. + */ + get_online_cpus(); + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + if (cpu_online(cpu)) { + if (!klp_try_switch_task(task)) + complete = false; + } else if (task->patch_state != klp_target_state) { + /* offline idle tasks can be switched immediately */ + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + task->patch_state = klp_target_state; + } + } + put_online_cpus(); + + if (!complete) { + /* + * Some tasks weren't able to be switched over. Try again + * later and/or wait for other methods like kernel exit + * switching. + */ + schedule_delayed_work(&klp_transition_work, + round_jiffies_relative(HZ)); + return; + } + +success: + pr_notice("'%s': %s complete\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + + /* we're done, now cleanup the data structures */ + klp_complete_transition(); +} + +/* + * Start the transition to the specified target patch state so tasks can begin + * switching to it. + */ +void klp_start_transition(void) +{ + struct task_struct *g, *task; + unsigned int cpu; + + WARN_ON_ONCE(klp_target_state == KLP_UNDEFINED); + + pr_notice("'%s': %s...\n", klp_transition_patch->mod->name, + klp_target_state == KLP_PATCHED ? "patching" : "unpatching"); + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (klp_transition_patch->immediate) + return; + + /* + * Mark all normal tasks as needing a patch state update. They'll + * switch either in klp_try_complete_transition() or as they exit the + * kernel. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + if (task->patch_state != klp_target_state) + set_tsk_thread_flag(task, TIF_PATCH_PENDING); + read_unlock(&tasklist_lock); + + /* + * Mark all idle tasks as needing a patch state update. They'll switch + * either in klp_try_complete_transition() or at the idle loop switch + * point. + */ + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + if (task->patch_state != klp_target_state) + set_tsk_thread_flag(task, TIF_PATCH_PENDING); + } +} + +/* + * Initialize the global target patch state and all tasks to the initial patch + * state, and initialize all function transition states to true in preparation + * for patching or unpatching. + */ +void klp_init_transition(struct klp_patch *patch, int state) +{ + struct task_struct *g, *task; + unsigned int cpu; + struct klp_object *obj; + struct klp_func *func; + int initial_state = !state; + + WARN_ON_ONCE(klp_target_state != KLP_UNDEFINED); + + klp_transition_patch = patch; + + /* + * Set the global target patch state which tasks will switch to. This + * has no effect until the TIF_PATCH_PENDING flags get set later. + */ + klp_target_state = state; + + /* + * If the patch can be applied or reverted immediately, skip the + * per-task transitions. + */ + if (patch->immediate) + return; + + /* + * Initialize all tasks to the initial patch state to prepare them for + * switching to the target state. + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) { + WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + task->patch_state = initial_state; + } + read_unlock(&tasklist_lock); + + /* + * Ditto for the idle "swapper" tasks. + */ + for_each_possible_cpu(cpu) { + task = idle_task(cpu); + WARN_ON_ONCE(task->patch_state != KLP_UNDEFINED); + task->patch_state = initial_state; + } + + /* + * Enforce the order of the task->patch_state initializations and the + * func->transition updates to ensure that klp_ftrace_handler() doesn't + * see a func in transition with a task->patch_state of KLP_UNDEFINED. + * + * Also enforce the order of the klp_target_state write and future + * TIF_PATCH_PENDING writes to ensure klp_update_patch_state() doesn't + * set a task->patch_state to KLP_UNDEFINED. + */ + smp_wmb(); + + /* + * Set the func transition states so klp_ftrace_handler() will know to + * switch to the transition logic. + * + * When patching, the funcs aren't yet in the func_stack and will be + * made visible to the ftrace handler shortly by the calls to + * klp_patch_object(). + * + * When unpatching, the funcs are already in the func_stack and so are + * already visible to the ftrace handler. + */ + klp_for_each_object(patch, obj) + klp_for_each_func(obj, func) + func->transition = true; +} + +/* + * This function can be called in the middle of an existing transition to + * reverse the direction of the target patch state. This can be done to + * effectively cancel an existing enable or disable operation if there are any + * tasks which are stuck in the initial patch state. + */ +void klp_reverse_transition(void) +{ + unsigned int cpu; + struct task_struct *g, *task; + + klp_transition_patch->enabled = !klp_transition_patch->enabled; + + klp_target_state = !klp_target_state; + + /* + * Clear all TIF_PATCH_PENDING flags to prevent races caused by + * klp_update_patch_state() running in parallel with + * klp_start_transition(). + */ + read_lock(&tasklist_lock); + for_each_process_thread(g, task) + clear_tsk_thread_flag(task, TIF_PATCH_PENDING); + read_unlock(&tasklist_lock); + + for_each_possible_cpu(cpu) + clear_tsk_thread_flag(idle_task(cpu), TIF_PATCH_PENDING); + + /* Let any remaining calls to klp_update_patch_state() complete */ + synchronize_rcu(); + + klp_start_transition(); +} + +/* Called from copy_process() during fork */ +void klp_copy_process(struct task_struct *child) +{ + child->patch_state = current->patch_state; + + /* TIF_PATCH_PENDING gets copied in setup_thread_stack() */ +} diff --git a/kernel/livepatch/transition.h b/kernel/livepatch/transition.h new file mode 100644 index 0000000000000000000000000000000000000000..ce09b326546cd3bf71f6e0a300bfd865eb1677ca --- /dev/null +++ b/kernel/livepatch/transition.h @@ -0,0 +1,14 @@ +#ifndef _LIVEPATCH_TRANSITION_H +#define _LIVEPATCH_TRANSITION_H + +#include + +extern struct klp_patch *klp_transition_patch; + +void klp_init_transition(struct klp_patch *patch, int state); +void klp_cancel_transition(void); +void klp_start_transition(void); +void klp_try_complete_transition(void); +void klp_reverse_transition(void); + +#endif /* _LIVEPATCH_TRANSITION_H */ diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 34e97d970761671a804a9ba2dede77809f42c155..c0e31bfee25c2ab0cfefef14b9e24e2ce8a5c523 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -660,6 +661,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) struct lockdep_subclass_key *key; struct hlist_head *hash_head; struct lock_class *class; + bool is_static = false; if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); @@ -673,10 +675,23 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) /* * Static locks do not have their class-keys yet - for them the key - * is the lock object itself: + * is the lock object itself. If the lock is in the per cpu area, + * the canonical address of the lock (per cpu offset removed) is + * used. */ - if (unlikely(!lock->key)) - lock->key = (void *)lock; + if (unlikely(!lock->key)) { + unsigned long can_addr, addr = (unsigned long)lock; + + if (__is_kernel_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (__is_module_percpu_address(addr, &can_addr)) + lock->key = (void *)can_addr; + else if (static_obj(lock)) + lock->key = (void *)lock; + else + return ERR_PTR(-EINVAL); + is_static = true; + } /* * NOTE: the class-key must be unique. For dynamic locks, a static @@ -708,7 +723,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) } } - return NULL; + return is_static || static_obj(lock->key) ? NULL : ERR_PTR(-EINVAL); } /* @@ -726,19 +741,18 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) DEBUG_LOCKS_WARN_ON(!irqs_disabled()); class = look_up_lock_class(lock, subclass); - if (likely(class)) + if (likely(!IS_ERR_OR_NULL(class))) goto out_set_class_cache; /* * Debug-check: all keys must be persistent! - */ - if (!static_obj(lock->key)) { + */ + if (IS_ERR(class)) { debug_locks_off(); printk("INFO: trying to register non-static key.\n"); printk("the code is fine but needs lockdep annotation.\n"); printk("turning off the locking correctness validator.\n"); dump_stack(); - return NULL; } @@ -1144,10 +1158,10 @@ print_circular_bug_header(struct lock_list *entry, unsigned int depth, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: possible circular locking dependency detected ]\n"); + pr_warn("======================================================\n"); + pr_warn("WARNING: possible circular locking dependency detected\n"); print_kernel_ident(); - printk("-------------------------------------------------------\n"); + pr_warn("------------------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(check_src); @@ -1482,11 +1496,11 @@ print_bad_irq_dependency(struct task_struct *curr, return 0; printk("\n"); - printk("======================================================\n"); - printk("[ INFO: %s-safe -> %s-unsafe lock order detected ]\n", + pr_warn("=====================================================\n"); + pr_warn("WARNING: %s-safe -> %s-unsafe lock order detected\n", irqclass, irqclass); print_kernel_ident(); - printk("------------------------------------------------------\n"); + pr_warn("-----------------------------------------------------\n"); printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n", curr->comm, task_pid_nr(curr), curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT, @@ -1711,10 +1725,10 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev, return 0; printk("\n"); - printk("=============================================\n"); - printk("[ INFO: possible recursive locking detected ]\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: possible recursive locking detected\n"); print_kernel_ident(); - printk("---------------------------------------------\n"); + pr_warn("--------------------------------------------\n"); printk("%s/%d is trying to acquire lock:\n", curr->comm, task_pid_nr(curr)); print_lock(next); @@ -2061,10 +2075,10 @@ static void print_collision(struct task_struct *curr, struct lock_chain *chain) { printk("\n"); - printk("======================\n"); - printk("[chain_key collision ]\n"); + pr_warn("============================\n"); + pr_warn("WARNING: chain_key collision\n"); print_kernel_ident(); - printk("----------------------\n"); + pr_warn("----------------------------\n"); printk("%s/%d: ", current->comm, task_pid_nr(current)); printk("Hash chain already cached but the contents don't match!\n"); @@ -2360,10 +2374,10 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this, return 0; printk("\n"); - printk("=================================\n"); - printk("[ INFO: inconsistent lock state ]\n"); + pr_warn("================================\n"); + pr_warn("WARNING: inconsistent lock state\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("--------------------------------\n"); printk("inconsistent {%s} -> {%s} usage.\n", usage_str[prev_bit], usage_str[new_bit]); @@ -2425,10 +2439,10 @@ print_irq_inversion_bug(struct task_struct *curr, return 0; printk("\n"); - printk("=========================================================\n"); - printk("[ INFO: possible irq lock inversion dependency detected ]\n"); + pr_warn("========================================================\n"); + pr_warn("WARNING: possible irq lock inversion dependency detected\n"); print_kernel_ident(); - printk("---------------------------------------------------------\n"); + pr_warn("--------------------------------------------------------\n"); printk("%s/%d just changed the state of lock:\n", curr->comm, task_pid_nr(curr)); print_lock(this); @@ -2863,6 +2877,8 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (unlikely(!debug_locks)) return; + gfp_mask = current_gfp_context(gfp_mask); + /* no reclaim without waiting on it */ if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; @@ -2872,7 +2888,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* We're only interested __GFP_FS allocations for now */ - if (!(gfp_mask & __GFP_FS)) + if (!(gfp_mask & __GFP_FS) || (curr->flags & PF_MEMALLOC_NOFS)) return; /* @@ -2881,6 +2897,10 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) if (DEBUG_LOCKS_WARN_ON(irqs_disabled_flags(flags))) return; + /* Disable lockdep if explicitly requested */ + if (gfp_mask & __GFP_NOLOCKDEP) + return; + mark_held_locks(curr, RECLAIM_FS); } @@ -3170,10 +3190,10 @@ print_lock_nested_lock_not_held(struct task_struct *curr, return 0; printk("\n"); - printk("==================================\n"); - printk("[ BUG: Nested lock was not taken ]\n"); + pr_warn("==================================\n"); + pr_warn("WARNING: Nested lock was not taken\n"); print_kernel_ident(); - printk("----------------------------------\n"); + pr_warn("----------------------------------\n"); printk("%s/%d is trying to lock:\n", curr->comm, task_pid_nr(curr)); print_lock(hlock); @@ -3383,10 +3403,10 @@ print_unlock_imbalance_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: bad unlock balance detected! ]\n"); + pr_warn("=====================================\n"); + pr_warn("WARNING: bad unlock balance detected!\n"); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("-------------------------------------\n"); printk("%s/%d is trying to release lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3419,7 +3439,7 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) * Clearly if the lock hasn't been acquired _ever_, we're not * holding it either, so report failure. */ - if (!class) + if (IS_ERR_OR_NULL(class)) return 0; /* @@ -3437,13 +3457,67 @@ static int match_held_lock(struct held_lock *hlock, struct lockdep_map *lock) return 0; } +/* @depth must not be zero */ +static struct held_lock *find_held_lock(struct task_struct *curr, + struct lockdep_map *lock, + unsigned int depth, int *idx) +{ + struct held_lock *ret, *hlock, *prev_hlock; + int i; + + i = depth - 1; + hlock = curr->held_locks + i; + ret = hlock; + if (match_held_lock(hlock, lock)) + goto out; + + ret = NULL; + for (i--, prev_hlock = hlock--; + i >= 0; + i--, prev_hlock = hlock--) { + /* + * We must not cross into another context: + */ + if (prev_hlock->irq_context != hlock->irq_context) { + ret = NULL; + break; + } + if (match_held_lock(hlock, lock)) { + ret = hlock; + break; + } + } + +out: + *idx = i; + return ret; +} + +static int reacquire_held_locks(struct task_struct *curr, unsigned int depth, + int idx) +{ + struct held_lock *hlock; + + for (hlock = curr->held_locks + idx; idx < depth; idx++, hlock++) { + if (!__lock_acquire(hlock->instance, + hlock_class(hlock)->subclass, + hlock->trylock, + hlock->read, hlock->check, + hlock->hardirqs_off, + hlock->nest_lock, hlock->acquire_ip, + hlock->references, hlock->pin_count)) + return 1; + } + return 0; +} + static int __lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class *class; unsigned int depth; int i; @@ -3456,21 +3530,10 @@ __lock_set_class(struct lockdep_map *lock, const char *name, if (DEBUG_LOCKS_WARN_ON(!depth)) return 0; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: lockdep_init_map(lock, name, key, 0); class = register_lock_class(lock, subclass, 0); hlock->class_idx = class - lock_classes + 1; @@ -3478,15 +3541,46 @@ __lock_set_class(struct lockdep_map *lock, const char *name, curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i)) + return 0; + + /* + * I took it apart and put it back together again, except now I have + * these 'spare' parts.. where shall I put them. + */ + if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth)) + return 0; + return 1; +} + +static int __lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + struct task_struct *curr = current; + struct held_lock *hlock; + unsigned int depth; + int i; + + depth = curr->lockdep_depth; + /* + * This function is about (re)setting the class of a held lock, + * yet we're not actually holding any locks. Naughty user! + */ + if (DEBUG_LOCKS_WARN_ON(!depth)) + return 0; + + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); + + curr->lockdep_depth = i; + curr->curr_chain_key = hlock->prev_chain_key; + + WARN(hlock->read, "downgrading a read lock"); + hlock->read = 1; + hlock->acquire_ip = ip; + + if (reacquire_held_locks(curr, depth, i)) + return 0; /* * I took it apart and put it back together again, except now I have @@ -3508,7 +3602,7 @@ static int __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; unsigned int depth; int i; @@ -3527,21 +3621,10 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) * Check whether the lock exists in the current stack * of held locks: */ - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; - } - return print_unlock_imbalance_bug(curr, lock, ip); + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) + return print_unlock_imbalance_bug(curr, lock, ip); -found_it: if (hlock->instance == lock) lock_release_holdtime(hlock); @@ -3568,15 +3651,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip) curr->lockdep_depth = i; curr->curr_chain_key = hlock->prev_chain_key; - for (i++; i < depth; i++) { - hlock = curr->held_locks + i; - if (!__lock_acquire(hlock->instance, - hlock_class(hlock)->subclass, hlock->trylock, - hlock->read, hlock->check, hlock->hardirqs_off, - hlock->nest_lock, hlock->acquire_ip, - hlock->references, hlock->pin_count)) - return 0; - } + if (reacquire_held_locks(curr, depth, i + 1)) + return 0; /* * We had N bottles of beer on the wall, we drank one, but now @@ -3741,6 +3817,23 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); +void lock_downgrade(struct lockdep_map *lock, unsigned long ip) +{ + unsigned long flags; + + if (unlikely(current->lockdep_recursion)) + return; + + raw_local_irq_save(flags); + current->lockdep_recursion = 1; + check_flags(flags); + if (__lock_downgrade(lock, ip)) + check_chain_key(current); + current->lockdep_recursion = 0; + raw_local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(lock_downgrade); + /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -3861,7 +3954,7 @@ EXPORT_SYMBOL_GPL(lock_unpin_lock); void lockdep_set_current_reclaim_state(gfp_t gfp_mask) { - current->lockdep_reclaim_gfp = gfp_mask; + current->lockdep_reclaim_gfp = current_gfp_context(gfp_mask); } EXPORT_SYMBOL_GPL(lockdep_set_current_reclaim_state); @@ -3882,10 +3975,10 @@ print_lock_contention_bug(struct task_struct *curr, struct lockdep_map *lock, return 0; printk("\n"); - printk("=================================\n"); - printk("[ BUG: bad contention detected! ]\n"); + pr_warn("=================================\n"); + pr_warn("WARNING: bad contention detected!\n"); print_kernel_ident(); - printk("---------------------------------\n"); + pr_warn("---------------------------------\n"); printk("%s/%d is trying to contend lock (", curr->comm, task_pid_nr(curr)); print_lockdep_cache(lock); @@ -3905,7 +3998,7 @@ static void __lock_contended(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; int i, contention_point, contending_point; @@ -3918,22 +4011,12 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, ip); + return; } - print_lock_contention_bug(curr, lock, ip); - return; -found_it: if (hlock->instance != lock) return; @@ -3957,7 +4040,7 @@ static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { struct task_struct *curr = current; - struct held_lock *hlock, *prev_hlock; + struct held_lock *hlock; struct lock_class_stats *stats; unsigned int depth; u64 now, waittime = 0; @@ -3971,22 +4054,12 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip) if (DEBUG_LOCKS_WARN_ON(!depth)) return; - prev_hlock = NULL; - for (i = depth-1; i >= 0; i--) { - hlock = curr->held_locks + i; - /* - * We must not cross into another context: - */ - if (prev_hlock && prev_hlock->irq_context != hlock->irq_context) - break; - if (match_held_lock(hlock, lock)) - goto found_it; - prev_hlock = hlock; + hlock = find_held_lock(curr, lock, depth, &i); + if (!hlock) { + print_lock_contention_bug(curr, lock, _RET_IP_); + return; } - print_lock_contention_bug(curr, lock, _RET_IP_); - return; -found_it: if (hlock->instance != lock) return; @@ -4174,7 +4247,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) * If the class exists we look it up and zap it: */ class = look_up_lock_class(lock, j); - if (class) + if (!IS_ERR_OR_NULL(class)) zap_class(class); } /* @@ -4246,10 +4319,10 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, return; printk("\n"); - printk("=========================\n"); - printk("[ BUG: held lock freed! ]\n"); + pr_warn("=========================\n"); + pr_warn("WARNING: held lock freed!\n"); print_kernel_ident(); - printk("-------------------------\n"); + pr_warn("-------------------------\n"); printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", curr->comm, task_pid_nr(curr), mem_from, mem_to-1); print_lock(hlock); @@ -4304,11 +4377,11 @@ static void print_held_locks_bug(void) return; printk("\n"); - printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", + pr_warn("====================================\n"); + pr_warn("WARNING: %s/%d still has locks held!\n", current->comm, task_pid_nr(current)); print_kernel_ident(); - printk("-------------------------------------\n"); + pr_warn("------------------------------------\n"); lockdep_print_held_locks(current); printk("\nstack backtrace:\n"); dump_stack(); @@ -4373,7 +4446,7 @@ void debug_show_all_locks(void) } while_each_thread(g, p); printk("\n"); - printk("=============================================\n\n"); + pr_warn("=============================================\n\n"); if (unlock) read_unlock(&tasklist_lock); @@ -4403,10 +4476,10 @@ asmlinkage __visible void lockdep_sys_exit(void) if (!debug_locks_off()) return; printk("\n"); - printk("================================================\n"); - printk("[ BUG: lock held when returning to user space! ]\n"); + pr_warn("================================================\n"); + pr_warn("WARNING: lock held when returning to user space!\n"); print_kernel_ident(); - printk("------------------------------------------------\n"); + pr_warn("------------------------------------------------\n"); printk("%s/%d is leaving the kernel with locks still held!\n", curr->comm, curr->pid); lockdep_print_held_locks(curr); @@ -4423,13 +4496,13 @@ void lockdep_rcu_suspicious(const char *file, const int line, const char *s) #endif /* #ifdef CONFIG_PROVE_RCU_REPEATEDLY */ /* Note: the following can be executed concurrently, so be careful. */ printk("\n"); - pr_err("===============================\n"); - pr_err("[ ERR: suspicious RCU usage. ]\n"); + pr_warn("=============================\n"); + pr_warn("WARNING: suspicious RCU usage\n"); print_kernel_ident(); - pr_err("-------------------------------\n"); - pr_err("%s:%d %s!\n", file, line, s); - pr_err("\nother info that might help us debug this:\n\n"); - pr_err("\n%srcu_scheduler_active = %d, debug_locks = %d\n", + pr_warn("-----------------------------\n"); + printk("%s:%d %s!\n", file, line, s); + printk("\nother info that might help us debug this:\n\n"); + printk("\n%srcu_scheduler_active = %d, debug_locks = %d\n", !rcu_lockdep_current_cpu_online() ? "RCU used illegally from offline CPU!\n" : !rcu_is_watching() diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h index c2b88490d857583026a35090b62f7891446b7ba2..c08fbd2f5ba9fa2a806f326a3a85f5d021d74027 100644 --- a/kernel/locking/lockdep_internals.h +++ b/kernel/locking/lockdep_internals.h @@ -46,13 +46,13 @@ enum { (LOCKF_USED_IN_HARDIRQ_READ | LOCKF_USED_IN_SOFTIRQ_READ) /* - * CONFIG_PROVE_LOCKING_SMALL is defined for sparc. Sparc requires .text, + * CONFIG_LOCKDEP_SMALL is defined for sparc. Sparc requires .text, * .data and .bss to fit in required 32MB limit for the kernel. With - * PROVE_LOCKING we could go over this limit and cause system boot-up problems. + * CONFIG_LOCKDEP we could go over this limit and cause system boot-up problems. * So, reduce the static allocations for lockdeps related structures so that * everything fits in current required size limit. */ -#ifdef CONFIG_PROVE_LOCKING_SMALL +#ifdef CONFIG_LOCKDEP_SMALL /* * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies * we track. diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 97ee9df32e0f0395dc79dc0236557bd1103047c5..58e366ad36f4e9bf87bf487966a91130feb4f7d7 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -102,10 +102,11 @@ void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) return; } - printk("\n============================================\n"); - printk( "[ BUG: circular locking deadlock detected! ]\n"); - printk("%s\n", print_tainted()); - printk( "--------------------------------------------\n"); + pr_warn("\n"); + pr_warn("============================================\n"); + pr_warn("WARNING: circular locking deadlock detected!\n"); + pr_warn("%s\n", print_tainted()); + pr_warn("--------------------------------------------\n"); printk("%s/%d is deadlocking current task %s/%d\n\n", task->comm, task_pid_nr(task), current->comm, task_pid_nr(current)); @@ -174,12 +175,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; } -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index d0519c3432b678b8c5de116b939cde12fa6db152..b585af9a1b508ea28007a05f4804606c336f14bd 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6edc32ecd9c54446e0bfbe692c407afa07433e6c..b9550941690915bc23323cd772e060cda7d3a704 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -224,6 +224,12 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, } #endif +/* + * Only use with rt_mutex_waiter_{less,equal}() + */ +#define task_to_waiter(p) \ + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, struct rt_mutex_waiter *right) @@ -238,12 +244,30 @@ rt_mutex_waiter_less(struct rt_mutex_waiter *left, * then right waiter has a dl_prio() too. */ if (dl_prio(left->prio)) - return dl_time_before(left->task->dl.deadline, - right->task->dl.deadline); + return dl_time_before(left->deadline, right->deadline); return 0; } +static inline int +rt_mutex_waiter_equal(struct rt_mutex_waiter *left, + struct rt_mutex_waiter *right) +{ + if (left->prio != right->prio) + return 0; + + /* + * If both waiters have dl_prio(), we check the deadlines of the + * associated tasks. + * If left waiter has a dl_prio(), and we didn't return 0 above, + * then right waiter has a dl_prio() too. + */ + if (dl_prio(left->prio)) + return left->deadline == right->deadline; + + return 1; +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { @@ -322,72 +346,16 @@ rt_mutex_dequeue_pi(struct task_struct *task, struct rt_mutex_waiter *waiter) RB_CLEAR_NODE(&waiter->pi_tree_entry); } -/* - * Calculate task priority from the waiter tree priority - * - * Return task->normal_prio when the waiter tree is empty or when - * the waiter is not allowed to do priority boosting - */ -int rt_mutex_getprio(struct task_struct *task) -{ - if (likely(!task_has_pi_waiters(task))) - return task->normal_prio; - - return min(task_top_pi_waiter(task)->prio, - task->normal_prio); -} - -struct task_struct *rt_mutex_get_top_task(struct task_struct *task) +static void rt_mutex_adjust_prio(struct task_struct *p) { - if (likely(!task_has_pi_waiters(task))) - return NULL; - - return task_top_pi_waiter(task)->task; -} + struct task_struct *pi_task = NULL; -/* - * Called by sched_setscheduler() to get the priority which will be - * effective after the change. - */ -int rt_mutex_get_effective_prio(struct task_struct *task, int newprio) -{ - if (!task_has_pi_waiters(task)) - return newprio; + lockdep_assert_held(&p->pi_lock); - if (task_top_pi_waiter(task)->task->prio <= newprio) - return task_top_pi_waiter(task)->task->prio; - return newprio; -} + if (task_has_pi_waiters(p)) + pi_task = task_top_pi_waiter(p)->task; -/* - * Adjust the priority of a task, after its pi_waiters got modified. - * - * This can be both boosting and unboosting. task->pi_lock must be held. - */ -static void __rt_mutex_adjust_prio(struct task_struct *task) -{ - int prio = rt_mutex_getprio(task); - - if (task->prio != prio || dl_prio(prio)) - rt_mutex_setprio(task, prio); -} - -/* - * Adjust task priority (undo boosting). Called from the exit path of - * rt_mutex_slowunlock() and rt_mutex_slowlock(). - * - * (Note: We do this outside of the protection of lock->wait_lock to - * allow the lock to be taken while or before we readjust the priority - * of task. We do not use the spin_xx_mutex() variants here as we are - * outside of the debug path.) - */ -void rt_mutex_adjust_prio(struct task_struct *task) -{ - unsigned long flags; - - raw_spin_lock_irqsave(&task->pi_lock, flags); - __rt_mutex_adjust_prio(task); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + rt_mutex_setprio(p, pi_task); } /* @@ -610,7 +578,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * enabled we continue, but stop the requeueing in the chain * walk. */ - if (waiter->prio == task->prio) { + if (rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { if (!detect_deadlock) goto out_unlock_pi; else @@ -706,7 +674,26 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* [7] Requeue the waiter in the lock waiter tree. */ rt_mutex_dequeue(lock, waiter); + + /* + * Update the waiter prio fields now that we're dequeued. + * + * These values can have changed through either: + * + * sys_sched_set_scheduler() / sys_sched_setattr() + * + * or + * + * DL CBS enforcement advancing the effective deadline. + * + * Even though pi_waiters also uses these fields, and that tree is only + * updated in [11], we can do this here, since we hold [L], which + * serializes all pi_waiters access and rb_erase() does not care about + * the values of the node being removed. + */ waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; + rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ @@ -747,7 +734,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ rt_mutex_dequeue_pi(task, prerequeue_top_waiter); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else if (prerequeue_top_waiter == waiter) { /* @@ -763,7 +750,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_dequeue_pi(task, waiter); waiter = rt_mutex_top_waiter(lock); rt_mutex_enqueue_pi(task, waiter); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); } else { /* * Nothing changed. No need to do any priority @@ -833,6 +820,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { + lockdep_assert_held(&lock->wait_lock); + /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -892,7 +881,8 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * the top waiter priority (kernel view), * @task lost. */ - if (task->prio >= rt_mutex_top_waiter(lock)->prio) + if (!rt_mutex_waiter_less(task_to_waiter(task), + rt_mutex_top_waiter(lock))) return 0; /* @@ -938,8 +928,6 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, task); - return 1; } @@ -960,6 +948,8 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex *next_lock; int chain_walk = 0, res; + lockdep_assert_held(&lock->wait_lock); + /* * Early deadlock detection. We really don't want the task to * enqueue on itself just to untangle the mess later. It's not @@ -973,10 +963,11 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); - __rt_mutex_adjust_prio(task); + rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; + waiter->deadline = task->dl.deadline; /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) @@ -995,7 +986,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); if (owner->pi_blocked_on) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { @@ -1047,12 +1038,14 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, waiter = rt_mutex_top_waiter(lock); /* - * Remove it from current->pi_waiters. We do not adjust a - * possible priority boost right now. We execute wakeup in the - * boosted mode and go back to normal after releasing - * lock->wait_lock. + * Remove it from current->pi_waiters and deboost. + * + * We must in fact deboost here in order to ensure we call + * rt_mutex_setprio() to update p->pi_top_task before the + * task unblocks. */ rt_mutex_dequeue_pi(current, waiter); + rt_mutex_adjust_prio(current); /* * As we are waking up the top waiter, and the waiter stays @@ -1064,9 +1057,19 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock(¤t->pi_lock); - + /* + * We deboosted before waking the top waiter task such that we don't + * run two tasks with the 'same' priority (and ensure the + * p->pi_top_task pointer points to a blocked task). This however can + * lead to priority inversion if we would get preempted after the + * deboost but before waking our donor task, hence the preempt_disable() + * before unlock. + * + * Pairs with preempt_enable() in rt_mutex_postunlock(); + */ + preempt_disable(); wake_q_add(wake_q, waiter->task); + raw_spin_unlock(¤t->pi_lock); } /* @@ -1082,6 +1085,8 @@ static void remove_waiter(struct rt_mutex *lock, struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; + lockdep_assert_held(&lock->wait_lock); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; @@ -1101,7 +1106,7 @@ static void remove_waiter(struct rt_mutex *lock, if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(owner, rt_mutex_top_waiter(lock)); - __rt_mutex_adjust_prio(owner); + rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); @@ -1140,8 +1145,7 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || (waiter->prio == task->prio && - !dl_prio(task->prio))) { + if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } @@ -1155,6 +1159,14 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1237,9 +1249,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, unsigned long flags; int ret = 0; - debug_rt_mutex_init_waiter(&waiter); - RB_CLEAR_NODE(&waiter.pi_tree_entry); - RB_CLEAR_NODE(&waiter.tree_entry); + rt_mutex_init_waiter(&waiter); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can @@ -1330,7 +1340,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) /* * Slow path to release a rt-mutex. - * Return whether the current task needs to undo a potential priority boosting. + * + * Return whether the current task needs to call rt_mutex_postunlock(). */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) @@ -1342,8 +1353,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1390,11 +1399,9 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * Queue the next waiter for wakeup once we release the wait_lock. */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - /* check PI boosting */ - return true; + return true; /* call rt_mutex_postunlock() */ } /* @@ -1409,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, enum rtmutex_chainwalk chwalk)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); } static inline int @@ -1425,24 +1431,33 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk); + + return slowfn(lock, state, timeout, chwalk); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); } +/* + * Performs the wakeup of the the top-waiter and re-enables preemption. + */ +void rt_mutex_postunlock(struct wake_q_head *wake_q) +{ + wake_up_q(wake_q); + + /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ + preempt_enable(); +} + static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, @@ -1450,18 +1465,11 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { DEFINE_WAKE_Q(wake_q); - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - - } else { - bool deboost = slowfn(lock, &wake_q); - - wake_up_q(&wake_q); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + if (slowfn(lock, &wake_q)) + rt_mutex_postunlock(&wake_q); } /** @@ -1495,16 +1503,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /* - * Futex variant with full deadlock detection. + * Futex variant, must not use fastpath. */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout) +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) { - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, - rt_mutex_slowlock); + return rt_mutex_slowtrylock(lock); } /** @@ -1563,20 +1566,43 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ } - return rt_mutex_slowunlock(lock, wqh); + + /* + * We've already deboosted, mark_wakeup_next_waiter() will + * retain preempt_disabled when we drop the wait_lock, to + * avoid inversion prior to the wakeup. preempt_disable() + * therein pairs with rt_mutex_postunlock(). + */ + mark_wakeup_next_waiter(wake_q, lock); + + return true; /* call postunlock() */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + DEFINE_WAKE_Q(wake_q); + bool postunlock; + + raw_spin_lock_irq(&lock->wait_lock); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (postunlock) + rt_mutex_postunlock(&wake_q); } /** @@ -1637,7 +1663,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, __rt_mutex_init(lock, NULL); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); } /** @@ -1657,34 +1682,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, @@ -1703,13 +1710,37 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + /** * rt_mutex_next_owner - return the next owner of the lock * @@ -1731,21 +1762,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) } /** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Complete the lock acquisition started our behalf by another thread. + * Wait for the the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). * * Returns: * 0 - success * <0 - error, one of -EINTR, -ETIMEDOUT * - * Special API call for PI-futex requeue support + * Special API call for PI-futex support */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -1758,8 +1791,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); - if (unlikely(ret)) + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); + fixup_rt_mutex_waiters(lock); + cleanup = true; + } /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -1769,5 +1839,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock_irq(&lock->wait_lock); - return ret; + return cleanup; } diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index c4060584c40769a3174aebcf7cc7ff245d7363c3..6607802efa8bd3e5fb93b72b88f613b24c158dc1 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */ #define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index 856dfff5c33ab5a24fd0464032daf1bf8b81e351..72ad45a9a79439c53f73afee4ab87330957a0ff2 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -34,6 +34,7 @@ struct rt_mutex_waiter { struct rt_mutex *deadlock_lock; #endif int prio; + u64 deadline; }; /* @@ -103,16 +104,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); -extern void rt_mutex_adjust_prio(struct task_struct *task); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 90a74ccd85a4b979295828bddee80b6dbddb62a9..4d48b1c4870dda0b33c7f93e1aed01fe8b6ceb61 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -124,10 +124,8 @@ EXPORT_SYMBOL(up_write); */ void downgrade_write(struct rw_semaphore *sem) { - /* - * lockdep: a downgraded write will live on as a write - * dependency. - */ + lock_downgrade(&sem->dep_map, _RET_IP_); + rwsem_set_reader_owned(sem); __downgrade_write(sem); } diff --git a/kernel/locking/test-ww_mutex.c b/kernel/locking/test-ww_mutex.c index 6b7abb334ca6027dd2b189a211671fa98a33be82..39f56c870051b505611746ff0033cc6a7f6e1a10 100644 --- a/kernel/locking/test-ww_mutex.c +++ b/kernel/locking/test-ww_mutex.c @@ -353,8 +353,8 @@ static int test_cycle(unsigned int ncpus) struct stress { struct work_struct work; struct ww_mutex *locks; + unsigned long timeout; int nlocks; - int nloops; }; static int *get_random_order(int count) @@ -398,12 +398,11 @@ static void stress_inorder_work(struct work_struct *work) if (!order) return; - ww_acquire_init(&ctx, &ww_class); - do { int contended = -1; int n, err; + ww_acquire_init(&ctx, &ww_class); retry: err = 0; for (n = 0; n < nlocks; n++) { @@ -433,9 +432,9 @@ static void stress_inorder_work(struct work_struct *work) __func__, err); break; } - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); kfree(order); kfree(stress); @@ -470,9 +469,9 @@ static void stress_reorder_work(struct work_struct *work) kfree(order); order = NULL; - ww_acquire_init(&ctx, &ww_class); - do { + ww_acquire_init(&ctx, &ww_class); + list_for_each_entry(ll, &locks, link) { err = ww_mutex_lock(ll->lock, &ctx); if (!err) @@ -495,9 +494,9 @@ static void stress_reorder_work(struct work_struct *work) dummy_load(stress); list_for_each_entry(ll, &locks, link) ww_mutex_unlock(ll->lock); - } while (--stress->nloops); - ww_acquire_fini(&ctx); + ww_acquire_fini(&ctx); + } while (!time_after(jiffies, stress->timeout)); out: list_for_each_entry_safe(ll, ln, &locks, link) @@ -523,7 +522,7 @@ static void stress_one_work(struct work_struct *work) __func__, err); break; } - } while (--stress->nloops); + } while (!time_after(jiffies, stress->timeout)); kfree(stress); } @@ -533,7 +532,7 @@ static void stress_one_work(struct work_struct *work) #define STRESS_ONE BIT(2) #define STRESS_ALL (STRESS_INORDER | STRESS_REORDER | STRESS_ONE) -static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) +static int stress(int nlocks, int nthreads, unsigned int flags) { struct ww_mutex *locks; int n; @@ -575,7 +574,7 @@ static int stress(int nlocks, int nthreads, int nloops, unsigned int flags) INIT_WORK(&stress->work, fn); stress->locks = locks; stress->nlocks = nlocks; - stress->nloops = nloops; + stress->timeout = jiffies + 2*HZ; queue_work(wq, &stress->work); nthreads--; @@ -619,15 +618,15 @@ static int __init test_ww_mutex_init(void) if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_INORDER); + ret = stress(16, 2*ncpus, STRESS_INORDER); if (ret) return ret; - ret = stress(16, 2*ncpus, 1<<10, STRESS_REORDER); + ret = stress(16, 2*ncpus, STRESS_REORDER); if (ret) return ret; - ret = stress(4095, hweight32(STRESS_ALL)*ncpus, 1<<12, STRESS_ALL); + ret = stress(4095, hweight32(STRESS_ALL)*ncpus, STRESS_ALL); if (ret) return ret; diff --git a/kernel/memremap.c b/kernel/memremap.c index 07e85e5229da849d33391f97234c1e1fff2c5ce1..23a6483c3666e35fec7f927d2f6cd4a2e38dbe81 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -182,18 +182,6 @@ struct page_map { struct vmem_altmap altmap; }; -void get_zone_device_page(struct page *page) -{ - percpu_ref_get(page->pgmap->ref); -} -EXPORT_SYMBOL(get_zone_device_page); - -void put_zone_device_page(struct page *page) -{ - put_dev_pagemap(page->pgmap); -} -EXPORT_SYMBOL(put_zone_device_page); - static void pgmap_radix_release(struct resource *res) { resource_size_t key, align_start, align_size, align_end; @@ -237,6 +225,10 @@ static void devm_memremap_pages_release(struct device *dev, void *data) struct resource *res = &page_map->res; resource_size_t align_start, align_size; struct dev_pagemap *pgmap = &page_map->pgmap; + unsigned long pfn; + + for_each_device_pfn(pfn, page_map) + put_page(pfn_to_page(pfn)); if (percpu_ref_tryget_live(pgmap->ref)) { dev_WARN(dev, "%s: page mapping is still live!\n", __func__); @@ -277,7 +269,10 @@ struct dev_pagemap *find_dev_pagemap(resource_size_t phys) * * Notes: * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time - * (or devm release event). + * (or devm release event). The expected order of events is that @ref has + * been through percpu_ref_kill() before devm_memremap_pages_release(). The + * wait for the completion of all references being dropped and + * percpu_ref_exit() must occur after devm_memremap_pages_release(). * * 2/ @res is expected to be a host memory range that could feasibly be * treated as a "System RAM" range, i.e. not a device mmio range, but @@ -379,6 +374,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, */ list_del(&page->lru); page->pgmap = pgmap; + percpu_ref_get(ref); } devres_add(dev, page_map); return __va(res->start); diff --git a/kernel/module.c b/kernel/module.c index 7eba6dea4f417dbf363731cf2a3cc374ce1d703b..4a3665f8f8372aab68b658c24c30678806823eee 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -49,6 +49,9 @@ #include #include #include +#ifdef CONFIG_STRICT_MODULE_RWX +#include +#endif #include #include #include @@ -665,16 +668,7 @@ static void percpu_modcopy(struct module *mod, memcpy(per_cpu_ptr(mod->percpu, cpu), from, size); } -/** - * is_module_percpu_address - test whether address is from module static percpu - * @addr: address to test - * - * Test whether @addr belongs to module static percpu area. - * - * RETURNS: - * %true if @addr is from module static percpu area - */ -bool is_module_percpu_address(unsigned long addr) +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) { struct module *mod; unsigned int cpu; @@ -688,9 +682,15 @@ bool is_module_percpu_address(unsigned long addr) continue; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(mod->percpu, cpu); - - if ((void *)addr >= start && - (void *)addr < start + mod->percpu_size) { + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } preempt_enable(); return true; } @@ -701,6 +701,20 @@ bool is_module_percpu_address(unsigned long addr) return false; } +/** + * is_module_percpu_address - test whether address is from module static percpu + * @addr: address to test + * + * Test whether @addr belongs to module static percpu area. + * + * RETURNS: + * %true if @addr is from module static percpu area + */ +bool is_module_percpu_address(unsigned long addr) +{ + return __is_module_percpu_address(addr, NULL); +} + #else /* ... !CONFIG_SMP */ static inline void __percpu *mod_percpu(struct module *mod) @@ -732,6 +746,11 @@ bool is_module_percpu_address(unsigned long addr) return false; } +bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) +{ + return false; +} + #endif /* CONFIG_SMP */ #define MODINFO_ATTR(field) \ @@ -947,6 +966,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, return -EFAULT; name[MODULE_NAME_LEN-1] = '\0'; + audit_log_kern_module(name); + if (mutex_lock_interruptible(&module_mutex) != 0) return -EINTR; @@ -2846,7 +2867,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, /* Suck in entire file: we'll want most of it. */ info->hdr = __vmalloc(info->len, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_NOWARN, PAGE_KERNEL); + GFP_KERNEL | __GFP_NOWARN, PAGE_KERNEL); if (!info->hdr) return -ENOMEM; @@ -4017,7 +4038,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) /* Don't lock: we're in enough trouble already. */ preempt_disable(); - if ((colon = strchr(name, ':')) != NULL) { + if ((colon = strnchr(name, MODULE_NAME_LEN, ':')) != NULL) { if ((mod = find_module_all(name, colon - name, false)) != NULL) ret = mod_find_symname(mod, colon+1); } else { diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 782102e59eed5b4b5379126b865fb9c795892cdd..f6c5d330059ac7996a1aeb7a4c4fdfad128d847d 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -26,6 +26,7 @@ #include #include #include +#include static struct kmem_cache *nsproxy_cachep; @@ -262,6 +263,8 @@ SYSCALL_DEFINE2(setns, int, fd, int, nstype) goto out; } switch_task_namespaces(tsk, new_nsproxy); + + perf_event_namespaces(tsk); out: fput(file); return err; diff --git a/kernel/padata.c b/kernel/padata.c index 3202aa17492c808af5331044de710a2f34e277a3..ac8f1e524836b37a0d305e412b19d6aeeb50c133 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -154,8 +154,6 @@ EXPORT_SYMBOL(padata_do_parallel); * A pointer to the control struct of the next object that needs * serialization, if present in one of the percpu reorder queues. * - * NULL, if all percpu reorder queues are empty. - * * -EINPROGRESS, if the next object that needs serialization will * be parallel processed by another cpu and is not yet present in * the cpu's reorder queue. @@ -182,8 +180,6 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd) cpu = padata_index_to_cpu(pd, next_index); next_queue = per_cpu_ptr(pd->pqueue, cpu); - padata = NULL; - reorder = &next_queue->reorder; spin_lock(&reorder->lock); @@ -235,12 +231,11 @@ static void padata_reorder(struct parallel_data *pd) padata = padata_get_next(pd); /* - * All reorder queues are empty, or the next object that needs - * serialization is parallel processed by another cpu and is - * still on it's way to the cpu's reorder queue, nothing to - * do for now. + * If the next object that needs serialization is parallel + * processed by another cpu and is still on it's way to the + * cpu's reorder queue, nothing to do for now. */ - if (!padata || PTR_ERR(padata) == -EINPROGRESS) + if (PTR_ERR(padata) == -EINPROGRESS) break; /* @@ -354,7 +349,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd, cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask); if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) { - free_cpumask_var(pd->cpumask.cbcpu); + free_cpumask_var(pd->cpumask.pcpu); return -ENOMEM; } diff --git a/kernel/params.c b/kernel/params.c index a6d6149c0fe60df1ca38d9a66acef281b78ee79d..60b2d8101355fedcfd8dfee99f8a2fa467ea99d9 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -160,58 +160,6 @@ static int parse_one(char *param, return -ENOENT; } -/* You can use " around spaces, but can't escape ". */ -/* Hyphens and underscores equivalent in parameter names. */ -static char *next_arg(char *args, char **param, char **val) -{ - unsigned int i, equals = 0; - int in_quote = 0, quoted = 0; - char *next; - - if (*args == '"') { - args++; - in_quote = 1; - quoted = 1; - } - - for (i = 0; args[i]; i++) { - if (isspace(args[i]) && !in_quote) - break; - if (equals == 0) { - if (args[i] == '=') - equals = i; - } - if (args[i] == '"') - in_quote = !in_quote; - } - - *param = args; - if (!equals) - *val = NULL; - else { - args[equals] = '\0'; - *val = args + equals + 1; - - /* Don't include quotes in value. */ - if (**val == '"') { - (*val)++; - if (args[i-1] == '"') - args[i-1] = '\0'; - } - } - if (quoted && args[i-1] == '"') - args[i-1] = '\0'; - - if (args[i]) { - args[i] = '\0'; - next = args + i + 1; - } else - next = args + i; - - /* Chew up trailing spaces. */ - return skip_spaces(next); -} - /* Args looks like "foo=bar,bar2 baz=fuz wiz". */ char *parse_args(const char *doing, char *args, diff --git a/kernel/pid.c b/kernel/pid.c index 0143ac0ddceb9c79a2c686e6b10b7265b7b5d925..fd1cde1e45760df67a6cc54fa97959061660ab3f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -321,8 +321,10 @@ struct pid *alloc_pid(struct pid_namespace *ns) } if (unlikely(is_child_reaper(pid))) { - if (pid_ns_prepare_proc(ns)) + if (pid_ns_prepare_proc(ns)) { + disable_pid_allocation(ns); goto out_free; + } } get_pid_ns(ns); diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index de461aa0bf9acc933254d34ae2ced925f45eefbc..d1f3e9f558b84058e4f8ec88fdfddc647d2f704e 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -374,6 +374,29 @@ static struct ns_common *pidns_get(struct task_struct *task) return ns ? &ns->ns : NULL; } +static struct ns_common *pidns_for_children_get(struct task_struct *task) +{ + struct pid_namespace *ns = NULL; + + task_lock(task); + if (task->nsproxy) { + ns = task->nsproxy->pid_ns_for_children; + get_pid_ns(ns); + } + task_unlock(task); + + if (ns) { + read_lock(&tasklist_lock); + if (!ns->child_reaper) { + put_pid_ns(ns); + ns = NULL; + } + read_unlock(&tasklist_lock); + } + + return ns ? &ns->ns : NULL; +} + static void pidns_put(struct ns_common *ns) { put_pid_ns(to_pid_ns(ns)); @@ -443,6 +466,17 @@ const struct proc_ns_operations pidns_operations = { .get_parent = pidns_get_parent, }; +const struct proc_ns_operations pidns_for_children_operations = { + .name = "pid_for_children", + .real_ns_name = "pid", + .type = CLONE_NEWPID, + .get = pidns_for_children_get, + .put = pidns_put, + .install = pidns_install, + .owner = pidns_owner, + .get_parent = pidns_get_parent, +}; + static __init int pid_namespaces_init(void) { pid_ns_cachep = KMEM_CACHE(pid_namespace, SLAB_PANIC); diff --git a/kernel/power/process.c b/kernel/power/process.c index c7209f060eeb7c8672cf8f07ba9c83ac6c9460ac..78672d324a6ef95394ad72a0b0ba29c7d1155d5d 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -132,7 +132,7 @@ int freeze_processes(void) if (!pm_freezing) atomic_inc(&system_freezing_cnt); - pm_wakeup_clear(); + pm_wakeup_clear(true); pr_info("Freezing user space processes ... "); pm_freezing = true; error = try_to_freeze_tasks(true); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index d79a38de425a0d642834adb5e8ac68ba237db93d..3b1e0f3ad07fa69d9524bcb72d1c6d0bab1a9ad8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -36,6 +36,9 @@ #include #include #include +#ifdef CONFIG_STRICT_KERNEL_RWX +#include +#endif #include "power.h" diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 15e6baef5c73f90b6817c0b1c4e871ea40e30318..c0248c74d6d4cef6dbf09f485f36686862c29094 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -72,6 +72,8 @@ static void freeze_begin(void) static void freeze_enter(void) { + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, true); + spin_lock_irq(&suspend_freeze_lock); if (pm_wakeup_pending()) goto out; @@ -98,6 +100,27 @@ static void freeze_enter(void) out: suspend_freeze_state = FREEZE_STATE_NONE; spin_unlock_irq(&suspend_freeze_lock); + + trace_suspend_resume(TPS("machine_suspend"), PM_SUSPEND_FREEZE, false); +} + +static void s2idle_loop(void) +{ + do { + freeze_enter(); + + if (freeze_ops && freeze_ops->wake) + freeze_ops->wake(); + + dpm_resume_noirq(PMSG_RESUME); + if (freeze_ops && freeze_ops->sync) + freeze_ops->sync(); + + if (pm_wakeup_pending()) + break; + + pm_wakeup_clear(false); + } while (!dpm_suspend_noirq(PMSG_SUSPEND)); } void freeze_wake(void) @@ -371,10 +394,8 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) * all the devices are suspended. */ if (state == PM_SUSPEND_FREEZE) { - trace_suspend_resume(TPS("machine_suspend"), state, true); - freeze_enter(); - trace_suspend_resume(TPS("machine_suspend"), state, false); - goto Platform_wake; + s2idle_loop(); + goto Platform_early_resume; } error = disable_nonboot_cpus(); diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c index d5760c42f042f4accfb65de23784bcaa877d9b00..61d41ca418441a15945b376964f6f3ed07ac84b8 100644 --- a/kernel/printk/braille.c +++ b/kernel/printk/braille.c @@ -2,12 +2,13 @@ #include #include +#include #include #include "console_cmdline.h" #include "braille.h" -char *_braille_console_setup(char **str, char **brl_options) +int _braille_console_setup(char **str, char **brl_options) { if (!strncmp(*str, "brl,", 4)) { *brl_options = ""; @@ -15,14 +16,14 @@ char *_braille_console_setup(char **str, char **brl_options) } else if (!strncmp(*str, "brl=", 4)) { *brl_options = *str + 4; *str = strchr(*brl_options, ','); - if (!*str) + if (!*str) { pr_err("need port name after brl=\n"); - else - *((*str)++) = 0; - } else - return NULL; + return -EINVAL; + } + *((*str)++) = 0; + } - return *str; + return 0; } int diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h index 769d771145c881cd78f43c035ea0866adb96b9b9..749a6756843a08e2a5f2a9ae4fa984658b43a65b 100644 --- a/kernel/printk/braille.h +++ b/kernel/printk/braille.h @@ -9,7 +9,14 @@ braille_set_options(struct console_cmdline *c, char *brl_options) c->brl_options = brl_options; } -char * +/* + * Setup console according to braille options. + * Return -EINVAL on syntax error, 0 on success (or no braille option was + * actually given). + * Modifies str to point to the serial options + * Sets brl_options to the parsed braille options. + */ +int _braille_console_setup(char **str, char **brl_options); int @@ -25,10 +32,10 @@ braille_set_options(struct console_cmdline *c, char *brl_options) { } -static inline char * +static inline int _braille_console_setup(char **str, char **brl_options) { - return NULL; + return 0; } static inline int diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2984fb0f0257420de4a7aa2595a52c34f0d33a0e..a1aecf44ab07c70ab9f33d455646559313926344 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include @@ -269,8 +269,8 @@ static struct console *exclusive_console; #define MAX_CMDLINECONSOLES 8 static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; +static int console_cmdline_cnt; -static int selected_console = -1; static int preferred_console = -1; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -1002,7 +1002,7 @@ const struct file_operations kmsg_fops = { .release = devkmsg_release, }; -#ifdef CONFIG_KEXEC_CORE +#ifdef CONFIG_CRASH_CORE /* * This appends the listed symbols to /proc/vmcore * @@ -1011,7 +1011,7 @@ const struct file_operations kmsg_fops = { * symbols are specifically used so that utilities can access and extract the * dmesg log from a vmcore file after a crash. */ -void log_buf_kexec_setup(void) +void log_buf_vmcoreinfo_setup(void) { VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); @@ -1906,24 +1906,38 @@ static int __add_preferred_console(char *name, int idx, char *options, * See if this tty is not yet registered, and * if we have a slot free. */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { + for (i = 0, c = console_cmdline; i < console_cmdline_cnt; i++, c++) { if (strcmp(c->name, name) == 0 && c->index == idx) { - if (!brl_options) - selected_console = i; + if (brl_options) + return 0; + + /* + * Maintain an invariant that will help to find if + * the matching console is preferred, see + * register_console(): + * + * The last non-braille console is always + * the preferred one. + */ + if (i != console_cmdline_cnt - 1) + swap(console_cmdline[i], + console_cmdline[console_cmdline_cnt - 1]); + + preferred_console = console_cmdline_cnt - 1; + return 0; } } if (i == MAX_CMDLINECONSOLES) return -E2BIG; if (!brl_options) - selected_console = i; + preferred_console = i; strlcpy(c->name, name, sizeof(c->name)); c->options = options; braille_set_options(c, brl_options); c->index = idx; + console_cmdline_cnt++; return 0; } /* @@ -2031,15 +2045,16 @@ void resume_console(void) * @cpu: unused * * If printk() is called from a CPU that is not online yet, the messages - * will be spooled but will not show up on the console. This function is - * called when a new CPU comes online (or fails to come up), and ensures - * that any such output gets printed. + * will be printed on the console only if there are CON_ANYTIME consoles. + * This function is called when a new CPU comes online (or fails to come + * up) or goes offline. */ static int console_cpu_notify(unsigned int cpu) { if (!cpuhp_tasks_frozen) { - console_lock(); - console_unlock(); + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } return 0; } @@ -2161,7 +2176,7 @@ void console_unlock(void) } /* - * Console drivers are called under logbuf_lock, so + * Console drivers are called with interrupts disabled, so * @console_may_schedule should be cleared before; however, we may * end up dumping a lot of lines, for example, if called from * console registration path, and should invoke cond_resched() @@ -2169,11 +2184,15 @@ void console_unlock(void) * scheduling stall on a slow console leading to RCU stall and * softlockup warnings which exacerbate the issue with more * messages practically incapacitating the system. + * + * console_trylock() is not able to detect the preemptive + * context reliably. Therefore the value must be stored before + * and cleared after the the "again" goto label. */ do_cond_resched = console_may_schedule; +again: console_may_schedule = 0; -again: /* * We released the console_sem lock, so we need to recheck if * cpu is online and (if not) is there at least one CON_ANYTIME @@ -2409,6 +2428,7 @@ void register_console(struct console *newcon) unsigned long flags; struct console *bcon = NULL; struct console_cmdline *c; + static bool has_preferred; if (console_drivers) for_each_console(bcon) @@ -2435,15 +2455,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (preferred_console < 0 || bcon || !console_drivers) - preferred_console = selected_console; + if (!has_preferred || bcon || !console_drivers) + has_preferred = preferred_console >= 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (preferred_console < 0) { + if (!has_preferred) { if (newcon->index < 0) newcon->index = 0; if (newcon->setup == NULL || @@ -2451,18 +2471,29 @@ void register_console(struct console *newcon) newcon->flags |= CON_ENABLED; if (newcon->device) { newcon->flags |= CON_CONSDEV; - preferred_console = 0; + has_preferred = true; } } } /* - * See if this console matches one we selected on - * the command line. + * See if this console matches one we selected on the command line. + * + * There may be several entries in the console_cmdline array matching + * with the same console, one with newcon->match(), another by + * name/index: + * + * pl011,mmio,0x87e024000000,115200 -- added from SPCR + * ttyAMA0 -- added from command line + * + * Traverse the console_cmdline array in reverse order to be + * sure that if this console is preferred then it will be the first + * matching entry. We use the invariant that is maintained in + * __add_preferred_console(). */ - for (i = 0, c = console_cmdline; - i < MAX_CMDLINECONSOLES && c->name[0]; - i++, c++) { + for (i = console_cmdline_cnt - 1; i >= 0; i--) { + c = console_cmdline + i; + if (!newcon->match || newcon->match(newcon, c->name, c->index, c->options) != 0) { /* default matching */ @@ -2484,9 +2515,9 @@ void register_console(struct console *newcon) } newcon->flags |= CON_ENABLED; - if (i == selected_console) { + if (i == preferred_console) { newcon->flags |= CON_CONSDEV; - preferred_console = selected_console; + has_preferred = true; } break; } @@ -2610,6 +2641,30 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Initialize the console device. This is called *early*, so + * we can't necessarily depend on lots of kernel help here. + * Just do some early initializations, and do the complex setup + * later. + */ +void __init console_init(void) +{ + initcall_t *call; + + /* Setup the default TTY line discipline. */ + n_tty_init(); + + /* + * set up the console device so that later boot sequences can + * inform about problems etc.. + */ + call = __con_initcall_start; + while (call < __con_initcall_end) { + (*call)(); + call++; + } +} + /* * Some boot consoles access data that is in the init section and which will * be discarded after the initcalls have been run. To make sure that no code diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 18dfc485225c3954ccf29a8743bc636f10db7d41..23803c7d51804debb62b91934cdba42f305c70d3 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -3,10 +3,13 @@ KCOV_INSTRUMENT := n obj-y += update.o sync.o -obj-$(CONFIG_SRCU) += srcu.o +obj-$(CONFIG_CLASSIC_SRCU) += srcu.o +obj-$(CONFIG_TREE_SRCU) += srcutree.o +obj-$(CONFIG_TINY_SRCU) += srcutiny.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RCU_PERF_TEST) += rcuperf.o obj-$(CONFIG_TREE_RCU) += tree.o obj-$(CONFIG_PREEMPT_RCU) += tree.o obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o obj-$(CONFIG_TINY_RCU) += tiny.o +obj-$(CONFIG_RCU_NEED_SEGCBLIST) += rcu_segcblist.o diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 0d6ff3e471be6c1597e0e78fb90d07eb0ce9c546..73e16ec4054b83d163f58d6786df4e5fb6a00ae4 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -56,6 +56,83 @@ #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ DYNTICK_TASK_FLAG) + +/* + * Grace-period counter management. + */ + +#define RCU_SEQ_CTR_SHIFT 2 +#define RCU_SEQ_STATE_MASK ((1 << RCU_SEQ_CTR_SHIFT) - 1) + +/* + * Return the counter portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline unsigned long rcu_seq_ctr(unsigned long s) +{ + return s >> RCU_SEQ_CTR_SHIFT; +} + +/* + * Return the state portion of a sequence number previously returned + * by rcu_seq_snap() or rcu_seq_current(). + */ +static inline int rcu_seq_state(unsigned long s) +{ + return s & RCU_SEQ_STATE_MASK; +} + +/* + * Set the state portion of the pointed-to sequence number. + * The caller is responsible for preventing conflicting updates. + */ +static inline void rcu_seq_set_state(unsigned long *sp, int newstate) +{ + WARN_ON_ONCE(newstate & ~RCU_SEQ_STATE_MASK); + WRITE_ONCE(*sp, (*sp & ~RCU_SEQ_STATE_MASK) + newstate); +} + +/* Adjust sequence number for start of update-side operation. */ +static inline void rcu_seq_start(unsigned long *sp) +{ + WRITE_ONCE(*sp, *sp + 1); + smp_mb(); /* Ensure update-side operation after counter increment. */ + WARN_ON_ONCE(rcu_seq_state(*sp) != 1); +} + +/* Adjust sequence number for end of update-side operation. */ +static inline void rcu_seq_end(unsigned long *sp) +{ + smp_mb(); /* Ensure update-side operation before counter increment. */ + WARN_ON_ONCE(!rcu_seq_state(*sp)); + WRITE_ONCE(*sp, (*sp | RCU_SEQ_STATE_MASK) + 1); +} + +/* Take a snapshot of the update side's sequence number. */ +static inline unsigned long rcu_seq_snap(unsigned long *sp) +{ + unsigned long s; + + s = (READ_ONCE(*sp) + 2 * RCU_SEQ_STATE_MASK + 1) & ~RCU_SEQ_STATE_MASK; + smp_mb(); /* Above access must not bleed into critical section. */ + return s; +} + +/* Return the current value the update side's sequence number, no ordering. */ +static inline unsigned long rcu_seq_current(unsigned long *sp) +{ + return READ_ONCE(*sp); +} + +/* + * Given a snapshot from rcu_seq_snap(), determine whether or not a + * full update-side operation has occurred. + */ +static inline bool rcu_seq_done(unsigned long *sp, unsigned long s) +{ + return ULONG_CMP_GE(READ_ONCE(*sp), s); +} + /* * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally * by call_rcu() and rcu callback execution, and are therefore not part of the @@ -109,12 +186,12 @@ static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); if (__is_kfree_rcu_offset(offset)) { - RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); + RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset);) kfree((void *)head - offset); rcu_lock_release(&rcu_callback_map); return true; } else { - RCU_TRACE(trace_rcu_invoke_callback(rn, head)); + RCU_TRACE(trace_rcu_invoke_callback(rn, head);) head->func(head); rcu_lock_release(&rcu_callback_map); return false; @@ -144,4 +221,76 @@ void rcu_test_sync_prims(void); */ extern void resched_cpu(int cpu); +#if defined(SRCU) || !defined(TINY_RCU) + +#include + +extern int rcu_num_lvls; +extern int num_rcu_lvl[]; +extern int rcu_num_nodes; +static bool rcu_fanout_exact; +static int rcu_fanout_leaf; + +/* + * Compute the per-level fanout, either using the exact fanout specified + * or balancing the tree, depending on the rcu_fanout_exact boot parameter. + */ +static inline void rcu_init_levelspread(int *levelspread, const int *levelcnt) +{ + int i; + + if (rcu_fanout_exact) { + levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; + for (i = rcu_num_lvls - 2; i >= 0; i--) + levelspread[i] = RCU_FANOUT; + } else { + int ccur; + int cprv; + + cprv = nr_cpu_ids; + for (i = rcu_num_lvls - 1; i >= 0; i--) { + ccur = levelcnt[i]; + levelspread[i] = (cprv + ccur - 1) / ccur; + cprv = ccur; + } + } +} + +/* + * Do a full breadth-first scan of the rcu_node structures for the + * specified rcu_state structure. + */ +#define rcu_for_each_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Do a breadth-first scan of the non-leaf rcu_node structures for the + * specified rcu_state structure. Note that if there is a singleton + * rcu_node tree with but one rcu_node structure, this loop is a no-op. + */ +#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ + for ((rnp) = &(rsp)->node[0]; \ + (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) + +/* + * Scan the leaves of the rcu_node hierarchy for the specified rcu_state + * structure. Note that if there is a singleton rcu_node tree with but + * one rcu_node structure, this loop -will- visit the rcu_node structure. + * It is still a leaf node, even if it is also the root node. + */ +#define rcu_for_each_leaf_node(rsp, rnp) \ + for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ + (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) + +/* + * Iterate over all possible CPUs in a leaf RCU node. + */ +#define for_each_leaf_node_possible_cpu(rnp, cpu) \ + for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ + cpu <= rnp->grphi; \ + cpu = cpumask_next((cpu), cpu_possible_mask)) + +#endif /* #if defined(SRCU) || !defined(TINY_RCU) */ + #endif /* __LINUX_RCU_H */ diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c new file mode 100644 index 0000000000000000000000000000000000000000..2b62a38b080fa5e04af36635f2d6dcedffe8c21e --- /dev/null +++ b/kernel/rcu/rcu_segcblist.c @@ -0,0 +1,505 @@ +/* + * RCU segmented callback lists, function definitions + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney + */ + +#include +#include +#include + +#include "rcu_segcblist.h" + +/* Initialize simple callback list. */ +void rcu_cblist_init(struct rcu_cblist *rclp) +{ + rclp->head = NULL; + rclp->tail = &rclp->head; + rclp->len = 0; + rclp->len_lazy = 0; +} + +/* + * Debug function to actually count the number of callbacks. + * If the number exceeds the limit specified, return -1. + */ +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim) +{ + int cnt = 0; + struct rcu_head **rhpp = &rclp->head; + + for (;;) { + if (!*rhpp) + return cnt; + if (++cnt > lim) + return -1; + rhpp = &(*rhpp)->next; + } +} + +/* + * Dequeue the oldest rcu_head structure from the specified callback + * list. This function assumes that the callback is non-lazy, but + * the caller can later invoke rcu_cblist_dequeued_lazy() if it + * finds otherwise (and if it cares about laziness). This allows + * different users to have different ways of determining laziness. + */ +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp) +{ + struct rcu_head *rhp; + + rhp = rclp->head; + if (!rhp) + return NULL; + rclp->len--; + rclp->head = rhp->next; + if (!rclp->head) + rclp->tail = &rclp->head; + return rhp; +} + +/* + * Initialize an rcu_segcblist structure. + */ +void rcu_segcblist_init(struct rcu_segcblist *rsclp) +{ + int i; + + BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq)); + BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq)); + rsclp->head = NULL; + for (i = 0; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = &rsclp->head; + rsclp->len = 0; + rsclp->len_lazy = 0; +} + +/* + * Disable the specified rcu_segcblist structure, so that callbacks can + * no longer be posted to it. This structure must be empty. + */ +void rcu_segcblist_disable(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(!rcu_segcblist_empty(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_cbs(rsclp)); + WARN_ON_ONCE(rcu_segcblist_n_lazy_cbs(rsclp)); + rsclp->tails[RCU_NEXT_TAIL] = NULL; +} + +/* + * Is the specified segment of the specified rcu_segcblist structure + * empty of callbacks? + */ +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg) +{ + if (seg == RCU_DONE_TAIL) + return &rsclp->head == rsclp->tails[RCU_DONE_TAIL]; + return rsclp->tails[seg - 1] == rsclp->tails[seg]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are ready to be invoked? + */ +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + &rsclp->head != rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * are still pending, that is, not yet ready to be invoked? + */ +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL); +} + +/* + * Dequeue and return the first ready-to-invoke callback. If there + * are no ready-to-invoke callbacks, return NULL. Disables interrupts + * to avoid interference. Does not protect from interference from other + * CPUs or tasks. + */ +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + int i; + struct rcu_head *rhp; + + local_irq_save(flags); + if (!rcu_segcblist_ready_cbs(rsclp)) { + local_irq_restore(flags); + return NULL; + } + rhp = rsclp->head; + BUG_ON(!rhp); + rsclp->head = rhp->next; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) { + if (rsclp->tails[i] != &rhp->next) + break; + rsclp->tails[i] = &rsclp->head; + } + smp_mb(); /* Dequeue before decrement for rcu_barrier(). */ + WRITE_ONCE(rsclp->len, rsclp->len - 1); + local_irq_restore(flags); + return rhp; +} + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp) +{ + unsigned long flags; + + local_irq_save(flags); + rsclp->len_lazy--; + local_irq_restore(flags); +} + +/* + * Return a pointer to the first callback in the specified rcu_segcblist + * structure. This is useful for diagnostics. + */ +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return rsclp->head; + return NULL; +} + +/* + * Return a pointer to the first pending callback in the specified + * rcu_segcblist structure. This is useful just after posting a given + * callback -- if that callback is the first pending callback, then + * you cannot rely on someone else having already started up the required + * grace period. + */ +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp) +{ + if (rcu_segcblist_is_enabled(rsclp)) + return *rsclp->tails[RCU_DONE_TAIL]; + return NULL; +} + +/* + * Does the specified rcu_segcblist structure contain callbacks that + * have not yet been processed beyond having been posted, that is, + * does it contain callbacks in its last segment? + */ +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp) +{ + return rcu_segcblist_is_enabled(rsclp) && + !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL); +} + +/* + * Enqueue the specified callback onto the specified rcu_segcblist + * structure, updating accounting as needed. Note that the ->len + * field may be accessed locklessly, hence the WRITE_ONCE(). + * The ->len field is used by rcu_barrier() and friends to determine + * if it must post a callback on this structure, and it is OK + * for rcu_barrier() to sometimes post callbacks needlessly, but + * absolutely not OK for it to ever miss posting a callback. + */ +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */ + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is enqueued. */ + rhp->next = NULL; + *rsclp->tails[RCU_NEXT_TAIL] = rhp; + rsclp->tails[RCU_NEXT_TAIL] = &rhp->next; +} + +/* + * Entrain the specified callback onto the specified rcu_segcblist at + * the end of the last non-empty segment. If the entire rcu_segcblist + * is empty, make no change, but return false. + * + * This is intended for use by rcu_barrier()-like primitives, -not- + * for normal grace-period use. IMPORTANT: The callback you enqueue + * will wait for all prior callbacks, NOT necessarily for a grace + * period. You have been warned. + */ +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy) +{ + int i; + + if (rcu_segcblist_n_cbs(rsclp) == 0) + return false; + WRITE_ONCE(rsclp->len, rsclp->len + 1); + if (lazy) + rsclp->len_lazy++; + smp_mb(); /* Ensure counts are updated before callback is entrained. */ + rhp->next = NULL; + for (i = RCU_NEXT_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1]) + break; + *rsclp->tails[i] = rhp; + for (; i <= RCU_NEXT_TAIL; i++) + rsclp->tails[i] = &rhp->next; + return true; +} + +/* + * Extract only the counts from the specified rcu_segcblist structure, + * and place them in the specified rcu_cblist structure. This function + * supports both callback orphaning and invocation, hence the separation + * of counts and callbacks. (Callbacks ready for invocation must be + * orphaned and adopted separately from pending callbacks, but counts + * apply to all callbacks. Locking must be used to make sure that + * both orphaned-callbacks lists are consistent.) + */ +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rclp->len_lazy += rsclp->len_lazy; + rclp->len += rsclp->len; + rsclp->len_lazy = 0; + WRITE_ONCE(rsclp->len, 0); /* ->len sampled locklessly. */ +} + +/* + * Extract only those callbacks ready to be invoked from the specified + * rcu_segcblist structure and place them in the specified rcu_cblist + * structure. + */ +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_ready_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = rsclp->head; + rsclp->head = *rsclp->tails[RCU_DONE_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + rclp->tail = rsclp->tails[RCU_DONE_TAIL]; + for (i = RCU_CBLIST_NSEGS - 1; i >= RCU_DONE_TAIL; i--) + if (rsclp->tails[i] == rsclp->tails[RCU_DONE_TAIL]) + rsclp->tails[i] = &rsclp->head; +} + +/* + * Extract only those callbacks still pending (not yet ready to be + * invoked) from the specified rcu_segcblist structure and place them in + * the specified rcu_cblist structure. Note that this loses information + * about any callbacks that might have been partway done waiting for + * their grace period. Too bad! They will have to start over. + */ +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rcu_segcblist_pend_cbs(rsclp)) + return; /* Nothing to do. */ + *rclp->tail = *rsclp->tails[RCU_DONE_TAIL]; + rclp->tail = rsclp->tails[RCU_NEXT_TAIL]; + *rsclp->tails[RCU_DONE_TAIL] = NULL; + for (i = RCU_DONE_TAIL + 1; i < RCU_CBLIST_NSEGS; i++) + rsclp->tails[i] = rsclp->tails[RCU_DONE_TAIL]; +} + +/* + * Insert counts from the specified rcu_cblist structure in the + * specified rcu_segcblist structure. + */ +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + rsclp->len_lazy += rclp->len_lazy; + /* ->len sampled locklessly. */ + WRITE_ONCE(rsclp->len, rsclp->len + rclp->len); + rclp->len_lazy = 0; + rclp->len = 0; +} + +/* + * Move callbacks from the specified rcu_cblist to the beginning of the + * done-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + int i; + + if (!rclp->head) + return; /* No callbacks to move. */ + *rclp->tail = rsclp->head; + rsclp->head = rclp->head; + for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) + if (&rsclp->head == rsclp->tails[i]) + rsclp->tails[i] = rclp->tail; + else + break; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Move callbacks from the specified rcu_cblist to the end of the + * new-callbacks segment of the specified rcu_segcblist. + */ +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp) +{ + if (!rclp->head) + return; /* Nothing to do. */ + *rsclp->tails[RCU_NEXT_TAIL] = rclp->head; + rsclp->tails[RCU_NEXT_TAIL] = rclp->tail; + rclp->head = NULL; + rclp->tail = &rclp->head; +} + +/* + * Advance the callbacks in the specified rcu_segcblist structure based + * on the current value passed in for the grace-period counter. + */ +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i, j; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return; + + /* + * Find all callbacks whose ->gp_seq numbers indicate that they + * are ready to invoke, and put them into the RCU_DONE_TAIL segment. + */ + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { + if (ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + break; + rsclp->tails[RCU_DONE_TAIL] = rsclp->tails[i]; + } + + /* If no callbacks moved, nothing more need be done. */ + if (i == RCU_WAIT_TAIL) + return; + + /* Clean up tail pointers that might have been misordered above. */ + for (j = RCU_WAIT_TAIL; j < i; j++) + rsclp->tails[j] = rsclp->tails[RCU_DONE_TAIL]; + + /* + * Callbacks moved, so clean up the misordered ->tails[] pointers + * that now point into the middle of the list of ready-to-invoke + * callbacks. The overall effect is to copy down the later pointers + * into the gap that was created by the now-ready segments. + */ + for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { + if (rsclp->tails[j] == rsclp->tails[RCU_NEXT_TAIL]) + break; /* No more callbacks. */ + rsclp->tails[j] = rsclp->tails[i]; + rsclp->gp_seq[j] = rsclp->gp_seq[i]; + } +} + +/* + * "Accelerate" callbacks based on more-accurate grace-period information. + * The reason for this is that RCU does not synchronize the beginnings and + * ends of grace periods, and that callbacks are posted locally. This in + * turn means that the callbacks must be labelled conservatively early + * on, as getting exact information would degrade both performance and + * scalability. When more accurate grace-period information becomes + * available, previously posted callbacks can be "accelerated", marking + * them to complete at the end of the earlier grace period. + * + * This function operates on an rcu_segcblist structure, and also the + * grace-period sequence number seq at which new callbacks would become + * ready to invoke. Returns true if there are callbacks that won't be + * ready to invoke until seq, false otherwise. + */ +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq) +{ + int i; + + WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp)); + if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL)) + return false; + + /* + * Find the segment preceding the oldest segment of callbacks + * whose ->gp_seq[] completion is at or after that passed in via + * "seq", skipping any empty segments. This oldest segment, along + * with any later segments, can be merged in with any newly arrived + * callbacks in the RCU_NEXT_TAIL segment, and assigned "seq" + * as their ->gp_seq[] grace-period completion sequence number. + */ + for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--) + if (rsclp->tails[i] != rsclp->tails[i - 1] && + ULONG_CMP_LT(rsclp->gp_seq[i], seq)) + break; + + /* + * If all the segments contain callbacks that correspond to + * earlier grace-period sequence numbers than "seq", leave. + * Assuming that the rcu_segcblist structure has enough + * segments in its arrays, this can only happen if some of + * the non-done segments contain callbacks that really are + * ready to invoke. This situation will get straightened + * out by the next call to rcu_segcblist_advance(). + * + * Also advance to the oldest segment of callbacks whose + * ->gp_seq[] completion is at or after that passed in via "seq", + * skipping any empty segments. + */ + if (++i >= RCU_NEXT_TAIL) + return false; + + /* + * Merge all later callbacks, including newly arrived callbacks, + * into the segment located by the for-loop above. Assign "seq" + * as the ->gp_seq[] value in order to correctly handle the case + * where there were no pending callbacks in the rcu_segcblist + * structure other than in the RCU_NEXT_TAIL segment. + */ + for (; i < RCU_NEXT_TAIL; i++) { + rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; + rsclp->gp_seq[i] = seq; + } + return true; +} + +/* + * Scan the specified rcu_segcblist structure for callbacks that need + * a grace period later than the one specified by "seq". We don't look + * at the RCU_DONE_TAIL or RCU_NEXT_TAIL segments because they don't + * have a grace-period sequence number. + */ +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq) +{ + int i; + + for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) + if (rsclp->tails[i - 1] != rsclp->tails[i] && + ULONG_CMP_LT(seq, rsclp->gp_seq[i])) + return true; + return false; +} diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h new file mode 100644 index 0000000000000000000000000000000000000000..6e36e36478cd3f57338260904eb9335d3e0b85a5 --- /dev/null +++ b/kernel/rcu/rcu_segcblist.h @@ -0,0 +1,164 @@ +/* + * RCU segmented callback lists, internal-to-rcu header file + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright IBM Corporation, 2017 + * + * Authors: Paul E. McKenney + */ + +#include + +/* + * Account for the fact that a previously dequeued callback turned out + * to be marked as lazy. + */ +static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp) +{ + rclp->len_lazy--; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp) +{ + return rclp->head; +} + +/* + * Interim function to return rcu_cblist head pointer. Longer term, the + * rcu_cblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp) +{ + WARN_ON_ONCE(!rclp->head); + return rclp->tail; +} + +void rcu_cblist_init(struct rcu_cblist *rclp); +long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim); +struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); + +/* + * Is the specified rcu_segcblist structure empty? + * + * But careful! The fact that the ->head field is NULL does not + * necessarily imply that there are no callbacks associated with + * this structure. When callbacks are being invoked, they are + * removed as a group. If callback invocation must be preempted, + * the remaining callbacks will be added back to the list. Either + * way, the counts are updated later. + * + * So it is often the case that rcu_segcblist_n_cbs() should be used + * instead. + */ +static inline bool rcu_segcblist_empty(struct rcu_segcblist *rsclp) +{ + return !rsclp->head; +} + +/* Return number of callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_cbs(struct rcu_segcblist *rsclp) +{ + return READ_ONCE(rsclp->len); +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_lazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len_lazy; +} + +/* Return number of lazy callbacks in segmented callback list. */ +static inline long rcu_segcblist_n_nonlazy_cbs(struct rcu_segcblist *rsclp) +{ + return rsclp->len - rsclp->len_lazy; +} + +/* + * Is the specified rcu_segcblist enabled, for example, not corresponding + * to an offline or callback-offloaded CPU? + */ +static inline bool rcu_segcblist_is_enabled(struct rcu_segcblist *rsclp) +{ + return !!rsclp->tails[RCU_NEXT_TAIL]; +} + +/* + * Are all segments following the specified segment of the specified + * rcu_segcblist structure empty of callbacks? (The specified + * segment might well contain callbacks.) + */ +static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg) +{ + return !*rsclp->tails[seg]; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head *rcu_segcblist_head(struct rcu_segcblist *rsclp) +{ + return rsclp->head; +} + +/* + * Interim function to return rcu_segcblist head pointer. Longer term, the + * rcu_segcblist will be used more pervasively, removing the need for this + * function. + */ +static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp) +{ + WARN_ON_ONCE(rcu_segcblist_empty(rsclp)); + return rsclp->tails[RCU_NEXT_TAIL]; +} + +void rcu_segcblist_init(struct rcu_segcblist *rsclp); +void rcu_segcblist_disable(struct rcu_segcblist *rsclp); +bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg); +bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); +bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp); +void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); +struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); +bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp); +void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, + struct rcu_head *rhp, bool lazy); +void rcu_segcblist_extract_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_extract_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_count(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_done_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_insert_pend_cbs(struct rcu_segcblist *rsclp, + struct rcu_cblist *rclp); +void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); +bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, + unsigned long seq); diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index cccc417a813502f8bc0f6d0af930b25d3d1b4f52..ae6e574d4cf5c3fbdd8ab9a56009981cec9935a1 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -559,19 +559,34 @@ static void srcu_torture_barrier(void) static void srcu_torture_stats(void) { - int cpu; - int idx = srcu_ctlp->completed & 0x1; + int __maybe_unused cpu; + int idx; - pr_alert("%s%s per-CPU(idx=%d):", +#if defined(CONFIG_TREE_SRCU) || defined(CONFIG_CLASSIC_SRCU) +#ifdef CONFIG_TREE_SRCU + idx = srcu_ctlp->srcu_idx & 0x1; +#else /* #ifdef CONFIG_TREE_SRCU */ + idx = srcu_ctlp->completed & 0x1; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ + pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", torture_type, TORTURE_FLAG, idx); for_each_possible_cpu(cpu) { unsigned long l0, l1; unsigned long u0, u1; long c0, c1; - struct srcu_array *counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); +#ifdef CONFIG_TREE_SRCU + struct srcu_data *counts; + counts = per_cpu_ptr(srcu_ctlp->sda, cpu); + u0 = counts->srcu_unlock_count[!idx]; + u1 = counts->srcu_unlock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ + struct srcu_array *counts; + + counts = per_cpu_ptr(srcu_ctlp->per_cpu_ref, cpu); u0 = counts->unlock_count[!idx]; u1 = counts->unlock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ /* * Make sure that a lock is always counted if the corresponding @@ -579,14 +594,26 @@ static void srcu_torture_stats(void) */ smp_rmb(); +#ifdef CONFIG_TREE_SRCU + l0 = counts->srcu_lock_count[!idx]; + l1 = counts->srcu_lock_count[idx]; +#else /* #ifdef CONFIG_TREE_SRCU */ l0 = counts->lock_count[!idx]; l1 = counts->lock_count[idx]; +#endif /* #else #ifdef CONFIG_TREE_SRCU */ c0 = l0 - u0; c1 = l1 - u1; pr_cont(" %d(%ld,%ld)", cpu, c0, c1); } pr_cont("\n"); +#elif defined(CONFIG_TINY_SRCU) + idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1; + pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%d,%d)\n", + torture_type, TORTURE_FLAG, idx, + READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]), + READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx])); +#endif } static void srcu_torture_synchronize_expedited(void) @@ -1333,12 +1360,14 @@ rcu_torture_stats_print(void) cur_ops->stats(); if (rtcv_snap == rcu_torture_current_version && rcu_torture_current != NULL) { - int __maybe_unused flags; - unsigned long __maybe_unused gpnum; - unsigned long __maybe_unused completed; + int __maybe_unused flags = 0; + unsigned long __maybe_unused gpnum = 0; + unsigned long __maybe_unused completed = 0; rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); + srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, + &flags, &gpnum, &completed); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", rcu_torture_writer_state_getname(), diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index ef3bcfb15b39ec4815275077d3e79825db740a27..584d8a983883584dfa87ff0e36cf01465506c199 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -22,7 +22,7 @@ * Lai Jiangshan * * For detailed explanation of Read-Copy Update mechanism see - - * Documentation/RCU/ *.txt + * Documentation/RCU/ *.txt * */ @@ -243,8 +243,14 @@ static bool srcu_readers_active(struct srcu_struct *sp) * cleanup_srcu_struct - deconstruct a sleep-RCU structure * @sp: structure to clean up. * - * Must invoke this after you are finished using a given srcu_struct that - * was initialized via init_srcu_struct(), else you leak memory. + * Must invoke this only after you are finished using a given srcu_struct + * that was initialized via init_srcu_struct(). This code does some + * probabalistic checking, spotting late uses of srcu_read_lock(), + * synchronize_srcu(), synchronize_srcu_expedited(), and call_srcu(). + * If any such late uses are detected, the per-CPU memory associated with + * the srcu_struct is simply leaked and WARN_ON() is invoked. If the + * caller frees the srcu_struct itself, a use-after-free crash will likely + * ensue, but at least there will be a warning printed. */ void cleanup_srcu_struct(struct srcu_struct *sp) { diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c new file mode 100644 index 0000000000000000000000000000000000000000..36e1f82faed15cb8e3bbfb3b7705bcdf29b5da71 --- /dev/null +++ b/kernel/rcu/srcutiny.c @@ -0,0 +1,216 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion, + * tiny version for non-preemptible single-CPU use. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2017 + * + * Author: Paul McKenney + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include "rcu_segcblist.h" +#include "rcu.h" + +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->srcu_lock_nesting[0] = 0; + sp->srcu_lock_nesting[1] = 0; + init_swait_queue_head(&sp->srcu_wq); + sp->srcu_gp_seq = 0; + rcu_segcblist_init(&sp->srcu_cblist); + sp->srcu_gp_running = false; + sp->srcu_gp_waiting = false; + sp->srcu_idx = 0; + INIT_WORK(&sp->srcu_work, srcu_drive_gp); + return 0; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + WARN_ON(sp->srcu_lock_nesting[0] || sp->srcu_lock_nesting[1]); + flush_work(&sp->srcu_work); + WARN_ON(rcu_seq_state(sp->srcu_gp_seq)); + WARN_ON(sp->srcu_gp_running); + WARN_ON(sp->srcu_gp_waiting); + WARN_ON(!rcu_segcblist_empty(&sp->srcu_cblist)); +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx); + WRITE_ONCE(sp->srcu_lock_nesting[idx], sp->srcu_lock_nesting[idx] + 1); + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate element of + * the srcu_struct. Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + int newval = sp->srcu_lock_nesting[idx] - 1; + + WRITE_ONCE(sp->srcu_lock_nesting[idx], newval); + if (!newval && READ_ONCE(sp->srcu_gp_waiting)) + swake_up(&sp->srcu_wq); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * Workqueue handler to drive one grace period and invoke any callbacks + * that become ready as a result. Single-CPU and !PREEMPT operation + * means that we get away with murder on synchronization. ;-) + */ +void srcu_drive_gp(struct work_struct *wp) +{ + int idx; + struct rcu_cblist ready_cbs; + struct srcu_struct *sp; + struct rcu_head *rhp; + + sp = container_of(wp, struct srcu_struct, srcu_work); + if (sp->srcu_gp_running || rcu_segcblist_empty(&sp->srcu_cblist)) + return; /* Already running or nothing to do. */ + + /* Tag recently arrived callbacks and wait for readers. */ + WRITE_ONCE(sp->srcu_gp_running, true); + rcu_segcblist_accelerate(&sp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + rcu_seq_start(&sp->srcu_gp_seq); + idx = sp->srcu_idx; + WRITE_ONCE(sp->srcu_idx, !sp->srcu_idx); + WRITE_ONCE(sp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + swait_event(sp->srcu_wq, !READ_ONCE(sp->srcu_lock_nesting[idx])); + WRITE_ONCE(sp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + rcu_seq_end(&sp->srcu_gp_seq); + + /* Update callback list based on GP, and invoke ready callbacks. */ + rcu_segcblist_advance(&sp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (rcu_segcblist_ready_cbs(&sp->srcu_cblist)) { + rcu_cblist_init(&ready_cbs); + local_irq_disable(); + rcu_segcblist_extract_done_cbs(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + local_irq_disable(); + rcu_segcblist_insert_count(&sp->srcu_cblist, &ready_cbs); + local_irq_enable(); + } + WRITE_ONCE(sp->srcu_gp_running, false); + + /* + * If more callbacks, reschedule ourselves. This can race with + * a call_srcu() at interrupt level, but the ->srcu_gp_running + * checks will straighten that out. + */ + if (!rcu_segcblist_empty(&sp->srcu_cblist)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(srcu_drive_gp); + +/* + * Enqueue an SRCU callback on the specified srcu_struct structure, + * initiating grace-period processing if it is not already running. + */ +void call_srcu(struct srcu_struct *sp, struct rcu_head *head, + rcu_callback_t func) +{ + unsigned long flags; + + head->func = func; + local_irq_save(flags); + rcu_segcblist_enqueue(&sp->srcu_cblist, head, false); + local_irq_restore(flags); + if (!READ_ONCE(sp->srcu_gp_running)) + schedule_work(&sp->srcu_work); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + struct rcu_synchronize rs; + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + call_srcu(sp, &rs.head, wakeme_after_rcu); + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c new file mode 100644 index 0000000000000000000000000000000000000000..3ae8474557df3975398722ecd6c8793e271af4ad --- /dev/null +++ b/kernel/rcu/srcutree.c @@ -0,0 +1,1155 @@ +/* + * Sleepable Read-Copy Update mechanism for mutual exclusion. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, you can access it online at + * http://www.gnu.org/licenses/gpl-2.0.html. + * + * Copyright (C) IBM Corporation, 2006 + * Copyright (C) Fujitsu, 2012 + * + * Author: Paul McKenney + * Lai Jiangshan + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU/ *.txt + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rcu.h" +#include "rcu_segcblist.h" + +ulong exp_holdoff = 25 * 1000; /* Holdoff (ns) for auto-expediting. */ +module_param(exp_holdoff, ulong, 0444); + +static void srcu_invoke_callbacks(struct work_struct *work); +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); + +/* + * Initialize SRCU combining tree. Note that statically allocated + * srcu_struct structures might already have srcu_read_lock() and + * srcu_read_unlock() running against them. So if the is_static parameter + * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. + */ +static void init_srcu_struct_nodes(struct srcu_struct *sp, bool is_static) +{ + int cpu; + int i; + int level = 0; + int levelspread[RCU_NUM_LVLS]; + struct srcu_data *sdp; + struct srcu_node *snp; + struct srcu_node *snp_first; + + /* Work out the overall tree geometry. */ + sp->level[0] = &sp->node[0]; + for (i = 1; i < rcu_num_lvls; i++) + sp->level[i] = sp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); + + /* Each pass through this loop initializes one srcu_node structure. */ + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_init(&snp->lock); + WARN_ON_ONCE(ARRAY_SIZE(snp->srcu_have_cbs) != + ARRAY_SIZE(snp->srcu_data_have_cbs)); + for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { + snp->srcu_have_cbs[i] = 0; + snp->srcu_data_have_cbs[i] = 0; + } + snp->srcu_gp_seq_needed_exp = 0; + snp->grplo = -1; + snp->grphi = -1; + if (snp == &sp->node[0]) { + /* Root node, special case. */ + snp->srcu_parent = NULL; + continue; + } + + /* Non-root node. */ + if (snp == sp->level[level + 1]) + level++; + snp->srcu_parent = sp->level[level - 1] + + (snp - sp->level[level]) / + levelspread[level - 1]; + } + + /* + * Initialize the per-CPU srcu_data array, which feeds into the + * leaves of the srcu_node tree. + */ + WARN_ON_ONCE(ARRAY_SIZE(sdp->srcu_lock_count) != + ARRAY_SIZE(sdp->srcu_unlock_count)); + level = rcu_num_lvls - 1; + snp_first = sp->level[level]; + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_init(&sdp->lock); + rcu_segcblist_init(&sdp->srcu_cblist); + sdp->srcu_cblist_invoking = false; + sdp->srcu_gp_seq_needed = sp->srcu_gp_seq; + sdp->srcu_gp_seq_needed_exp = sp->srcu_gp_seq; + sdp->mynode = &snp_first[cpu / levelspread[level]]; + for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { + if (snp->grplo < 0) + snp->grplo = cpu; + snp->grphi = cpu; + } + sdp->cpu = cpu; + INIT_DELAYED_WORK(&sdp->work, srcu_invoke_callbacks); + sdp->sp = sp; + sdp->grpmask = 1 << (cpu - sdp->mynode->grplo); + if (is_static) + continue; + + /* Dynamically allocated, better be no srcu_read_locks()! */ + for (i = 0; i < ARRAY_SIZE(sdp->srcu_lock_count); i++) { + sdp->srcu_lock_count[i] = 0; + sdp->srcu_unlock_count[i] = 0; + } + } +} + +/* + * Initialize non-compile-time initialized fields, including the + * associated srcu_node and srcu_data structures. The is_static + * parameter is passed through to init_srcu_struct_nodes(), and + * also tells us that ->sda has already been wired up to srcu_data. + */ +static int init_srcu_struct_fields(struct srcu_struct *sp, bool is_static) +{ + mutex_init(&sp->srcu_cb_mutex); + mutex_init(&sp->srcu_gp_mutex); + sp->srcu_idx = 0; + sp->srcu_gp_seq = 0; + sp->srcu_barrier_seq = 0; + mutex_init(&sp->srcu_barrier_mutex); + atomic_set(&sp->srcu_barrier_cpu_cnt, 0); + INIT_DELAYED_WORK(&sp->work, process_srcu); + if (!is_static) + sp->sda = alloc_percpu(struct srcu_data); + init_srcu_struct_nodes(sp, is_static); + sp->srcu_gp_seq_needed_exp = 0; + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + smp_store_release(&sp->srcu_gp_seq_needed, 0); /* Init done. */ + return sp->sda ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/** + * init_srcu_struct - initialize a sleep-RCU structure + * @sp: structure to initialize. + * + * Must invoke this on a given srcu_struct before passing that srcu_struct + * to any other function. Each srcu_struct represents a separate domain + * of SRCU protection. + */ +int init_srcu_struct(struct srcu_struct *sp) +{ + spin_lock_init(&sp->gp_lock); + return init_srcu_struct_fields(sp, false); +} +EXPORT_SYMBOL_GPL(init_srcu_struct); + +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + +/* + * First-use initialization of statically allocated srcu_struct + * structure. Wiring up the combining tree is more than can be + * done with compile-time initialization, so this check is added + * to each update-side SRCU primitive. Use ->gp_lock, which -is- + * compile-time initialized, to resolve races involving multiple + * CPUs trying to garner first-use privileges. + */ +static void check_init_srcu_struct(struct srcu_struct *sp) +{ + unsigned long flags; + + WARN_ON_ONCE(rcu_scheduler_active == RCU_SCHEDULER_INIT); + /* The smp_load_acquire() pairs with the smp_store_release(). */ + if (!rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq_needed))) /*^^^*/ + return; /* Already initialized. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (!rcu_seq_state(sp->srcu_gp_seq_needed)) { + spin_unlock_irqrestore(&sp->gp_lock, flags); + return; + } + init_srcu_struct_fields(sp, true); + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Returns approximate total of the readers' ->srcu_lock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_lock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[idx]); + } + return sum; +} + +/* + * Returns approximate total of the readers' ->srcu_unlock_count[] values + * for the rank of per-CPU counters specified by idx. + */ +static unsigned long srcu_readers_unlock_idx(struct srcu_struct *sp, int idx) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_unlock_count[idx]); + } + return sum; +} + +/* + * Return true if the number of pre-existing readers is determined to + * be zero. + */ +static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) +{ + unsigned long unlocks; + + unlocks = srcu_readers_unlock_idx(sp, idx); + + /* + * Make sure that a lock is always counted if the corresponding + * unlock is counted. Needs to be a smp_mb() as the read side may + * contain a read from a variable that is written to before the + * synchronize_srcu() in the write side. In this case smp_mb()s + * A and B act like the store buffering pattern. + * + * This smp_mb() also pairs with smp_mb() C to prevent accesses + * after the synchronize_srcu() from being executed before the + * grace period ends. + */ + smp_mb(); /* A */ + + /* + * If the locks are the same as the unlocks, then there must have + * been no readers on this index at some time in between. This does + * not mean that there are no more readers, as one could have read + * the current index but not have incremented the lock counter yet. + * + * Possible bug: There is no guarantee that there haven't been + * ULONG_MAX increments of ->srcu_lock_count[] since the unlocks were + * counted, meaning that this could return true even if there are + * still active readers. Since there are no memory barriers around + * srcu_flip(), the CPU is not required to increment ->srcu_idx + * before running srcu_readers_unlock_idx(), which means that there + * could be an arbitrarily large number of critical sections that + * execute after srcu_readers_unlock_idx() but use the old value + * of ->srcu_idx. + */ + return srcu_readers_lock_idx(sp, idx) == unlocks; +} + +/** + * srcu_readers_active - returns true if there are readers. and false + * otherwise + * @sp: which srcu_struct to count active readers (holding srcu_read_lock). + * + * Note that this is not an atomic primitive, and can therefore suffer + * severe errors when invoked on an active srcu_struct. That said, it + * can be useful as an error check at cleanup time. + */ +static bool srcu_readers_active(struct srcu_struct *sp) +{ + int cpu; + unsigned long sum = 0; + + for_each_possible_cpu(cpu) { + struct srcu_data *cpuc = per_cpu_ptr(sp->sda, cpu); + + sum += READ_ONCE(cpuc->srcu_lock_count[0]); + sum += READ_ONCE(cpuc->srcu_lock_count[1]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[0]); + sum -= READ_ONCE(cpuc->srcu_unlock_count[1]); + } + return sum; +} + +#define SRCU_INTERVAL 1 + +/* + * Return grace-period delay, zero if there are expedited grace + * periods pending, SRCU_INTERVAL otherwise. + */ +static unsigned long srcu_get_delay(struct srcu_struct *sp) +{ + if (ULONG_CMP_LT(READ_ONCE(sp->srcu_gp_seq), + READ_ONCE(sp->srcu_gp_seq_needed_exp))) + return 0; + return SRCU_INTERVAL; +} + +/** + * cleanup_srcu_struct - deconstruct a sleep-RCU structure + * @sp: structure to clean up. + * + * Must invoke this after you are finished using a given srcu_struct that + * was initialized via init_srcu_struct(), else you leak memory. + */ +void cleanup_srcu_struct(struct srcu_struct *sp) +{ + int cpu; + + if (WARN_ON(!srcu_get_delay(sp))) + return; /* Leakage unless caller handles error. */ + if (WARN_ON(srcu_readers_active(sp))) + return; /* Leakage unless caller handles error. */ + flush_delayed_work(&sp->work); + for_each_possible_cpu(cpu) + flush_delayed_work(&per_cpu_ptr(sp->sda, cpu)->work); + if (WARN_ON(rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) != SRCU_STATE_IDLE) || + WARN_ON(srcu_readers_active(sp))) { + pr_info("cleanup_srcu_struct: Active srcu_struct %p state: %d\n", sp, rcu_seq_state(READ_ONCE(sp->srcu_gp_seq))); + return; /* Caller forgot to stop doing call_srcu()? */ + } + free_percpu(sp->sda); + sp->sda = NULL; +} +EXPORT_SYMBOL_GPL(cleanup_srcu_struct); + +/* + * Counts the new reader in the appropriate per-CPU element of the + * srcu_struct. Must be called from process context. + * Returns an index that must be passed to the matching srcu_read_unlock(). + */ +int __srcu_read_lock(struct srcu_struct *sp) +{ + int idx; + + idx = READ_ONCE(sp->srcu_idx) & 0x1; + __this_cpu_inc(sp->sda->srcu_lock_count[idx]); + smp_mb(); /* B */ /* Avoid leaking the critical section. */ + return idx; +} +EXPORT_SYMBOL_GPL(__srcu_read_lock); + +/* + * Removes the count for the old reader from the appropriate per-CPU + * element of the srcu_struct. Note that this may well be a different + * CPU than that which was incremented by the corresponding srcu_read_lock(). + * Must be called from process context. + */ +void __srcu_read_unlock(struct srcu_struct *sp, int idx) +{ + smp_mb(); /* C */ /* Avoid leaking the critical section. */ + this_cpu_inc(sp->sda->srcu_unlock_count[idx]); +} +EXPORT_SYMBOL_GPL(__srcu_read_unlock); + +/* + * We use an adaptive strategy for synchronize_srcu() and especially for + * synchronize_srcu_expedited(). We spin for a fixed time period + * (defined below) to allow SRCU readers to exit their read-side critical + * sections. If there are still some readers after a few microseconds, + * we repeatedly block for 1-millisecond time periods. + */ +#define SRCU_RETRY_CHECK_DELAY 5 + +/* + * Start an SRCU grace period. + */ +static void srcu_gp_start(struct srcu_struct *sp) +{ + struct srcu_data *sdp = this_cpu_ptr(sp->sda); + int state; + + RCU_LOCKDEP_WARN(!lockdep_is_held(&sp->gp_lock), + "Invoked srcu_gp_start() without ->gp_lock!"); + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ + rcu_seq_start(&sp->srcu_gp_seq); + state = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + WARN_ON_ONCE(state != SRCU_STATE_SCAN1); +} + +/* + * Track online CPUs to guide callback workqueue placement. + */ +DEFINE_PER_CPU(bool, srcu_online); + +void srcu_online_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), true); +} + +void srcu_offline_cpu(unsigned int cpu) +{ + WRITE_ONCE(per_cpu(srcu_online, cpu), false); +} + +/* + * Place the workqueue handler on the specified CPU if online, otherwise + * just run it whereever. This is useful for placing workqueue handlers + * that are to invoke the specified CPU's callbacks. + */ +static bool srcu_queue_delayed_work_on(int cpu, struct workqueue_struct *wq, + struct delayed_work *dwork, + unsigned long delay) +{ + bool ret; + + preempt_disable(); + if (READ_ONCE(per_cpu(srcu_online, cpu))) + ret = queue_delayed_work_on(cpu, wq, dwork, delay); + else + ret = queue_delayed_work(wq, dwork, delay); + preempt_enable(); + return ret; +} + +/* + * Schedule callback invocation for the specified srcu_data structure, + * if possible, on the corresponding CPU. + */ +static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) +{ + srcu_queue_delayed_work_on(sdp->cpu, system_power_efficient_wq, + &sdp->work, delay); +} + +/* + * Schedule callback invocation for all srcu_data structures associated + * with the specified srcu_node structure that have callbacks for the + * just-completed grace period, the one corresponding to idx. If possible, + * schedule this invocation on the corresponding CPUs. + */ +static void srcu_schedule_cbs_snp(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long mask, unsigned long delay) +{ + int cpu; + + for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { + if (!(mask & (1 << (cpu - snp->grplo)))) + continue; + srcu_schedule_cbs_sdp(per_cpu_ptr(sp->sda, cpu), delay); + } +} + +/* + * Note the end of an SRCU grace period. Initiates callback invocation + * and starts a new grace period if needed. + * + * The ->srcu_cb_mutex acquisition does not protect any data, but + * instead prevents more than one grace period from starting while we + * are initiating callback invocation. This allows the ->srcu_have_cbs[] + * array to have a finite number of elements. + */ +static void srcu_gp_end(struct srcu_struct *sp) +{ + unsigned long cbdelay; + bool cbs; + unsigned long gpseq; + int idx; + int idxnext; + unsigned long mask; + struct srcu_node *snp; + + /* Prevent more than one additional grace period. */ + mutex_lock(&sp->srcu_cb_mutex); + + /* End the current grace period. */ + spin_lock_irq(&sp->gp_lock); + idx = rcu_seq_state(sp->srcu_gp_seq); + WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); + cbdelay = srcu_get_delay(sp); + sp->srcu_last_gp_end = ktime_get_mono_fast_ns(); + rcu_seq_end(&sp->srcu_gp_seq); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, gpseq)) + sp->srcu_gp_seq_needed_exp = gpseq; + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + /* A new grace period can start at this point. But only one. */ + + /* Initiate callback invocation as needed. */ + idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); + idxnext = (idx + 1) % ARRAY_SIZE(snp->srcu_have_cbs); + rcu_for_each_node_breadth_first(sp, snp) { + spin_lock_irq(&snp->lock); + cbs = false; + if (snp >= sp->level[rcu_num_lvls - 1]) + cbs = snp->srcu_have_cbs[idx] == gpseq; + snp->srcu_have_cbs[idx] = gpseq; + rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); + if (ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, gpseq)) + snp->srcu_gp_seq_needed_exp = gpseq; + mask = snp->srcu_data_have_cbs[idx]; + snp->srcu_data_have_cbs[idx] = 0; + spin_unlock_irq(&snp->lock); + if (cbs) { + smp_mb(); /* GP end before CB invocation. */ + srcu_schedule_cbs_snp(sp, snp, mask, cbdelay); + } + } + + /* Callback initiation done, allow grace periods after next. */ + mutex_unlock(&sp->srcu_cb_mutex); + + /* Start a new grace period if needed. */ + spin_lock_irq(&sp->gp_lock); + gpseq = rcu_seq_current(&sp->srcu_gp_seq); + if (!rcu_seq_state(gpseq) && + ULONG_CMP_LT(gpseq, sp->srcu_gp_seq_needed)) { + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + /* Throttle expedited grace periods: Should be rare! */ + srcu_reschedule(sp, rcu_seq_ctr(gpseq) & 0x3ff + ? 0 : SRCU_INTERVAL); + } else { + spin_unlock_irq(&sp->gp_lock); + } +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent expedited + * grace-period requests. This function is invoked for the first known + * expedited request for a grace period that has already been requested, + * but without expediting. To start a completely new grace period, + * whether expedited or not, use srcu_funnel_gp_start() instead. + */ +static void srcu_funnel_exp_start(struct srcu_struct *sp, struct srcu_node *snp, + unsigned long s) +{ + unsigned long flags; + + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) || + ULONG_CMP_GE(READ_ONCE(snp->srcu_gp_seq_needed_exp), s)) + return; + spin_lock_irqsave(&snp->lock, flags); + if (ULONG_CMP_GE(snp->srcu_gp_seq_needed_exp, s)) { + spin_unlock_irqrestore(&snp->lock, flags); + return; + } + WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); + spin_unlock_irqrestore(&snp->lock, flags); + } + spin_lock_irqsave(&sp->gp_lock, flags); + if (!ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Funnel-locking scheme to scalably mediate many concurrent grace-period + * requests. The winner has to do the work of actually starting grace + * period s. Losers must either ensure that their desired grace-period + * number is recorded on at least their leaf srcu_node structure, or they + * must take steps to invoke their own callbacks. + */ +static void srcu_funnel_gp_start(struct srcu_struct *sp, struct srcu_data *sdp, + unsigned long s, bool do_norm) +{ + unsigned long flags; + int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); + struct srcu_node *snp = sdp->mynode; + unsigned long snp_seq; + + /* Each pass through the loop does one level of the srcu_node tree. */ + for (; snp != NULL; snp = snp->srcu_parent) { + if (rcu_seq_done(&sp->srcu_gp_seq, s) && snp != sdp->mynode) + return; /* GP already done and CBs recorded. */ + spin_lock_irqsave(&snp->lock, flags); + if (ULONG_CMP_GE(snp->srcu_have_cbs[idx], s)) { + snp_seq = snp->srcu_have_cbs[idx]; + if (snp == sdp->mynode && snp_seq == s) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + spin_unlock_irqrestore(&snp->lock, flags); + if (snp == sdp->mynode && snp_seq != s) { + smp_mb(); /* CBs after GP! */ + srcu_schedule_cbs_sdp(sdp, do_norm + ? SRCU_INTERVAL + : 0); + return; + } + if (!do_norm) + srcu_funnel_exp_start(sp, snp, s); + return; + } + snp->srcu_have_cbs[idx] = s; + if (snp == sdp->mynode) + snp->srcu_data_have_cbs[idx] |= sdp->grpmask; + if (!do_norm && ULONG_CMP_LT(snp->srcu_gp_seq_needed_exp, s)) + snp->srcu_gp_seq_needed_exp = s; + spin_unlock_irqrestore(&snp->lock, flags); + } + + /* Top of tree, must ensure the grace period will be started. */ + spin_lock_irqsave(&sp->gp_lock, flags); + if (ULONG_CMP_LT(sp->srcu_gp_seq_needed, s)) { + /* + * Record need for grace period s. Pair with load + * acquire setting up for initialization. + */ + smp_store_release(&sp->srcu_gp_seq_needed, s); /*^^^*/ + } + if (!do_norm && ULONG_CMP_LT(sp->srcu_gp_seq_needed_exp, s)) + sp->srcu_gp_seq_needed_exp = s; + + /* If grace period not already done and none in progress, start it. */ + if (!rcu_seq_done(&sp->srcu_gp_seq, s) && + rcu_seq_state(sp->srcu_gp_seq) == SRCU_STATE_IDLE) { + WARN_ON_ONCE(ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)); + srcu_gp_start(sp); + queue_delayed_work(system_power_efficient_wq, &sp->work, + srcu_get_delay(sp)); + } + spin_unlock_irqrestore(&sp->gp_lock, flags); +} + +/* + * Wait until all readers counted by array index idx complete, but + * loop an additional time if there is an expedited grace period pending. + * The caller must ensure that ->srcu_idx is not changed while checking. + */ +static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) +{ + for (;;) { + if (srcu_readers_active_idx_check(sp, idx)) + return true; + if (--trycount + !srcu_get_delay(sp) <= 0) + return false; + udelay(SRCU_RETRY_CHECK_DELAY); + } +} + +/* + * Increment the ->srcu_idx counter so that future SRCU readers will + * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows + * us to wait for pre-existing readers in a starvation-free manner. + */ +static void srcu_flip(struct srcu_struct *sp) +{ + WRITE_ONCE(sp->srcu_idx, sp->srcu_idx + 1); + + /* + * Ensure that if the updater misses an __srcu_read_unlock() + * increment, that task's next __srcu_read_lock() will see the + * above counter update. Note that both this memory barrier + * and the one in srcu_readers_active_idx_check() provide the + * guarantee for __srcu_read_lock(). + */ + smp_mb(); /* D */ /* Pairs with C. */ +} + +/* + * If SRCU is likely idle, return true, otherwise return false. + * + * Note that it is OK for several current from-idle requests for a new + * grace period from idle to specify expediting because they will all end + * up requesting the same grace period anyhow. So no loss. + * + * Note also that if any CPU (including the current one) is still invoking + * callbacks, this function will nevertheless say "idle". This is not + * ideal, but the overhead of checking all CPUs' callback lists is even + * less ideal, especially on large systems. Furthermore, the wakeup + * can happen before the callback is fully removed, so we have no choice + * but to accept this type of error. + * + * This function is also subject to counter-wrap errors, but let's face + * it, if this function was preempted for enough time for the counters + * to wrap, it really doesn't matter whether or not we expedite the grace + * period. The extra overhead of a needlessly expedited grace period is + * negligible when amoritized over that time period, and the extra latency + * of a needlessly non-expedited grace period is similarly negligible. + */ +static bool srcu_might_be_idle(struct srcu_struct *sp) +{ + unsigned long curseq; + unsigned long flags; + struct srcu_data *sdp; + unsigned long t; + + /* If the local srcu_data structure has callbacks, not idle. */ + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { + local_irq_restore(flags); + return false; /* Callbacks already present, so not idle. */ + } + local_irq_restore(flags); + + /* + * No local callbacks, so probabalistically probe global state. + * Exact information would require acquiring locks, which would + * kill scalability, hence the probabalistic nature of the probe. + */ + + /* First, see if enough time has passed since the last GP. */ + t = ktime_get_mono_fast_ns(); + if (exp_holdoff == 0 || + time_in_range_open(t, sp->srcu_last_gp_end, + sp->srcu_last_gp_end + exp_holdoff)) + return false; /* Too soon after last GP. */ + + /* Next, check for probable idleness. */ + curseq = rcu_seq_current(&sp->srcu_gp_seq); + smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ + if (ULONG_CMP_LT(curseq, READ_ONCE(sp->srcu_gp_seq_needed))) + return false; /* Grace period in progress, so not idle. */ + smp_mb(); /* Order ->srcu_gp_seq with prior access. */ + if (curseq != rcu_seq_current(&sp->srcu_gp_seq)) + return false; /* GP # changed, so not idle. */ + return true; /* With reasonable probability, idle! */ +} + +/* + * Enqueue an SRCU callback on the srcu_data structure associated with + * the current CPU and the specified srcu_struct structure, initiating + * grace-period processing if it is not already running. + * + * Note that all CPUs must agree that the grace period extended beyond + * all pre-existing SRCU read-side critical section. On systems with + * more than one CPU, this means that when "func()" is invoked, each CPU + * is guaranteed to have executed a full memory barrier since the end of + * its last corresponding SRCU read-side critical section whose beginning + * preceded the call to call_rcu(). It also means that each CPU executing + * an SRCU read-side critical section that continues beyond the start of + * "func()" must have executed a memory barrier after the call_rcu() + * but before the beginning of that SRCU read-side critical section. + * Note that these guarantees include CPUs that are offline, idle, or + * executing in user mode, as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked call_rcu() and CPU B invoked the + * resulting SRCU callback function "func()", then both CPU A and CPU + * B are guaranteed to execute a full memory barrier during the time + * interval between the call to call_rcu() and the invocation of "func()". + * This guarantee applies even if CPU A and CPU B are the same CPU (but + * again only if the system has more than one CPU). + * + * Of course, these guarantees apply only for invocations of call_srcu(), + * srcu_read_lock(), and srcu_read_unlock() that are all passed the same + * srcu_struct structure. + */ +void __call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func, bool do_norm) +{ + unsigned long flags; + bool needexp = false; + bool needgp = false; + unsigned long s; + struct srcu_data *sdp; + + check_init_srcu_struct(sp); + rhp->func = func; + local_irq_save(flags); + sdp = this_cpu_ptr(sp->sda); + spin_lock(&sdp->lock); + rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp, false); + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + s = rcu_seq_snap(&sp->srcu_gp_seq); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, s); + if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { + sdp->srcu_gp_seq_needed = s; + needgp = true; + } + if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { + sdp->srcu_gp_seq_needed_exp = s; + needexp = true; + } + spin_unlock_irqrestore(&sdp->lock, flags); + if (needgp) + srcu_funnel_gp_start(sp, sdp, s, do_norm); + else if (needexp) + srcu_funnel_exp_start(sp, sdp->mynode, s); +} + +void call_srcu(struct srcu_struct *sp, struct rcu_head *rhp, + rcu_callback_t func) +{ + __call_srcu(sp, rhp, func, true); +} +EXPORT_SYMBOL_GPL(call_srcu); + +/* + * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). + */ +static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm) +{ + struct rcu_synchronize rcu; + + RCU_LOCKDEP_WARN(lock_is_held(&sp->dep_map) || + lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); + + if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) + return; + might_sleep(); + check_init_srcu_struct(sp); + init_completion(&rcu.completion); + init_rcu_head_on_stack(&rcu.head); + __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); + wait_for_completion(&rcu.completion); + destroy_rcu_head_on_stack(&rcu.head); +} + +/** + * synchronize_srcu_expedited - Brute-force SRCU grace period + * @sp: srcu_struct with which to synchronize. + * + * Wait for an SRCU grace period to elapse, but be more aggressive about + * spinning rather than blocking when waiting. + * + * Note that synchronize_srcu_expedited() has the same deadlock and + * memory-ordering properties as does synchronize_srcu(). + */ +void synchronize_srcu_expedited(struct srcu_struct *sp) +{ + __synchronize_srcu(sp, rcu_gp_is_normal()); +} +EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); + +/** + * synchronize_srcu - wait for prior SRCU read-side critical-section completion + * @sp: srcu_struct with which to synchronize. + * + * Wait for the count to drain to zero of both indexes. To avoid the + * possible starvation of synchronize_srcu(), it waits for the count of + * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, + * and then flip the srcu_idx and wait for the count of the other index. + * + * Can block; must be called from process context. + * + * Note that it is illegal to call synchronize_srcu() from the corresponding + * SRCU read-side critical section; doing so will result in deadlock. + * However, it is perfectly legal to call synchronize_srcu() on one + * srcu_struct from some other srcu_struct's read-side critical section, + * as long as the resulting graph of srcu_structs is acyclic. + * + * There are memory-ordering constraints implied by synchronize_srcu(). + * On systems with more than one CPU, when synchronize_srcu() returns, + * each CPU is guaranteed to have executed a full memory barrier since + * the end of its last corresponding SRCU-sched read-side critical section + * whose beginning preceded the call to synchronize_srcu(). In addition, + * each CPU having an SRCU read-side critical section that extends beyond + * the return from synchronize_srcu() is guaranteed to have executed a + * full memory barrier after the beginning of synchronize_srcu() and before + * the beginning of that SRCU read-side critical section. Note that these + * guarantees include CPUs that are offline, idle, or executing in user mode, + * as well as CPUs that are executing in the kernel. + * + * Furthermore, if CPU A invoked synchronize_srcu(), which returned + * to its caller on CPU B, then both CPU A and CPU B are guaranteed + * to have executed a full memory barrier during the execution of + * synchronize_srcu(). This guarantee applies even if CPU A and CPU B + * are the same CPU, but again only if the system has more than one CPU. + * + * Of course, these memory-ordering guarantees apply only when + * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are + * passed the same srcu_struct structure. + * + * If SRCU is likely idle, expedite the first request. This semantic + * was provided by Classic SRCU, and is relied upon by its users, so TREE + * SRCU must also provide it. Note that detecting idleness is heuristic + * and subject to both false positives and negatives. + */ +void synchronize_srcu(struct srcu_struct *sp) +{ + if (srcu_might_be_idle(sp) || rcu_gp_is_expedited()) + synchronize_srcu_expedited(sp); + else + __synchronize_srcu(sp, true); +} +EXPORT_SYMBOL_GPL(synchronize_srcu); + +/* + * Callback function for srcu_barrier() use. + */ +static void srcu_barrier_cb(struct rcu_head *rhp) +{ + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); + sp = sdp->sp; + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); +} + +/** + * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. + * @sp: srcu_struct on which to wait for in-flight callbacks. + */ +void srcu_barrier(struct srcu_struct *sp) +{ + int cpu; + struct srcu_data *sdp; + unsigned long s = rcu_seq_snap(&sp->srcu_barrier_seq); + + check_init_srcu_struct(sp); + mutex_lock(&sp->srcu_barrier_mutex); + if (rcu_seq_done(&sp->srcu_barrier_seq, s)) { + smp_mb(); /* Force ordering following return. */ + mutex_unlock(&sp->srcu_barrier_mutex); + return; /* Someone else did our work for us. */ + } + rcu_seq_start(&sp->srcu_barrier_seq); + init_completion(&sp->srcu_barrier_completion); + + /* Initial count prevents reaching zero until all CBs are posted. */ + atomic_set(&sp->srcu_barrier_cpu_cnt, 1); + + /* + * Each pass through this loop enqueues a callback, but only + * on CPUs already having callbacks enqueued. Note that if + * a CPU already has callbacks enqueue, it must have already + * registered the need for a future grace period, so all we + * need do is enqueue a callback that will use the same + * grace period as the last callback already in the queue. + */ + for_each_possible_cpu(cpu) { + sdp = per_cpu_ptr(sp->sda, cpu); + spin_lock_irq(&sdp->lock); + atomic_inc(&sp->srcu_barrier_cpu_cnt); + sdp->srcu_barrier_head.func = srcu_barrier_cb; + if (!rcu_segcblist_entrain(&sdp->srcu_cblist, + &sdp->srcu_barrier_head, 0)) + atomic_dec(&sp->srcu_barrier_cpu_cnt); + spin_unlock_irq(&sdp->lock); + } + + /* Remove the initial count, at which point reaching zero can happen. */ + if (atomic_dec_and_test(&sp->srcu_barrier_cpu_cnt)) + complete(&sp->srcu_barrier_completion); + wait_for_completion(&sp->srcu_barrier_completion); + + rcu_seq_end(&sp->srcu_barrier_seq); + mutex_unlock(&sp->srcu_barrier_mutex); +} +EXPORT_SYMBOL_GPL(srcu_barrier); + +/** + * srcu_batches_completed - return batches completed. + * @sp: srcu_struct on which to report batch completion. + * + * Report the number of batches, correlated with, but not necessarily + * precisely the same as, the number of grace periods that have elapsed. + */ +unsigned long srcu_batches_completed(struct srcu_struct *sp) +{ + return sp->srcu_idx; +} +EXPORT_SYMBOL_GPL(srcu_batches_completed); + +/* + * Core SRCU state machine. Push state bits of ->srcu_gp_seq + * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has + * completed in that state. + */ +static void srcu_advance_state(struct srcu_struct *sp) +{ + int idx; + + mutex_lock(&sp->srcu_gp_mutex); + + /* + * Because readers might be delayed for an extended period after + * fetching ->srcu_idx for their index, at any point in time there + * might well be readers using both idx=0 and idx=1. We therefore + * need to wait for readers to clear from both index values before + * invoking a callback. + * + * The load-acquire ensures that we see the accesses performed + * by the prior grace period. + */ + idx = rcu_seq_state(smp_load_acquire(&sp->srcu_gp_seq)); /* ^^^ */ + if (idx == SRCU_STATE_IDLE) { + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq)); + spin_unlock_irq(&sp->gp_lock); + mutex_unlock(&sp->srcu_gp_mutex); + return; + } + idx = rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)); + if (idx == SRCU_STATE_IDLE) + srcu_gp_start(sp); + spin_unlock_irq(&sp->gp_lock); + if (idx != SRCU_STATE_IDLE) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* Someone else started the grace period. */ + } + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN1) { + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 1)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_flip(sp); + rcu_seq_set_state(&sp->srcu_gp_seq, SRCU_STATE_SCAN2); + } + + if (rcu_seq_state(READ_ONCE(sp->srcu_gp_seq)) == SRCU_STATE_SCAN2) { + + /* + * SRCU read-side critical sections are normally short, + * so check at least twice in quick succession after a flip. + */ + idx = 1 ^ (sp->srcu_idx & 1); + if (!try_check_zero(sp, idx, 2)) { + mutex_unlock(&sp->srcu_gp_mutex); + return; /* readers present, retry later. */ + } + srcu_gp_end(sp); /* Releases ->srcu_gp_mutex. */ + } +} + +/* + * Invoke a limited number of SRCU callbacks that have passed through + * their grace period. If there are more to do, SRCU will reschedule + * the workqueue. Note that needed memory barriers have been executed + * in this task's context by srcu_readers_active_idx_check(). + */ +static void srcu_invoke_callbacks(struct work_struct *work) +{ + bool more; + struct rcu_cblist ready_cbs; + struct rcu_head *rhp; + struct srcu_data *sdp; + struct srcu_struct *sp; + + sdp = container_of(work, struct srcu_data, work.work); + sp = sdp->sp; + rcu_cblist_init(&ready_cbs); + spin_lock_irq(&sdp->lock); + smp_mb(); /* Old grace periods before callback invocation! */ + rcu_segcblist_advance(&sdp->srcu_cblist, + rcu_seq_current(&sp->srcu_gp_seq)); + if (sdp->srcu_cblist_invoking || + !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { + spin_unlock_irq(&sdp->lock); + return; /* Someone else on the job or nothing to do. */ + } + + /* We are on the job! Extract and invoke ready callbacks. */ + sdp->srcu_cblist_invoking = true; + rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); + spin_unlock_irq(&sdp->lock); + rhp = rcu_cblist_dequeue(&ready_cbs); + for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { + local_bh_disable(); + rhp->func(rhp); + local_bh_enable(); + } + + /* + * Update counts, accelerate new callbacks, and if needed, + * schedule another round of callback invocation. + */ + spin_lock_irq(&sdp->lock); + rcu_segcblist_insert_count(&sdp->srcu_cblist, &ready_cbs); + (void)rcu_segcblist_accelerate(&sdp->srcu_cblist, + rcu_seq_snap(&sp->srcu_gp_seq)); + sdp->srcu_cblist_invoking = false; + more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); + spin_unlock_irq(&sdp->lock); + if (more) + srcu_schedule_cbs_sdp(sdp, 0); +} + +/* + * Finished one round of SRCU grace period. Start another if there are + * more SRCU callbacks queued, otherwise put SRCU into not-running state. + */ +static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay) +{ + bool pushgp = true; + + spin_lock_irq(&sp->gp_lock); + if (ULONG_CMP_GE(sp->srcu_gp_seq, sp->srcu_gp_seq_needed)) { + if (!WARN_ON_ONCE(rcu_seq_state(sp->srcu_gp_seq))) { + /* All requests fulfilled, time to go idle. */ + pushgp = false; + } + } else if (!rcu_seq_state(sp->srcu_gp_seq)) { + /* Outstanding request and no GP. Start one. */ + srcu_gp_start(sp); + } + spin_unlock_irq(&sp->gp_lock); + + if (pushgp) + queue_delayed_work(system_power_efficient_wq, &sp->work, delay); +} + +/* + * This is the work-queue function that handles SRCU grace periods. + */ +void process_srcu(struct work_struct *work) +{ + struct srcu_struct *sp; + + sp = container_of(work, struct srcu_struct, work.work); + + srcu_advance_state(sp); + srcu_reschedule(sp, srcu_get_delay(sp)); +} +EXPORT_SYMBOL_GPL(process_srcu); + +void srcutorture_get_gp_data(enum rcutorture_type test_type, + struct srcu_struct *sp, int *flags, + unsigned long *gpnum, unsigned long *completed) +{ + if (test_type != SRCU_FLAVOR) + return; + *flags = 0; + *completed = rcu_seq_ctr(sp->srcu_gp_seq); + *gpnum = rcu_seq_ctr(sp->srcu_gp_seq_needed); +} +EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 6ad330dbbae2ec3fa4f74fdc1cc1a2ff1154bc9d..e5385731e39109d9a27a47d123ac44908402fcce 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -79,7 +79,7 @@ EXPORT_SYMBOL(__rcu_is_watching); */ static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) { - RCU_TRACE(reset_cpu_stall_ticks(rcp)); + RCU_TRACE(reset_cpu_stall_ticks(rcp);) if (rcp->donetail != rcp->curtail) { rcp->donetail = rcp->curtail; return 1; @@ -125,7 +125,7 @@ void rcu_bh_qs(void) */ void rcu_check_callbacks(int user) { - RCU_TRACE(check_cpu_stalls()); + RCU_TRACE(check_cpu_stalls();) if (user) rcu_sched_qs(); else if (!in_softirq()) @@ -143,7 +143,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) const char *rn = NULL; struct rcu_head *next, *list; unsigned long flags; - RCU_TRACE(int cb_count = 0); + RCU_TRACE(int cb_count = 0;) /* Move the ready-to-invoke callbacks to a local list. */ local_irq_save(flags); @@ -152,7 +152,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); return; } - RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); + RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1);) list = rcp->rcucblist; rcp->rcucblist = *rcp->donetail; *rcp->donetail = NULL; @@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) local_irq_restore(flags); /* Invoke the callbacks on the local list. */ - RCU_TRACE(rn = rcp->name); + RCU_TRACE(rn = rcp->name;) while (list) { next = list->next; prefetch(next); @@ -171,9 +171,9 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) __rcu_reclaim(rn, list); local_bh_enable(); list = next; - RCU_TRACE(cb_count++); + RCU_TRACE(cb_count++;) } - RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); + RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count);) RCU_TRACE(trace_rcu_batch_end(rcp->name, cb_count, 0, need_resched(), is_idle_task(current), @@ -221,7 +221,7 @@ static void __call_rcu(struct rcu_head *head, local_irq_save(flags); *rcp->curtail = head; rcp->curtail = &head->next; - RCU_TRACE(rcp->qlen++); + RCU_TRACE(rcp->qlen++;) local_irq_restore(flags); if (unlikely(is_idle_task(current))) { @@ -254,8 +254,8 @@ EXPORT_SYMBOL_GPL(call_rcu_bh); void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk)); - RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk)); + RCU_TRACE(reset_cpu_stall_ticks(&rcu_sched_ctrlblk);) + RCU_TRACE(reset_cpu_stall_ticks(&rcu_bh_ctrlblk);) rcu_early_boot_tests(); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index c64b827ecbca19656395e873ca06da0c92a6298e..371034e77f87c8c6a7942495284ec3c1331dad18 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -52,7 +52,7 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = { RCU_TRACE(.name = "rcu_bh") }; -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) #include int rcu_scheduler_active __read_mostly; @@ -65,15 +65,16 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active); * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage. * The reason for this is that Tiny RCU does not need kthreads, so does * not have to care about the fact that the scheduler is half-initialized - * at a certain phase of the boot process. + * at a certain phase of the boot process. Unless SRCU is in the mix. */ void __init rcu_scheduler_starting(void) { WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU) + ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING; } -#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ +#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */ #ifdef CONFIG_RCU_TRACE @@ -162,8 +163,8 @@ static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) static void check_cpu_stalls(void) { - RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); - RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); + RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk);) + RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk);) } #endif /* #ifdef CONFIG_RCU_TRACE */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 50fee7689e7125787cb3a02ffaa6e9837b1b05d2..e354e475e645ff4f0fab186913a274e931da0db3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -57,6 +57,7 @@ #include #include #include +#include #include "tree.h" #include "rcu.h" @@ -97,8 +98,8 @@ struct rcu_state sname##_state = { \ .gpnum = 0UL - 300UL, \ .completed = 0UL - 300UL, \ .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ - .orphan_nxttail = &sname##_state.orphan_nxtlist, \ - .orphan_donetail = &sname##_state.orphan_donelist, \ + .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \ + .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \ .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ .name = RCU_STATE_NAME(sname), \ .abbr = sabbr, \ @@ -123,7 +124,7 @@ static int rcu_fanout_leaf = RCU_FANOUT_LEAF; module_param(rcu_fanout_leaf, int, 0444); int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; /* Number of rcu_nodes at specified level. */ -static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; +int num_rcu_lvl[] = NUM_RCU_LVL_INIT; int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ /* panic() on RCU Stall sysctl. */ int sysctl_panic_on_rcu_stall __read_mostly; @@ -199,7 +200,7 @@ static const int gp_cleanup_delay; /* * Number of grace periods between delays, normalized by the duration of - * the delay. The longer the the delay, the more the grace periods between + * the delay. The longer the delay, the more the grace periods between * each delay. The reason for this normalization is that it means that, * for non-zero delays, the overall slowdown of grace periods is constant * regardless of the duration of the delay. This arrangement balances @@ -272,17 +273,39 @@ void rcu_bh_qs(void) } } -static DEFINE_PER_CPU(int, rcu_sched_qs_mask); +/* + * Steal a bit from the bottom of ->dynticks for idle entry/exit + * control. Initially this is for TLB flushing. + */ +#define RCU_DYNTICK_CTRL_MASK 0x1 +#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1) +#ifndef rcu_eqs_special_exit +#define rcu_eqs_special_exit() do { } while (0) +#endif static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, - .dynticks = ATOMIC_INIT(1), + .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR), #ifdef CONFIG_NO_HZ_FULL_SYSIDLE .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, .dynticks_idle = ATOMIC_INIT(1), #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ }; +/* + * There's a few places, currently just in the tracing infrastructure, + * that uses rcu_irq_enter() to make sure RCU is watching. But there's + * a small location where that will not even work. In those cases + * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() + * can be called. + */ +static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); + +bool rcu_irq_enter_disabled(void) +{ + return this_cpu_read(disable_rcu_irq_enter); +} + /* * Record entry into an extended quiescent state. This is only to be * called when not already in an extended quiescent state. @@ -290,15 +313,20 @@ static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { static void rcu_dynticks_eqs_enter(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior RCU read-side + * CPUs seeing atomic_add_return() must see prior RCU read-side * critical sections, and we also must force ordering with the * next idle sojourn. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + /* Better be in an extended quiescent state! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_CTR)); + /* Better not have special action (TLB flush) pending! */ + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + (seq & RCU_DYNTICK_CTRL_MASK)); } /* @@ -308,15 +336,22 @@ static void rcu_dynticks_eqs_enter(void) static void rcu_dynticks_eqs_exit(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special; + int seq; /* - * CPUs seeing atomic_inc_return() must see prior idle sojourns, + * CPUs seeing atomic_add_return() must see prior idle sojourns, * and we also must force ordering with the next RCU read-side * critical section. */ - special = atomic_inc_return(&rdtp->dynticks); - WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); + seq = atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); + WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && + !(seq & RCU_DYNTICK_CTRL_CTR)); + if (seq & RCU_DYNTICK_CTRL_MASK) { + atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdtp->dynticks); + smp_mb__after_atomic(); /* _exit after clearing mask. */ + /* Prefer duplicate flushes to losing a flush. */ + rcu_eqs_special_exit(); + } } /* @@ -333,9 +368,9 @@ static void rcu_dynticks_eqs_online(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - if (atomic_read(&rdtp->dynticks) & 0x1) + if (atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR) return; - atomic_add(0x1, &rdtp->dynticks); + atomic_add(RCU_DYNTICK_CTRL_CTR, &rdtp->dynticks); } /* @@ -347,7 +382,7 @@ bool rcu_dynticks_curr_cpu_in_eqs(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - return !(atomic_read(&rdtp->dynticks) & 0x1); + return !(atomic_read(&rdtp->dynticks) & RCU_DYNTICK_CTRL_CTR); } /* @@ -358,7 +393,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) { int snap = atomic_add_return(0, &rdtp->dynticks); - return snap; + return snap & ~RCU_DYNTICK_CTRL_MASK; } /* @@ -367,7 +402,7 @@ int rcu_dynticks_snap(struct rcu_dynticks *rdtp) */ static bool rcu_dynticks_in_eqs(int snap) { - return !(snap & 0x1); + return !(snap & RCU_DYNTICK_CTRL_CTR); } /* @@ -387,14 +422,34 @@ static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) static void rcu_dynticks_momentary_idle(void) { struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - int special = atomic_add_return(2, &rdtp->dynticks); + int special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR, + &rdtp->dynticks); /* It is illegal to call this from idle state. */ - WARN_ON_ONCE(!(special & 0x1)); + WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR)); } -DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); -EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); +/* + * Set the special (bottom) bit of the specified CPU so that it + * will take special action (such as flushing its TLB) on the + * next exit from an extended quiescent state. Returns true if + * the bit was successfully set, or false if the CPU was not in + * an extended quiescent state. + */ +bool rcu_eqs_special_set(int cpu) +{ + int old; + int new; + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + do { + old = atomic_read(&rdtp->dynticks); + if (old & RCU_DYNTICK_CTRL_CTR) + return false; + new = old | RCU_DYNTICK_CTRL_MASK; + } while (atomic_cmpxchg(&rdtp->dynticks, old, new) != old); + return true; +} /* * Let the RCU core know that this CPU has gone through the scheduler, @@ -403,44 +458,14 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * memory barriers to let the RCU core know about it, regardless of what * this CPU might (or might not) do in the near future. * - * We inform the RCU core by emulating a zero-duration dyntick-idle - * period, which we in turn do by incrementing the ->dynticks counter - * by two. + * We inform the RCU core by emulating a zero-duration dyntick-idle period. * * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - struct rcu_data *rdp; - int resched_mask; - struct rcu_state *rsp; - - /* - * Yes, we can lose flag-setting operations. This is OK, because - * the flag will be set again after some delay. - */ - resched_mask = raw_cpu_read(rcu_sched_qs_mask); - raw_cpu_write(rcu_sched_qs_mask, 0); - - /* Find the flavor that needs a quiescent state. */ - for_each_rcu_flavor(rsp) { - rdp = raw_cpu_ptr(rsp->rda); - if (!(resched_mask & rsp->flavor_mask)) - continue; - smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ - if (READ_ONCE(rdp->mynode->completed) != - READ_ONCE(rdp->cond_resched_completed)) - continue; - - /* - * Pretend to be momentarily idle for the quiescent state. - * This allows the grace-period kthread to record the - * quiescent state, with no need for this CPU to do anything - * further. - */ - rcu_dynticks_momentary_idle(); - break; - } + raw_cpu_write(rcu_dynticks.rcu_need_heavy_qs, false); + rcu_dynticks_momentary_idle(); } /* @@ -448,14 +473,22 @@ static void rcu_momentary_dyntick_idle(void) * and requires special handling for preemptible RCU. * The caller must have disabled interrupts. */ -void rcu_note_context_switch(void) +void rcu_note_context_switch(bool preempt) { barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); rcu_sched_qs(); rcu_preempt_note_context_switch(); - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) + goto out; + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) rcu_momentary_dyntick_idle(); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); + if (!preempt) + rcu_note_voluntary_context_switch_lite(current); +out: trace_rcu_utilization(TPS("End context switch")); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -478,29 +511,26 @@ void rcu_all_qs(void) { unsigned long flags; + if (!raw_cpu_read(rcu_dynticks.rcu_urgent_qs)) + return; + preempt_disable(); + /* Load rcu_urgent_qs before other flags. */ + if (!smp_load_acquire(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs))) { + preempt_enable(); + return; + } + this_cpu_write(rcu_dynticks.rcu_urgent_qs, false); barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + if (unlikely(raw_cpu_read(rcu_dynticks.rcu_need_heavy_qs))) { local_irq_save(flags); rcu_momentary_dyntick_idle(); local_irq_restore(flags); } - if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { - /* - * Yes, we just checked a per-CPU variable with preemption - * enabled, so we might be migrated to some other CPU at - * this point. That is OK because in that case, the - * migration will supply the needed quiescent state. - * We might end up needlessly disabling preemption and - * invoking rcu_sched_qs() on the destination CPU, but - * the probability and cost are both quite low, so this - * should not be a problem in practice. - */ - preempt_disable(); + if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) rcu_sched_qs(); - preempt_enable(); - } - this_cpu_inc(rcu_qs_ctr); + this_cpu_inc(rcu_dynticks.rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ + preempt_enable(); } EXPORT_SYMBOL_GPL(rcu_all_qs); @@ -689,15 +719,11 @@ void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, default: break; } - if (rsp != NULL) { - *flags = READ_ONCE(rsp->gp_flags); - *gpnum = READ_ONCE(rsp->gpnum); - *completed = READ_ONCE(rsp->completed); + if (rsp == NULL) return; - } - *flags = 0; - *gpnum = 0; - *completed = 0; + *flags = READ_ONCE(rsp->gp_flags); + *gpnum = READ_ONCE(rsp->gpnum); + *completed = READ_ONCE(rsp->completed); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); @@ -712,16 +738,6 @@ void rcutorture_record_progress(unsigned long vernum) } EXPORT_SYMBOL_GPL(rcutorture_record_progress); -/* - * Does the CPU have callbacks ready to be invoked? - */ -static int -cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) -{ - return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && - rdp->nxttail[RCU_NEXT_TAIL] != NULL; -} - /* * Return the root node of the specified rcu_state structure. */ @@ -752,44 +768,39 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { - int i; - if (rcu_gp_in_progress(rsp)) return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) return true; /* Yes, a no-CBs CPU needs one. */ - if (!rdp->nxttail[RCU_NEXT_TAIL]) + if (!rcu_segcblist_is_enabled(&rdp->cblist)) return false; /* No, this is a no-CBs (or offline) CPU. */ - if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) + if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)) return true; /* Yes, CPU has newly registered callbacks. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) - if (rdp->nxttail[i - 1] != rdp->nxttail[i] && - ULONG_CMP_LT(READ_ONCE(rsp->completed), - rdp->nxtcompleted[i])) - return true; /* Yes, CBs for future grace period. */ + if (rcu_segcblist_future_gp_needed(&rdp->cblist, + READ_ONCE(rsp->completed))) + return true; /* Yes, CBs for future grace period. */ return false; /* No grace period needed. */ } /* - * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state + * rcu_eqs_enter_common - current CPU is entering an extended quiescent state * - * If the new value of the ->dynticks_nesting counter now is zero, - * we really have entered idle, and must do the appropriate accounting. - * The caller must have disabled interrupts. + * Enter idle, doing appropriate accounting. The caller must have + * disabled interrupts. */ -static void rcu_eqs_enter_common(long long oldval, bool user) +static void rcu_eqs_enter_common(bool user) { struct rcu_state *rsp; struct rcu_data *rdp; - RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) + struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); - trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); + trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)) { struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); - trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); + trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); rcu_ftrace_dump(DUMP_ORIG); WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", current->pid, current->comm, @@ -800,7 +811,10 @@ static void rcu_eqs_enter_common(long long oldval, bool user) do_nocb_deferred_wakeup(rdp); } rcu_prepare_for_idle(); - rcu_dynticks_eqs_enter(); + __this_cpu_inc(disable_rcu_irq_enter); + rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ + rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ + __this_cpu_dec(disable_rcu_irq_enter); rcu_dynticks_task_enter(); /* @@ -821,19 +835,15 @@ static void rcu_eqs_enter_common(long long oldval, bool user) */ static void rcu_eqs_enter(bool user) { - long long oldval; struct rcu_dynticks *rdtp; rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - (oldval & DYNTICK_TASK_NEST_MASK) == 0); - if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) { - rdtp->dynticks_nesting = 0; - rcu_eqs_enter_common(oldval, user); - } else { + (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); + if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) + rcu_eqs_enter_common(user); + else rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; - } } /** @@ -892,19 +902,18 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - long long oldval; struct rcu_dynticks *rdtp; RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); - oldval = rdtp->dynticks_nesting; - rdtp->dynticks_nesting--; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && - rdtp->dynticks_nesting < 0); - if (rdtp->dynticks_nesting) - trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); - else - rcu_eqs_enter_common(oldval, true); + rdtp->dynticks_nesting < 1); + if (rdtp->dynticks_nesting <= 1) { + rcu_eqs_enter_common(true); + } else { + trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); + rdtp->dynticks_nesting--; + } rcu_sysidle_enter(1); } @@ -1150,6 +1159,24 @@ bool notrace rcu_is_watching(void) } EXPORT_SYMBOL_GPL(rcu_is_watching); +/* + * If a holdout task is actually running, request an urgent quiescent + * state from its CPU. This is unsynchronized, so migrations can cause + * the request to go to the wrong CPU. Which is OK, all that will happen + * is that the CPU's next context switch will be a bit slower and next + * time around this task will generate another request. + */ +void rcu_request_urgent_qs_task(struct task_struct *t) +{ + int cpu; + + barrier(); + cpu = task_cpu(t); + if (!task_curr(t)) + return; /* This task is not running on that CPU. */ + smp_store_release(per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, cpu), true); +} + #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) /* @@ -1235,7 +1262,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, bool *isidle, unsigned long *maxj) { unsigned long jtsq; - int *rcrmp; + bool *rnhqp; + bool *ruqp; unsigned long rjtsc; struct rcu_node *rnp; @@ -1271,11 +1299,15 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * might not be the case for nohz_full CPUs looping in the kernel. */ rnp = rdp->mynode; + ruqp = per_cpu_ptr(&rcu_dynticks.rcu_urgent_qs, rdp->cpu); if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && - READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && + READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_dynticks.rcu_qs_ctr, rdp->cpu) && READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); return 1; + } else { + /* Load rcu_qs_ctr before store to rcu_urgent_qs. */ + smp_store_release(ruqp, true); } /* Check for the CPU being offline. */ @@ -1292,7 +1324,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * in-kernel CPU-bound tasks cannot advance grace periods. * So if the grace period is old enough, make the CPU pay attention. * Note that the unsynchronized assignments to the per-CPU - * rcu_sched_qs_mask variable are safe. Yes, setting of + * rcu_need_heavy_qs variable are safe. Yes, setting of * bits can be lost, but they will be set again on the next * force-quiescent-state pass. So lost bit sets do not result * in incorrect behavior, merely in a grace period lasting @@ -1306,16 +1338,13 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, * is set too high, we override with half of the RCU CPU stall * warning delay. */ - rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); - if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || - time_after(jiffies, rdp->rsp->jiffies_resched)) { - if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { - WRITE_ONCE(rdp->cond_resched_completed, - READ_ONCE(rdp->mynode->completed)); - smp_mb(); /* ->cond_resched_completed before *rcrmp. */ - WRITE_ONCE(*rcrmp, - READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - } + rnhqp = &per_cpu(rcu_dynticks.rcu_need_heavy_qs, rdp->cpu); + if (!READ_ONCE(*rnhqp) && + (time_after(jiffies, rdp->rsp->gp_start + jtsq) || + time_after(jiffies, rdp->rsp->jiffies_resched))) { + WRITE_ONCE(*rnhqp, true); + /* Store rcu_need_heavy_qs before rcu_urgent_qs. */ + smp_store_release(ruqp, true); rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } @@ -1475,7 +1504,8 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start), (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1529,7 +1559,8 @@ static void print_cpu_stall(struct rcu_state *rsp) print_cpu_stall_info(rsp, smp_processor_id()); print_cpu_stall_info_end(); for_each_possible_cpu(cpu) - totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; + totqlen += rcu_segcblist_n_cbs(&per_cpu_ptr(rsp->rda, + cpu)->cblist); pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", jiffies - rsp->gp_start, (long)rsp->gpnum, (long)rsp->completed, totqlen); @@ -1631,30 +1662,6 @@ void rcu_cpu_stall_reset(void) WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); } -/* - * Initialize the specified rcu_data structure's default callback list - * to empty. The default callback list is the one that is not used by - * no-callbacks CPUs. - */ -static void init_default_callback_list(struct rcu_data *rdp) -{ - int i; - - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; -} - -/* - * Initialize the specified rcu_data structure's callback list to empty. - */ -static void init_callback_list(struct rcu_data *rdp) -{ - if (init_nocb_callback_list(rdp)) - return; - init_default_callback_list(rdp); -} - /* * Determine the value that ->completed will have at the end of the * next subsequent grace period. This is used to tag callbacks so that @@ -1709,7 +1716,6 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, unsigned long *c_out) { unsigned long c; - int i; bool ret = false; struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); @@ -1755,13 +1761,11 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Get a new grace-period number. If there really is no grace * period in progress, it will be smaller than the one we obtained - * earlier. Adjust callbacks as needed. Note that even no-CBs - * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. + * earlier. Adjust callbacks as needed. */ c = rcu_cbs_completed(rdp->rsp, rnp_root); - for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) - if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) - rdp->nxtcompleted[i] = c; + if (!rcu_is_nocb_cpu(rdp->cpu)) + (void)rcu_segcblist_accelerate(&rdp->cblist, c); /* * If the needed for the required grace period is already @@ -1793,9 +1797,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, /* * Clean up any old requests for the just-ended grace period. Also return - * whether any additional grace periods have been requested. Also invoke - * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads - * waiting for this grace period to complete. + * whether any additional grace periods have been requested. */ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) { @@ -1841,57 +1843,27 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - unsigned long c; - int i; - bool ret; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) - return false; - - /* - * Starting from the sublist containing the callbacks most - * recently assigned a ->completed number and working down, find the - * first sublist that is not assignable to an upcoming grace period. - * Such a sublist has something in it (first two tests) and has - * a ->completed number assigned that will complete sooner than - * the ->completed number for newly arrived callbacks (last test). - * - * The key point is that any later sublist can be assigned the - * same ->completed number as the newly arrived callbacks, which - * means that the callbacks in any of these later sublist can be - * grouped into a single sublist, whether or not they have already - * been assigned a ->completed number. - */ - c = rcu_cbs_completed(rsp, rnp); - for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] != rdp->nxttail[i - 1] && - !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) - break; + bool ret = false; - /* - * If there are no sublist for unassigned callbacks, leave. - * At the same time, advance "i" one sublist, so that "i" will - * index into the sublist where all the remaining callbacks should - * be grouped into. - */ - if (++i >= RCU_NEXT_TAIL) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* - * Assign all subsequent callbacks' ->completed number to the next - * full grace period and group them all in the sublist initially - * indexed by "i". + * Callbacks are often registered with incomplete grace-period + * information. Something about the fact that getting exact + * information requires acquiring a global lock... RCU therefore + * makes a conservative estimate of the grace period number at which + * a given callback will become ready to invoke. The following + * code checks this estimate and improves it when possible, thus + * accelerating callback invocation to an earlier grace-period + * number. */ - for (; i <= RCU_NEXT_TAIL; i++) { - rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtcompleted[i] = c; - } - /* Record any needed additional grace periods. */ - ret = rcu_start_future_gp(rnp, rdp, NULL); + if (rcu_segcblist_accelerate(&rdp->cblist, rcu_cbs_completed(rsp, rnp))) + ret = rcu_start_future_gp(rnp, rdp, NULL); /* Trace depending on how much we were able to accelerate. */ - if (!*rdp->nxttail[RCU_WAIT_TAIL]) + if (rcu_segcblist_restempty(&rdp->cblist, RCU_WAIT_TAIL)) trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); else trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); @@ -1911,32 +1883,15 @@ static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) { - int i, j; - - /* If the CPU has no callbacks, nothing to do. */ - if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) + /* If no pending (not yet ready to invoke) callbacks, nothing to do. */ + if (!rcu_segcblist_pend_cbs(&rdp->cblist)) return false; /* * Find all callbacks whose ->completed numbers indicate that they * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. */ - for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { - if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) - break; - rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; - } - /* Clean up any sublist tail pointers that were misordered above. */ - for (j = RCU_WAIT_TAIL; j < i; j++) - rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; - - /* Copy down callbacks to fill in empty sublists. */ - for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { - if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) - break; - rdp->nxttail[j] = rdp->nxttail[i]; - rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; - } + rcu_segcblist_advance(&rdp->cblist, rnp->completed); /* Classify any remaining callbacks. */ return rcu_accelerate_cbs(rsp, rnp, rdp); @@ -1981,7 +1936,7 @@ static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); need_gp = !!(rnp->qsmask & rdp->grpmask); rdp->cpu_no_qs.b.norm = need_gp; - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); rdp->core_needs_qs = need_gp; zero_cpu_stall_ticks(rdp); WRITE_ONCE(rdp->gpwrap, false); @@ -2579,7 +2534,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) * within the current grace period. */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ - rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); + rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_dynticks.rcu_qs_ctr); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } @@ -2653,13 +2608,8 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * because _rcu_barrier() excludes CPU-hotplug operations, so it * cannot be running now. Thus no memory barrier is required. */ - if (rdp->nxtlist != NULL) { - rsp->qlen_lazy += rdp->qlen_lazy; - rsp->qlen += rdp->qlen; - rdp->n_cbs_orphaned += rdp->qlen; - rdp->qlen_lazy = 0; - WRITE_ONCE(rdp->qlen, 0); - } + rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist); + rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done); /* * Next, move those callbacks still needing a grace period to @@ -2667,31 +2617,18 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, * Some of the callbacks might have gone partway through a grace * period, but that is too bad. They get to start over because we * cannot assume that grace periods are synchronized across CPUs. - * We don't bother updating the ->nxttail[] array yet, instead - * we just reset the whole thing later on. */ - if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { - *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; - rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - } + rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend); /* * Then move the ready-to-invoke callbacks to the orphanage, * where some other CPU will pick them up. These will not be * required to pass though another grace period: They are done. */ - if (rdp->nxtlist != NULL) { - *rsp->orphan_donetail = rdp->nxtlist; - rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; - } + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done); - /* - * Finally, initialize the rcu_data structure's list to empty and - * disallow further callbacks on this CPU. - */ - init_callback_list(rdp); - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + /* Finally, disallow further callbacks on this CPU. */ + rcu_segcblist_disable(&rdp->cblist); } /* @@ -2700,7 +2637,6 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, */ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) { - int i; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); /* No-CBs CPUs are handled specially. */ @@ -2709,13 +2645,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) return; /* Do the accounting first. */ - rdp->qlen_lazy += rsp->qlen_lazy; - rdp->qlen += rsp->qlen; - rdp->n_cbs_adopted += rsp->qlen; - if (rsp->qlen_lazy != rsp->qlen) + rdp->n_cbs_adopted += rsp->orphan_done.len; + if (rsp->orphan_done.len_lazy != rsp->orphan_done.len) rcu_idle_count_callbacks_posted(); - rsp->qlen_lazy = 0; - rsp->qlen = 0; + rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done); /* * We do not need a memory barrier here because the only way we @@ -2723,24 +2656,13 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) * we are the task doing the rcu_barrier(). */ - /* First adopt the ready-to-invoke callbacks. */ - if (rsp->orphan_donelist != NULL) { - *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; - for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = rsp->orphan_donetail; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; - } - - /* And then adopt the callbacks that still need a grace period. */ - if (rsp->orphan_nxtlist != NULL) { - *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; - rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; - } + /* First adopt the ready-to-invoke callbacks, then the done ones. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done); + WARN_ON_ONCE(rsp->orphan_done.head); + rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend); + WARN_ON_ONCE(rsp->orphan_pend.head); + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != + !rcu_segcblist_n_cbs(&rdp->cblist)); } /* @@ -2748,14 +2670,14 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) */ static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) { - RCU_TRACE(unsigned long mask); - RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); - RCU_TRACE(struct rcu_node *rnp = rdp->mynode); + RCU_TRACE(unsigned long mask;) + RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda);) + RCU_TRACE(struct rcu_node *rnp = rdp->mynode;) if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) return; - RCU_TRACE(mask = rdp->grpmask); + RCU_TRACE(mask = rdp->grpmask;) trace_rcu_grace_period(rsp->name, rnp->gpnum + 1 - !!(rnp->qsmask & mask), TPS("cpuofl")); @@ -2828,9 +2750,11 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) rcu_adopt_orphan_cbs(rsp, flags); raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); - WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, - "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", - cpu, rdp->qlen, rdp->nxtlist); + WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || + !rcu_segcblist_empty(&rdp->cblist), + "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", + cpu, rcu_segcblist_n_cbs(&rdp->cblist), + rcu_segcblist_first_cb(&rdp->cblist)); } /* @@ -2840,14 +2764,17 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; - struct rcu_head *next, *list, **tail; - long bl, count, count_lazy; - int i; + struct rcu_head *rhp; + struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl); + long bl, count; /* If no callbacks are ready, just return. */ - if (!cpu_has_callbacks_ready_to_invoke(rdp)) { - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); - trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), + if (!rcu_segcblist_ready_cbs(&rdp->cblist)) { + trace_rcu_batch_start(rsp->name, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), 0); + trace_rcu_batch_end(rsp->name, 0, + !rcu_segcblist_empty(&rdp->cblist), need_resched(), is_idle_task(current), rcu_is_callbacks_kthread()); return; @@ -2855,73 +2782,61 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) /* * Extract the list of ready callbacks, disabling to prevent - * races with call_rcu() from interrupt handlers. + * races with call_rcu() from interrupt handlers. Leave the + * callback counts, as rcu_barrier() needs to be conservative. */ local_irq_save(flags); WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); bl = rdp->blimit; - trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); - list = rdp->nxtlist; - rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; - *rdp->nxttail[RCU_DONE_TAIL] = NULL; - tail = rdp->nxttail[RCU_DONE_TAIL]; - for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) - if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) - rdp->nxttail[i] = &rdp->nxtlist; + trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist), bl); + rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl); local_irq_restore(flags); /* Invoke callbacks. */ - count = count_lazy = 0; - while (list) { - next = list->next; - prefetch(next); - debug_rcu_head_unqueue(list); - if (__rcu_reclaim(rsp->name, list)) - count_lazy++; - list = next; - /* Stop only if limit reached and CPU has something to do. */ - if (++count >= bl && + rhp = rcu_cblist_dequeue(&rcl); + for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) { + debug_rcu_head_unqueue(rhp); + if (__rcu_reclaim(rsp->name, rhp)) + rcu_cblist_dequeued_lazy(&rcl); + /* + * Stop only if limit reached and CPU has something to do. + * Note: The rcl structure counts down from zero. + */ + if (-rcl.len >= bl && (need_resched() || (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) break; } local_irq_save(flags); - trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), - is_idle_task(current), - rcu_is_callbacks_kthread()); - - /* Update count, and requeue any remaining callbacks. */ - if (list != NULL) { - *tail = rdp->nxtlist; - rdp->nxtlist = list; - for (i = 0; i < RCU_NEXT_SIZE; i++) - if (&rdp->nxtlist == rdp->nxttail[i]) - rdp->nxttail[i] = tail; - else - break; - } + count = -rcl.len; + trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(), + is_idle_task(current), rcu_is_callbacks_kthread()); + + /* Update counts and requeue any remaining callbacks. */ + rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl); smp_mb(); /* List handling before counting for rcu_barrier(). */ - rdp->qlen_lazy -= count_lazy; - WRITE_ONCE(rdp->qlen, rdp->qlen - count); rdp->n_cbs_invoked += count; + rcu_segcblist_insert_count(&rdp->cblist, &rcl); /* Reinstate batch limit if we have worked down the excess. */ - if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) + count = rcu_segcblist_n_cbs(&rdp->cblist); + if (rdp->blimit == LONG_MAX && count <= qlowmark) rdp->blimit = blimit; /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ - if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + if (count == 0 && rdp->qlen_last_fqs_check != 0) { rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; - } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) - rdp->qlen_last_fqs_check = rdp->qlen; - WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); + } else if (count < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = count; + WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0)); local_irq_restore(flags); /* Re-invoke RCU core processing if there are callbacks remaining. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_core(); } @@ -3087,7 +3002,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) bool needwake; struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); - WARN_ON_ONCE(rdp->beenonline == 0); + WARN_ON_ONCE(!rdp->beenonline); /* Update RCU state based on any recent quiescent states. */ rcu_check_quiescent_state(rsp, rdp); @@ -3105,7 +3020,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) } /* If there are callbacks ready, invoke them. */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) invoke_rcu_callbacks(rsp, rdp); /* Do any needed deferred wakeups of rcuo kthreads. */ @@ -3177,7 +3092,8 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, * invoking force_quiescent_state() if the newly enqueued callback * is the only one waiting for a grace period to complete. */ - if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { + if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) > + rdp->qlen_last_fqs_check + qhimark)) { /* Are we ignoring a completed grace period? */ note_gp_changes(rsp, rdp); @@ -3195,10 +3111,10 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, /* Give the grace period a kick. */ rdp->blimit = LONG_MAX; if (rsp->n_force_qs == rdp->n_force_qs_snap && - *rdp->nxttail[RCU_DONE_TAIL] != head) + rcu_segcblist_first_pend_cb(&rdp->cblist) != head) force_quiescent_state(rsp); rdp->n_force_qs_snap = rsp->n_force_qs; - rdp->qlen_last_fqs_check = rdp->qlen; + rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist); } } } @@ -3238,7 +3154,7 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, rdp = this_cpu_ptr(rsp->rda); /* Add the callback to our list. */ - if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { + if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) { int offline; if (cpu != -1) @@ -3257,23 +3173,21 @@ __call_rcu(struct rcu_head *head, rcu_callback_t func, */ BUG_ON(cpu != -1); WARN_ON_ONCE(!rcu_is_watching()); - if (!likely(rdp->nxtlist)) - init_default_callback_list(rdp); + if (rcu_segcblist_empty(&rdp->cblist)) + rcu_segcblist_init(&rdp->cblist); } - WRITE_ONCE(rdp->qlen, rdp->qlen + 1); - if (lazy) - rdp->qlen_lazy++; - else + rcu_segcblist_enqueue(&rdp->cblist, head, lazy); + if (!lazy) rcu_idle_count_callbacks_posted(); - smp_mb(); /* Count before adding callback for rcu_barrier(). */ - *rdp->nxttail[RCU_NEXT_TAIL] = head; - rdp->nxttail[RCU_NEXT_TAIL] = &head->next; if (__is_kfree_rcu_offset((unsigned long)func)) trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, - rdp->qlen_lazy, rdp->qlen); + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); else - trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); + trace_rcu_callback(rsp->name, head, + rcu_segcblist_n_lazy_cbs(&rdp->cblist), + rcu_segcblist_n_cbs(&rdp->cblist)); /* Go handle any RCU core processing required. */ __call_rcu_core(rsp, rdp, head, flags); @@ -3519,41 +3433,6 @@ void cond_synchronize_sched(unsigned long oldstate) } EXPORT_SYMBOL_GPL(cond_synchronize_sched); -/* Adjust sequence number for start of update-side operation. */ -static void rcu_seq_start(unsigned long *sp) -{ - WRITE_ONCE(*sp, *sp + 1); - smp_mb(); /* Ensure update-side operation after counter increment. */ - WARN_ON_ONCE(!(*sp & 0x1)); -} - -/* Adjust sequence number for end of update-side operation. */ -static void rcu_seq_end(unsigned long *sp) -{ - smp_mb(); /* Ensure update-side operation before counter increment. */ - WRITE_ONCE(*sp, *sp + 1); - WARN_ON_ONCE(*sp & 0x1); -} - -/* Take a snapshot of the update side's sequence number. */ -static unsigned long rcu_seq_snap(unsigned long *sp) -{ - unsigned long s; - - s = (READ_ONCE(*sp) + 3) & ~0x1; - smp_mb(); /* Above access must not bleed into critical section. */ - return s; -} - -/* - * Given a snapshot from rcu_seq_snap(), determine whether or not a - * full update-side operation has occurred. - */ -static bool rcu_seq_done(unsigned long *sp, unsigned long s) -{ - return ULONG_CMP_GE(READ_ONCE(*sp), s); -} - /* * Check to see if there is any immediate RCU-related work to be done * by the current CPU, for the specified type of RCU, returning 1 if so. @@ -3577,7 +3456,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Is the RCU core waiting for a quiescent state from this CPU? */ if (rcu_scheduler_fully_active && rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && - rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { + rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_dynticks.rcu_qs_ctr)) { rdp->n_rp_core_needs_qs++; } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { rdp->n_rp_report_qs++; @@ -3585,7 +3464,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) { + if (rcu_segcblist_ready_cbs(&rdp->cblist)) { rdp->n_rp_cb_ready++; return 1; } @@ -3649,10 +3528,10 @@ static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!rdp->nxtlist) + if (rcu_segcblist_empty(&rdp->cblist)) continue; hc = true; - if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { + if (rcu_segcblist_n_nonlazy_cbs(&rdp->cblist) || !all_lazy) { al = false; break; } @@ -3761,7 +3640,7 @@ static void _rcu_barrier(struct rcu_state *rsp) __call_rcu(&rdp->barrier_head, rcu_barrier_callback, rsp, cpu, 0); } - } else if (READ_ONCE(rdp->qlen)) { + } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { _rcu_barrier_trace(rsp, "OnlineQ", cpu, rsp->barrier_sequence); smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); @@ -3870,8 +3749,9 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - if (!rdp->nxtlist) - init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ + if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */ + !init_nocb_callback_list(rdp)) + rcu_segcblist_init(&rdp->cblist); /* Re-enable callbacks. */ rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; rcu_sysidle_init_percpu_data(rdp->dynticks); rcu_dynticks_eqs_online(); @@ -3890,12 +3770,16 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ rdp->completed = rnp->completed; rdp->cpu_no_qs.b.norm = true; - rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); + rdp->rcu_qs_ctr_snap = per_cpu(rcu_dynticks.rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * Invoked early in the CPU-online process, when pretty much all + * services are available. The incoming CPU is not present. + */ int rcutree_prepare_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3909,6 +3793,9 @@ int rcutree_prepare_cpu(unsigned int cpu) return 0; } +/* + * Update RCU priority boot kthread affinity for CPU-hotplug changes. + */ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) { struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); @@ -3916,20 +3803,34 @@ static void rcutree_affinity_setting(unsigned int cpu, int outgoing) rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); } +/* + * Near the end of the CPU-online process. Pretty much all services + * enabled, and the CPU is now very much alive. + */ int rcutree_online_cpu(unsigned int cpu) { sync_sched_exp_online_cleanup(cpu); rcutree_affinity_setting(cpu, -1); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); return 0; } +/* + * Near the beginning of the process. The CPU is still very much alive + * with pretty much all services enabled. + */ int rcutree_offline_cpu(unsigned int cpu) { rcutree_affinity_setting(cpu, cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_offline_cpu(cpu); return 0; } - +/* + * Near the end of the offline process. We do only tracing here. + */ int rcutree_dying_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3939,6 +3840,9 @@ int rcutree_dying_cpu(unsigned int cpu) return 0; } +/* + * The outgoing CPU is gone and we are running elsewhere. + */ int rcutree_dead_cpu(unsigned int cpu) { struct rcu_state *rsp; @@ -3956,6 +3860,10 @@ int rcutree_dead_cpu(unsigned int cpu) * incoming CPUs are not allowed to use RCU read-side critical sections * until this function is called. Failing to observe this restriction * will result in lockdep splats. + * + * Note that this function is special in that it is invoked directly + * from the incoming CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. */ void rcu_cpu_starting(unsigned int cpu) { @@ -3978,9 +3886,6 @@ void rcu_cpu_starting(unsigned int cpu) #ifdef CONFIG_HOTPLUG_CPU /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. * The CPU is exiting the idle loop into the arch_cpu_idle_dead() * function. We now remove it from the rcu_node tree's ->qsmaskinit * bit masks. @@ -3999,6 +3904,14 @@ static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * The outgoing function has no further need of RCU, so remove it from + * the list of CPUs that RCU must track. + * + * Note that this function is special in that it is invoked directly + * from the outgoing CPU rather than from the cpuhp_step mechanism. + * This is because this function must be invoked at a precise location. + */ void rcu_report_dead(unsigned int cpu) { struct rcu_state *rsp; @@ -4013,6 +3926,10 @@ void rcu_report_dead(unsigned int cpu) } #endif +/* + * On non-huge systems, use expedited RCU grace periods to make suspend + * and hibernation run faster. + */ static int rcu_pm_notify(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -4083,7 +4000,7 @@ early_initcall(rcu_spawn_gp_kthread); * task is booting the system, and such primitives are no-ops). After this * function is called, any synchronous grace-period primitives are run as * expedited, with the requesting task driving the grace period forward. - * A later core_initcall() rcu_exp_runtime_mode() will switch to full + * A later core_initcall() rcu_set_runtime_mode() will switch to full * runtime RCU functionality. */ void rcu_scheduler_starting(void) @@ -4095,31 +4012,6 @@ void rcu_scheduler_starting(void) rcu_test_sync_prims(); } -/* - * Compute the per-level fanout, either using the exact fanout specified - * or balancing the tree, depending on the rcu_fanout_exact boot parameter. - */ -static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) -{ - int i; - - if (rcu_fanout_exact) { - levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; - for (i = rcu_num_lvls - 2; i >= 0; i--) - levelspread[i] = RCU_FANOUT; - } else { - int ccur; - int cprv; - - cprv = nr_cpu_ids; - for (i = rcu_num_lvls - 1; i >= 0; i--) { - ccur = levelcnt[i]; - levelspread[i] = (cprv + ccur - 1) / ccur; - cprv = ccur; - } - } -} - /* * Helper function for rcu_init() that initializes one rcu_state structure. */ @@ -4129,9 +4021,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) static const char * const fqs[] = RCU_FQS_NAME_INIT; static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; - static u8 fl_mask = 0x1; - int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ int cpustride = 1; int i; @@ -4146,20 +4036,16 @@ static void __init rcu_init_one(struct rcu_state *rsp) /* Initialize the level-tracking arrays. */ - for (i = 0; i < rcu_num_lvls; i++) - levelcnt[i] = num_rcu_lvl[i]; for (i = 1; i < rcu_num_lvls; i++) - rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; - rcu_init_levelspread(levelspread, levelcnt); - rsp->flavor_mask = fl_mask; - fl_mask <<= 1; + rsp->level[i] = rsp->level[i - 1] + num_rcu_lvl[i - 1]; + rcu_init_levelspread(levelspread, num_rcu_lvl); /* Initialize the elements themselves, starting from the leaves. */ for (i = rcu_num_lvls - 1; i >= 0; i--) { cpustride *= levelspread[i]; rnp = rsp->level[i]; - for (j = 0; j < levelcnt[i]; j++, rnp++) { + for (j = 0; j < num_rcu_lvl[i]; j++, rnp++) { raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); @@ -4332,6 +4218,8 @@ void __init rcu_init(void) for_each_online_cpu(cpu) { rcutree_prepare_cpu(cpu); rcu_cpu_starting(cpu); + if (IS_ENABLED(CONFIG_TREE_SRCU)) + srcu_online_cpu(cpu); } } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index ec62a05bfdb3c81b9a82593de2aeefffb96bf300..ba38262c3554495c59b0aacba52ba245ba108d53 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -30,80 +30,9 @@ #include #include #include +#include -/* - * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and - * CONFIG_RCU_FANOUT_LEAF. - * In theory, it should be possible to add more levels straightforwardly. - * In practice, this did work well going from three levels to four. - * Of course, your mileage may vary. - */ - -#ifdef CONFIG_RCU_FANOUT -#define RCU_FANOUT CONFIG_RCU_FANOUT -#else /* #ifdef CONFIG_RCU_FANOUT */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT 64 -# else -# define RCU_FANOUT 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT */ - -#ifdef CONFIG_RCU_FANOUT_LEAF -#define RCU_FANOUT_LEAF CONFIG_RCU_FANOUT_LEAF -#else /* #ifdef CONFIG_RCU_FANOUT_LEAF */ -# ifdef CONFIG_64BIT -# define RCU_FANOUT_LEAF 64 -# else -# define RCU_FANOUT_LEAF 32 -# endif -#endif /* #else #ifdef CONFIG_RCU_FANOUT_LEAF */ - -#define RCU_FANOUT_1 (RCU_FANOUT_LEAF) -#define RCU_FANOUT_2 (RCU_FANOUT_1 * RCU_FANOUT) -#define RCU_FANOUT_3 (RCU_FANOUT_2 * RCU_FANOUT) -#define RCU_FANOUT_4 (RCU_FANOUT_3 * RCU_FANOUT) - -#if NR_CPUS <= RCU_FANOUT_1 -# define RCU_NUM_LVLS 1 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_NODES NUM_RCU_LVL_0 -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0 } -# define RCU_NODE_NAME_INIT { "rcu_node_0" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0" } -#elif NR_CPUS <= RCU_FANOUT_2 -# define RCU_NUM_LVLS 2 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1" } -#elif NR_CPUS <= RCU_FANOUT_3 -# define RCU_NUM_LVLS 3 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2" } -#elif NR_CPUS <= RCU_FANOUT_4 -# define RCU_NUM_LVLS 4 -# define NUM_RCU_LVL_0 1 -# define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) -# define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) -# define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) -# define NUM_RCU_NODES (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) -# define NUM_RCU_LVL_INIT { NUM_RCU_LVL_0, NUM_RCU_LVL_1, NUM_RCU_LVL_2, NUM_RCU_LVL_3 } -# define RCU_NODE_NAME_INIT { "rcu_node_0", "rcu_node_1", "rcu_node_2", "rcu_node_3" } -# define RCU_FQS_NAME_INIT { "rcu_node_fqs_0", "rcu_node_fqs_1", "rcu_node_fqs_2", "rcu_node_fqs_3" } -#else -# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" -#endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ - -extern int rcu_num_lvls; -extern int rcu_num_nodes; +#include "rcu_segcblist.h" /* * Dynticks per-CPU state. @@ -113,6 +42,9 @@ struct rcu_dynticks { /* Process level is worth LLONG_MAX/2. */ int dynticks_nmi_nesting; /* Track NMI nesting level. */ atomic_t dynticks; /* Even value for idle, else odd. */ + bool rcu_need_heavy_qs; /* GP old, need heavy quiescent state. */ + unsigned long rcu_qs_ctr; /* Light universal quiescent state ctr. */ + bool rcu_urgent_qs; /* GP old need light quiescent state. */ #ifdef CONFIG_NO_HZ_FULL_SYSIDLE long long dynticks_idle_nesting; /* irq/process nesting level from idle. */ @@ -261,41 +193,6 @@ struct rcu_node { */ #define leaf_node_cpu_bit(rnp, cpu) (1UL << ((cpu) - (rnp)->grplo)) -/* - * Do a full breadth-first scan of the rcu_node structures for the - * specified rcu_state structure. - */ -#define rcu_for_each_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Do a breadth-first scan of the non-leaf rcu_node structures for the - * specified rcu_state structure. Note that if there is a singleton - * rcu_node tree with but one rcu_node structure, this loop is a no-op. - */ -#define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ - for ((rnp) = &(rsp)->node[0]; \ - (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) - -/* - * Scan the leaves of the rcu_node hierarchy for the specified rcu_state - * structure. Note that if there is a singleton rcu_node tree with but - * one rcu_node structure, this loop -will- visit the rcu_node structure. - * It is still a leaf node, even if it is also the root node. - */ -#define rcu_for_each_leaf_node(rsp, rnp) \ - for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ - (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) - -/* - * Iterate over all possible CPUs in a leaf RCU node. - */ -#define for_each_leaf_node_possible_cpu(rnp, cpu) \ - for ((cpu) = cpumask_next(rnp->grplo - 1, cpu_possible_mask); \ - cpu <= rnp->grphi; \ - cpu = cpumask_next((cpu), cpu_possible_mask)) - /* * Union to allow "aggregate OR" operation on the need for a quiescent * state by the normal and expedited grace periods. @@ -336,34 +233,9 @@ struct rcu_data { /* period it is aware of. */ /* 2) batch handling */ - /* - * If nxtlist is not NULL, it is partitioned as follows. - * Any of the partitions might be empty, in which case the - * pointer to that partition will be equal to the pointer for - * the following partition. When the list is empty, all of - * the nxttail elements point to the ->nxtlist pointer itself, - * which in that case is NULL. - * - * [nxtlist, *nxttail[RCU_DONE_TAIL]): - * Entries that batch # <= ->completed - * The grace period for these entries has completed, and - * the other grace-period-completed entries may be moved - * here temporarily in rcu_process_callbacks(). - * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): - * Entries that batch # <= ->completed - 1: waiting for current GP - * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): - * Entries known to have arrived before current GP ended - * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): - * Entries that might have arrived after current GP ended - * Note that the value of *nxttail[RCU_NEXT_TAIL] will - * always be NULL, as this is the end of the list. - */ - struct rcu_head *nxtlist; - struct rcu_head **nxttail[RCU_NEXT_SIZE]; - unsigned long nxtcompleted[RCU_NEXT_SIZE]; - /* grace periods for sublists. */ - long qlen_lazy; /* # of lazy queued callbacks */ - long qlen; /* # of queued callbacks, incl lazy */ + struct rcu_segcblist cblist; /* Segmented callback list, with */ + /* different callbacks waiting for */ + /* different grace periods. */ long qlen_last_fqs_check; /* qlen at last check for QS forcing */ unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ @@ -482,7 +354,6 @@ struct rcu_state { struct rcu_node *level[RCU_NUM_LVLS + 1]; /* Hierarchy levels (+1 to */ /* shut bogus gcc warning) */ - u8 flavor_mask; /* bit in flavor mask. */ struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ call_rcu_func_t call; /* call_rcu() flavor. */ int ncpus; /* # CPUs seen so far. */ @@ -502,14 +373,11 @@ struct rcu_state { raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; /* Protect following fields. */ - struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_pend; /* Orphaned callbacks that */ /* need a grace period. */ - struct rcu_head **orphan_nxttail; /* Tail of above. */ - struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ + struct rcu_cblist orphan_done; /* Orphaned callbacks that */ /* are ready to invoke. */ - struct rcu_head **orphan_donetail; /* Tail of above. */ - long qlen_lazy; /* Number of lazy callbacks. */ - long qlen; /* Total number of callbacks. */ + /* (Contains counts.) */ /* End of fields guarded by orphan_lock. */ struct mutex barrier_mutex; /* Guards barrier fields. */ @@ -596,6 +464,7 @@ extern struct rcu_state rcu_preempt_state; #endif /* #ifdef CONFIG_PREEMPT_RCU */ int rcu_dynticks_snap(struct rcu_dynticks *rdtp); +bool rcu_eqs_special_set(int cpu); #ifdef CONFIG_RCU_BOOST DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); @@ -673,6 +542,14 @@ static bool rcu_nohz_full_cpu(struct rcu_state *rsp); static void rcu_dynticks_task_enter(void); static void rcu_dynticks_task_exit(void); +#ifdef CONFIG_SRCU +void srcu_online_cpu(unsigned int cpu); +void srcu_offline_cpu(unsigned int cpu); +#else /* #ifdef CONFIG_SRCU */ +void srcu_online_cpu(unsigned int cpu) { } +void srcu_offline_cpu(unsigned int cpu) { } +#endif /* #else #ifdef CONFIG_SRCU */ + #endif /* #ifndef RCU_TREE_NONCORE */ #ifdef CONFIG_RCU_TRACE diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index a7b639ccd46e0ade81946639ef1d50bdc2b68d21..e513b4ab119769488419eba9bff7601f1d556182 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -292,7 +292,7 @@ static bool exp_funnel_lock(struct rcu_state *rsp, unsigned long s) trace_rcu_exp_funnel_lock(rsp->name, rnp->level, rnp->grplo, rnp->grphi, TPS("wait")); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], sync_exp_work_done(rsp, &rdp->exp_workdone2, s)); return true; @@ -331,6 +331,8 @@ static void sync_sched_exp_handler(void *data) return; } __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, true); + /* Store .exp before .rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); resched_cpu(smp_processor_id()); } @@ -531,7 +533,8 @@ static void rcu_exp_wait_wake(struct rcu_state *rsp, unsigned long s) rnp->exp_seq_rq = s; spin_unlock(&rnp->exp_lock); } - wake_up_all(&rnp->exp_wq[(rsp->expedited_sequence >> 1) & 0x3]); + smp_mb(); /* All above changes before wakeup. */ + wake_up_all(&rnp->exp_wq[rcu_seq_ctr(rsp->expedited_sequence) & 0x3]); } trace_rcu_exp_grace_period(rsp->name, s, TPS("endwake")); mutex_unlock(&rsp->exp_wake_mutex); @@ -609,9 +612,9 @@ static void _synchronize_rcu_expedited(struct rcu_state *rsp, /* Wait for expedited grace period to complete. */ rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); rnp = rcu_get_root(rsp); - wait_event(rnp->exp_wq[(s >> 1) & 0x3], - sync_exp_work_done(rsp, - &rdp->exp_workdone0, s)); + wait_event(rnp->exp_wq[rcu_seq_ctr(s) & 0x3], + sync_exp_work_done(rsp, &rdp->exp_workdone0, s)); + smp_mb(); /* Workqueue actions happen before return. */ /* Let the next expedited grace period start. */ mutex_unlock(&rsp->exp_mutex); @@ -735,15 +738,3 @@ void synchronize_rcu_expedited(void) EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ - -/* - * Switch to run-time mode once Tree RCU has fully initialized. - */ -static int __init rcu_exp_runtime_mode(void) -{ - rcu_test_sync_prims(); - rcu_scheduler_active = RCU_SCHEDULER_RUNNING; - rcu_test_sync_prims(); - return 0; -} -core_initcall(rcu_exp_runtime_mode); diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 0a62a8f1caacfab8a2a39c729f70e445f8048cbf..c9a48657512ae35a833d6cc1c56ff9687a899b17 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -1350,10 +1350,10 @@ static bool __maybe_unused rcu_try_advance_all_cbs(void) */ if ((rdp->completed != rnp->completed || unlikely(READ_ONCE(rdp->gpwrap))) && - rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) + rcu_segcblist_pend_cbs(&rdp->cblist)) note_gp_changes(rsp, rdp); - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (rcu_segcblist_ready_cbs(&rdp->cblist)) cbs_ready = true; } return cbs_ready; @@ -1461,7 +1461,7 @@ static void rcu_prepare_for_idle(void) rdtp->last_accelerate = jiffies; for_each_rcu_flavor(rsp) { rdp = this_cpu_ptr(rsp->rda); - if (!*rdp->nxttail[RCU_DONE_TAIL]) + if (rcu_segcblist_pend_cbs(&rdp->cblist)) continue; rnp = rdp->mynode; raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ @@ -1529,7 +1529,7 @@ static void rcu_oom_notify_cpu(void *unused) for_each_rcu_flavor(rsp) { rdp = raw_cpu_ptr(rsp->rda); - if (rdp->qlen_lazy != 0) { + if (rcu_segcblist_n_lazy_cbs(&rdp->cblist)) { atomic_inc(&oom_callback_count); rsp->call(&rdp->oom_head, rcu_oom_callback); } @@ -1709,7 +1709,7 @@ __setup("rcu_nocbs=", rcu_nocb_setup); static int __init parse_rcu_nocb_poll(char *arg) { - rcu_nocb_poll = 1; + rcu_nocb_poll = true; return 0; } early_param("rcu_nocb_poll", parse_rcu_nocb_poll); @@ -1860,7 +1860,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmptyIsDeferred")); } @@ -1872,7 +1874,9 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); } else { - rdp->nocb_defer_wakeup = RCU_NOGP_WAKE_FORCE; + WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOGP_WAKE_FORCE); + /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ + smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true); trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvfIsDeferred")); } @@ -1930,30 +1934,26 @@ static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, struct rcu_data *rdp, unsigned long flags) { - long ql = rsp->qlen; - long qll = rsp->qlen_lazy; + long ql = rsp->orphan_done.len; + long qll = rsp->orphan_done.len_lazy; /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ if (!rcu_is_nocb_cpu(smp_processor_id())) return false; - rsp->qlen = 0; - rsp->qlen_lazy = 0; /* First, enqueue the donelist, if any. This preserves CB ordering. */ - if (rsp->orphan_donelist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, - rsp->orphan_donetail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_donelist = NULL; - rsp->orphan_donetail = &rsp->orphan_donelist; + if (rsp->orphan_done.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), + rcu_cblist_tail(&rsp->orphan_done), + ql, qll, flags); } - if (rsp->orphan_nxtlist != NULL) { - __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, - rsp->orphan_nxttail, ql, qll, flags); - ql = qll = 0; - rsp->orphan_nxtlist = NULL; - rsp->orphan_nxttail = &rsp->orphan_nxtlist; + if (rsp->orphan_pend.head) { + __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend), + rcu_cblist_tail(&rsp->orphan_pend), + ql, qll, flags); } + rcu_cblist_init(&rsp->orphan_done); + rcu_cblist_init(&rsp->orphan_pend); return true; } @@ -2395,16 +2395,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp) return false; /* If there are early-boot callbacks, move them to nocb lists. */ - if (rdp->nxtlist) { - rdp->nocb_head = rdp->nxtlist; - rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL]; - atomic_long_set(&rdp->nocb_q_count, rdp->qlen); - atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy); - rdp->nxtlist = NULL; - rdp->qlen = 0; - rdp->qlen_lazy = 0; + if (!rcu_segcblist_empty(&rdp->cblist)) { + rdp->nocb_head = rcu_segcblist_head(&rdp->cblist); + rdp->nocb_tail = rcu_segcblist_tail(&rdp->cblist); + atomic_long_set(&rdp->nocb_q_count, + rcu_segcblist_n_cbs(&rdp->cblist)); + atomic_long_set(&rdp->nocb_q_count_lazy, + rcu_segcblist_n_lazy_cbs(&rdp->cblist)); + rcu_segcblist_init(&rdp->cblist); } - rdp->nxttail[RCU_NEXT_TAIL] = NULL; + rcu_segcblist_disable(&rdp->cblist); return true; } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index 8751a748499a3d3a93419fe5273bbf4876aa2149..6cea17a1ea301f23180126504acf9d84d27e16f6 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -41,11 +41,11 @@ #include #include #include +#include #define RCU_TREE_NONCORE #include "tree.h" - -DECLARE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); +#include "rcu.h" static int r_open(struct inode *inode, struct file *file, const struct seq_operations *op) @@ -121,7 +121,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) cpu_is_offline(rdp->cpu) ? '!' : ' ', ulong2long(rdp->completed), ulong2long(rdp->gpnum), rdp->cpu_no_qs.b.norm, - rdp->rcu_qs_ctr_snap == per_cpu(rcu_qs_ctr, rdp->cpu), + rdp->rcu_qs_ctr_snap == per_cpu(rdp->dynticks->rcu_qs_ctr, rdp->cpu), rdp->core_needs_qs); seq_printf(m, " dt=%d/%llx/%d df=%lu", rcu_dynticks_snap(rdp->dynticks), @@ -130,17 +130,15 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) rdp->dynticks_fqs); seq_printf(m, " of=%lu", rdp->offline_fqs); rcu_nocb_q_lengths(rdp, &ql, &qll); - qll += rdp->qlen_lazy; - ql += rdp->qlen; + qll += rcu_segcblist_n_lazy_cbs(&rdp->cblist); + ql += rcu_segcblist_n_cbs(&rdp->cblist); seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", qll, ql, - ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != - rdp->nxttail[RCU_NEXT_TAIL]], - ".R"[rdp->nxttail[RCU_WAIT_TAIL] != - rdp->nxttail[RCU_NEXT_READY_TAIL]], - ".W"[rdp->nxttail[RCU_DONE_TAIL] != - rdp->nxttail[RCU_WAIT_TAIL]], - ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); + ".N"[!rcu_segcblist_segempty(&rdp->cblist, RCU_NEXT_TAIL)], + ".R"[!rcu_segcblist_segempty(&rdp->cblist, + RCU_NEXT_READY_TAIL)], + ".W"[!rcu_segcblist_segempty(&rdp->cblist, RCU_WAIT_TAIL)], + ".D"[!rcu_segcblist_segempty(&rdp->cblist, RCU_DONE_TAIL)]); #ifdef CONFIG_RCU_BOOST seq_printf(m, " kt=%d/%c ktl=%x", per_cpu(rcu_cpu_has_work, rdp->cpu), @@ -278,7 +276,9 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", rsp->n_force_qs, rsp->n_force_qs_ngp, rsp->n_force_qs - rsp->n_force_qs_ngp, - READ_ONCE(rsp->n_force_qs_lh), rsp->qlen_lazy, rsp->qlen); + READ_ONCE(rsp->n_force_qs_lh), + rsp->orphan_done.len_lazy, + rsp->orphan_done.len); for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { if (rnp->level != level) { seq_puts(m, "\n"); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 55c8530316c7ce6077df9814cc4c7997f7282486..273e869ca21d546b5457987bd676f70c08bba5f3 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -124,7 +124,7 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); * non-expedited counterparts? Intended for use within RCU. Note * that if the user specifies both rcu_expedited and rcu_normal, then * rcu_normal wins. (Except during the time period during boot from - * when the first task is spawned until the rcu_exp_runtime_mode() + * when the first task is spawned until the rcu_set_runtime_mode() * core_initcall() is invoked, at which point everything is expedited.) */ bool rcu_gp_is_normal(void) @@ -190,6 +190,39 @@ void rcu_end_inkernel_boot(void) #endif /* #ifndef CONFIG_TINY_RCU */ +/* + * Test each non-SRCU synchronous grace-period wait API. This is + * useful just after a change in mode for these primitives, and + * during early boot. + */ +void rcu_test_sync_prims(void) +{ + if (!IS_ENABLED(CONFIG_PROVE_RCU)) + return; + synchronize_rcu(); + synchronize_rcu_bh(); + synchronize_sched(); + synchronize_rcu_expedited(); + synchronize_rcu_bh_expedited(); + synchronize_sched_expedited(); +} + +#if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) + +/* + * Switch to run-time mode once RCU has fully initialized. + */ +static int __init rcu_set_runtime_mode(void) +{ + rcu_test_sync_prims(); + rcu_scheduler_active = RCU_SCHEDULER_RUNNING; + rcu_test_sync_prims(); + return 0; +} +core_initcall(rcu_set_runtime_mode); + +#endif /* #if !defined(CONFIG_TINY_RCU) || defined(CONFIG_SRCU) */ + #ifdef CONFIG_PREEMPT_RCU /* @@ -632,6 +665,7 @@ static void check_holdout_task(struct task_struct *t, put_task_struct(t); return; } + rcu_request_urgent_qs_task(t); if (!needreport) return; if (*firstreport) { @@ -817,23 +851,6 @@ static void rcu_spawn_tasks_kthread(void) #endif /* #ifdef CONFIG_TASKS_RCU */ -/* - * Test each non-SRCU synchronous grace-period wait API. This is - * useful just after a change in mode for these primitives, and - * during early boot. - */ -void rcu_test_sync_prims(void) -{ - if (!IS_ENABLED(CONFIG_PROVE_RCU)) - return; - synchronize_rcu(); - synchronize_rcu_bh(); - synchronize_sched(); - synchronize_rcu_expedited(); - synchronize_rcu_bh_expedited(); - synchronize_sched_expedited(); -} - #ifdef CONFIG_PROVE_RCU /* diff --git a/kernel/relay.c b/kernel/relay.c index 0e413d9eec8af484517300e1c142325356865771..39a9dfc69486b57a85d115dcefee75d35907a74d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1212,7 +1212,6 @@ static ssize_t subbuf_splice_actor(struct file *in, .nr_pages = 0, .nr_pages_max = PIPE_DEF_BUFFERS, .partial = partial, - .flags = flags, .ops = &relay_pipe_buf_ops, .spd_release = relay_page_release, }; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b31fc05a0f1e45be5985b860a5fde95ee969832..759f4bd52cd6b3724b858d41f57ff305a3747c8b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,21 +85,6 @@ int sysctl_sched_rt_runtime = 950000; /* CPUs with isolated domains */ cpumask_var_t cpu_isolated_map; -/* - * this_rq_lock - lock this runqueue and disable interrupts. - */ -static struct rq *this_rq_lock(void) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_disable(); - rq = this_rq(); - raw_spin_lock(&rq->lock); - - return rq; -} - /* * __task_rq_lock - lock the rq @p resides on. */ @@ -233,8 +218,11 @@ void update_rq_clock(struct rq *rq) return; #ifdef CONFIG_SCHED_DEBUG + if (sched_feat(WARN_DOUBLE_CLOCK)) + SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED); rq->clock_update_flags |= RQCF_UPDATED; #endif + delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; if (delta < 0) return; @@ -261,13 +249,14 @@ static void hrtick_clear(struct rq *rq) static enum hrtimer_restart hrtick(struct hrtimer *timer) { struct rq *rq = container_of(timer, struct rq, hrtick_timer); + struct rq_flags rf; WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); return HRTIMER_NORESTART; } @@ -287,11 +276,12 @@ static void __hrtick_restart(struct rq *rq) static void __hrtick_start(void *arg) { struct rq *rq = arg; + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); __hrtick_restart(rq); rq->hrtick_csd_pending = 0; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -762,17 +752,23 @@ static void set_load_weight(struct task_struct *p) static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & ENQUEUE_RESTORE)) sched_info_queued(rq, p); + p->sched_class->enqueue_task(rq, p, flags); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) { - update_rq_clock(rq); + if (!(flags & DEQUEUE_NOCLOCK)) + update_rq_clock(rq); + if (!(flags & DEQUEUE_SAVE)) sched_info_dequeued(rq, p); + p->sched_class->dequeue_task(rq, p, flags); } @@ -946,18 +942,19 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * * Returns (locked) new rq. Old rq's lock is released. */ -static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu) +static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int new_cpu) { lockdep_assert_held(&rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - dequeue_task(rq, p, 0); + dequeue_task(rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, new_cpu); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = cpu_rq(new_cpu); - raw_spin_lock(&rq->lock); + rq_lock(rq, rf); BUG_ON(task_cpu(p) != new_cpu); enqueue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; @@ -980,7 +977,8 @@ struct migration_arg { * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ -static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu) +static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, + struct task_struct *p, int dest_cpu) { if (unlikely(!cpu_active(dest_cpu))) return rq; @@ -989,7 +987,8 @@ static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_ if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) return rq; - rq = move_queued_task(rq, p, dest_cpu); + update_rq_clock(rq); + rq = move_queued_task(rq, rf, p, dest_cpu); return rq; } @@ -1004,6 +1003,7 @@ static int migration_cpu_stop(void *data) struct migration_arg *arg = data; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + struct rq_flags rf; /* * The original target CPU might have gone down and we might @@ -1018,7 +1018,7 @@ static int migration_cpu_stop(void *data) sched_ttwu_pending(); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because @@ -1026,11 +1026,11 @@ static int migration_cpu_stop(void *data) */ if (task_rq(p) == rq) { if (task_on_rq_queued(p)) - rq = __migrate_task(rq, p, arg->dest_cpu); + rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; } - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); raw_spin_unlock(&p->pi_lock); local_irq_enable(); @@ -1063,7 +1063,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * holding rq->lock. */ lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); } if (running) put_prev_task(rq, p); @@ -1071,7 +1071,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); } @@ -1150,9 +1150,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * OK, since we're going to drop the lock immediately * afterwards anyway. */ - rq_unpin_lock(rq, &rf); - rq = move_queued_task(rq, p, dest_cpu); - rq_repin_lock(rq, &rf); + rq = move_queued_task(rq, &rf, p, dest_cpu); } out: task_rq_unlock(rq, p, &rf); @@ -1217,16 +1215,24 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { struct rq *src_rq, *dst_rq; + struct rq_flags srf, drf; src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + rq_pin_lock(src_rq, &srf); + rq_pin_lock(dst_rq, &drf); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); + + rq_unpin_lock(dst_rq, &drf); + rq_unpin_lock(src_rq, &srf); + } else { /* * Task isn't running anymore; make it appear like we migrated @@ -1680,7 +1686,7 @@ static void ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, struct rq_flags *rf) { - int en_flags = ENQUEUE_WAKEUP; + int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK; lockdep_assert_held(&rq->lock); @@ -1726,14 +1732,13 @@ void sched_ttwu_pending(void) struct rq *rq = this_rq(); struct llist_node *llist = llist_del_all(&rq->wake_list); struct task_struct *p; - unsigned long flags; struct rq_flags rf; if (!llist) return; - raw_spin_lock_irqsave(&rq->lock, flags); - rq_pin_lock(rq, &rf); + rq_lock_irqsave(rq, &rf); + update_rq_clock(rq); while (llist) { int wake_flags = 0; @@ -1747,8 +1752,7 @@ void sched_ttwu_pending(void) ttwu_do_activate(rq, p, wake_flags, &rf); } - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } void scheduler_ipi(void) @@ -1806,7 +1810,7 @@ static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) void wake_up_if_idle(int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; rcu_read_lock(); @@ -1816,11 +1820,11 @@ void wake_up_if_idle(int cpu) if (set_nr_if_polling(rq->idle)) { trace_sched_wake_idle_without_ipi(cpu); } else { - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (is_idle_task(rq->curr)) smp_send_reschedule(cpu); /* Else CPU is not idle, do nothing here: */ - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } out: @@ -1846,11 +1850,10 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) } #endif - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); + update_rq_clock(rq); ttwu_do_activate(rq, p, wake_flags, &rf); - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -2097,11 +2100,9 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) * disabled avoiding further scheduler activity on it and we've * not yet picked a replacement task. */ - rq_unpin_lock(rq, rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&p->pi_lock); - raw_spin_lock(&rq->lock); - rq_repin_lock(rq, rf); + rq_relock(rq, rf); } if (!(p->state & TASK_NORMAL)) @@ -2114,7 +2115,7 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf) delayacct_blkio_end(); atomic_dec(&rq->nr_iowait); } - ttwu_activate(rq, p, ENQUEUE_WAKEUP); + ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK); } ttwu_do_wakeup(rq, p, 0, rf); @@ -2555,7 +2556,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(&p->se); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3093,15 +3094,18 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + struct rq_flags rf; sched_clock_tick(); - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); + update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); cpu_load_update_active(rq); calc_global_load_tick(rq); - raw_spin_unlock(&rq->lock); + + rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3378,7 +3382,7 @@ static void __sched notrace __schedule(bool preempt) hrtick_clear(rq); local_irq_disable(); - rcu_note_context_switch(); + rcu_note_context_switch(preempt); /* * Make sure that signal_pending_state()->signal_pending() below @@ -3386,18 +3390,18 @@ static void __sched notrace __schedule(bool preempt) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock(&rq->lock); - rq_pin_lock(rq, &rf); + rq_lock(rq, &rf); /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; + update_rq_clock(rq); switch_count = &prev->nivcsw; if (!preempt && prev->state) { if (unlikely(signal_pending_state(prev->state, prev))) { prev->state = TASK_RUNNING; } else { - deactivate_task(rq, prev, DEQUEUE_SLEEP); + deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); prev->on_rq = 0; if (prev->in_iowait) { @@ -3421,9 +3425,6 @@ static void __sched notrace __schedule(bool preempt) switch_count = &prev->nvcsw; } - if (task_on_rq_queued(prev)) - update_rq_clock(rq); - next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); clear_preempt_need_resched(); @@ -3439,8 +3440,7 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } balance_callback(rq); @@ -3671,10 +3671,25 @@ EXPORT_SYMBOL(default_wake_function); #ifdef CONFIG_RT_MUTEXES +static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) +{ + if (pi_task) + prio = min(prio, pi_task->prio); + + return prio; +} + +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + struct task_struct *pi_task = rt_mutex_get_top_task(p); + + return __rt_effective_prio(pi_task, prio); +} + /* * rt_mutex_setprio - set the current priority of a task - * @p: task - * @prio: prio value (kernel-internal form) + * @p: task to boost + * @pi_task: donor task * * This function changes the 'effective' priority of a task. It does * not touch ->normal_prio like __setscheduler(). @@ -3682,17 +3697,42 @@ EXPORT_SYMBOL(default_wake_function); * Used by the rt_mutex code to implement priority inheritance * logic. Call site only calls if the priority of the task changed. */ -void rt_mutex_setprio(struct task_struct *p, int prio) +void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) { - int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; + int prio, oldprio, queued, running, queue_flag = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; const struct sched_class *prev_class; struct rq_flags rf; struct rq *rq; - BUG_ON(prio > MAX_PRIO); + /* XXX used to be waiter->prio, not waiter->task->prio */ + prio = __rt_effective_prio(pi_task, p->normal_prio); + + /* + * If nothing changed; bail early. + */ + if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) + return; rq = __task_rq_lock(p, &rf); update_rq_clock(rq); + /* + * Set under pi_lock && rq->lock, such that the value can be used under + * either lock. + * + * Note that there is loads of tricky to make this pointer cache work + * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to + * ensure a task is de-boosted (pi_task is set to NULL) before the + * task is allowed to run again (and can exit). This ensures the pointer + * points to a blocked task -- which guaratees the task is present. + */ + p->pi_top_task = pi_task; + + /* + * For FIFO/RR we only need to set prio, if that matches we're done. + */ + if (prio == p->prio && !dl_prio(prio)) + goto out_unlock; /* * Idle task boosting is a nono in general. There is one @@ -3712,7 +3752,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) goto out_unlock; } - trace_sched_pi_setprio(p, prio); + trace_sched_pi_setprio(p, pi_task); oldprio = p->prio; if (oldprio == prio) @@ -3736,7 +3776,6 @@ void rt_mutex_setprio(struct task_struct *p, int prio) * running task */ if (dl_prio(prio)) { - struct task_struct *pi_task = rt_mutex_get_top_task(p); if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; @@ -3774,6 +3813,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio) balance_callback(rq); preempt_enable(); } +#else +static inline int rt_effective_prio(struct task_struct *p, int prio) +{ + return prio; +} #endif void set_user_nice(struct task_struct *p, long nice) @@ -3805,7 +3849,7 @@ void set_user_nice(struct task_struct *p, long nice) queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK); if (running) put_prev_task(rq, p); @@ -3816,7 +3860,7 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (queued) { - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4020,10 +4064,9 @@ static void __setscheduler(struct rq *rq, struct task_struct *p, * Keep a potential priority boosting if called from * sched_setscheduler(). */ + p->prio = normal_prio(p); if (keep_boost) - p->prio = rt_mutex_get_effective_prio(p, normal_prio(p)); - else - p->prio = normal_prio(p); + p->prio = rt_effective_prio(p, p->prio); if (dl_prio(p->prio)) p->sched_class = &dl_sched_class; @@ -4126,7 +4169,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq_flags rf; int reset_on_fork; - int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq *rq; /* May grab non-irq protected spin_locks: */ @@ -4310,7 +4353,7 @@ static int __sched_setscheduler(struct task_struct *p, * the runqueue. This will be done when the task deboost * itself. */ - new_effective_prio = rt_mutex_get_effective_prio(p, newprio); + new_effective_prio = rt_effective_prio(p, newprio); if (new_effective_prio == oldprio) queue_flags &= ~DEQUEUE_MOVE; } @@ -4923,7 +4966,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, */ SYSCALL_DEFINE0(sched_yield) { - struct rq *rq = this_rq_lock(); + struct rq_flags rf; + struct rq *rq; + + local_irq_disable(); + rq = this_rq(); + rq_lock(rq, &rf); schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); @@ -4932,9 +4980,8 @@ SYSCALL_DEFINE0(sched_yield) * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ - __release(rq->lock); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); - do_raw_spin_unlock(&rq->lock); + preempt_disable(); + rq_unlock(rq, &rf); sched_preempt_enable_no_resched(); schedule(); @@ -5514,7 +5561,7 @@ void sched_setnuma(struct task_struct *p, int nid) p->numa_preferred_nid = nid; if (queued) - enqueue_task(rq, p, ENQUEUE_RESTORE); + enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) set_curr_task(rq, p); task_rq_unlock(rq, p, &rf); @@ -5579,11 +5626,11 @@ static struct task_struct fake_task = { * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq) +static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; - struct rq_flags rf; + struct rq_flags orf = *rf; int dest_cpu; /* @@ -5602,9 +5649,7 @@ static void migrate_tasks(struct rq *dead_rq) * class method both need to have an up-to-date * value of rq->clock[_task] */ - rq_pin_lock(rq, &rf); update_rq_clock(rq); - rq_unpin_lock(rq, &rf); for (;;) { /* @@ -5617,8 +5662,7 @@ static void migrate_tasks(struct rq *dead_rq) /* * pick_next_task() assumes pinned rq->lock: */ - rq_repin_lock(rq, &rf); - next = pick_next_task(rq, &fake_task, &rf); + next = pick_next_task(rq, &fake_task, rf); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); @@ -5631,10 +5675,9 @@ static void migrate_tasks(struct rq *dead_rq) * because !cpu_active at this point, which means load-balance * will not interfere. Also, stop-machine. */ - rq_unpin_lock(rq, &rf); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); raw_spin_lock(&next->pi_lock); - raw_spin_lock(&rq->lock); + rq_relock(rq, rf); /* * Since we're inside stop-machine, _nothing_ should have @@ -5648,12 +5691,12 @@ static void migrate_tasks(struct rq *dead_rq) /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - - rq = __migrate_task(rq, next, dest_cpu); + rq = __migrate_task(rq, rf, next, dest_cpu); if (rq != dead_rq) { - raw_spin_unlock(&rq->lock); + rq_unlock(rq, rf); rq = dead_rq; - raw_spin_lock(&rq->lock); + *rf = orf; + rq_relock(rq, rf); } raw_spin_unlock(&next->pi_lock); } @@ -5732,7 +5775,7 @@ static void cpuset_cpu_active(void) * cpuset configurations. */ } - cpuset_update_active_cpus(true); + cpuset_update_active_cpus(); } static int cpuset_cpu_inactive(unsigned int cpu) @@ -5755,7 +5798,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) if (overflow) return -EBUSY; - cpuset_update_active_cpus(false); + cpuset_update_active_cpus(); } else { num_cpus_frozen++; partition_sched_domains(1, NULL, NULL); @@ -5766,7 +5809,7 @@ static int cpuset_cpu_inactive(unsigned int cpu) int sched_cpu_activate(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; set_cpu_active(cpu, true); @@ -5784,12 +5827,12 @@ int sched_cpu_activate(unsigned int cpu) * 2) At runtime, if cpuset_cpu_active() fails to rebuild the * domains. */ - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_online(rq); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); update_max_interval(); @@ -5847,18 +5890,20 @@ int sched_cpu_starting(unsigned int cpu) int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); - unsigned long flags; + struct rq_flags rf; /* Handle pending wakeups and then migrate everything off */ sched_ttwu_pending(); - raw_spin_lock_irqsave(&rq->lock, flags); + + rq_lock_irqsave(rq, &rf); if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); } - migrate_tasks(rq); + migrate_tasks(rq, &rf); BUG_ON(rq->nr_running != 1); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(cpu); @@ -6412,7 +6457,8 @@ static void sched_change_group(struct task_struct *tsk, int type) */ void sched_move_task(struct task_struct *tsk) { - int queued, running; + int queued, running, queue_flags = + DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct rq_flags rf; struct rq *rq; @@ -6423,14 +6469,14 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); + dequeue_task(rq, tsk, queue_flags); if (running) put_prev_task(rq, tsk); sched_change_group(tsk, TASK_MOVE_GROUP); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); + enqueue_task(rq, tsk, queue_flags); if (running) set_curr_task(rq, tsk); @@ -7008,14 +7054,15 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota) for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; + struct rq_flags rf; - raw_spin_lock_irq(&rq->lock); + rq_lock_irq(rq, &rf); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); } if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 54c577578da6899160cf4a611e87a386a2fd7db2..76877a62b5fa374f0daa93f1180ce853ba32fc0c 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -61,6 +61,11 @@ struct sugov_cpu { unsigned long util; unsigned long max; unsigned int flags; + + /* The field below is for single-CPU policies only. */ +#ifdef CONFIG_NO_HZ_COMMON + unsigned long saved_idle_calls; +#endif }; static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu); @@ -93,22 +98,23 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time, { struct cpufreq_policy *policy = sg_policy->policy; + if (sg_policy->next_freq == next_freq) + return; + + if (sg_policy->next_freq > next_freq) + next_freq = (sg_policy->next_freq + next_freq) >> 1; + + sg_policy->next_freq = next_freq; sg_policy->last_freq_update_time = time; if (policy->fast_switch_enabled) { - if (sg_policy->next_freq == next_freq) { - trace_cpu_frequency(policy->cur, smp_processor_id()); - return; - } - sg_policy->next_freq = next_freq; next_freq = cpufreq_driver_fast_switch(policy, next_freq); if (next_freq == CPUFREQ_ENTRY_INVALID) return; policy->cur = next_freq; trace_cpu_frequency(next_freq, smp_processor_id()); - } else if (sg_policy->next_freq != next_freq) { - sg_policy->next_freq = next_freq; + } else { sg_policy->work_in_progress = true; irq_work_queue(&sg_policy->irq_work); } @@ -192,6 +198,19 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, sg_cpu->iowait_boost >>= 1; } +#ifdef CONFIG_NO_HZ_COMMON +static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) +{ + unsigned long idle_calls = tick_nohz_get_idle_calls(); + bool ret = idle_calls == sg_cpu->saved_idle_calls; + + sg_cpu->saved_idle_calls = idle_calls; + return ret; +} +#else +static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } +#endif /* CONFIG_NO_HZ_COMMON */ + static void sugov_update_single(struct update_util_data *hook, u64 time, unsigned int flags) { @@ -200,6 +219,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, struct cpufreq_policy *policy = sg_policy->policy; unsigned long util, max; unsigned int next_f; + bool busy; sugov_set_iowait_boost(sg_cpu, time, flags); sg_cpu->last_update = time; @@ -207,40 +227,37 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, if (!sugov_should_update_freq(sg_policy, time)) return; + busy = sugov_cpu_is_busy(sg_cpu); + if (flags & SCHED_CPUFREQ_RT_DL) { next_f = policy->cpuinfo.max_freq; } else { sugov_get_util(&util, &max); sugov_iowait_boost(sg_cpu, &util, &max); next_f = get_next_freq(sg_policy, util, max); + /* + * Do not reduce the frequency if the CPU has not been idle + * recently, as the reduction is likely to be premature then. + */ + if (busy && next_f < sg_policy->next_freq) + next_f = sg_policy->next_freq; } sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, - unsigned long util, unsigned long max, - unsigned int flags) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - unsigned int max_f = policy->cpuinfo.max_freq; u64 last_freq_update_time = sg_policy->last_freq_update_time; + unsigned long util = 0, max = 1; unsigned int j; - if (flags & SCHED_CPUFREQ_RT_DL) - return max_f; - - sugov_iowait_boost(sg_cpu, &util, &max); - for_each_cpu(j, policy->cpus) { - struct sugov_cpu *j_sg_cpu; + struct sugov_cpu *j_sg_cpu = &per_cpu(sugov_cpu, j); unsigned long j_util, j_max; s64 delta_ns; - if (j == smp_processor_id()) - continue; - - j_sg_cpu = &per_cpu(sugov_cpu, j); /* * If the CPU utilization was last updated before the previous * frequency update and the time elapsed between the last update @@ -254,7 +271,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL) - return max_f; + return policy->cpuinfo.max_freq; j_util = j_sg_cpu->util; j_max = j_sg_cpu->max; @@ -289,7 +306,11 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, sg_cpu->last_update = time; if (sugov_should_update_freq(sg_policy, time)) { - next_f = sugov_next_freq_shared(sg_cpu, util, max, flags); + if (flags & SCHED_CPUFREQ_RT_DL) + next_f = sg_policy->policy->cpuinfo.max_freq; + else + next_f = sugov_next_freq_shared(sg_cpu); + sugov_update_commit(sg_policy, time, next_f); } @@ -473,7 +494,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -512,10 +532,16 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) - tunables->rate_limit_us *= lat; + if (policy->transition_delay_us) { + tunables->rate_limit_us = policy->transition_delay_us; + } else { + unsigned int lat; + + tunables->rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) + tunables->rate_limit_us *= lat; + } policy->governor_data = sg_policy; sg_policy->tunables = tunables; diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index f3778e2b46c8dc00c90d9165f6ae9efbf1f16dc7..aea3135c5d90f434ee72980c30f0db1129ef752b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void) sched_clock_irqtime = 0; } +static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, + enum cpu_usage_stat idx) +{ + u64 *cpustat = kcpustat_this_cpu->cpustat; + + u64_stats_update_begin(&irqtime->sync); + cpustat[idx] += delta; + irqtime->total += delta; + irqtime->tick_delta += delta; + u64_stats_update_end(&irqtime->sync); +} + /* * Called before incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. @@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void) void irqtime_account_irq(struct task_struct *curr) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); - u64 *cpustat = kcpustat_this_cpu->cpustat; s64 delta; int cpu; @@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr) delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; - u64_stats_update_begin(&irqtime->sync); /* * We do not account for softirq time from ksoftirqd here. * We want to continue accounting softirq time to ksoftirqd thread * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) { - cpustat[CPUTIME_IRQ] += delta; - irqtime->tick_delta += delta; - } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) { - cpustat[CPUTIME_SOFTIRQ] += delta; - irqtime->tick_delta += delta; - } - - u64_stats_update_end(&irqtime->sync); + if (hardirq_count()) + irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); + else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dea138964b9107b3e22542a8b80f5cf1d43c1dee..d711093218415d77ead6405004dd9e41323ad924 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -717,18 +717,12 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) } #ifdef CONFIG_SMP + +#include "sched-pelt.h" + static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); -/* - * We choose a half-life close to 1 scheduling period. - * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are - * dependent on this value. - */ -#define LOAD_AVG_PERIOD 32 -#define LOAD_AVG_MAX 47742 /* maximum possible load avg */ -#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */ - /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) { @@ -2733,47 +2727,15 @@ static inline void update_cfs_shares(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_SMP -/* Precomputed fixed inverse multiplies for multiplication by y^n */ -static const u32 runnable_avg_yN_inv[] = { - 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, - 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, - 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, - 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, - 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, - 0x85aac367, 0x82cd8698, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent - * over-estimates when re-combining. - */ -static const u32 runnable_avg_yN_sum[] = { - 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103, - 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082, - 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371, -}; - -/* - * Precomputed \Sum y^k { 1<=k<=n, where n%32=0). Values are rolled down to - * lower integers. See Documentation/scheduler/sched-avg.txt how these - * were generated: - */ -static const u32 __accumulated_sum_N32[] = { - 0, 23371, 35056, 40899, 43820, 45281, - 46011, 46376, 46559, 46650, 46696, 46719, -}; - /* * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static __always_inline u64 decay_load(u64 val, u64 n) +static u64 decay_load(u64 val, u64 n) { unsigned int local_n; - if (!n) - return val; - else if (unlikely(n > LOAD_AVG_PERIOD * 63)) + if (unlikely(n > LOAD_AVG_PERIOD * 63)) return 0; /* after bounds checking we can collapse to 32-bit */ @@ -2795,30 +2757,97 @@ static __always_inline u64 decay_load(u64 val, u64 n) return val; } +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; /* y^0 == 1 */ + + /* + * c1 = d1 y^p + */ + c1 = decay_load((u64)d1, periods); + + /* + * p-1 + * c2 = 1024 \Sum y^n + * n=1 + * + * inf inf + * = 1024 ( \Sum y^n - \Sum y^n - y^0 ) + * n=0 n=p + */ + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* - * For updates fully spanning n periods, the contribution to runnable - * average will be: \Sum 1024*y^n + * Accumulate the three separate parts of the sum; d1 the remainder + * of the last (incomplete) period, d2 the span of full periods and d3 + * the remainder of the (incomplete) current period. + * + * d1 d2 d3 + * ^ ^ ^ + * | | | + * |<->|<----------------->|<--->| + * ... |---x---|------| ... |------|-----x (now) + * + * p-1 + * u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0 + * n=1 * - * We can compute this reasonably efficiently by combining: - * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n delta < 1024 */ + u64 periods; - if (likely(n <= LOAD_AVG_PERIOD)) - return runnable_avg_yN_sum[n]; - else if (unlikely(n >= LOAD_AVG_MAX_N)) - return LOAD_AVG_MAX; + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - /* Since n < LOAD_AVG_MAX_N, n/LOAD_AVG_PERIOD < 11 */ - contrib = __accumulated_sum_N32[n/LOAD_AVG_PERIOD]; - n %= LOAD_AVG_PERIOD; - contrib = decay_load(contrib, n); - return contrib + runnable_avg_yN_sum[n]; -} + delta += sa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ -#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + /* + * Step 1: decay old *_sum if we crossed period boundaries. + */ + if (periods) { + sa->load_sum = decay_load(sa->load_sum, periods); + if (cfs_rq) { + cfs_rq->runnable_load_sum = + decay_load(cfs_rq->runnable_load_sum, periods); + } + sa->util_sum = decay_load((u64)(sa->util_sum), periods); + + /* + * Step 2 + */ + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - sa->period_contrib, delta); + } + sa->period_contrib = delta; + + contrib = cap_scale(contrib, scale_freq); + if (weight) { + sa->load_sum += weight * contrib; + if (cfs_rq) + cfs_rq->runnable_load_sum += weight * contrib; + } + if (running) + sa->util_sum += contrib * scale_cpu; + + return periods; +} /* * We can represent the historical contribution to runnable average as the @@ -2849,13 +2878,10 @@ static u32 __compute_runnable_contrib(u64 n) * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}] */ static __always_inline int -__update_load_avg(u64 now, int cpu, struct sched_avg *sa, +___update_load_avg(u64 now, int cpu, struct sched_avg *sa, unsigned long weight, int running, struct cfs_rq *cfs_rq) { - u64 delta, scaled_delta, periods; - u32 contrib; - unsigned int delta_w, scaled_delta_w, decayed = 0; - unsigned long scale_freq, scale_cpu; + u64 delta; delta = now - sa->last_update_time; /* @@ -2874,83 +2900,52 @@ __update_load_avg(u64 now, int cpu, struct sched_avg *sa, delta >>= 10; if (!delta) return 0; - sa->last_update_time = now; - - scale_freq = arch_scale_freq_capacity(NULL, cpu); - scale_cpu = arch_scale_cpu_capacity(NULL, cpu); - - /* delta_w is the amount already accumulated against our next period */ - delta_w = sa->period_contrib; - if (delta + delta_w >= 1024) { - decayed = 1; - /* how much left for next period will start over, we don't know yet */ - sa->period_contrib = 0; + sa->last_update_time += delta << 10; - /* - * Now that we know we're crossing a period boundary, figure - * out how much from delta we need to complete the current - * period and accrue it. - */ - delta_w = 1024 - delta_w; - scaled_delta_w = cap_scale(delta_w, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta_w; - if (cfs_rq) { - cfs_rq->runnable_load_sum += - weight * scaled_delta_w; - } - } - if (running) - sa->util_sum += scaled_delta_w * scale_cpu; - - delta -= delta_w; - - /* Figure out how many additional periods this update spans */ - periods = delta / 1024; - delta %= 1024; + /* + * Now we know we crossed measurement unit boundaries. The *_avg + * accrues by two steps: + * + * Step 1: accumulate *_sum since last_update_time. If we haven't + * crossed period boundaries, finish. + */ + if (!accumulate_sum(delta, cpu, sa, weight, running, cfs_rq)) + return 0; - sa->load_sum = decay_load(sa->load_sum, periods + 1); - if (cfs_rq) { - cfs_rq->runnable_load_sum = - decay_load(cfs_rq->runnable_load_sum, periods + 1); - } - sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1); - - /* Efficiently calculate \sum (1..n_period) 1024*y^i */ - contrib = __compute_runnable_contrib(periods); - contrib = cap_scale(contrib, scale_freq); - if (weight) { - sa->load_sum += weight * contrib; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * contrib; - } - if (running) - sa->util_sum += contrib * scale_cpu; + /* + * Step 2: update *_avg. + */ + sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); + if (cfs_rq) { + cfs_rq->runnable_load_avg = + div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); } + sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - /* Remainder of delta accrued against u_0` */ - scaled_delta = cap_scale(delta, scale_freq); - if (weight) { - sa->load_sum += weight * scaled_delta; - if (cfs_rq) - cfs_rq->runnable_load_sum += weight * scaled_delta; - } - if (running) - sa->util_sum += scaled_delta * scale_cpu; + return 1; +} - sa->period_contrib += delta; +static int +__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL); +} - if (decayed) { - sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX); - if (cfs_rq) { - cfs_rq->runnable_load_avg = - div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX); - } - sa->util_avg = sa->util_sum / LOAD_AVG_MAX; - } +static int +__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + return ___update_load_avg(now, cpu, &se->avg, + se->on_rq * scale_load_down(se->load.weight), + cfs_rq->curr == se, NULL); +} - return decayed; +static int +__update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq) +{ + return ___update_load_avg(now, cpu, &cfs_rq->avg, + scale_load_down(cfs_rq->load.weight), + cfs_rq->curr != NULL, cfs_rq); } /* @@ -3014,6 +3009,9 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) void set_task_rq_fair(struct sched_entity *se, struct cfs_rq *prev, struct cfs_rq *next) { + u64 p_last_update_time; + u64 n_last_update_time; + if (!sched_feat(ATTACH_AGE_LOAD)) return; @@ -3024,11 +3022,11 @@ void set_task_rq_fair(struct sched_entity *se, * time. This will result in the wakee task is less decayed, but giving * the wakee more load sounds not bad. */ - if (se->avg.last_update_time && prev) { - u64 p_last_update_time; - u64 n_last_update_time; + if (!(se->avg.last_update_time && prev)) + return; #ifndef CONFIG_64BIT + { u64 p_last_update_time_copy; u64 n_last_update_time_copy; @@ -3043,14 +3041,13 @@ void set_task_rq_fair(struct sched_entity *se, } while (p_last_update_time != p_last_update_time_copy || n_last_update_time != n_last_update_time_copy); + } #else - p_last_update_time = prev->avg.last_update_time; - n_last_update_time = next->avg.last_update_time; + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; #endif - __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), - &se->avg, 0, 0, NULL); - se->avg.last_update_time = n_last_update_time; - } + __update_load_avg_blocked_se(p_last_update_time, cpu_of(rq_of(prev)), se); + se->avg.last_update_time = n_last_update_time; } /* Take into account change of utilization of a child task group */ @@ -3173,6 +3170,36 @@ static inline int propagate_entity_load_avg(struct sched_entity *se) return 1; } +/* + * Check if we need to update the load and the utilization of a blocked + * group_entity: + */ +static inline bool skip_blocked_update(struct sched_entity *se) +{ + struct cfs_rq *gcfs_rq = group_cfs_rq(se); + + /* + * If sched_entity still have not zero load or utilization, we have to + * decay it: + */ + if (se->avg.load_avg || se->avg.util_avg) + return false; + + /* + * If there is a pending propagation, we have to update the load and + * the utilization of the sched_entity: + */ + if (gcfs_rq->propagate_avg) + return false; + + /* + * Otherwise, the load and the utilization of the sched_entity is + * already zero and there is no pending propagation, so it will be a + * waste of time to try to decay it: + */ + return true; +} + #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} @@ -3265,8 +3292,7 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq) set_tg_cfs_propagate(cfs_rq); } - decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa, - scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq); + decayed = __update_load_avg_cfs_rq(now, cpu_of(rq_of(cfs_rq)), cfs_rq); #ifndef CONFIG_64BIT smp_wmb(); @@ -3298,11 +3324,8 @@ static inline void update_load_avg(struct sched_entity *se, int flags) * Track task load average for carrying it to new CPU after migrated, and * track group sched_entity load average for task_h_load calc in migration */ - if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) { - __update_load_avg(now, cpu, &se->avg, - se->on_rq * scale_load_down(se->load.weight), - cfs_rq->curr == se, NULL); - } + if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) + __update_load_avg_se(now, cpu, cfs_rq, se); decayed = update_cfs_rq_load_avg(now, cfs_rq, true); decayed |= propagate_entity_load_avg(se); @@ -3407,7 +3430,7 @@ void sync_entity_load_avg(struct sched_entity *se) u64 last_update_time; last_update_time = cfs_rq_last_update_time(cfs_rq); - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + __update_load_avg_blocked_se(last_update_time, cpu_of(rq_of(cfs_rq)), se); } /* @@ -4271,8 +4294,9 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, throttled_list) { struct rq *rq = rq_of(cfs_rq); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4289,7 +4313,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); if (!remaining) break; @@ -5097,15 +5121,16 @@ void cpu_load_update_nohz_stop(void) unsigned long curr_jiffies = READ_ONCE(jiffies); struct rq *this_rq = this_rq(); unsigned long load; + struct rq_flags rf; if (curr_jiffies == this_rq->last_load_update_tick) return; load = weighted_cpuload(cpu_of(this_rq)); - raw_spin_lock(&this_rq->lock); + rq_lock(this_rq, &rf); update_rq_clock(this_rq); cpu_load_update_nohz(this_rq, curr_jiffies, load); - raw_spin_unlock(&this_rq->lock); + rq_unlock(this_rq, &rf); } #else /* !CONFIG_NO_HZ_COMMON */ static inline void cpu_load_update_nohz(struct rq *this_rq, @@ -6769,7 +6794,7 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); p->on_rq = TASK_ON_RQ_MIGRATING; - deactivate_task(env->src_rq, p, 0); + deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); set_task_cpu(p, env->dst_cpu); } @@ -6902,7 +6927,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - activate_task(rq, p, 0); + activate_task(rq, p, ENQUEUE_NOCLOCK); p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6913,9 +6938,12 @@ static void attach_task(struct rq *rq, struct task_struct *p) */ static void attach_one_task(struct rq *rq, struct task_struct *p) { - raw_spin_lock(&rq->lock); + struct rq_flags rf; + + rq_lock(rq, &rf); + update_rq_clock(rq); attach_task(rq, p); - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -6926,8 +6954,10 @@ static void attach_tasks(struct lb_env *env) { struct list_head *tasks = &env->tasks; struct task_struct *p; + struct rq_flags rf; - raw_spin_lock(&env->dst_rq->lock); + rq_lock(env->dst_rq, &rf); + update_rq_clock(env->dst_rq); while (!list_empty(tasks)) { p = list_first_entry(tasks, struct task_struct, se.group_node); @@ -6936,7 +6966,7 @@ static void attach_tasks(struct lb_env *env) attach_task(env->dst_rq, p); } - raw_spin_unlock(&env->dst_rq->lock); + rq_unlock(env->dst_rq, &rf); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -6944,9 +6974,9 @@ static void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); /* @@ -6954,6 +6984,8 @@ static void update_blocked_averages(int cpu) * list_add_leaf_cfs_rq() for details. */ for_each_leaf_cfs_rq(rq, cfs_rq) { + struct sched_entity *se; + /* throttled entities do not contribute to load */ if (throttled_hierarchy(cfs_rq)) continue; @@ -6961,11 +6993,12 @@ static void update_blocked_averages(int cpu) if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true)) update_tg_load_avg(cfs_rq, 0); - /* Propagate pending load changes to the parent */ - if (cfs_rq->tg->se[cpu]) - update_load_avg(cfs_rq->tg->se[cpu], 0); + /* Propagate pending load changes to the parent, if any: */ + se = cfs_rq->tg->se[cpu]; + if (se && !skip_blocked_update(se)) + update_load_avg(se, 0); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } /* @@ -7019,12 +7052,12 @@ static inline void update_blocked_averages(int cpu) { struct rq *rq = cpu_rq(cpu); struct cfs_rq *cfs_rq = &rq->cfs; - unsigned long flags; + struct rq_flags rf; - raw_spin_lock_irqsave(&rq->lock, flags); + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true); - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } static unsigned long task_h_load(struct task_struct *p) @@ -7525,6 +7558,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd { struct sched_domain *child = env->sd->child; struct sched_group *sg = env->sd->groups; + struct sg_lb_stats *local = &sds->local_stat; struct sg_lb_stats tmp_sgs; int load_idx, prefer_sibling = 0; bool overload = false; @@ -7541,7 +7575,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg)); if (local_group) { sds->local = sg; - sgs = &sds->local_stat; + sgs = local; if (env->idle != CPU_NEWLY_IDLE || time_after_eq(jiffies, sg->sgc->next_update)) @@ -7565,8 +7599,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd * the tasks on the system). */ if (prefer_sibling && sds->local && - group_has_capacity(env, &sds->local_stat) && - (sgs->sum_nr_running > 1)) { + group_has_capacity(env, local) && + (sgs->sum_nr_running > local->sum_nr_running + 1)) { sgs->group_no_capacity = 1; sgs->group_type = group_classify(sg, sgs); } @@ -7597,7 +7631,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd /** * check_asym_packing - Check to see if the group is packed into the - * sched doman. + * sched domain. * * This is primarily intended to used at the sibling level. Some * cores like POWER7 prefer to use lower numbered SMT threads. In the @@ -8042,7 +8076,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd_parent = sd->parent; struct sched_group *group; struct rq *busiest; - unsigned long flags; + struct rq_flags rf; struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask); struct lb_env env = { @@ -8105,7 +8139,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: - raw_spin_lock_irqsave(&busiest->lock, flags); + rq_lock_irqsave(busiest, &rf); update_rq_clock(busiest); /* @@ -8122,14 +8156,14 @@ static int load_balance(int this_cpu, struct rq *this_rq, * See task_rq_lock() family for the details. */ - raw_spin_unlock(&busiest->lock); + rq_unlock(busiest, &rf); if (cur_ld_moved) { attach_tasks(&env); ld_moved += cur_ld_moved; } - local_irq_restore(flags); + local_irq_restore(rf.flags); if (env.flags & LBF_NEED_BREAK) { env.flags &= ~LBF_NEED_BREAK; @@ -8207,6 +8241,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, sd->nr_balance_failed++; if (need_active_balance(&env)) { + unsigned long flags; + raw_spin_lock_irqsave(&busiest->lock, flags); /* don't kick the active_load_balance_cpu_stop, @@ -8444,8 +8480,9 @@ static int active_load_balance_cpu_stop(void *data) struct rq *target_rq = cpu_rq(target_cpu); struct sched_domain *sd; struct task_struct *p = NULL; + struct rq_flags rf; - raw_spin_lock_irq(&busiest_rq->lock); + rq_lock_irq(busiest_rq, &rf); /* make sure the requested cpu hasn't gone down in the meantime */ if (unlikely(busiest_cpu != smp_processor_id() || @@ -8496,7 +8533,7 @@ static int active_load_balance_cpu_stop(void *data) rcu_read_unlock(); out_unlock: busiest_rq->active_balance = 0; - raw_spin_unlock(&busiest_rq->lock); + rq_unlock(busiest_rq, &rf); if (p) attach_one_task(target_rq, p); @@ -8794,10 +8831,13 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) * do the balance. */ if (time_after_eq(jiffies, rq->next_balance)) { - raw_spin_lock_irq(&rq->lock); + struct rq_flags rf; + + rq_lock_irq(rq, &rf); update_rq_clock(rq); cpu_load_update_idle(rq); - raw_spin_unlock_irq(&rq->lock); + rq_unlock_irq(rq, &rf); + rebalance_domains(rq, CPU_IDLE); } @@ -8988,8 +9028,9 @@ static void task_fork_fair(struct task_struct *p) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se, *curr; struct rq *rq = this_rq(); + struct rq_flags rf; - raw_spin_lock(&rq->lock); + rq_lock(rq, &rf); update_rq_clock(rq); cfs_rq = task_cfs_rq(current); @@ -9010,7 +9051,7 @@ static void task_fork_fair(struct task_struct *p) } se->vruntime -= cfs_rq->min_vruntime; - raw_spin_unlock(&rq->lock); + rq_unlock(rq, &rf); } /* @@ -9372,7 +9413,6 @@ static DEFINE_MUTEX(shares_mutex); int sched_group_set_shares(struct task_group *tg, unsigned long shares) { int i; - unsigned long flags; /* * We can't change the weight of the root cgroup. @@ -9389,19 +9429,17 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) tg->shares = shares; for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); - struct sched_entity *se; + struct sched_entity *se = tg->se[i]; + struct rq_flags rf; - se = tg->se[i]; /* Propagate contribution to hierarchy */ - raw_spin_lock_irqsave(&rq->lock, flags); - - /* Possible calls to update_curr() need rq clock */ + rq_lock_irqsave(rq, &rf); update_rq_clock(rq); for_each_sched_entity(se) { update_load_avg(se, UPDATE_TG); update_cfs_shares(se); } - raw_spin_unlock_irqrestore(&rq->lock, flags); + rq_unlock_irqrestore(rq, &rf); } done: diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1b3c8189b28656d2644a714ff60ceab7d015d97b..11192e0cb122c72066a2f46417246464d909ea7c 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -56,6 +56,13 @@ SCHED_FEAT(TTWU_QUEUE, true) */ SCHED_FEAT(SIS_AVG_CPU, false) +/* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the + * annotations are not complete. + */ +SCHED_FEAT(WARN_DOUBLE_CLOCK, false) + #ifdef HAVE_RT_PUSH_IPI /* * In order to avoid a thundering herd attack of CPUs that are diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ac6d5176463dca591b40263d9524d728c363ec09..2a25a9ec2c6e5806ad8528717c5b372d750dcf9e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -10,6 +10,7 @@ #include #include #include +#include #include @@ -265,6 +266,9 @@ static void do_idle(void) sched_ttwu_pending(); schedule_preempt_disabled(); + + if (unlikely(klp_patch_pending(current))) + klp_update_patch_state(current); } bool cpu_in_idle(unsigned long pc) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9f3e40226dec875c7b318b4a9e6a2c01a89604ac..979b7341008afc94f839f38b3fd84a12b7a3fb16 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1927,6 +1927,87 @@ static int find_next_push_cpu(struct rq *rq) #define RT_PUSH_IPI_EXECUTING 1 #define RT_PUSH_IPI_RESTART 2 +/* + * When a high priority task schedules out from a CPU and a lower priority + * task is scheduled in, a check is made to see if there's any RT tasks + * on other CPUs that are waiting to run because a higher priority RT task + * is currently running on its CPU. In this case, the CPU with multiple RT + * tasks queued on it (overloaded) needs to be notified that a CPU has opened + * up that may be able to run one of its non-running queued RT tasks. + * + * On large CPU boxes, there's the case that several CPUs could schedule + * a lower priority task at the same time, in which case it will look for + * any overloaded CPUs that it could pull a task from. To do this, the runqueue + * lock must be taken from that overloaded CPU. Having 10s of CPUs all fighting + * for a single overloaded CPU's runqueue lock can produce a large latency. + * (This has actually been observed on large boxes running cyclictest). + * Instead of taking the runqueue lock of the overloaded CPU, each of the + * CPUs that scheduled a lower priority task simply sends an IPI to the + * overloaded CPU. An IPI is much cheaper than taking an runqueue lock with + * lots of contention. The overloaded CPU will look to push its non-running + * RT task off, and if it does, it can then ignore the other IPIs coming + * in, and just pass those IPIs off to any other overloaded CPU. + * + * When a CPU schedules a lower priority task, it only sends an IPI to + * the "next" CPU that has overloaded RT tasks. This prevents IPI storms, + * as having 10 CPUs scheduling lower priority tasks and 10 CPUs with + * RT overloaded tasks, would cause 100 IPIs to go out at once. + * + * The overloaded RT CPU, when receiving an IPI, will try to push off its + * overloaded RT tasks and then send an IPI to the next CPU that has + * overloaded RT tasks. This stops when all CPUs with overloaded RT tasks + * have completed. Just because a CPU may have pushed off its own overloaded + * RT task does not mean it should stop sending the IPI around to other + * overloaded CPUs. There may be another RT task waiting to run on one of + * those CPUs that are of higher priority than the one that was just + * pushed. + * + * An optimization that could possibly be made is to make a CPU array similar + * to the cpupri array mask of all running RT tasks, but for the overloaded + * case, then the IPI could be sent to only the CPU with the highest priority + * RT task waiting, and that CPU could send off further IPIs to the CPU with + * the next highest waiting task. Since the overloaded case is much less likely + * to happen, the complexity of this implementation may not be worth it. + * Instead, just send an IPI around to all overloaded CPUs. + * + * The rq->rt.push_flags holds the status of the IPI that is going around. + * A run queue can only send out a single IPI at a time. The possible flags + * for rq->rt.push_flags are: + * + * (None or zero): No IPI is going around for the current rq + * RT_PUSH_IPI_EXECUTING: An IPI for the rq is being passed around + * RT_PUSH_IPI_RESTART: The priority of the running task for the rq + * has changed, and the IPI should restart + * circulating the overloaded CPUs again. + * + * rq->rt.push_cpu contains the CPU that is being sent the IPI. It is updated + * before sending to the next CPU. + * + * Instead of having all CPUs that schedule a lower priority task send + * an IPI to the same "first" CPU in the RT overload mask, they send it + * to the next overloaded CPU after their own CPU. This helps distribute + * the work when there's more than one overloaded CPU and multiple CPUs + * scheduling in lower priority tasks. + * + * When a rq schedules a lower priority task than what was currently + * running, the next CPU with overloaded RT tasks is examined first. + * That is, if CPU 1 and 5 are overloaded, and CPU 3 schedules a lower + * priority task, it will send an IPI first to CPU 5, then CPU 5 will + * send to CPU 1 if it is still overloaded. CPU 1 will clear the + * rq->rt.push_flags if RT_PUSH_IPI_RESTART is not set. + * + * The first CPU to notice IPI_RESTART is set, will clear that flag and then + * send an IPI to the next overloaded CPU after the rq->cpu and not the next + * CPU after push_cpu. That is, if CPU 1, 4 and 5 are overloaded when CPU 3 + * schedules a lower priority task, and the IPI_RESTART gets set while the + * handling is being done on CPU 5, it will clear the flag and send it back to + * CPU 4 instead of CPU 1. + * + * Note, the above logic can be disabled by turning off the sched_feature + * RT_PUSH_IPI. Then the rq lock of the overloaded CPU will simply be + * taken by the CPU requesting a pull and the waiting RT task will be pulled + * by that CPU. This may be fine for machines with few CPUs. + */ static void tell_cpu_to_push(struct rq *rq) { int cpu; diff --git a/kernel/sched/sched-pelt.h b/kernel/sched/sched-pelt.h new file mode 100644 index 0000000000000000000000000000000000000000..cd200d16529efe9ca8cbc56afa5896fde001e654 --- /dev/null +++ b/kernel/sched/sched-pelt.h @@ -0,0 +1,13 @@ +/* Generated by Documentation/scheduler/sched-pelt; do not modify. */ + +static const u32 runnable_avg_yN_inv[] = { + 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6, + 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85, + 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581, + 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9, + 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80, + 0x85aac367, 0x82cd8698, +}; + +#define LOAD_AVG_PERIOD 32 +#define LOAD_AVG_MAX 47742 diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 5cbf92214ad89287d111ab8300e5b55923d83ffe..7808ab05059991a2dad05bf073f1fee539f0fbbd 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1331,15 +1331,17 @@ extern const u32 sched_prio_to_wmult[40]; #define DEQUEUE_SLEEP 0x01 #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ +#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ #define ENQUEUE_WAKEUP 0x01 #define ENQUEUE_RESTORE 0x02 #define ENQUEUE_MOVE 0x04 +#define ENQUEUE_NOCLOCK 0x08 -#define ENQUEUE_HEAD 0x08 -#define ENQUEUE_REPLENISH 0x10 +#define ENQUEUE_HEAD 0x10 +#define ENQUEUE_REPLENISH 0x20 #ifdef CONFIG_SMP -#define ENQUEUE_MIGRATED 0x20 +#define ENQUEUE_MIGRATED 0x40 #else #define ENQUEUE_MIGRATED 0x00 #endif @@ -1624,6 +1626,7 @@ static inline void sched_avg_update(struct rq *rq) { } struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(rq->lock); + struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) __acquires(p->pi_lock) __acquires(rq->lock); @@ -1645,6 +1648,62 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +static inline void +rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irqsave(&rq->lock, rf->flags); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock_irq(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock_irq(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_lock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_pin_lock(rq, rf); +} + +static inline void +rq_relock(struct rq *rq, struct rq_flags *rf) + __acquires(rq->lock) +{ + raw_spin_lock(&rq->lock); + rq_repin_lock(rq, rf); +} + +static inline void +rq_unlock_irqrestore(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irqrestore(&rq->lock, rf->flags); +} + +static inline void +rq_unlock_irq(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock_irq(&rq->lock); +} + +static inline void +rq_unlock(struct rq *rq, struct rq_flags *rf) + __releases(rq->lock) +{ + rq_unpin_lock(rq, rf); + raw_spin_unlock(&rq->lock); +} + #ifdef CONFIG_SMP #ifdef CONFIG_PREEMPT @@ -1869,6 +1928,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { } #ifdef CONFIG_IRQ_TIME_ACCOUNTING struct irqtime { + u64 total; u64 tick_delta; u64 irq_start_time; struct u64_stats_sync sync; @@ -1876,16 +1936,20 @@ struct irqtime { DECLARE_PER_CPU(struct irqtime, cpu_irqtime); +/* + * Returns the irqtime minus the softirq time computed by ksoftirqd. + * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime + * and never move forward. + */ static inline u64 irq_time_read(int cpu) { struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); - u64 *cpustat = kcpustat_cpu(cpu).cpustat; unsigned int seq; u64 total; do { seq = __u64_stats_fetch_begin(&irqtime->sync); - total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ]; + total = irqtime->total; } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); return total; diff --git a/kernel/signal.c b/kernel/signal.c index 7e59ebc2c25e669ef60f54d6e3ae6839c34d2f2a..ca92bcfeb322f3f836031ec8b3ab21867f39adf5 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1237,7 +1237,7 @@ struct sighand_struct *__lock_task_sighand(struct task_struct *tsk, } /* * This sighand can be already freed and even reused, but - * we rely on SLAB_DESTROY_BY_RCU and sighand_ctor() which + * we rely on SLAB_TYPESAFE_BY_RCU and sighand_ctor() which * initializes ->siglock: this slab can't go away, it has * the same object type, ->siglock can't be reinitialized. * @@ -1318,7 +1318,7 @@ int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) } } -int kill_proc_info(int sig, struct siginfo *info, pid_t pid) +static int kill_proc_info(int sig, struct siginfo *info, pid_t pid) { int error; rcu_read_lock(); diff --git a/kernel/softirq.c b/kernel/softirq.c index 744fa611cae06b26d89a04e915f0d7fadf37035b..4e09821f9d9e8b5815037bd469110e2618abb7f6 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -309,7 +309,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) account_irq_exit_time(current); __local_bh_enable(SOFTIRQ_OFFSET); WARN_ON_ONCE(in_interrupt()); - tsk_restore_flags(current, old_flags, PF_MEMALLOC); + current_restore_flags(old_flags, PF_MEMALLOC); } asmlinkage __visible void do_softirq(void) diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index 9c15a9124e83b50661414fa425393cdb51866313..f8edee9c792de527cfb968664cc1ae91ea062d1f 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -54,8 +54,8 @@ int snprint_stack_trace(char *buf, size_t size, EXPORT_SYMBOL_GPL(snprint_stack_trace); /* - * Architectures that do not implement save_stack_trace_tsk or - * save_stack_trace_regs get this weak alias and a once-per-bootup warning + * Architectures that do not implement save_stack_trace_*() + * get these weak aliases and once-per-bootup warnings * (whenever this facility is utilized - for example by procfs): */ __weak void @@ -69,3 +69,11 @@ save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { WARN_ONCE(1, KERN_INFO "save_stack_trace_regs() not implemented yet.\n"); } + +__weak int +save_stack_trace_tsk_reliable(struct task_struct *tsk, + struct stack_trace *trace) +{ + WARN_ONCE(1, KERN_INFO "save_stack_tsk_reliable() not implemented yet.\n"); + return -ENOSYS; +} diff --git a/kernel/sys.c b/kernel/sys.c index 7ff6d1b10cecac8e66583f3bc233fd6fcb445b5c..8a94b4eabcaa291e6315858deba48a0269363fb9 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1396,8 +1396,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, !capable(CAP_SYS_RESOURCE)) retval = -EPERM; if (!retval) - retval = security_task_setrlimit(tsk->group_leader, - resource, new_rlim); + retval = security_task_setrlimit(tsk, resource, new_rlim); if (resource == RLIMIT_CPU && new_rlim->rlim_cur == 0) { /* * The caller is asking for an immediate RLIMIT_CPU @@ -1432,25 +1431,26 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource, } /* rcu lock must be held */ -static int check_prlimit_permission(struct task_struct *task) +static int check_prlimit_permission(struct task_struct *task, + unsigned int flags) { const struct cred *cred = current_cred(), *tcred; + bool id_match; if (current == task) return 0; tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) - return 0; - if (ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) - return 0; + id_match = (uid_eq(cred->uid, tcred->euid) && + uid_eq(cred->uid, tcred->suid) && + uid_eq(cred->uid, tcred->uid) && + gid_eq(cred->gid, tcred->egid) && + gid_eq(cred->gid, tcred->sgid) && + gid_eq(cred->gid, tcred->gid)); + if (!id_match && !ns_capable(tcred->user_ns, CAP_SYS_RESOURCE)) + return -EPERM; - return -EPERM; + return security_task_prlimit(cred, tcred, flags); } SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, @@ -1460,12 +1460,17 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, struct rlimit64 old64, new64; struct rlimit old, new; struct task_struct *tsk; + unsigned int checkflags = 0; int ret; + if (old_rlim) + checkflags |= LSM_PRLIMIT_READ; + if (new_rlim) { if (copy_from_user(&new64, new_rlim, sizeof(new64))) return -EFAULT; rlim64_to_rlim(&new64, &new); + checkflags |= LSM_PRLIMIT_WRITE; } rcu_read_lock(); @@ -1474,7 +1479,7 @@ SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource, rcu_read_unlock(); return -ESRCH; } - ret = check_prlimit_permission(tsk); + ret = check_prlimit_permission(tsk, checkflags); if (ret) { rcu_read_unlock(); return ret; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8c8714fcb53c35a390becf14f2fa8e1bfc20142c..4dfba1a76cc360f649cbe6f35b57cf9413f2f5e1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1176,6 +1176,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL @@ -2574,7 +2576,7 @@ static int do_proc_dointvec_jiffies_conv(bool *negp, unsigned long *lvalp, int write, void *data) { if (write) { - if (*lvalp > LONG_MAX / HZ) + if (*lvalp > INT_MAX / HZ) return 1; *valp = *negp ? -(*lvalp*HZ) : (*lvalp*HZ); } else { diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 8a5e44236f78d3ba069e0c81ef3304f4be5ba7fd..4559e914452b4b8a47d1cf2efc0320be253e5025 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -30,6 +30,7 @@ #include #include #include +#include /* * Maximum length of a cpumask that can be specified in @@ -210,6 +211,8 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) struct task_struct *tsk, *first; unsigned long flags; int rc = -ESRCH; + u64 delta, utime, stime; + u64 start_time; /* * Add additional stats from live tasks except zombie thread group @@ -227,6 +230,7 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) memset(stats, 0, sizeof(*stats)); tsk = first; + start_time = ktime_get_ns(); do { if (tsk->exit_state) continue; @@ -238,6 +242,16 @@ static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats) */ delayacct_add_tsk(stats, tsk); + /* calculate task elapsed time in nsec */ + delta = start_time - tsk->start_time; + /* Convert to micro seconds */ + do_div(delta, NSEC_PER_USEC); + stats->ac_etime += delta; + + task_cputime(tsk, &utime, &stime); + stats->ac_utime += div_u64(utime, NSEC_PER_USEC); + stats->ac_stime += div_u64(stime, NSEC_PER_USEC); + stats->nvcsw += tsk->nvcsw; stats->nivcsw += tsk->nivcsw; } while_each_thread(first, tsk); diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index ce3a31e8eb3687e8633848d645ba4c4dc741a8f3..5cb5b0008d9710c95c1af7ab8312b5e308b2c5d0 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -541,7 +541,7 @@ static enum alarmtimer_restart alarm_handle_timer(struct alarm *alarm, * * Returns the granularity of underlying alarm base clock */ -static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) +static int alarm_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { if (!alarmtimer_get_rtcdev()) return -EINVAL; @@ -558,14 +558,14 @@ static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp) * * Provides the underlying alarm base time. */ -static int alarm_clock_get(clockid_t which_clock, struct timespec *tp) +static int alarm_clock_get(clockid_t which_clock, struct timespec64 *tp) { struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)]; if (!alarmtimer_get_rtcdev()) return -EINVAL; - *tp = ktime_to_timespec(base->gettime()); + *tp = ktime_to_timespec64(base->gettime()); return 0; } @@ -598,19 +598,19 @@ static int alarm_timer_create(struct k_itimer *new_timer) * Copies out the current itimerspec data */ static void alarm_timer_get(struct k_itimer *timr, - struct itimerspec *cur_setting) + struct itimerspec64 *cur_setting) { ktime_t relative_expiry_time = alarm_expires_remaining(&(timr->it.alarm.alarmtimer)); if (ktime_to_ns(relative_expiry_time) > 0) { - cur_setting->it_value = ktime_to_timespec(relative_expiry_time); + cur_setting->it_value = ktime_to_timespec64(relative_expiry_time); } else { cur_setting->it_value.tv_sec = 0; cur_setting->it_value.tv_nsec = 0; } - cur_setting->it_interval = ktime_to_timespec(timr->it.alarm.interval); + cur_setting->it_interval = ktime_to_timespec64(timr->it.alarm.interval); } /** @@ -640,8 +640,8 @@ static int alarm_timer_del(struct k_itimer *timr) * Sets the timer to new_setting, and starts the timer. */ static int alarm_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, - struct itimerspec *old_setting) + struct itimerspec64 *new_setting, + struct itimerspec64 *old_setting) { ktime_t exp; @@ -659,8 +659,8 @@ static int alarm_timer_set(struct k_itimer *timr, int flags, return TIMER_RETRY; /* start the timer */ - timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval); - exp = timespec_to_ktime(new_setting->it_value); + timr->it.alarm.interval = timespec64_to_ktime(new_setting->it_interval); + exp = timespec64_to_ktime(new_setting->it_value); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now; @@ -790,13 +790,14 @@ static long __sched alarm_timer_nsleep_restart(struct restart_block *restart) * Handles clock_nanosleep calls against _ALARM clockids */ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsreq, struct timespec __user *rmtp) + struct timespec64 *tsreq, + struct timespec __user *rmtp) { enum alarmtimer_type type = clock2alarm(which_clock); + struct restart_block *restart; struct alarm alarm; ktime_t exp; int ret = 0; - struct restart_block *restart; if (!alarmtimer_get_rtcdev()) return -ENOTSUPP; @@ -809,7 +810,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags, alarm_init(&alarm, type, alarmtimer_nsleep_wakeup); - exp = timespec_to_ktime(*tsreq); + exp = timespec64_to_ktime(*tsreq); /* Convert (if necessary) to absolute time */ if (flags != TIMER_ABSTIME) { ktime_t now = alarm_bases[type].gettime(); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 97ac0951f164a386ca7199195607a1af90ed14ea..4237e0744e26bd276de92ca083a44676c9ede240 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -468,7 +468,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -void clockevents_config(struct clock_event_device *dev, u32 freq) +static void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index ec08f527d7ee9101399fd150ad788b8a40f2f859..ac053bb5296e8ef84be3ce4c57084ac9d05c2812 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -987,7 +987,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); * Returns: * 0 when the timer was not active * 1 when the timer was active - * -1 when the timer is currently excuting the callback function and + * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) @@ -1368,10 +1368,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) ktime_to_ns(delta)); } -/* - * local version of hrtimer_peek_ahead_timers() called with interrupts - * disabled. - */ +/* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; @@ -1506,7 +1503,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart) return ret; } -long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, +long hrtimer_nanosleep(struct timespec64 *rqtp, struct timespec __user *rmtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; @@ -1519,7 +1516,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, slack = 0; hrtimer_init_on_stack(&t.timer, clockid, mode); - hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack); + hrtimer_set_expires_range_ns(&t.timer, timespec64_to_ktime(*rqtp), slack); if (do_nanosleep(&t, mode)) goto out; @@ -1550,15 +1547,17 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, SYSCALL_DEFINE2(nanosleep, struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 tu64; struct timespec tu; if (copy_from_user(&tu, rqtp, sizeof(tu))) return -EFAULT; - if (!timespec_valid(&tu)) + tu64 = timespec_to_timespec64(tu); + if (!timespec64_valid(&tu64)) return -EINVAL; - return hrtimer_nanosleep(&tu, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); + return hrtimer_nanosleep(&tu64, rmtp, HRTIMER_MODE_REL, CLOCK_MONOTONIC); } /* diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index 9cff0ab82b635d33d2936697d909724d07767e69..31d588d37a173ae6fc65ab7d956736982e381991 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -297,7 +297,7 @@ static int pc_clock_adjtime(clockid_t id, struct timex *tx) return err; } -static int pc_clock_gettime(clockid_t id, struct timespec *ts) +static int pc_clock_gettime(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -316,7 +316,7 @@ static int pc_clock_gettime(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_getres(clockid_t id, struct timespec *ts) +static int pc_clock_getres(clockid_t id, struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -335,7 +335,7 @@ static int pc_clock_getres(clockid_t id, struct timespec *ts) return err; } -static int pc_clock_settime(clockid_t id, const struct timespec *ts) +static int pc_clock_settime(clockid_t id, const struct timespec64 *ts) { struct posix_clock_desc cd; int err; @@ -399,7 +399,7 @@ static int pc_timer_delete(struct k_itimer *kit) return err; } -static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) +static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec64 *ts) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; @@ -414,7 +414,7 @@ static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts) } static int pc_timer_settime(struct k_itimer *kit, int flags, - struct itimerspec *ts, struct itimerspec *old) + struct itimerspec64 *ts, struct itimerspec64 *old) { clockid_t id = kit->it_clock; struct posix_clock_desc cd; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 4513ad16a253f6d2b0ccf6a3aa178b1e058c5c52..1370f067fb51511f26d578cb3954032952a62027 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -116,7 +116,7 @@ static inline u64 virt_ticks(struct task_struct *p) } static int -posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) +posix_cpu_clock_getres(const clockid_t which_clock, struct timespec64 *tp) { int error = check_clock(which_clock); if (!error) { @@ -135,7 +135,7 @@ posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) } static int -posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp) +posix_cpu_clock_set(const clockid_t which_clock, const struct timespec64 *tp) { /* * You can never reset a CPU clock, but we check for other errors @@ -261,7 +261,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock, static int posix_cpu_clock_get_task(struct task_struct *tsk, const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { int err = -EINVAL; u64 rtn; @@ -275,13 +275,13 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk, } if (!err) - *tp = ns_to_timespec(rtn); + *tp = ns_to_timespec64(rtn); return err; } -static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) +static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec64 *tp) { const pid_t pid = CPUCLOCK_PID(which_clock); int err = -EINVAL; @@ -562,7 +562,7 @@ static int cpu_timer_sample_group(const clockid_t which_clock, * and try again. (This happens when the timer is in the middle of firing.) */ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, - struct itimerspec *new, struct itimerspec *old) + struct itimerspec64 *new, struct itimerspec64 *old) { unsigned long flags; struct sighand_struct *sighand; @@ -572,7 +572,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, WARN_ON_ONCE(p == NULL); - new_expires = timespec_to_ns(&new->it_value); + new_expires = timespec64_to_ns(&new->it_value); /* * Protect against sighand release/switch in exit/exec and p->cpu_timers @@ -633,7 +633,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, bump_cpu_timer(timer, val); if (val < timer->it.cpu.expires) { old_expires = timer->it.cpu.expires - val; - old->it_value = ns_to_timespec(old_expires); + old->it_value = ns_to_timespec64(old_expires); } else { old->it_value.tv_nsec = 1; old->it_value.tv_sec = 0; @@ -671,7 +671,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, * Install the new reload setting, and * set up the signal and overrun bookkeeping. */ - timer->it.cpu.incr = timespec_to_ns(&new->it_interval); + timer->it.cpu.incr = timespec64_to_ns(&new->it_interval); /* * This acts as a modification timestamp for the timer, @@ -695,12 +695,12 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, ret = 0; out: if (old) - old->it_interval = ns_to_timespec(old_incr); + old->it_interval = ns_to_timespec64(old_incr); return ret; } -static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) +static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec64 *itp) { u64 now; struct task_struct *p = timer->it.cpu.task; @@ -710,7 +710,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) /* * Easy part: convert the reload time. */ - itp->it_interval = ns_to_timespec(timer->it.cpu.incr); + itp->it_interval = ns_to_timespec64(timer->it.cpu.incr); if (timer->it.cpu.expires == 0) { /* Timer not armed at all. */ itp->it_value.tv_sec = itp->it_value.tv_nsec = 0; @@ -739,7 +739,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) * Call the timer disarmed, nothing else to do. */ timer->it.cpu.expires = 0; - itp->it_value = ns_to_timespec(timer->it.cpu.expires); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires); return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); @@ -748,7 +748,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) } if (now < timer->it.cpu.expires) { - itp->it_value = ns_to_timespec(timer->it.cpu.expires - now); + itp->it_value = ns_to_timespec64(timer->it.cpu.expires - now); } else { /* * The timer should have expired already, but the firing @@ -825,6 +825,8 @@ static void check_thread_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("CPU Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -836,8 +838,7 @@ static void check_thread_timers(struct task_struct *tsk, soft += USEC_PER_SEC; sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } - printk(KERN_INFO - "RT Watchdog Timeout: %s[%d]\n", + pr_info("RT Watchdog Timeout (soft): %s[%d]\n", tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } @@ -935,6 +936,8 @@ static void check_process_timers(struct task_struct *tsk, * At the hard limit, we just die. * No need to calculate anything else now. */ + pr_info("RT Watchdog Timeout (hard): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } @@ -942,6 +945,8 @@ static void check_process_timers(struct task_struct *tsk, /* * At the soft limit, send a SIGXCPU every second. */ + pr_info("CPU Watchdog Timeout (soft): %s[%d]\n", + tsk->comm, task_pid_nr(tsk)); __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); if (soft < hard) { soft++; @@ -1214,7 +1219,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct itimerspec *it) + struct timespec64 *rqtp, struct itimerspec64 *it) { struct k_itimer timer; int error; @@ -1229,7 +1234,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, error = posix_cpu_timer_create(&timer); timer.it_process = current; if (!error) { - static struct itimerspec zero_it; + static struct itimerspec64 zero_it; memset(it, 0, sizeof *it); it->it_value = *rqtp; @@ -1264,7 +1269,7 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, /* * We were interrupted by a signal. */ - *rqtp = ns_to_timespec(timer.it.cpu.expires); + *rqtp = ns_to_timespec64(timer.it.cpu.expires); error = posix_cpu_timer_set(&timer, 0, &zero_it, it); if (!error) { /* @@ -1301,10 +1306,11 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block); static int posix_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, struct timespec __user *rmtp) + struct timespec64 *rqtp, struct timespec __user *rmtp) { struct restart_block *restart_block = ¤t->restart_block; - struct itimerspec it; + struct itimerspec64 it; + struct timespec ts; int error; /* @@ -1312,7 +1318,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, */ if (CPUCLOCK_PERTHREAD(which_clock) && (CPUCLOCK_PID(which_clock) == 0 || - CPUCLOCK_PID(which_clock) == current->pid)) + CPUCLOCK_PID(which_clock) == task_pid_vnr(current))) return -EINVAL; error = do_cpu_nanosleep(which_clock, flags, rqtp, &it); @@ -1324,13 +1330,14 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + ts = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &ts, sizeof(*rmtp))) return -EFAULT; restart_block->fn = posix_cpu_nsleep_restart; restart_block->nanosleep.clockid = which_clock; restart_block->nanosleep.rmtp = rmtp; - restart_block->nanosleep.expires = timespec_to_ns(rqtp); + restart_block->nanosleep.expires = timespec64_to_ns(rqtp); } return error; } @@ -1338,11 +1345,12 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags, static long posix_cpu_nsleep_restart(struct restart_block *restart_block) { clockid_t which_clock = restart_block->nanosleep.clockid; - struct timespec t; - struct itimerspec it; + struct itimerspec64 it; + struct timespec64 t; + struct timespec tmp; int error; - t = ns_to_timespec(restart_block->nanosleep.expires); + t = ns_to_timespec64(restart_block->nanosleep.expires); error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it); @@ -1351,10 +1359,11 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) /* * Report back to the user the time still remaining. */ - if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp)) + tmp = timespec64_to_timespec(it.it_value); + if (rmtp && copy_to_user(rmtp, &tmp, sizeof(*rmtp))) return -EFAULT; - restart_block->nanosleep.expires = timespec_to_ns(&t); + restart_block->nanosleep.expires = timespec64_to_ns(&t); } return error; @@ -1364,12 +1373,12 @@ static long posix_cpu_nsleep_restart(struct restart_block *restart_block) #define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED) static int process_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(PROCESS_CLOCK, tp); } static int process_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(PROCESS_CLOCK, tp); } @@ -1379,7 +1388,7 @@ static int process_cpu_timer_create(struct k_itimer *timer) return posix_cpu_timer_create(timer); } static int process_cpu_nsleep(const clockid_t which_clock, int flags, - struct timespec *rqtp, + struct timespec64 *rqtp, struct timespec __user *rmtp) { return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); @@ -1389,12 +1398,12 @@ static long process_cpu_nsleep_restart(struct restart_block *restart_block) return -EINVAL; } static int thread_cpu_clock_getres(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_getres(THREAD_CLOCK, tp); } static int thread_cpu_clock_get(const clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { return posix_cpu_clock_get(THREAD_CLOCK, tp); } diff --git a/kernel/time/posix-stubs.c b/kernel/time/posix-stubs.c index cd6716e115e8d81a89afdc95286590ca9f7e500e..c0cd53eb018a249f99a3d120ffde9fbcc2abbdef 100644 --- a/kernel/time/posix-stubs.c +++ b/kernel/time/posix-stubs.c @@ -49,26 +49,32 @@ SYS_NI(alarm); SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { + struct timespec64 new_tp64; struct timespec new_tp; if (which_clock != CLOCK_REALTIME) return -EINVAL; if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; - return do_sys_settimeofday(&new_tp, NULL); + + new_tp64 = timespec_to_timespec64(new_tp); + return do_sys_settimeofday64(&new_tp64, NULL); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { + struct timespec64 kernel_tp64; struct timespec kernel_tp; switch (which_clock) { - case CLOCK_REALTIME: ktime_get_real_ts(&kernel_tp); break; - case CLOCK_MONOTONIC: ktime_get_ts(&kernel_tp); break; - case CLOCK_BOOTTIME: get_monotonic_boottime(&kernel_tp); break; + case CLOCK_REALTIME: ktime_get_real_ts64(&kernel_tp64); break; + case CLOCK_MONOTONIC: ktime_get_ts64(&kernel_tp64); break; + case CLOCK_BOOTTIME: get_monotonic_boottime64(&kernel_tp64); break; default: return -EINVAL; } + + kernel_tp = timespec64_to_timespec(kernel_tp64); if (copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) return -EFAULT; return 0; @@ -97,6 +103,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, const struct timespec __user *, rqtp, struct timespec __user *, rmtp) { + struct timespec64 t64; struct timespec t; switch (which_clock) { @@ -105,9 +112,10 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, case CLOCK_BOOTTIME: if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return hrtimer_nanosleep(&t, rmtp, flags & TIMER_ABSTIME ? + return hrtimer_nanosleep(&t64, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, which_clock); default: diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 50a6a47020dea9e055b7267926ee9b0e39399d3c..4d7b2ce09c27cb6b77b1b2d18e379c4efed48d71 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -130,12 +130,12 @@ static struct k_clock posix_clocks[MAX_CLOCKS]; /* * These ones are defined below. */ -static int common_nsleep(const clockid_t, int flags, struct timespec *t, +static int common_nsleep(const clockid_t, int flags, struct timespec64 *t, struct timespec __user *rmtp); static int common_timer_create(struct k_itimer *new_timer); -static void common_timer_get(struct k_itimer *, struct itimerspec *); +static void common_timer_get(struct k_itimer *, struct itimerspec64 *); static int common_timer_set(struct k_itimer *, int, - struct itimerspec *, struct itimerspec *); + struct itimerspec64 *, struct itimerspec64 *); static int common_timer_del(struct k_itimer *timer); static enum hrtimer_restart posix_timer_fn(struct hrtimer *data); @@ -204,17 +204,17 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags) } /* Get clock_realtime */ -static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp) +static int posix_clock_realtime_get(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_real_ts(tp); + ktime_get_real_ts64(tp); return 0; } /* Set clock_realtime */ static int posix_clock_realtime_set(const clockid_t which_clock, - const struct timespec *tp) + const struct timespec64 *tp) { - return do_sys_settimeofday(tp, NULL); + return do_sys_settimeofday64(tp, NULL); } static int posix_clock_realtime_adj(const clockid_t which_clock, @@ -226,54 +226,54 @@ static int posix_clock_realtime_adj(const clockid_t which_clock, /* * Get monotonic time for posix timers */ -static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp) +static int posix_ktime_get_ts(clockid_t which_clock, struct timespec64 *tp) { - ktime_get_ts(tp); + ktime_get_ts64(tp); return 0; } /* * Get monotonic-raw time for posix timers */ -static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp) +static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec64 *tp) { - getrawmonotonic(tp); + getrawmonotonic64(tp); return 0; } -static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec *tp) +static int posix_get_realtime_coarse(clockid_t which_clock, struct timespec64 *tp) { - *tp = current_kernel_time(); + *tp = current_kernel_time64(); return 0; } static int posix_get_monotonic_coarse(clockid_t which_clock, - struct timespec *tp) + struct timespec64 *tp) { - *tp = get_monotonic_coarse(); + *tp = get_monotonic_coarse64(); return 0; } -static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp) +static int posix_get_coarse_res(const clockid_t which_clock, struct timespec64 *tp) { - *tp = ktime_to_timespec(KTIME_LOW_RES); + *tp = ktime_to_timespec64(KTIME_LOW_RES); return 0; } -static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp) +static int posix_get_boottime(const clockid_t which_clock, struct timespec64 *tp) { - get_monotonic_boottime(tp); + get_monotonic_boottime64(tp); return 0; } -static int posix_get_tai(clockid_t which_clock, struct timespec *tp) +static int posix_get_tai(clockid_t which_clock, struct timespec64 *tp) { - timekeeping_clocktai(tp); + timekeeping_clocktai64(tp); return 0; } -static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec *tp) +static int posix_get_hrtimer_res(clockid_t which_clock, struct timespec64 *tp) { tp->tv_sec = 0; tp->tv_nsec = hrtimer_resolution; @@ -734,18 +734,18 @@ static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags) * report. */ static void -common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) +common_timer_get(struct k_itimer *timr, struct itimerspec64 *cur_setting) { ktime_t now, remaining, iv; struct hrtimer *timer = &timr->it.real.timer; - memset(cur_setting, 0, sizeof(struct itimerspec)); + memset(cur_setting, 0, sizeof(*cur_setting)); iv = timr->it.real.interval; /* interval timer ? */ if (iv) - cur_setting->it_interval = ktime_to_timespec(iv); + cur_setting->it_interval = ktime_to_timespec64(iv); else if (!hrtimer_active(timer) && (timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) return; @@ -771,13 +771,14 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) if ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE) cur_setting->it_value.tv_nsec = 1; } else - cur_setting->it_value = ktime_to_timespec(remaining); + cur_setting->it_value = ktime_to_timespec64(remaining); } /* Get the time remaining on a POSIX.1b interval timer. */ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, struct itimerspec __user *, setting) { + struct itimerspec64 cur_setting64; struct itimerspec cur_setting; struct k_itimer *timr; struct k_clock *kc; @@ -792,10 +793,11 @@ SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id, if (WARN_ON_ONCE(!kc || !kc->timer_get)) ret = -EINVAL; else - kc->timer_get(timr, &cur_setting); + kc->timer_get(timr, &cur_setting64); unlock_timer(timr, flags); + cur_setting = itimerspec64_to_itimerspec(&cur_setting64); if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting))) return -EFAULT; @@ -831,7 +833,7 @@ SYSCALL_DEFINE1(timer_getoverrun, timer_t, timer_id) /* timr->it_lock is taken. */ static int common_timer_set(struct k_itimer *timr, int flags, - struct itimerspec *new_setting, struct itimerspec *old_setting) + struct itimerspec64 *new_setting, struct itimerspec64 *old_setting) { struct hrtimer *timer = &timr->it.real.timer; enum hrtimer_mode mode; @@ -860,10 +862,10 @@ common_timer_set(struct k_itimer *timr, int flags, hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); timr->it.real.timer.function = posix_timer_fn; - hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value)); + hrtimer_set_expires(timer, timespec64_to_ktime(new_setting->it_value)); /* Convert interval */ - timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); + timr->it.real.interval = timespec64_to_ktime(new_setting->it_interval); /* SIGEV_NONE timers are not queued ! See common_timer_get */ if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { @@ -883,21 +885,23 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, const struct itimerspec __user *, new_setting, struct itimerspec __user *, old_setting) { - struct k_itimer *timr; + struct itimerspec64 new_spec64, old_spec64; + struct itimerspec64 *rtn = old_setting ? &old_spec64 : NULL; struct itimerspec new_spec, old_spec; - int error = 0; + struct k_itimer *timr; unsigned long flag; - struct itimerspec *rtn = old_setting ? &old_spec : NULL; struct k_clock *kc; + int error = 0; if (!new_setting) return -EINVAL; if (copy_from_user(&new_spec, new_setting, sizeof (new_spec))) return -EFAULT; + new_spec64 = itimerspec_to_itimerspec64(&new_spec); - if (!timespec_valid(&new_spec.it_interval) || - !timespec_valid(&new_spec.it_value)) + if (!timespec64_valid(&new_spec64.it_interval) || + !timespec64_valid(&new_spec64.it_value)) return -EINVAL; retry: timr = lock_timer(timer_id, &flag); @@ -908,7 +912,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, if (WARN_ON_ONCE(!kc || !kc->timer_set)) error = -EINVAL; else - error = kc->timer_set(timr, flags, &new_spec, rtn); + error = kc->timer_set(timr, flags, &new_spec64, rtn); unlock_timer(timr, flag); if (error == TIMER_RETRY) { @@ -916,6 +920,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags, goto retry; } + old_spec = itimerspec64_to_itimerspec(&old_spec64); if (old_setting && !error && copy_to_user(old_setting, &old_spec, sizeof (old_spec))) error = -EFAULT; @@ -1014,6 +1019,7 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, const struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 new_tp64; struct timespec new_tp; if (!kc || !kc->clock_set) @@ -1021,21 +1027,24 @@ SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock, if (copy_from_user(&new_tp, tp, sizeof (*tp))) return -EFAULT; + new_tp64 = timespec_to_timespec64(new_tp); - return kc->clock_set(which_clock, &new_tp); + return kc->clock_set(which_clock, &new_tp64); } SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock, struct timespec __user *,tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 kernel_tp64; struct timespec kernel_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_get(which_clock, &kernel_tp); + error = kc->clock_get(which_clock, &kernel_tp64); + kernel_tp = timespec64_to_timespec(kernel_tp64); if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp))) error = -EFAULT; @@ -1070,13 +1079,15 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, struct timespec __user *, tp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 rtn_tp64; struct timespec rtn_tp; int error; if (!kc) return -EINVAL; - error = kc->clock_getres(which_clock, &rtn_tp); + error = kc->clock_getres(which_clock, &rtn_tp64); + rtn_tp = timespec64_to_timespec(rtn_tp64); if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) error = -EFAULT; @@ -1088,7 +1099,7 @@ SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock, * nanosleep for monotonic and realtime clocks */ static int common_nsleep(const clockid_t which_clock, int flags, - struct timespec *tsave, struct timespec __user *rmtp) + struct timespec64 *tsave, struct timespec __user *rmtp) { return hrtimer_nanosleep(tsave, rmtp, flags & TIMER_ABSTIME ? HRTIMER_MODE_ABS : HRTIMER_MODE_REL, @@ -1100,6 +1111,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, struct timespec __user *, rmtp) { struct k_clock *kc = clockid_to_kclock(which_clock); + struct timespec64 t64; struct timespec t; if (!kc) @@ -1110,10 +1122,11 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags, if (copy_from_user(&t, rqtp, sizeof (struct timespec))) return -EFAULT; - if (!timespec_valid(&t)) + t64 = timespec_to_timespec64(t); + if (!timespec64_valid(&t64)) return -EINVAL; - return kc->nsleep(which_clock, flags, &t, rmtp); + return kc->nsleep(which_clock, flags, &t64, rmtp); } /* diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c index ea6b610c4c57c3f4d600cfc5a42597a34bedc72b..2d8f05aad442a5fcafc37b98a177f84bde2047a5 100644 --- a/kernel/time/sched_clock.c +++ b/kernel/time/sched_clock.c @@ -206,6 +206,11 @@ sched_clock_register(u64 (*read)(void), int bits, unsigned long rate) update_clock_read_data(&rd); + if (sched_clock_timer.function != NULL) { + /* update timeout for clock wrap */ + hrtimer_start(&sched_clock_timer, cd.wrap_kt, HRTIMER_MODE_REL); + } + r = rate; if (r >= 4000000) { r /= 1000000; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7fe53be860778b07210fb7a333bf49695bded575..64c97fc130c4db2f1241185d9075af7a0f5175f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -993,6 +993,18 @@ ktime_t tick_nohz_get_sleep_length(void) return ts->sleep_length; } +/** + * tick_nohz_get_idle_calls - return the current idle calls counter value + * + * Called from the schedutil frequency scaling governor in scheduler context. + */ +unsigned long tick_nohz_get_idle_calls(void) +{ + struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched); + + return ts->idle_calls; +} + static void tick_nohz_account_idle_ticks(struct tick_sched *ts) { #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE diff --git a/kernel/time/time.c b/kernel/time/time.c index 25bdd250457192df0a2dfc13e6c9078bb92be0d6..49c73c6ed648900d873190f9ebaefe9cb44f4f98 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -193,8 +193,8 @@ int do_sys_settimeofday64(const struct timespec64 *tv, const struct timezone *tz SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, struct timezone __user *, tz) { + struct timespec64 new_ts; struct timeval user_tv; - struct timespec new_ts; struct timezone new_tz; if (tv) { @@ -212,7 +212,7 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, return -EFAULT; } - return do_sys_settimeofday(tv ? &new_ts : NULL, tz ? &new_tz : NULL); + return do_sys_settimeofday64(tv ? &new_ts : NULL, tz ? &new_tz : NULL); } SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) @@ -230,20 +230,6 @@ SYSCALL_DEFINE1(adjtimex, struct timex __user *, txc_p) return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret; } -/** - * current_fs_time - Return FS time - * @sb: Superblock. - * - * Return the current time truncated to the time granularity supported by - * the fs. - */ -struct timespec current_fs_time(struct super_block *sb) -{ - struct timespec now = current_kernel_time(); - return timespec_trunc(now, sb->s_time_gran); -} -EXPORT_SYMBOL(current_fs_time); - /* * Convert jiffies to milliseconds and back. * diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 5b63a2102c2907bc42a870fe2545bafdb10cdf28..9652bc57fd09811fa4e3ffbabc81b9139e75f125 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -996,8 +996,7 @@ static int adjust_historical_crosststamp(struct system_time_snapshot *history, return 0; /* Interpolate shortest distance from beginning or end of history */ - interp_forward = partial_history_cycles > total_history_cycles/2 ? - true : false; + interp_forward = partial_history_cycles > total_history_cycles / 2; partial_history_cycles = interp_forward ? total_history_cycles - partial_history_cycles : partial_history_cycles; diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 1dc0256bfb6e1f00bfafb1c6e085feda9f04cfa3..152a706ef8b87ca8f86a195d2fedfbc728c592d7 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -241,7 +241,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); @@ -1120,7 +1120,7 @@ void add_timer_on(struct timer_list *timer, int cpu) EXPORT_SYMBOL_GPL(add_timer_on); /** - * del_timer - deactive a timer. + * del_timer - deactivate a timer. * @timer: the timer to be deactivated * * del_timer() deactivates a timer - this works on both active and inactive diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ff8d5c13d04bd0911c62584b6f2764b7a27cb89a..0e7f5428a1484ed85215a72e115759eace1392fb 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -16,6 +16,7 @@ #include #include #include +#include #include @@ -86,6 +87,9 @@ print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, next_one: i = 0; + + touch_nmi_watchdog(); + raw_spin_lock_irqsave(&base->cpu_base->lock, flags); curr = timerqueue_getnext(&base->active); @@ -197,6 +201,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) { struct clock_event_device *dev = td->evtdev; + touch_nmi_watchdog(); + SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); if (cpu < 0) SEQ_printf(m, "Broadcast device\n"); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index d4a06e714645df56f75db97ba6bb052534a4bb41..7e06f04e98fe2c5a0194ce05c88a270fcf0bfc4d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -134,7 +134,8 @@ config FUNCTION_TRACER select KALLSYMS select GENERIC_TRACER select CONTEXT_SWITCH_TRACER - select GLOB + select GLOB + select TASKS_RCU if PREEMPT help Enable the kernel to trace every kernel function. This is done by using a compiler feature to insert a small, 5-byte No-Operation @@ -455,7 +456,7 @@ config UPROBE_EVENTS select UPROBES select PROBE_EVENTS select TRACING - default n + default y help This allows the user to add tracing events on top of userspace dynamic events (similar to tracepoints) on the fly via the trace diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b2058a7f94bd8797f5629158aeed8bf045c20e4f..bd8ae8d5ae9ca865f3738c6b4df579a3a1ca12cc 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -690,8 +690,8 @@ void blk_trace_shutdown(struct request_queue *q) /** * blk_add_trace_rq - Add a trace for a request oriented action - * @q: queue the io is for * @rq: the source request + * @error: return status to log * @nr_bytes: number of completed bytes * @what: the action * @@ -699,10 +699,10 @@ void blk_trace_shutdown(struct request_queue *q) * Records an action against a request. Will log the bio offset + size. * **/ -static void blk_add_trace_rq(struct request_queue *q, struct request *rq, +static void blk_add_trace_rq(struct request *rq, int error, unsigned int nr_bytes, u32 what) { - struct blk_trace *bt = q->blk_trace; + struct blk_trace *bt = rq->q->blk_trace; if (likely(!bt)) return; @@ -713,40 +713,32 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, what |= BLK_TC_ACT(BLK_TC_FS); __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), - rq->cmd_flags, what, rq->errors, 0, NULL); -} - -static void blk_add_trace_rq_abort(void *ignore, - struct request_queue *q, struct request *rq) -{ - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ABORT); + rq->cmd_flags, what, error, 0, NULL); } static void blk_add_trace_rq_insert(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_INSERT); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_INSERT); } static void blk_add_trace_rq_issue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_ISSUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_ISSUE); } static void blk_add_trace_rq_requeue(void *ignore, struct request_queue *q, struct request *rq) { - blk_add_trace_rq(q, rq, blk_rq_bytes(rq), BLK_TA_REQUEUE); + blk_add_trace_rq(rq, 0, blk_rq_bytes(rq), BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(void *ignore, - struct request_queue *q, - struct request *rq, - unsigned int nr_bytes) +static void blk_add_trace_rq_complete(void *ignore, struct request *rq, + int error, unsigned int nr_bytes) { - blk_add_trace_rq(q, rq, nr_bytes, BLK_TA_COMPLETE); + blk_add_trace_rq(rq, error, nr_bytes, BLK_TA_COMPLETE); } /** @@ -941,7 +933,7 @@ static void blk_add_trace_rq_remap(void *ignore, r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), - rq_data_dir(rq), 0, BLK_TA_REMAP, !!rq->errors, + rq_data_dir(rq), 0, BLK_TA_REMAP, 0, sizeof(r), &r); } @@ -966,7 +958,7 @@ void blk_add_driver_data(struct request_queue *q, return; __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, - BLK_TA_DRV_DATA, rq->errors, len, data); + BLK_TA_DRV_DATA, 0, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); @@ -974,8 +966,6 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); - WARN_ON(ret); ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); @@ -1028,7 +1018,6 @@ static void blk_unregister_tracepoints(void) unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index cee9802cf3e00f0f5ef1625df14fa9d6892b3581..460a031c77e592411b83a4071d976706db975687 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -96,7 +96,7 @@ BPF_CALL_3(bpf_probe_write_user, void *, unsafe_ptr, const void *, src, if (unlikely(in_interrupt() || current->flags & (PF_KTHREAD | PF_EXITING))) return -EPERM; - if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + if (unlikely(uaccess_kernel())) return -EPERM; if (!access_ok(VERIFY_WRITE, unsafe_ptr, size)) return -EPERM; @@ -501,16 +501,11 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -static const struct bpf_verifier_ops kprobe_prog_ops = { +const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; -static struct bpf_prog_type_list kprobe_tl __ro_after_init = { - .ops = &kprobe_prog_ops, - .type = BPF_PROG_TYPE_KPROBE, -}; - BPF_CALL_5(bpf_perf_event_output_tp, void *, tp_buff, struct bpf_map *, map, u64, flags, void *, data, u64, size) { @@ -584,16 +579,11 @@ static bool tp_prog_is_valid_access(int off, int size, enum bpf_access_type type return true; } -static const struct bpf_verifier_ops tracepoint_prog_ops = { +const struct bpf_verifier_ops tracepoint_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = tp_prog_is_valid_access, }; -static struct bpf_prog_type_list tracepoint_tl __ro_after_init = { - .ops = &tracepoint_prog_ops, - .type = BPF_PROG_TYPE_TRACEPOINT, -}; - static bool pe_prog_is_valid_access(int off, int size, enum bpf_access_type type, enum bpf_reg_type *reg_type) { @@ -642,22 +632,8 @@ static u32 pe_prog_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static const struct bpf_verifier_ops perf_event_prog_ops = { +const struct bpf_verifier_ops perf_event_prog_ops = { .get_func_proto = tp_prog_func_proto, .is_valid_access = pe_prog_is_valid_access, .convert_ctx_access = pe_prog_convert_ctx_access, }; - -static struct bpf_prog_type_list perf_event_tl __ro_after_init = { - .ops = &perf_event_prog_ops, - .type = BPF_PROG_TYPE_PERF_EVENT, -}; - -static int __init register_kprobe_prog_ops(void) -{ - bpf_register_prog_type(&kprobe_tl); - bpf_register_prog_type(&tracepoint_tl); - bpf_register_prog_type(&perf_event_tl); - return 0; -} -late_initcall(register_kprobe_prog_ops); diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 27bb2e61276e1c6b3a581ce0f3a1f750d5975227..39dca4e86a94f0867c29aaa29b47d881eefd02e8 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -36,6 +36,7 @@ #include +#include #include #include "trace_output.h" @@ -1095,22 +1096,20 @@ static bool update_all_ops; # error Dynamic ftrace depends on MCOUNT_RECORD #endif -static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly; - -struct ftrace_func_probe { - struct hlist_node node; - struct ftrace_probe_ops *ops; - unsigned long flags; - unsigned long ip; - void *data; - struct list_head free_list; -}; - struct ftrace_func_entry { struct hlist_node hlist; unsigned long ip; }; +struct ftrace_func_probe { + struct ftrace_probe_ops *probe_ops; + struct ftrace_ops ops; + struct trace_array *tr; + struct list_head list; + void *data; + int ref; +}; + /* * We make these constant because no one should touch them, * but they are used as the default "empty hash", to avoid allocating @@ -1271,7 +1270,7 @@ static void remove_hash_entry(struct ftrace_hash *hash, struct ftrace_func_entry *entry) { - hlist_del(&entry->hlist); + hlist_del_rcu(&entry->hlist); hash->count--; } @@ -2807,18 +2806,28 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * callers are done before leaving this function. * The same goes for freeing the per_cpu data of the per_cpu * ops. - * - * Again, normal synchronize_sched() is not good enough. - * We need to do a hard force of sched synchronization. - * This is because we use preempt_disable() to do RCU, but - * the function tracers can be called where RCU is not watching - * (like before user_exit()). We can not rely on the RCU - * infrastructure to do the synchronization, thus we must do it - * ourselves. */ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { + /* + * We need to do a hard force of sched synchronization. + * This is because we use preempt_disable() to do RCU, but + * the function tracers can be called where RCU is not watching + * (like before user_exit()). We can not rely on the RCU + * infrastructure to do the synchronization, thus we must do it + * ourselves. + */ schedule_on_each_cpu(ftrace_sync); + /* + * When the kernel is preeptive, tasks can be preempted + * while on a ftrace trampoline. Just scheduling a task on + * a CPU is not good enough to flush them. Calling + * synchornize_rcu_tasks() will wait for those tasks to + * execute and either schedule voluntarily or enter user space. + */ + if (IS_ENABLED(CONFIG_PREEMPT)) + synchronize_rcu_tasks(); + arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_PER_CPU) @@ -3055,34 +3064,63 @@ struct ftrace_iterator { struct ftrace_page *pg; struct dyn_ftrace *func; struct ftrace_func_probe *probe; + struct ftrace_func_entry *probe_entry; struct trace_parser parser; struct ftrace_hash *hash; struct ftrace_ops *ops; - int hidx; + int pidx; int idx; unsigned flags; }; static void * -t_hash_next(struct seq_file *m, loff_t *pos) +t_probe_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; + struct trace_array *tr = iter->ops->private; + struct list_head *func_probes; + struct ftrace_hash *hash; + struct list_head *next; struct hlist_node *hnd = NULL; struct hlist_head *hhd; + int size; (*pos)++; iter->pos = *pos; - if (iter->probe) - hnd = &iter->probe->node; - retry: - if (iter->hidx >= FTRACE_FUNC_HASHSIZE) + if (!tr) return NULL; - hhd = &ftrace_func_hash[iter->hidx]; + func_probes = &tr->func_probes; + if (list_empty(func_probes)) + return NULL; + + if (!iter->probe) { + next = func_probes->next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + } + + if (iter->probe_entry) + hnd = &iter->probe_entry->hlist; + + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + + retry: + if (iter->pidx >= size) { + if (iter->probe->list.next == func_probes) + return NULL; + next = iter->probe->list.next; + iter->probe = list_entry(next, struct ftrace_func_probe, list); + hash = iter->probe->ops.func_hash->filter_hash; + size = 1 << hash->size_bits; + iter->pidx = 0; + } + + hhd = &hash->buckets[iter->pidx]; if (hlist_empty(hhd)) { - iter->hidx++; + iter->pidx++; hnd = NULL; goto retry; } @@ -3092,7 +3130,7 @@ t_hash_next(struct seq_file *m, loff_t *pos) else { hnd = hnd->next; if (!hnd) { - iter->hidx++; + iter->pidx++; goto retry; } } @@ -3100,26 +3138,28 @@ t_hash_next(struct seq_file *m, loff_t *pos) if (WARN_ON_ONCE(!hnd)) return NULL; - iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + iter->probe_entry = hlist_entry(hnd, struct ftrace_func_entry, hlist); return iter; } -static void *t_hash_start(struct seq_file *m, loff_t *pos) +static void *t_probe_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_DO_HASH)) + if (!(iter->flags & FTRACE_ITER_DO_PROBES)) return NULL; if (iter->func_pos > *pos) return NULL; - iter->hidx = 0; + iter->probe = NULL; + iter->probe_entry = NULL; + iter->pidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, &l); + p = t_probe_next(m, &l); if (!p) break; } @@ -3127,50 +3167,42 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) return NULL; /* Only set this if we have an item */ - iter->flags |= FTRACE_ITER_HASH; + iter->flags |= FTRACE_ITER_PROBE; return iter; } static int -t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) +t_probe_show(struct seq_file *m, struct ftrace_iterator *iter) { - struct ftrace_func_probe *rec; + struct ftrace_func_entry *probe_entry; + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - rec = iter->probe; - if (WARN_ON_ONCE(!rec)) + probe = iter->probe; + probe_entry = iter->probe_entry; + + if (WARN_ON_ONCE(!probe || !probe_entry)) return -EIO; - if (rec->ops->print) - return rec->ops->print(m, rec->ip, rec->ops, rec->data); + probe_ops = probe->probe_ops; - seq_printf(m, "%ps:%ps", (void *)rec->ip, (void *)rec->ops->func); + if (probe_ops->print) + return probe_ops->print(m, probe_entry->ip, probe_ops, probe->data); - if (rec->data) - seq_printf(m, ":%p", rec->data); - seq_putc(m, '\n'); + seq_printf(m, "%ps:%ps\n", (void *)probe_entry->ip, + (void *)probe_ops->func); return 0; } static void * -t_next(struct seq_file *m, void *v, loff_t *pos) +t_func_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; struct dyn_ftrace *rec = NULL; - if (unlikely(ftrace_disabled)) - return NULL; - - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, pos); - (*pos)++; - iter->pos = iter->func_pos = *pos; - - if (iter->flags & FTRACE_ITER_PRINTALL) - return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -3181,11 +3213,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } else { rec = &iter->pg->records[iter->idx++]; - if (((iter->flags & FTRACE_ITER_FILTER) && - !(ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip))) || - - ((iter->flags & FTRACE_ITER_NOTRACE) && - !ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip)) || + if (((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + !ftrace_lookup_ip(iter->hash, rec->ip)) || ((iter->flags & FTRACE_ITER_ENABLED) && !(rec->flags & FTRACE_FL_ENABLED))) { @@ -3196,24 +3225,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return t_hash_start(m, pos); + return NULL; + iter->pos = iter->func_pos = *pos; iter->func = rec; return iter; } +static void * +t_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct ftrace_iterator *iter = m->private; + loff_t l = *pos; /* t_hash_start() must use original pos */ + void *ret; + + if (unlikely(ftrace_disabled)) + return NULL; + + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_next(m, pos); + + if (iter->flags & FTRACE_ITER_PRINTALL) { + /* next must increment pos, and t_probe_start does not */ + (*pos)++; + return t_probe_start(m, &l); + } + + ret = t_func_next(m, pos); + + if (!ret) + return t_probe_start(m, &l); + + return ret; +} + static void reset_iter_read(struct ftrace_iterator *iter) { iter->pos = 0; iter->func_pos = 0; - iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_HASH); + iter->flags &= ~(FTRACE_ITER_PRINTALL | FTRACE_ITER_PROBE); } static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct ftrace_ops *ops = iter->ops; void *p = NULL; loff_t l; @@ -3233,20 +3289,19 @@ static void *t_start(struct seq_file *m, loff_t *pos) * off, we can short cut and just print out that all * functions are enabled. */ - if ((iter->flags & FTRACE_ITER_FILTER && - ftrace_hash_empty(ops->func_hash->filter_hash)) || - (iter->flags & FTRACE_ITER_NOTRACE && - ftrace_hash_empty(ops->func_hash->notrace_hash))) { + if ((iter->flags & (FTRACE_ITER_FILTER | FTRACE_ITER_NOTRACE)) && + ftrace_hash_empty(iter->hash)) { + iter->func_pos = 1; /* Account for the message */ if (*pos > 0) - return t_hash_start(m, pos); + return t_probe_start(m, pos); iter->flags |= FTRACE_ITER_PRINTALL; /* reset in case of seek/pread */ - iter->flags &= ~FTRACE_ITER_HASH; + iter->flags &= ~FTRACE_ITER_PROBE; return iter; } - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_start(m, pos); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_start(m, pos); /* * Unfortunately, we need to restart at ftrace_pages_start @@ -3256,13 +3311,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { - p = t_next(m, p, &l); + p = t_func_next(m, &l); if (!p) break; } if (!p) - return t_hash_start(m, pos); + return t_probe_start(m, pos); return iter; } @@ -3293,8 +3348,8 @@ static int t_show(struct seq_file *m, void *v) struct ftrace_iterator *iter = m->private; struct dyn_ftrace *rec; - if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, iter); + if (iter->flags & FTRACE_ITER_PROBE) + return t_probe_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { if (iter->flags & FTRACE_ITER_NOTRACE) @@ -3355,12 +3410,13 @@ ftrace_avail_open(struct inode *inode, struct file *file) return -ENODEV; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; + + iter->pg = ftrace_pages_start; + iter->ops = &global_ops; - return iter ? 0 : -ENOMEM; + return 0; } static int @@ -3369,13 +3425,14 @@ ftrace_enabled_open(struct inode *inode, struct file *file) struct ftrace_iterator *iter; iter = __seq_open_private(file, &show_ftrace_seq_ops, sizeof(*iter)); - if (iter) { - iter->pg = ftrace_pages_start; - iter->flags = FTRACE_ITER_ENABLED; - iter->ops = &global_ops; - } + if (!iter) + return -ENOMEM; - return iter ? 0 : -ENOMEM; + iter->pg = ftrace_pages_start; + iter->flags = FTRACE_ITER_ENABLED; + iter->ops = &global_ops; + + return 0; } /** @@ -3440,7 +3497,8 @@ ftrace_regex_open(struct ftrace_ops *ops, int flag, ret = -ENOMEM; goto out_unlock; } - } + } else + iter->hash = hash; if (file->f_mode & FMODE_READ) { iter->pg = ftrace_pages_start; @@ -3470,7 +3528,7 @@ ftrace_filter_open(struct inode *inode, struct file *file) struct ftrace_ops *ops = inode->i_private; return ftrace_regex_open(ops, - FTRACE_ITER_FILTER | FTRACE_ITER_DO_HASH, + FTRACE_ITER_FILTER | FTRACE_ITER_DO_PROBES, inode, file); } @@ -3573,22 +3631,20 @@ ftrace_match_record(struct dyn_ftrace *rec, struct ftrace_glob *func_g, /* blank module name to match all modules */ if (!mod_g->len) { /* blank module globbing: modname xor exclude_mod */ - if ((!exclude_mod) != (!modname)) + if (!exclude_mod != !modname) goto func_match; return 0; } - /* not matching the module */ - if (!modname || !mod_matches) { - if (exclude_mod) - goto func_match; - else - return 0; - } - - if (mod_matches && exclude_mod) + /* + * exclude_mod is set to trace everything but the given + * module. If it is set and the module matches, then + * return 0. If it is not set, and the module doesn't match + * also return 0. Otherwise, check the function to see if + * that matches. + */ + if (!mod_matches == !exclude_mod) return 0; - func_match: /* blank search means to match all funcs in the mod */ if (!func_g->len) @@ -3654,6 +3710,56 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) return match_records(hash, buff, len, NULL); } +static void ftrace_ops_update_code(struct ftrace_ops *ops, + struct ftrace_ops_hash *old_hash) +{ + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); +} + +static int ftrace_hash_move_and_update_ops(struct ftrace_ops *ops, + struct ftrace_hash **orig_hash, + struct ftrace_hash *hash, + int enable) +{ + struct ftrace_ops_hash old_hash_ops; + struct ftrace_hash *old_hash; + int ret; + + old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; + ret = ftrace_hash_move(ops, enable, orig_hash, hash); + if (!ret) { + ftrace_ops_update_code(ops, &old_hash_ops); + free_ftrace_hash_rcu(old_hash); + } + return ret; +} /* * We register the module command as a template to show others how @@ -3661,7 +3767,7 @@ ftrace_match_records(struct ftrace_hash *hash, char *buff, int len) */ static int -ftrace_mod_callback(struct ftrace_hash *hash, +ftrace_mod_callback(struct trace_array *tr, struct ftrace_hash *hash, char *func, char *cmd, char *module, int enable) { int ret; @@ -3695,16 +3801,11 @@ core_initcall(ftrace_mod_cmd_init); static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { - struct ftrace_func_probe *entry; - struct hlist_head *hhd; - unsigned long key; - - key = hash_long(ip, FTRACE_HASH_BITS); + struct ftrace_probe_ops *probe_ops; + struct ftrace_func_probe *probe; - hhd = &ftrace_func_hash[key]; - - if (hlist_empty(hhd)) - return; + probe = container_of(op, struct ftrace_func_probe, ops); + probe_ops = probe->probe_ops; /* * Disable preemption for these calls to prevent a RCU grace @@ -3712,209 +3813,336 @@ static void function_trace_probe_call(unsigned long ip, unsigned long parent_ip, * on the hash. rcu_read_lock is too dangerous here. */ preempt_disable_notrace(); - hlist_for_each_entry_rcu_notrace(entry, hhd, node) { - if (entry->ip == ip) - entry->ops->func(ip, parent_ip, &entry->data); - } + probe_ops->func(ip, parent_ip, probe->tr, probe_ops, probe->data); preempt_enable_notrace(); } -static struct ftrace_ops trace_probe_ops __read_mostly = -{ - .func = function_trace_probe_call, - .flags = FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(trace_probe_ops) +struct ftrace_func_map { + struct ftrace_func_entry entry; + void *data; }; -static int ftrace_probe_registered; +struct ftrace_func_mapper { + struct ftrace_hash hash; +}; -static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) +/** + * allocate_ftrace_func_mapper - allocate a new ftrace_func_mapper + * + * Returns a ftrace_func_mapper descriptor that can be used to map ips to data. + */ +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void) { - int ret; - int i; + struct ftrace_hash *hash; - if (ftrace_probe_registered) { - /* still need to update the function call sites */ - if (ftrace_enabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, - old_hash); - return; - } + /* + * The mapper is simply a ftrace_hash, but since the entries + * in the hash are not ftrace_func_entry type, we define it + * as a separate structure. + */ + hash = alloc_ftrace_hash(FTRACE_HASH_DEFAULT_BITS); + return (struct ftrace_func_mapper *)hash; +} - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - break; - } - /* Nothing registered? */ - if (i == FTRACE_FUNC_HASHSIZE) - return; +/** + * ftrace_func_mapper_find_ip - Find some data mapped to an ip + * @mapper: The mapper that has the ip maps + * @ip: the instruction pointer to find the data for + * + * Returns the data mapped to @ip if found otherwise NULL. The return + * is actually the address of the mapper data pointer. The address is + * returned for use cases where the data is no bigger than a long, and + * the user can use the data pointer as its data instead of having to + * allocate more memory for the reference. + */ +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - ret = ftrace_startup(&trace_probe_ops, 0); + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; - ftrace_probe_registered = 1; + map = (struct ftrace_func_map *)entry; + return &map->data; } -static bool __disable_ftrace_function_probe(void) +/** + * ftrace_func_mapper_add_ip - Map some data to an ip + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to map @data to + * @data: The data to map to @ip + * + * Returns 0 on succes otherwise an error. + */ +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data) { - int i; + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; - if (!ftrace_probe_registered) - return false; + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (entry) + return -EBUSY; - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - if (hhd->first) - return false; - } + map = kmalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return -ENOMEM; - /* no more funcs left */ - ftrace_shutdown(&trace_probe_ops, 0); + map->entry.ip = ip; + map->data = data; - ftrace_probe_registered = 0; - return true; -} + __add_hash_entry(&mapper->hash, &map->entry); + return 0; +} -static void ftrace_free_entry(struct ftrace_func_probe *entry) +/** + * ftrace_func_mapper_remove_ip - Remove an ip from the mapping + * @mapper: The mapper that has the ip maps + * @ip: The instruction pointer address to remove the data from + * + * Returns the data if it is found, otherwise NULL. + * Note, if the data pointer is used as the data itself, (see + * ftrace_func_mapper_find_ip(), then the return value may be meaningless, + * if the data pointer was set to zero. + */ +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip) { - if (entry->ops->free) - entry->ops->free(entry->ops, entry->ip, &entry->data); + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + void *data; + + entry = ftrace_lookup_ip(&mapper->hash, ip); + if (!entry) + return NULL; + + map = (struct ftrace_func_map *)entry; + data = map->data; + + remove_hash_entry(&mapper->hash, entry); kfree(entry); + + return data; +} + +/** + * free_ftrace_func_mapper - free a mapping of ips and data + * @mapper: The mapper that has the ip maps + * @free_func: A function to be called on each data item. + * + * This is used to free the function mapper. The @free_func is optional + * and can be used if the data needs to be freed as well. + */ +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func) +{ + struct ftrace_func_entry *entry; + struct ftrace_func_map *map; + struct hlist_head *hhd; + int size = 1 << mapper->hash.size_bits; + int i; + + if (free_func && mapper->hash.count) { + for (i = 0; i < size; i++) { + hhd = &mapper->hash.buckets[i]; + hlist_for_each_entry(entry, hhd, hlist) { + map = (struct ftrace_func_map *)entry; + free_func(map); + } + } + } + free_ftrace_hash(&mapper->hash); +} + +static void release_probe(struct ftrace_func_probe *probe) +{ + struct ftrace_probe_ops *probe_ops; + + mutex_lock(&ftrace_lock); + + WARN_ON(probe->ref <= 0); + + /* Subtract the ref that was used to protect this instance */ + probe->ref--; + + if (!probe->ref) { + probe_ops = probe->probe_ops; + /* + * Sending zero as ip tells probe_ops to free + * the probe->data itself + */ + if (probe_ops->free) + probe_ops->free(probe_ops, probe->tr, 0, probe->data); + list_del(&probe->list); + kfree(probe); + } + mutex_unlock(&ftrace_lock); +} + +static void acquire_probe_locked(struct ftrace_func_probe *probe) +{ + /* + * Add one ref to keep it from being freed when releasing the + * ftrace_lock mutex. + */ + probe->ref++; } int -register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops, + void *data) { - struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_probe *entry; - struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; struct ftrace_hash *hash; - struct ftrace_page *pg; - struct dyn_ftrace *rec; - int not; - unsigned long key; int count = 0; + int size; int ret; + int i; - func_g.type = filter_parse_regex(glob, strlen(glob), - &func_g.search, ¬); - func_g.len = strlen(func_g.search); - - /* we do not support '!' for function probes */ - if (WARN_ON(not)) + if (WARN_ON(!tr)) return -EINVAL; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + /* We do not support '!' for function probes */ + if (WARN_ON(glob[0] == '!')) + return -EINVAL; - old_hash_ops.filter_hash = old_hash; - /* Probes only have filters */ - old_hash_ops.notrace_hash = NULL; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - if (!hash) { - count = -ENOMEM; - goto out; + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; } - - if (unlikely(ftrace_disabled)) { - count = -ENODEV; - goto out; + if (&probe->list == &tr->func_probes) { + probe = kzalloc(sizeof(*probe), GFP_KERNEL); + if (!probe) { + mutex_unlock(&ftrace_lock); + return -ENOMEM; + } + probe->probe_ops = probe_ops; + probe->ops.func = function_trace_probe_call; + probe->tr = tr; + ftrace_ops_init(&probe->ops); + list_add(&probe->list, &tr->func_probes); } - mutex_lock(&ftrace_lock); + acquire_probe_locked(probe); - do_for_each_ftrace_rec(pg, rec) { + mutex_unlock(&ftrace_lock); - if (rec->flags & FTRACE_FL_DISABLED) - continue; + mutex_lock(&probe->ops.func_hash->regex_lock); - if (!ftrace_match_record(rec, &func_g, NULL, 0)) - continue; + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) { - /* If we did not process any, then return error */ - if (!count) - count = -ENOMEM; - goto out_unlock; - } + ret = ftrace_match_records(hash, glob, strlen(glob)); - count++; + /* Nothing found? */ + if (!ret) + ret = -EINVAL; - entry->data = data; + if (ret < 0) + goto out; - /* - * The caller might want to do something special - * for each function we find. We call the callback - * to give the caller an opportunity to do so. - */ - if (ops->init) { - if (ops->init(ops, rec->ip, &entry->data) < 0) { - /* caller does not like this func */ - kfree(entry); + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) continue; + /* + * The caller might want to do something special + * for each function we find. We call the callback + * to give the caller an opportunity to do so. + */ + if (probe_ops->init) { + ret = probe_ops->init(probe_ops, tr, + entry->ip, data, + &probe->data); + if (ret < 0) { + if (probe_ops->free && count) + probe_ops->free(probe_ops, tr, + 0, probe->data); + probe->data = NULL; + goto out; + } } + count++; } + } - ret = enter_record(hash, rec, 0); - if (ret < 0) { - kfree(entry); - count = ret; - goto out_unlock; - } - - entry->ops = ops; - entry->ip = rec->ip; + mutex_lock(&ftrace_lock); - key = hash_long(entry->ip, FTRACE_HASH_BITS); - hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); + if (!count) { + /* Nothing was added? */ + ret = -EINVAL; + goto out_unlock; + } - } while_for_each_ftrace_rec(); + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); + if (ret < 0) + goto err_unlock; - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + /* One ref for each new function traced */ + probe->ref += count; - __enable_ftrace_function_probe(&old_hash_ops); - - if (!ret) - free_ftrace_hash_rcu(old_hash); - else - count = ret; + if (!(probe->ops.flags & FTRACE_OPS_FL_ENABLED)) + ret = ftrace_startup(&probe->ops, 0); out_unlock: mutex_unlock(&ftrace_lock); + + if (!ret) + ret = count; out: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); - return count; -} + release_probe(probe); -enum { - PROBE_TEST_FUNC = 1, - PROBE_TEST_DATA = 2 -}; + return ret; -static void -__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data, int flags) + err_unlock: + if (!probe_ops->free || !count) + goto out_unlock; + + /* Failed to do the move, need to call the free functions */ + for (i = 0; i < size; i++) { + hlist_for_each_entry(entry, &hash->buckets[i], hlist) { + if (ftrace_lookup_ip(old_hash, entry->ip)) + continue; + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + } + } + goto out_unlock; +} + +int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *probe_ops) { struct ftrace_ops_hash old_hash_ops; - struct ftrace_func_entry *rec_entry; - struct ftrace_func_probe *entry; - struct ftrace_func_probe *p; + struct ftrace_func_entry *entry; + struct ftrace_func_probe *probe; struct ftrace_glob func_g; - struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; - struct ftrace_hash *old_hash = *orig_hash; - struct list_head free_list; - struct ftrace_hash *hash; + struct ftrace_hash **orig_hash; + struct ftrace_hash *old_hash; + struct ftrace_hash *hash = NULL; struct hlist_node *tmp; + struct hlist_head hhd; char str[KSYM_SYMBOL_LEN]; - int i, ret; - bool disabled; + int count = 0; + int i, ret = -ENODEV; + int size; if (glob && (strcmp(glob, "*") == 0 || !strlen(glob))) func_g.search = NULL; @@ -3928,95 +4156,104 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, /* we do not support '!' for function probes */ if (WARN_ON(not)) - return; + return -EINVAL; + } + + mutex_lock(&ftrace_lock); + /* Check if the probe_ops is already registered */ + list_for_each_entry(probe, &tr->func_probes, list) { + if (probe->probe_ops == probe_ops) + break; } + if (&probe->list == &tr->func_probes) + goto err_unlock_ftrace; - mutex_lock(&trace_probe_ops.func_hash->regex_lock); + ret = -EINVAL; + if (!(probe->ops.flags & FTRACE_OPS_FL_INITIALIZED)) + goto err_unlock_ftrace; + + acquire_probe_locked(probe); + + mutex_unlock(&ftrace_lock); + + mutex_lock(&probe->ops.func_hash->regex_lock); + + orig_hash = &probe->ops.func_hash->filter_hash; + old_hash = *orig_hash; + + if (ftrace_hash_empty(old_hash)) + goto out_unlock; old_hash_ops.filter_hash = old_hash; /* Probes only have filters */ old_hash_ops.notrace_hash = NULL; - hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + ret = -ENOMEM; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) - /* Hmm, should report this somehow */ goto out_unlock; - INIT_LIST_HEAD(&free_list); - - for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { - struct hlist_head *hhd = &ftrace_func_hash[i]; - - hlist_for_each_entry_safe(entry, tmp, hhd, node) { - - /* break up if statements for readability */ - if ((flags & PROBE_TEST_FUNC) && entry->ops != ops) - continue; + INIT_HLIST_HEAD(&hhd); - if ((flags & PROBE_TEST_DATA) && entry->data != data) - continue; + size = 1 << hash->size_bits; + for (i = 0; i < size; i++) { + hlist_for_each_entry_safe(entry, tmp, &hash->buckets[i], hlist) { - /* do this last, since it is the most expensive */ if (func_g.search) { kallsyms_lookup(entry->ip, NULL, NULL, NULL, str); if (!ftrace_match(str, &func_g)) continue; } - - rec_entry = ftrace_lookup_ip(hash, entry->ip); - /* It is possible more than one entry had this ip */ - if (rec_entry) - free_hash_entry(hash, rec_entry); - - hlist_del_rcu(&entry->node); - list_add(&entry->free_list, &free_list); + count++; + remove_hash_entry(hash, entry); + hlist_add_head(&entry->hlist, &hhd); } } + + /* Nothing found? */ + if (!count) { + ret = -EINVAL; + goto out_unlock; + } + mutex_lock(&ftrace_lock); - disabled = __disable_ftrace_function_probe(); - /* - * Remove after the disable is called. Otherwise, if the last - * probe is removed, a null hash means *all enabled*. - */ - ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + + WARN_ON(probe->ref < count); + + probe->ref -= count; + + if (ftrace_hash_empty(hash)) + ftrace_shutdown(&probe->ops, 0); + + ret = ftrace_hash_move_and_update_ops(&probe->ops, orig_hash, + hash, 1); /* still need to update the function call sites */ - if (ftrace_enabled && !disabled) - ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS, + if (ftrace_enabled && !ftrace_hash_empty(hash)) + ftrace_run_modify_code(&probe->ops, FTRACE_UPDATE_CALLS, &old_hash_ops); synchronize_sched(); - if (!ret) - free_ftrace_hash_rcu(old_hash); - list_for_each_entry_safe(entry, p, &free_list, free_list) { - list_del(&entry->free_list); - ftrace_free_entry(entry); + hlist_for_each_entry_safe(entry, tmp, &hhd, hlist) { + hlist_del(&entry->hlist); + if (probe_ops->free) + probe_ops->free(probe_ops, tr, entry->ip, probe->data); + kfree(entry); } mutex_unlock(&ftrace_lock); out_unlock: - mutex_unlock(&trace_probe_ops.func_hash->regex_lock); + mutex_unlock(&probe->ops.func_hash->regex_lock); free_ftrace_hash(hash); -} -void -unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, - void *data) -{ - __unregister_ftrace_function_probe(glob, ops, data, - PROBE_TEST_FUNC | PROBE_TEST_DATA); -} + release_probe(probe); -void -unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops) -{ - __unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC); -} + return ret; -void unregister_ftrace_function_probe_all(char *glob) -{ - __unregister_ftrace_function_probe(glob, NULL, NULL, 0); + err_unlock_ftrace: + mutex_unlock(&ftrace_lock); + return ret; } static LIST_HEAD(ftrace_commands); @@ -4068,9 +4305,11 @@ __init int unregister_ftrace_command(struct ftrace_func_command *cmd) return ret; } -static int ftrace_process_regex(struct ftrace_hash *hash, +static int ftrace_process_regex(struct ftrace_iterator *iter, char *buff, int len, int enable) { + struct ftrace_hash *hash = iter->hash; + struct trace_array *tr = iter->ops->private; char *func, *command, *next = buff; struct ftrace_func_command *p; int ret = -EINVAL; @@ -4090,10 +4329,13 @@ static int ftrace_process_regex(struct ftrace_hash *hash, command = strsep(&next, ":"); + if (WARN_ON_ONCE(!tr)) + return -EINVAL; + mutex_lock(&ftrace_cmd_mutex); list_for_each_entry(p, &ftrace_commands, list) { if (strcmp(p->name, command) == 0) { - ret = p->func(hash, func, command, next, enable); + ret = p->func(tr, hash, func, command, next, enable); goto out_unlock; } } @@ -4130,7 +4372,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf, if (read >= 0 && trace_parser_loaded(parser) && !trace_parser_cont(parser)) { - ret = ftrace_process_regex(iter->hash, parser->buffer, + ret = ftrace_process_regex(iter, parser->buffer, parser->idx, enable); trace_parser_clear(parser); if (ret < 0) @@ -4175,44 +4417,11 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) return add_hash_entry(hash, ip); } -static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_ops_hash *old_hash) -{ - struct ftrace_ops *op; - - if (!ftrace_enabled) - return; - - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); - return; - } - - /* - * If this is the shared global_ops filter, then we need to - * check if there is another ops that shares it, is enabled. - * If so, we still need to run the modify code. - */ - if (ops->func_hash != &global_ops.local_hash) - return; - - do_for_each_ftrace_op(op, ftrace_ops_list) { - if (op->func_hash == &global_ops.local_hash && - op->flags & FTRACE_OPS_FL_ENABLED) { - ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); - /* Only need to do this once */ - return; - } - } while_for_each_ftrace_op(op); -} - static int ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; - struct ftrace_ops_hash old_hash_ops; - struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4247,14 +4456,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, } mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; - ret = ftrace_hash_move(ops, enable, orig_hash, hash); - if (!ret) { - ftrace_ops_update_code(ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(ops, orig_hash, hash, enable); mutex_unlock(&ftrace_lock); out_regex_unlock: @@ -4493,10 +4695,8 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; - struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; - struct ftrace_hash *old_hash; struct trace_parser *parser; int filter_hash; int ret; @@ -4526,16 +4726,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) orig_hash = &iter->ops->func_hash->notrace_hash; mutex_lock(&ftrace_lock); - old_hash = *orig_hash; - old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; - old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; - ret = ftrace_hash_move(iter->ops, filter_hash, - orig_hash, iter->hash); - if (!ret) { - ftrace_ops_update_code(iter->ops, &old_hash_ops); - free_ftrace_hash_rcu(old_hash); - } + ret = ftrace_hash_move_and_update_ops(iter->ops, orig_hash, + iter->hash, filter_hash); mutex_unlock(&ftrace_lock); + } else { + /* For read only, the hash is the ops hash */ + iter->hash = NULL; } mutex_unlock(&iter->ops->func_hash->regex_lock); @@ -5274,6 +5470,50 @@ void ftrace_module_init(struct module *mod) } #endif /* CONFIG_MODULES */ +void __init ftrace_free_init_mem(void) +{ + unsigned long start = (unsigned long)(&__init_begin); + unsigned long end = (unsigned long)(&__init_end); + struct ftrace_page **last_pg = &ftrace_pages_start; + struct ftrace_page *pg; + struct dyn_ftrace *rec; + struct dyn_ftrace key; + int order; + + key.ip = start; + key.flags = end; /* overload flags, as it is unsigned long */ + + mutex_lock(&ftrace_lock); + + for (pg = ftrace_pages_start; pg; last_pg = &pg->next, pg = *last_pg) { + if (end < pg->records[0].ip || + start >= (pg->records[pg->index - 1].ip + MCOUNT_INSN_SIZE)) + continue; + again: + rec = bsearch(&key, pg->records, pg->index, + sizeof(struct dyn_ftrace), + ftrace_cmp_recs); + if (!rec) + continue; + pg->index--; + if (!pg->index) { + *last_pg = pg->next; + order = get_count_order(pg->size / ENTRIES_PER_PAGE); + free_pages((unsigned long)pg->records, order); + kfree(pg); + pg = container_of(last_pg, struct ftrace_page, next); + if (!(*last_pg)) + ftrace_pages = pg; + continue; + } + memmove(rec, rec + 1, + (pg->index - (rec - pg->records)) * sizeof(*rec)); + /* More than one function may be in this block */ + goto again; + } + mutex_unlock(&ftrace_lock); +} + void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5316,25 +5556,13 @@ void __weak arch_ftrace_update_trampoline(struct ftrace_ops *ops) static void ftrace_update_trampoline(struct ftrace_ops *ops) { - -/* - * Currently there's no safe way to free a trampoline when the kernel - * is configured with PREEMPT. That is because a task could be preempted - * when it jumped to the trampoline, it may be preempted for a long time - * depending on the system load, and currently there's no way to know - * when it will be off the trampoline. If the trampoline is freed - * too early, when the task runs again, it will be executing on freed - * memory and crash. - */ -#ifdef CONFIG_PREEMPT - /* Currently, only non dynamic ops can have a trampoline */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC) - return; -#endif - arch_ftrace_update_trampoline(ops); } +void ftrace_init_trace_array(struct trace_array *tr) +{ + INIT_LIST_HEAD(&tr->func_probes); +} #else static struct ftrace_ops global_ops = { @@ -5389,6 +5617,7 @@ __init void ftrace_init_global_array_ops(struct trace_array *tr) { tr->ops = &global_ops; tr->ops->private = tr; + ftrace_init_trace_array(tr); } void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func) @@ -5543,6 +5772,43 @@ ftrace_filter_pid_sched_switch_probe(void *data, bool preempt, trace_ignore_this_task(pid_list, next)); } +static void +ftrace_pid_follow_sched_process_fork(void *data, + struct task_struct *self, + struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, self, task); +} + +static void +ftrace_pid_follow_sched_process_exit(void *data, struct task_struct *task) +{ + struct trace_pid_list *pid_list; + struct trace_array *tr = data; + + pid_list = rcu_dereference_sched(tr->function_pids); + trace_filter_add_remove_task(pid_list, NULL, task); +} + +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) +{ + if (enable) { + register_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + register_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } else { + unregister_trace_sched_process_fork(ftrace_pid_follow_sched_process_fork, + tr); + unregister_trace_sched_process_exit(ftrace_pid_follow_sched_process_exit, + tr); + } +} + static void clear_ftrace_pids(struct trace_array *tr) { struct trace_pid_list *pid_list; @@ -5566,6 +5832,15 @@ static void clear_ftrace_pids(struct trace_array *tr) trace_free_pid_list(pid_list); } +void ftrace_clear_pids(struct trace_array *tr) +{ + mutex_lock(&ftrace_lock); + + clear_ftrace_pids(tr); + + mutex_unlock(&ftrace_lock); +} + static void ftrace_pid_reset(struct trace_array *tr) { mutex_lock(&ftrace_lock); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 54e7a90db848df3d1bca17e1ca966e57d2ce57d2..4ae268e687fe1683d9253235d539d840d8bf4f14 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -438,6 +438,7 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; + struct buffer_data_page *free_page; unsigned long nr_pages; unsigned int current_context; struct list_head *pages; @@ -3405,11 +3406,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset); int ring_buffer_iter_empty(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer; + struct buffer_page *reader; + struct buffer_page *head_page; + struct buffer_page *commit_page; + unsigned commit; cpu_buffer = iter->cpu_buffer; - return iter->head_page == cpu_buffer->commit_page && - iter->head == rb_commit_index(cpu_buffer); + /* Remember, trace recording is off when iterator is in use */ + reader = cpu_buffer->reader_page; + head_page = cpu_buffer->head_page; + commit_page = cpu_buffer->commit_page; + commit = rb_page_commit(commit_page); + + return ((iter->head_page == commit_page && iter->head == commit) || + (iter->head_page == reader && commit_page == head_page && + head_page->read == commit && + iter->head == rb_page_commit(cpu_buffer->reader_page))); } EXPORT_SYMBOL_GPL(ring_buffer_iter_empty); @@ -4377,9 +4390,25 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); */ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) { - struct buffer_data_page *bpage; + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = NULL; + unsigned long flags; struct page *page; + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (cpu_buffer->free_page) { + bpage = cpu_buffer->free_page; + cpu_buffer->free_page = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + if (bpage) + goto out; + page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY, 0); if (!page) @@ -4387,6 +4416,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer, int cpu) bpage = page_address(page); + out: rb_init_page(bpage); return bpage; @@ -4396,13 +4426,29 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page * @buffer: the buffer the page was allocate for + * @cpu: the cpu buffer the page came from * @data: the page to free * * Free a page allocated from ring_buffer_alloc_read_page. */ -void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) +void ring_buffer_free_read_page(struct ring_buffer *buffer, int cpu, void *data) { - free_page((unsigned long)data); + struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu]; + struct buffer_data_page *bpage = data; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&cpu_buffer->lock); + + if (!cpu_buffer->free_page) { + cpu_buffer->free_page = bpage; + bpage = NULL; + } + + arch_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + free_page((unsigned long)bpage); } EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index c190a4d5013c5ecd637c78cac7c1273dcf0b7fe1..9fbcaf56788626335bfd5412dffaf820a9b308bc 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -171,7 +171,7 @@ static enum event_status read_page(int cpu) } } } - ring_buffer_free_read_page(buffer, bpage); + ring_buffer_free_read_page(buffer, cpu, bpage); if (ret < 0) return EVENT_DROPPED; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f35109514a015c38de8b2e1da99399fd5f399692..c4536c4490217a2e6b59e423c70c0f1836a34a9d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -257,7 +257,7 @@ unsigned long long ns2usecs(u64 nsec) /* trace_flags that are default zero for instances */ #define ZEROED_TRACE_FLAGS \ - TRACE_ITER_EVENT_FORK + (TRACE_ITER_EVENT_FORK | TRACE_ITER_FUNC_FORK) /* * The global_trace is the descriptor that holds the top-level tracing @@ -757,7 +757,7 @@ __trace_buffer_lock_reserve(struct ring_buffer *buffer, return event; } -static void tracer_tracing_on(struct trace_array *tr) +void tracer_tracing_on(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_on(tr->trace_buffer.buffer); @@ -894,23 +894,8 @@ int __trace_bputs(unsigned long ip, const char *str) EXPORT_SYMBOL_GPL(__trace_bputs); #ifdef CONFIG_TRACER_SNAPSHOT -/** - * trace_snapshot - take a snapshot of the current buffer. - * - * This causes a swap between the snapshot buffer and the current live - * tracing buffer. You can use this to take snapshots of the live - * trace when some condition is triggered, but continue to trace. - * - * Note, make sure to allocate the snapshot with either - * a tracing_snapshot_alloc(), or by doing it manually - * with: echo 1 > /sys/kernel/debug/tracing/snapshot - * - * If the snapshot buffer is not allocated, it will stop tracing. - * Basically making a permanent snapshot. - */ -void tracing_snapshot(void) +static void tracing_snapshot_instance(struct trace_array *tr) { - struct trace_array *tr = &global_trace; struct tracer *tracer = tr->current_trace; unsigned long flags; @@ -938,6 +923,27 @@ void tracing_snapshot(void) update_max_tr(tr, current, smp_processor_id()); local_irq_restore(flags); } + +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + + tracing_snapshot_instance(tr); +} EXPORT_SYMBOL_GPL(tracing_snapshot); static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, @@ -1039,7 +1045,7 @@ void tracing_snapshot_alloc(void) EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); #endif /* CONFIG_TRACER_SNAPSHOT */ -static void tracer_tracing_off(struct trace_array *tr) +void tracer_tracing_off(struct trace_array *tr) { if (tr->trace_buffer.buffer) ring_buffer_record_off(tr->trace_buffer.buffer); @@ -1424,6 +1430,28 @@ static int wait_on_pipe(struct trace_iterator *iter, bool full) } #ifdef CONFIG_FTRACE_STARTUP_TEST +static bool selftests_can_run; + +struct trace_selftests { + struct list_head list; + struct tracer *type; +}; + +static LIST_HEAD(postponed_selftests); + +static int save_selftest(struct tracer *type) +{ + struct trace_selftests *selftest; + + selftest = kmalloc(sizeof(*selftest), GFP_KERNEL); + if (!selftest) + return -ENOMEM; + + selftest->type = type; + list_add(&selftest->list, &postponed_selftests); + return 0; +} + static int run_tracer_selftest(struct tracer *type) { struct trace_array *tr = &global_trace; @@ -1433,6 +1461,14 @@ static int run_tracer_selftest(struct tracer *type) if (!type->selftest || tracing_selftest_disabled) return 0; + /* + * If a tracer registers early in boot up (before scheduling is + * initialized and such), then do not run its selftests yet. + * Instead, run it a little later in the boot process. + */ + if (!selftests_can_run) + return save_selftest(type); + /* * Run a selftest on this tracer. * Here we reset the trace buffer, and set the current @@ -1482,6 +1518,47 @@ static int run_tracer_selftest(struct tracer *type) printk(KERN_CONT "PASSED\n"); return 0; } + +static __init int init_trace_selftests(void) +{ + struct trace_selftests *p, *n; + struct tracer *t, **last; + int ret; + + selftests_can_run = true; + + mutex_lock(&trace_types_lock); + + if (list_empty(&postponed_selftests)) + goto out; + + pr_info("Running postponed tracer tests:\n"); + + list_for_each_entry_safe(p, n, &postponed_selftests, list) { + ret = run_tracer_selftest(p->type); + /* If the test fails, then warn and remove from available_tracers */ + if (ret < 0) { + WARN(1, "tracer: %s failed selftest, disabling\n", + p->type->name); + last = &trace_types; + for (t = trace_types; t; t = t->next) { + if (t == p->type) { + *last = t->next; + break; + } + last = &t->next; + } + } + list_del(&p->list); + kfree(p); + } + + out: + mutex_unlock(&trace_types_lock); + + return 0; +} +early_initcall(init_trace_selftests); #else static inline int run_tracer_selftest(struct tracer *type) { @@ -1899,7 +1976,7 @@ static void __trace_find_cmdline(int pid, char comm[]) map = savedcmd->map_pid_to_cmdline[pid]; if (map != NO_CMDLINE_MAP) - strcpy(comm, get_saved_cmdlines(map)); + strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN); else strcpy(comm, "<...>"); } @@ -1927,6 +2004,18 @@ void tracing_record_cmdline(struct task_struct *tsk) __this_cpu_write(trace_cmdline_save, false); } +/* + * Several functions return TRACE_TYPE_PARTIAL_LINE if the trace_seq + * overflowed, and TRACE_TYPE_HANDLED otherwise. This helper function + * simplifies those functions and keeps them in sync. + */ +enum print_line_t trace_handle_return(struct trace_seq *s) +{ + return trace_seq_has_overflowed(s) ? + TRACE_TYPE_PARTIAL_LINE : TRACE_TYPE_HANDLED; +} +EXPORT_SYMBOL_GPL(trace_handle_return); + void tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, int pc) @@ -3222,13 +3311,14 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (!(iter->iter_flags & TRACE_FILE_ANNOTATE)) return; - if (iter->started && cpumask_test_cpu(iter->cpu, iter->started)) + if (cpumask_available(iter->started) && + cpumask_test_cpu(iter->cpu, iter->started)) return; if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; - if (iter->started) + if (cpumask_available(iter->started)) cpumask_set_cpu(iter->cpu, iter->started); /* Don't print started cpu buffer for the first entry of the trace */ @@ -4122,6 +4212,9 @@ int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) if (mask == TRACE_ITER_EVENT_FORK) trace_event_follow_fork(tr, enabled); + if (mask == TRACE_ITER_FUNC_FORK) + ftrace_pid_follow_fork(tr, enabled); + if (mask == TRACE_ITER_OVERWRITE) { ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); #ifdef CONFIG_TRACER_MAX_TRACE @@ -4355,6 +4448,7 @@ static const char readme_msg[] = "\t -:[/]\n" #ifdef CONFIG_KPROBE_EVENTS "\t place: [:][+]|\n" + "place (kretprobe): [:][+]|\n" #endif #ifdef CONFIG_UPROBE_EVENTS "\t place: :\n" @@ -5529,7 +5623,6 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .partial = partial_def, .nr_pages = 0, /* This gets updated below. */ .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; @@ -5962,6 +6055,7 @@ static int tracing_clock_open(struct inode *inode, struct file *file) struct ftrace_buffer_info { struct trace_iterator iter; void *spare; + unsigned int spare_cpu; unsigned int read; }; @@ -6291,9 +6385,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, return -EBUSY; #endif - if (!info->spare) + if (!info->spare) { info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, iter->cpu_file); + info->spare_cpu = iter->cpu_file; + } if (!info->spare) return -ENOMEM; @@ -6353,7 +6449,8 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) __trace_array_put(iter->tr); if (info->spare) - ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, + info->spare_cpu, info->spare); kfree(info); mutex_unlock(&trace_types_lock); @@ -6364,6 +6461,7 @@ static int tracing_buffers_release(struct inode *inode, struct file *file) struct buffer_ref { struct ring_buffer *buffer; void *page; + int cpu; int ref; }; @@ -6375,7 +6473,7 @@ static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); buf->private = 0; } @@ -6409,7 +6507,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) if (--ref->ref) return; - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); kfree(ref); spd->partial[i].private = 0; } @@ -6427,7 +6525,6 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .pages = pages_def, .partial = partial_def, .nr_pages_max = PIPE_DEF_BUFFERS, - .flags = flags, .ops = &buffer_pipe_buf_ops, .spd_release = buffer_spd_release, }; @@ -6474,11 +6571,13 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, kfree(ref); break; } + ref->cpu = iter->cpu_file; r = ring_buffer_read_page(ref->buffer, &ref->page, len, iter->cpu_file, 1); if (r < 0) { - ring_buffer_free_read_page(ref->buffer, ref->page); + ring_buffer_free_read_page(ref->buffer, ref->cpu, + ref->page); kfree(ref); break; } @@ -6649,43 +6748,89 @@ static const struct file_operations tracing_dyn_info_fops = { #if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) static void -ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - tracing_snapshot(); + tracing_snapshot_instance(tr); } static void -ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) { + + if (*count <= 0) + return; - if (*count != -1) (*count)--; + } - tracing_snapshot(); + tracing_snapshot_instance(tr); } static int ftrace_snapshot_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:", (void *)ip); seq_puts(m, "snapshot"); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } +static int +ftrace_snapshot_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_snapshot_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); +} + static struct ftrace_probe_ops snapshot_probe_ops = { .func = ftrace_snapshot, .print = ftrace_snapshot_print, @@ -6694,10 +6839,12 @@ static struct ftrace_probe_ops snapshot_probe_ops = { static struct ftrace_probe_ops snapshot_count_probe_ops = { .func = ftrace_count_snapshot, .print = ftrace_snapshot_print, + .init = ftrace_snapshot_init, + .free = ftrace_snapshot_free, }; static int -ftrace_trace_snapshot_callback(struct ftrace_hash *hash, +ftrace_trace_snapshot_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -6711,10 +6858,8 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -6733,11 +6878,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = alloc_snapshot(tr); + if (ret < 0) + goto out; - if (ret >= 0) - alloc_snapshot(&global_trace); + ret = register_ftrace_function_probe(glob, tr, ops, count); + out: return ret < 0 ? ret : 0; } @@ -7346,6 +7493,8 @@ static int instance_mkdir(const char *name) goto out_free_tr; } + ftrace_init_trace_array(tr); + init_tracer_tracefs(tr, tr->dir); init_trace_flags_index(tr); __update_tracer_options(tr); @@ -7402,6 +7551,7 @@ static int instance_rmdir(const char *name) tracing_set_nop(tr); event_trace_del_tracer(tr); + ftrace_clear_pids(tr); ftrace_destroy_function_files(tr); tracefs_remove_recursive(tr->dir); free_trace_buffers(tr); @@ -7965,6 +8115,9 @@ __init static int tracer_alloc_buffers(void) register_tracer(&nop_trace); + /* Function tracing may start here (via kernel command line) */ + init_function_trace(); + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -7999,7 +8152,7 @@ __init static int tracer_alloc_buffers(void) return ret; } -void __init trace_init(void) +void __init early_trace_init(void) { if (tracepoint_printk) { tracepoint_print_iter = @@ -8010,6 +8163,10 @@ void __init trace_init(void) static_key_enable(&tracepoint_printk_key.key); } tracer_alloc_buffers(); +} + +void __init trace_init(void) +{ trace_event_init(); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index ae1cce91fead25a065899109e426a6cc1e597d28..291a1bca5748870011c70e639fe802785945a0a6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -262,6 +262,9 @@ struct trace_array { #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; +#ifdef CONFIG_DYNAMIC_FTRACE + struct list_head func_probes; +#endif /* function tracing enabled */ int function_enabled; #endif @@ -579,6 +582,8 @@ void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); int tracer_tracing_is_on(struct trace_array *tr); +void tracer_tracing_on(struct trace_array *tr); +void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, @@ -696,6 +701,9 @@ extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; +void ftrace_init_trace_array(struct trace_array *tr); +#else +static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); @@ -880,6 +888,14 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER +struct ftrace_func_command { + struct list_head list; + char *name; + int (*func)(struct trace_array *tr, + struct ftrace_hash *hash, + char *func, char *cmd, + char *params, int enable); +}; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { @@ -896,6 +912,9 @@ int using_ftrace_ops_list_func(void); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); +void ftrace_clear_pids(struct trace_array *tr); +int init_function_trace(void); +void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { @@ -914,15 +933,71 @@ ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } +static inline void ftrace_clear_pids(struct trace_array *tr) { } +static inline int init_function_trace(void) { return 0; } +static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) + +struct ftrace_probe_ops { + void (*func)(unsigned long ip, + unsigned long parent_ip, + struct trace_array *tr, + struct ftrace_probe_ops *ops, + void *data); + int (*init)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *init_data, + void **data); + void (*free)(struct ftrace_probe_ops *ops, + struct trace_array *tr, + unsigned long ip, void *data); + int (*print)(struct seq_file *m, + unsigned long ip, + struct ftrace_probe_ops *ops, + void *data); +}; + +struct ftrace_func_mapper; +typedef int (*ftrace_mapper_func)(void *data); + +struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); +void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, + unsigned long ip, void *data); +void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, + unsigned long ip); +void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, + ftrace_mapper_func free_func); + +extern int +register_ftrace_function_probe(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops, void *data); +extern int +unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, + struct ftrace_probe_ops *ops); + +int register_ftrace_command(struct ftrace_func_command *cmd); +int unregister_ftrace_command(struct ftrace_func_command *cmd); + void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); #else +struct ftrace_func_command; + +static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) +{ + return -EINVAL; +} +static inline __init int unregister_ftrace_command(char *cmd_name) +{ + return -EINVAL; +} /* * The ops parameter passed in is usually undefined. * This must be a macro. @@ -987,11 +1062,13 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ - C(FUNCTION, "function-trace"), + C(FUNCTION, "function-trace"), \ + C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL +# define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c index e49fbe901cfc64c60734d52a0a7c8db320606b00..16a8cf02eee96ae4fcac5d0641673b6c38f003fd 100644 --- a/kernel/trace/trace_benchmark.c +++ b/kernel/trace/trace_benchmark.c @@ -153,10 +153,18 @@ static int benchmark_event_kthread(void *arg) trace_do_benchmark(); /* - * We don't go to sleep, but let others - * run as well. + * We don't go to sleep, but let others run as well. + * This is bascially a "yield()" to let any task that + * wants to run, schedule in, but if the CPU is idle, + * we'll keep burning cycles. + * + * Note the _rcu_qs() version of cond_resched() will + * notify synchronize_rcu_tasks() that this thread has + * passed a quiescent state for rcu_tasks. Otherwise + * this thread will never voluntarily schedule which would + * block synchronize_rcu_tasks() indefinitely. */ - cond_resched(); + cond_resched_rcu_qs(); } return 0; diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index c203ac4df791f28e074d387ce573cec74bc40cf6..adcdbbeae0108bad436d0af31297dc555ad7f545 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -348,14 +348,14 @@ FTRACE_ENTRY(hwlat, hwlat_entry, __field( u64, duration ) __field( u64, outer_duration ) __field( u64, nmi_total_ts ) - __field_struct( struct timespec, timestamp ) - __field_desc( long, timestamp, tv_sec ) + __field_struct( struct timespec64, timestamp ) + __field_desc( s64, timestamp, tv_sec ) __field_desc( long, timestamp, tv_nsec ) __field( unsigned int, nmi_count ) __field( unsigned int, seqnum ) ), - F_printk("cnt:%u\tts:%010lu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", + F_printk("cnt:%u\tts:%010llu.%010lu\tinner:%llu\touter:%llunmi-ts:%llu\tnmi-count:%u\n", __entry->seqnum, __entry->tv_sec, __entry->tv_nsec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 93116549a284366a7670bdd82316b049ec72412e..e7973e10398c83cc9210b9ab6a70c23a07950297 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2460,15 +2460,8 @@ struct event_probe_data { bool enable; }; -static void -event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +static void update_event_probe(struct event_probe_data *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; - - if (!data) - return; - if (data->enable) clear_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &data->file->flags); else @@ -2476,77 +2469,141 @@ event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) } static void -event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +event_enable_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; - if (!data) + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) + return; + + edata = *pdata; + update_event_probe(edata); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) +{ + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + if (!pdata || !*pdata) return; - if (!data->count) + edata = *pdata; + + if (!edata->count) return; /* Skip if the event is in a state we want to switch to */ - if (data->enable == !(data->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) + if (edata->enable == !(edata->file->flags & EVENT_FILE_FL_SOFT_DISABLED)) return; - if (data->count != -1) - (data->count)--; + if (edata->count != -1) + (edata->count)--; - event_enable_probe(ip, parent_ip, _data); + update_event_probe(edata); } static int event_enable_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *_data) + struct ftrace_probe_ops *ops, void *data) { - struct event_probe_data *data = _data; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; + void **pdata; + + pdata = ftrace_func_mapper_find_ip(mapper, ip); + + if (WARN_ON_ONCE(!pdata || !*pdata)) + return 0; + + edata = *pdata; seq_printf(m, "%ps:", (void *)ip); seq_printf(m, "%s:%s:%s", - data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, - data->file->event_call->class->system, - trace_event_name(data->file->event_call)); + edata->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + edata->file->event_call->class->system, + trace_event_name(edata->file->event_call)); - if (data->count == -1) + if (edata->count == -1) seq_puts(m, ":unlimited\n"); else - seq_printf(m, ":count=%ld\n", data->count); + seq_printf(m, ":count=%ld\n", edata->count); return 0; } static int -event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = *data; + struct event_probe_data *edata = init_data; + int ret; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENODEV; + *data = mapper; + } + + ret = ftrace_func_mapper_add_ip(mapper, ip, edata); + if (ret < 0) + return ret; + + edata->ref++; - data->ref++; + return 0; +} + +static int free_probe_data(void *data) +{ + struct event_probe_data *edata = data; + + edata->ref--; + if (!edata->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(edata->file, 0, 1); + module_put(edata->file->event_call->mod); + kfree(edata); + } return 0; } static void -event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, - void **_data) +event_enable_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) { - struct event_probe_data **pdata = (struct event_probe_data **)_data; - struct event_probe_data *data = *pdata; + struct ftrace_func_mapper *mapper = data; + struct event_probe_data *edata; - if (WARN_ON_ONCE(data->ref <= 0)) + if (!ip) { + if (!mapper) + return; + free_ftrace_func_mapper(mapper, free_probe_data); return; - - data->ref--; - if (!data->ref) { - /* Remove the SOFT_MODE flag */ - __ftrace_event_enable_disable(data->file, 0, 1); - module_put(data->file->event_call->mod); - kfree(data); } - *pdata = NULL; + + edata = ftrace_func_mapper_remove_ip(mapper, ip); + + if (WARN_ON_ONCE(!edata)) + return; + + if (WARN_ON_ONCE(edata->ref <= 0)) + return; + + free_probe_data(edata); } static struct ftrace_probe_ops event_enable_probe_ops = { @@ -2578,10 +2635,9 @@ static struct ftrace_probe_ops event_disable_count_probe_ops = { }; static int -event_enable_func(struct ftrace_hash *hash, +event_enable_func(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enabled) { - struct trace_array *tr = top_trace_array(); struct trace_event_file *file; struct ftrace_probe_ops *ops; struct event_probe_data *data; @@ -2619,12 +2675,12 @@ event_enable_func(struct ftrace_hash *hash, ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - ret = 0; + ret = unregister_ftrace_function_probe_func(glob+1, tr, ops); goto out; } ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) goto out; @@ -2661,7 +2717,8 @@ event_enable_func(struct ftrace_hash *hash, ret = __ftrace_event_enable_disable(file, 1, 1); if (ret < 0) goto out_put; - ret = register_ftrace_function_probe(glob, ops, data); + + ret = register_ftrace_function_probe(glob, tr, ops, data); /* * The above returns on success the # of functions enabled, * but if it didn't find any functions it returns zero. diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 0efa00d80623db3de438a5233a8ee6f84fae65ed..a3bddbfd0874a26bcdbc84b3b4713cb206a0998b 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -267,10 +267,14 @@ static struct tracer function_trace __tracer_data = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void update_traceon_count(void **data, bool on) +static void update_traceon_count(struct ftrace_probe_ops *ops, + unsigned long ip, + struct trace_array *tr, bool on, + void *data) { - long *count = (long *)data; - long old_count = *count; + struct ftrace_func_mapper *mapper = data; + long *count; + long old_count; /* * Tracing gets disabled (or enabled) once per count. @@ -301,23 +305,22 @@ static void update_traceon_count(void **data, bool on) * setting the tracing_on file. But we currently don't care * about that. */ - if (!old_count) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + old_count = *count; + + if (old_count <= 0) return; /* Make sure we see count before checking tracing state */ smp_rmb(); - if (on == !!tracing_is_on()) + if (on == !!tracer_tracing_is_on(tr)) return; if (on) - tracing_on(); + tracer_tracing_on(tr); else - tracing_off(); - - /* unlimited? */ - if (old_count == -1) - return; + tracer_tracing_off(tr); /* Make sure tracing state is visible before updating count */ smp_wmb(); @@ -326,33 +329,41 @@ static void update_traceon_count(void **data, bool on) } static void -ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 1); + update_traceon_count(ops, ip, tr, 1, data); } static void -ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - update_traceon_count(data, 0); + update_traceon_count(ops, ip, tr, 0, data); } static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (tracing_is_on()) + if (tracer_tracing_is_on(tr)) return; - tracing_on(); + tracer_tracing_on(tr); } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (!tracing_is_on()) + if (!tracer_tracing_is_on(tr)) return; - tracing_off(); + tracer_tracing_off(tr); } /* @@ -364,144 +375,218 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) */ #define STACK_SKIP 4 +static __always_inline void trace_stack(struct trace_array *tr) +{ + unsigned long flags; + int pc; + + local_save_flags(flags); + pc = preempt_count(); + + __trace_stack(tr, flags, STACK_SKIP, pc); +} + static void -ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - trace_dump_stack(STACK_SKIP); + trace_stack(tr); } static void -ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count; long old_count; long new_count; + if (!tracing_is_on()) + return; + + /* unlimited? */ + if (!mapper) { + trace_stack(tr); + return; + } + + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + /* * Stack traces should only execute the number of times the * user specified in the counter. */ do { - - if (!tracing_is_on()) - return; - old_count = *count; if (!old_count) return; - /* unlimited? */ - if (old_count == -1) { - trace_dump_stack(STACK_SKIP); - return; - } - new_count = old_count - 1; new_count = cmpxchg(count, old_count, new_count); if (new_count == old_count) - trace_dump_stack(STACK_SKIP); + trace_stack(tr); + + if (!tracing_is_on()) + return; } while (new_count != old_count); } -static int update_count(void **data) +static int update_count(struct ftrace_probe_ops *ops, unsigned long ip, + void *data) { - unsigned long *count = (long *)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; - if (!*count) - return 0; + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); - if (*count != -1) + if (count) { + if (*count <= 0) + return 0; (*count)--; + } return 1; } static void -ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_dump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ALL); } /* Only dump the current CPU buffer. */ static void -ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_cpudump_probe(unsigned long ip, unsigned long parent_ip, + struct trace_array *tr, struct ftrace_probe_ops *ops, + void *data) { - if (update_count(data)) + if (update_count(ops, ip, data)) ftrace_dump(DUMP_ORIG); } static int ftrace_probe_print(const char *name, struct seq_file *m, - unsigned long ip, void *data) + unsigned long ip, struct ftrace_probe_ops *ops, + void *data) { - long count = (long)data; + struct ftrace_func_mapper *mapper = data; + long *count = NULL; seq_printf(m, "%ps:%s", (void *)ip, name); - if (count == -1) - seq_puts(m, ":unlimited\n"); + if (mapper) + count = (long *)ftrace_func_mapper_find_ip(mapper, ip); + + if (count) + seq_printf(m, ":count=%ld\n", *count); else - seq_printf(m, ":count=%ld\n", count); + seq_puts(m, ":unlimited\n"); return 0; } static int ftrace_traceon_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) + struct ftrace_probe_ops *ops, + void *data) { - return ftrace_probe_print("traceon", m, ip, data); + return ftrace_probe_print("traceon", m, ip, ops, data); } static int ftrace_traceoff_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("traceoff", m, ip, data); + return ftrace_probe_print("traceoff", m, ip, ops, data); } static int ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("stacktrace", m, ip, data); + return ftrace_probe_print("stacktrace", m, ip, ops, data); } static int ftrace_dump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("dump", m, ip, data); + return ftrace_probe_print("dump", m, ip, ops, data); } static int ftrace_cpudump_print(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data) { - return ftrace_probe_print("cpudump", m, ip, data); + return ftrace_probe_print("cpudump", m, ip, ops, data); +} + + +static int +ftrace_count_init(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *init_data, void **data) +{ + struct ftrace_func_mapper *mapper = *data; + + if (!mapper) { + mapper = allocate_ftrace_func_mapper(); + if (!mapper) + return -ENOMEM; + *data = mapper; + } + + return ftrace_func_mapper_add_ip(mapper, ip, init_data); +} + +static void +ftrace_count_free(struct ftrace_probe_ops *ops, struct trace_array *tr, + unsigned long ip, void *data) +{ + struct ftrace_func_mapper *mapper = data; + + if (!ip) { + free_ftrace_func_mapper(mapper, NULL); + return; + } + + ftrace_func_mapper_remove_ip(mapper, ip); } static struct ftrace_probe_ops traceon_count_probe_ops = { .func = ftrace_traceon_count, .print = ftrace_traceon_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops traceoff_count_probe_ops = { .func = ftrace_traceoff_count, .print = ftrace_traceoff_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops stacktrace_count_probe_ops = { .func = ftrace_stacktrace_count, .print = ftrace_stacktrace_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops dump_probe_ops = { .func = ftrace_dump_probe, .print = ftrace_dump_print, + .init = ftrace_count_init, + .free = ftrace_count_free, }; static struct ftrace_probe_ops cpudump_probe_ops = { @@ -525,7 +610,8 @@ static struct ftrace_probe_ops stacktrace_probe_ops = { }; static int -ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, +ftrace_trace_probe_callback(struct trace_array *tr, + struct ftrace_probe_ops *ops, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { @@ -537,10 +623,8 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, if (!enable) return -EINVAL; - if (glob[0] == '!') { - unregister_ftrace_function_probe_func(glob+1, ops); - return 0; - } + if (glob[0] == '!') + return unregister_ftrace_function_probe_func(glob+1, tr, ops); if (!param) goto out_reg; @@ -559,13 +643,13 @@ ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, return ret; out_reg: - ret = register_ftrace_function_probe(glob, ops, count); + ret = register_ftrace_function_probe(glob, tr, ops, count); return ret < 0 ? ret : 0; } static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, +ftrace_trace_onoff_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -576,24 +660,24 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, else ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_stacktrace_callback(struct ftrace_hash *hash, +ftrace_stacktrace_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, param, enable); } static int -ftrace_dump_callback(struct ftrace_hash *hash, +ftrace_dump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -601,12 +685,12 @@ ftrace_dump_callback(struct ftrace_hash *hash, ops = &dump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } static int -ftrace_cpudump_callback(struct ftrace_hash *hash, +ftrace_cpudump_callback(struct trace_array *tr, struct ftrace_hash *hash, char *glob, char *cmd, char *param, int enable) { struct ftrace_probe_ops *ops; @@ -614,7 +698,7 @@ ftrace_cpudump_callback(struct ftrace_hash *hash, ops = &cpudump_probe_ops; /* Only dump once. */ - return ftrace_trace_probe_callback(ops, hash, glob, cmd, + return ftrace_trace_probe_callback(tr, ops, hash, glob, cmd, "1", enable); } @@ -687,9 +771,8 @@ static inline int init_func_cmd_traceon(void) } #endif /* CONFIG_DYNAMIC_FTRACE */ -static __init int init_function_trace(void) +__init int init_function_trace(void) { init_func_cmd_traceon(); return register_tracer(&function_trace); } -core_initcall(init_function_trace); diff --git a/kernel/trace/trace_hwlat.c b/kernel/trace/trace_hwlat.c index 21ea6ae77d93fd45f50ed71059fbe4954b9ef2c0..d7c8e4ec3d9d6cd92b6d78a2c66c984cbbc6fb6c 100644 --- a/kernel/trace/trace_hwlat.c +++ b/kernel/trace/trace_hwlat.c @@ -79,12 +79,12 @@ static u64 last_tracing_thresh = DEFAULT_LAT_THRESHOLD * NSEC_PER_USEC; /* Individual latency samples are stored here when detected. */ struct hwlat_sample { - u64 seqnum; /* unique sequence */ - u64 duration; /* delta */ - u64 outer_duration; /* delta (outer loop) */ - u64 nmi_total_ts; /* Total time spent in NMIs */ - struct timespec timestamp; /* wall time */ - int nmi_count; /* # NMIs during this sample */ + u64 seqnum; /* unique sequence */ + u64 duration; /* delta */ + u64 outer_duration; /* delta (outer loop) */ + u64 nmi_total_ts; /* Total time spent in NMIs */ + struct timespec64 timestamp; /* wall time */ + int nmi_count; /* # NMIs during this sample */ }; /* keep the global state somewhere. */ @@ -250,7 +250,7 @@ static int get_sample(void) s.seqnum = hwlat_data.count; s.duration = sample; s.outer_duration = outer_sample; - s.timestamp = CURRENT_TIME; + ktime_get_real_ts64(&s.timestamp); s.nmi_total_ts = nmi_total_ts; s.nmi_count = nmi_count; trace_hwlat_sample(&s); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 5f688cc724f00abcd9d09686adc75fb96e4b5b02..8485f6738a87cba32a070543b4ae7b8c434ad04c 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -25,6 +25,7 @@ #include "trace_probe.h" #define KPROBE_EVENT_SYSTEM "kprobes" +#define KRETPROBE_MAXACTIVE_MAX 4096 /** * Kprobe event core functions @@ -282,6 +283,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, void *addr, const char *symbol, unsigned long offs, + int maxactive, int nargs, bool is_return) { struct trace_kprobe *tk; @@ -309,6 +311,8 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, else tk->rp.kp.pre_handler = kprobe_dispatcher; + tk->rp.maxactive = maxactive; + if (!event || !is_good_name(event)) { ret = -EINVAL; goto error; @@ -598,8 +602,10 @@ static int create_trace_kprobe(int argc, char **argv) { /* * Argument syntax: - * - Add kprobe: p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] - * - Add kretprobe: r[:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] + * - Add kprobe: + * p[:[GRP/]EVENT] [MOD:]KSYM[+OFFS]|KADDR [FETCHARGS] + * - Add kretprobe: + * r[MAXACTIVE][:[GRP/]EVENT] [MOD:]KSYM[+0] [FETCHARGS] * Fetch args: * $retval : fetch return value * $stack : fetch stack address @@ -619,6 +625,7 @@ static int create_trace_kprobe(int argc, char **argv) int i, ret = 0; bool is_return = false, is_delete = false; char *symbol = NULL, *event = NULL, *group = NULL; + int maxactive = 0; char *arg; unsigned long offset = 0; void *addr = NULL; @@ -637,8 +644,28 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } - if (argv[0][1] == ':') { - event = &argv[0][2]; + event = strchr(&argv[0][1], ':'); + if (event) { + event[0] = '\0'; + event++; + } + if (is_return && isdigit(argv[0][1])) { + ret = kstrtouint(&argv[0][1], 0, &maxactive); + if (ret) { + pr_info("Failed to parse maxactive.\n"); + return ret; + } + /* kretprobes instances are iterated over via a list. The + * maximum should stay reasonable. + */ + if (maxactive > KRETPROBE_MAXACTIVE_MAX) { + pr_info("Maxactive is too big (%d > %d).\n", + maxactive, KRETPROBE_MAXACTIVE_MAX); + return -E2BIG; + } + } + + if (event) { if (strchr(event, '/')) { group = event; event = strchr(group, '/') + 1; @@ -681,10 +708,6 @@ static int create_trace_kprobe(int argc, char **argv) return -EINVAL; } if (isdigit(argv[1][0])) { - if (is_return) { - pr_info("Return probe point must be a symbol.\n"); - return -EINVAL; - } /* an address specified */ ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { @@ -700,8 +723,9 @@ static int create_trace_kprobe(int argc, char **argv) pr_info("Failed to parse symbol.\n"); return ret; } - if (offset && is_return) { - pr_info("Return probe must be used without offset.\n"); + if (offset && is_return && + !function_offset_within_entry(NULL, symbol, offset)) { + pr_info("Given offset is not valid for return probe.\n"); return -EINVAL; } } @@ -718,8 +742,8 @@ static int create_trace_kprobe(int argc, char **argv) is_return ? 'r' : 'p', addr); event = buf; } - tk = alloc_trace_kprobe(group, event, addr, symbol, offset, argc, - is_return); + tk = alloc_trace_kprobe(group, event, addr, symbol, offset, maxactive, + argc, is_return); if (IS_ERR(tk)) { pr_info("Failed to allocate trace_probe.(%d)\n", (int)PTR_ERR(tk)); diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 02a4aeb22c4785cc808a7906c970dfa420796cbe..08f9bab8089e74c2acd63e0eaa7ef53f922b88c4 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -4,7 +4,6 @@ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt * */ - #include #include #include @@ -1161,11 +1160,11 @@ trace_hwlat_print(struct trace_iterator *iter, int flags, trace_assign_type(field, entry); - trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%ld.%09ld", + trace_seq_printf(s, "#%-5u inner/outer(us): %4llu/%-5llu ts:%lld.%09ld", field->seqnum, field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec); if (field->nmi_count) { @@ -1195,10 +1194,10 @@ trace_hwlat_raw(struct trace_iterator *iter, int flags, trace_assign_type(field, iter->ent); - trace_seq_printf(s, "%llu %lld %ld %09ld %u\n", + trace_seq_printf(s, "%llu %lld %lld %09ld %u\n", field->duration, field->outer_duration, - field->timestamp.tv_sec, + (long long)field->timestamp.tv_sec, field->timestamp.tv_nsec, field->seqnum); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 5fb1f2c87e6b846b7f9d32823ef3aede4b28db9e..76aa04d4c9257482181c9b2885530fb4dc417edd 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -35,7 +35,7 @@ unsigned long stack_trace_max_size; arch_spinlock_t stack_trace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; -static DEFINE_PER_CPU(int, trace_active); +DEFINE_PER_CPU(int, disable_stack_tracer); static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; @@ -96,6 +96,14 @@ check_stack(unsigned long ip, unsigned long *stack) if (in_nmi()) return; + /* + * There's a slight chance that we are tracing inside the + * RCU infrastructure, and rcu_irq_enter() will not work + * as expected. + */ + if (unlikely(rcu_irq_enter_disabled())) + return; + local_irq_save(flags); arch_spin_lock(&stack_trace_max_lock); @@ -207,13 +215,12 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { unsigned long stack; - int cpu; preempt_disable_notrace(); - cpu = raw_smp_processor_id(); /* no atomic needed, we only modify this variable by this cpu */ - if (per_cpu(trace_active, cpu)++ != 0) + __this_cpu_inc(disable_stack_tracer); + if (__this_cpu_read(disable_stack_tracer) != 1) goto out; ip += MCOUNT_INSN_SIZE; @@ -221,7 +228,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, check_stack(ip, &stack); out: - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); /* prevent recursion in schedule */ preempt_enable_notrace(); } @@ -253,7 +260,6 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, long *ptr = filp->private_data; unsigned long val, flags; int ret; - int cpu; ret = kstrtoul_from_user(ubuf, count, 10, &val); if (ret) @@ -264,16 +270,15 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, /* * In case we trace inside arch_spin_lock() or after (NMI), * we will cause circular lock, so we also need to increase - * the percpu trace_active here. + * the percpu disable_stack_tracer here. */ - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); *ptr = val; arch_spin_unlock(&stack_trace_max_lock); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_restore(flags); return count; @@ -307,12 +312,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - int cpu; - local_irq_disable(); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)++; + __this_cpu_inc(disable_stack_tracer); arch_spin_lock(&stack_trace_max_lock); @@ -324,12 +326,9 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { - int cpu; - arch_spin_unlock(&stack_trace_max_lock); - cpu = smp_processor_id(); - per_cpu(trace_active, cpu)--; + __this_cpu_dec(disable_stack_tracer); local_irq_enable(); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c0168b7da1eaf22c216147ca5ebd03ef7311dca8..c74bf39ef7643fdc4ecc219aa25b58feabcd91f3 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -3209,9 +3209,8 @@ static int init_worker_pool(struct worker_pool *pool) INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; + setup_deferrable_timer(&pool->idle_timer, idle_worker_timeout, + (unsigned long)pool); setup_timer(&pool->mayday_timer, pool_mayday_timeout, (unsigned long)pool); @@ -4735,6 +4734,29 @@ long work_on_cpu(int cpu, long (*fn)(void *), void *arg) return wfc.ret; } EXPORT_SYMBOL_GPL(work_on_cpu); + +/** + * work_on_cpu_safe - run a function in thread context on a particular cpu + * @cpu: the cpu to run on + * @fn: the function to run + * @arg: the function argument + * + * Disables CPU hotplug and calls work_on_cpu(). The caller must not hold + * any locks which would prevent @fn from completing. + * + * Return: The value @fn returns. + */ +long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) +{ + long ret = -ENODEV; + + get_online_cpus(); + if (cpu_online(cpu)) + ret = work_on_cpu(cpu, fn, arg); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL_GPL(work_on_cpu_safe); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 97d62c2da6c25dd5721f8c1c75264c83201f7247..e4587ebe52c7ec3c7923c661dea9b8f15b77bd8e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -130,7 +130,8 @@ config DYNAMIC_DEBUG nullarbor:~ # echo -n 'func svc_process -p' > /dynamic_debug/control - See Documentation/dynamic-debug-howto.txt for additional information. + See Documentation/admin-guide/dynamic-debug-howto.rst for additional + information. endmenu # "printk and dmesg options" @@ -356,7 +357,7 @@ config FRAME_POINTER bool "Compile the kernel with frame pointers" depends on DEBUG_KERNEL && \ (CRIS || M68K || FRV || UML || \ - AVR32 || SUPERH || BLACKFIN || MN10300 || METAG) || \ + SUPERH || BLACKFIN || MN10300 || METAG) || \ ARCH_WANT_FRAME_POINTERS default y if (DEBUG_INFO && UML) || ARCH_WANT_FRAME_POINTERS help @@ -404,8 +405,8 @@ config MAGIC_SYSRQ by pressing various keys while holding SysRq (Alt+PrintScreen). It also works on a serial console (on PC hardware at least), if you send a BREAK and then within 5 seconds a command keypress. The - keys are documented in . Don't say Y - unless you really know what this hack does. + keys are documented in . + Don't say Y unless you really know what this hack does. config MAGIC_SYSRQ_DEFAULT_ENABLE hex "Enable magic SysRq key functions by default" @@ -414,7 +415,7 @@ config MAGIC_SYSRQ_DEFAULT_ENABLE help Specifies which SysRq key functions are enabled by default. This may be set to 1 or 0 to enable or disable them all, or - to a bitmask as described in Documentation/sysrq.txt. + to a bitmask as described in Documentation/admin-guide/sysrq.rst. config MAGIC_SYSRQ_SERIAL bool "Enable magic SysRq key over serial" @@ -1103,9 +1104,6 @@ config PROVE_LOCKING For more details, see Documentation/locking/lockdep-design.txt. -config PROVE_LOCKING_SMALL - bool - config LOCKDEP bool depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -1114,6 +1112,9 @@ config LOCKDEP select KALLSYMS select KALLSYMS_ALL +config LOCKDEP_SMALL + bool + config LOCK_STAT bool "Lock usage statistics" depends on DEBUG_KERNEL && TRACE_IRQFLAGS_SUPPORT && STACKTRACE_SUPPORT && LOCKDEP_SUPPORT @@ -1718,19 +1719,21 @@ config LKDTM Documentation/fault-injection/provoke-crashes.txt config TEST_LIST_SORT - bool "Linked list sorting test" - depends on DEBUG_KERNEL + tristate "Linked list sorting test" + depends on DEBUG_KERNEL || m help Enable this to turn on 'list_sort()' function test. This test is - executed only once during system boot, so affects only boot time. + executed only once during system boot (so affects only boot time), + or at module load time. If unsure, say N. config TEST_SORT - bool "Array-based sort test" - depends on DEBUG_KERNEL + tristate "Array-based sort test" + depends on DEBUG_KERNEL || m help - This option enables the self-test function of 'sort()' at boot. + This option enables the self-test function of 'sort()' at boot, + or at module load time. If unsure, say N. diff --git a/lib/Makefile b/lib/Makefile index 320ac46a8725b6f9dac0726767a4976a2bd8907c..0166fbc0fa811f8462bc2f3dafcae9408ae922c4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -19,7 +19,7 @@ KCOV_INSTRUMENT_dynamic_debug.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ idr.o int_sqrt.o extable.o \ - sha1.o chacha20.o md5.o irq_regs.o argv_split.o \ + sha1.o chacha20.o irq_regs.o argv_split.o \ flex_proportions.o ratelimit.o show_mem.o \ is_single_threaded.o plist.o decompress.o kobject_uevent.o \ earlycpio.o seq_buf.o siphash.o \ @@ -41,7 +41,7 @@ obj-y += bcd.o div64.o sort.o parser.o debug_locks.o random32.o \ gcd.o lcm.o list_sort.o uuid.o flex_array.o iov_iter.o clz_ctz.o \ bsearch.o find_bit.o llist.o memweight.o kfifo.o \ percpu-refcount.o percpu_ida.o rhashtable.o reciprocal_div.o \ - once.o refcount.o + once.o refcount.o usercopy.o obj-y += string_helpers.o obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o obj-y += hexdump.o @@ -52,6 +52,7 @@ obj-$(CONFIG_TEST_FIRMWARE) += test_firmware.o obj-$(CONFIG_TEST_HASH) += test_hash.o test_siphash.o obj-$(CONFIG_TEST_KASAN) += test_kasan.o obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o +obj-$(CONFIG_TEST_LIST_SORT) += test_list_sort.o obj-$(CONFIG_TEST_LKM) += test_module.o obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o obj-$(CONFIG_TEST_SORT) += test_sort.o diff --git a/lib/bitmap.c b/lib/bitmap.c index 0b66f0e5eb6bb172ae240fe59dd1eec0d9065493..08c6ef3a2b6fc5134968fd05bfbc8cd0b2ee91aa 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -502,11 +502,11 @@ EXPORT_SYMBOL(bitmap_print_to_pagebuf); * Syntax: range:used_size/group_size * Example: 0-1023:2/256 ==> 0,1,256,257,512,513,768,769 * - * Returns 0 on success, -errno on invalid input strings. - * Error values: - * %-EINVAL: second number in range smaller than first - * %-EINVAL: invalid character in string - * %-ERANGE: bit number specified too large for mask + * Returns: 0 on success, -errno on invalid input strings. Error values: + * + * - ``-EINVAL``: second number in range smaller than first + * - ``-EINVAL``: invalid character in string + * - ``-ERANGE``: bit number specified too large for mask */ static int __bitmap_parselist(const char *buf, unsigned int buflen, int is_user, unsigned long *maskp, @@ -864,14 +864,16 @@ EXPORT_SYMBOL(bitmap_bitremap); * 11 was set in @orig had no affect on @dst. * * Example [2] for bitmap_fold() + bitmap_onto(): - * Let's say @relmap has these ten bits set: + * Let's say @relmap has these ten bits set:: + * * 40 41 42 43 45 48 53 61 74 95 + * * (for the curious, that's 40 plus the first ten terms of the * Fibonacci sequence.) * * Further lets say we use the following code, invoking * bitmap_fold() then bitmap_onto, as suggested above to - * avoid the possibility of an empty @dst result: + * avoid the possibility of an empty @dst result:: * * unsigned long *tmp; // a temporary bitmap's bits * @@ -882,22 +884,26 @@ EXPORT_SYMBOL(bitmap_bitremap); * various @orig's. I list the zero-based positions of each set bit. * The tmp column shows the intermediate result, as computed by * using bitmap_fold() to fold the @orig bitmap modulo ten - * (the weight of @relmap). + * (the weight of @relmap): * + * =============== ============== ================= * @orig tmp @dst * 0 0 40 * 1 1 41 * 9 9 95 - * 10 0 40 (*) + * 10 0 40 [#f1]_ * 1 3 5 7 1 3 5 7 41 43 48 61 * 0 1 2 3 4 0 1 2 3 4 40 41 42 43 45 * 0 9 18 27 0 9 8 7 40 61 74 95 * 0 10 20 30 0 40 * 0 11 22 33 0 1 2 3 40 41 42 43 * 0 12 24 36 0 2 4 6 40 42 45 53 - * 78 102 211 1 2 8 41 42 74 (*) + * 78 102 211 1 2 8 41 42 74 [#f1]_ + * =============== ============== ================= + * + * .. [#f1] * - * (*) For these marked lines, if we hadn't first done bitmap_fold() + * For these marked lines, if we hadn't first done bitmap_fold() * into tmp, then the @dst result would have been empty. * * If either of @orig or @relmap is empty (no set bits), then @dst diff --git a/lib/bug.c b/lib/bug.c index 06edbbef062322f12662f95d726b2d83856ac62c..a6a1137d06db75f94d005a9e646e061847be1c93 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -47,7 +47,7 @@ #include #include -extern const struct bug_entry __start___bug_table[], __stop___bug_table[]; +extern struct bug_entry __start___bug_table[], __stop___bug_table[]; static inline unsigned long bug_addr(const struct bug_entry *bug) { @@ -62,10 +62,10 @@ static inline unsigned long bug_addr(const struct bug_entry *bug) /* Updates are protected by module mutex */ static LIST_HEAD(module_bug_list); -static const struct bug_entry *module_find_bug(unsigned long bugaddr) +static struct bug_entry *module_find_bug(unsigned long bugaddr) { struct module *mod; - const struct bug_entry *bug = NULL; + struct bug_entry *bug = NULL; rcu_read_lock_sched(); list_for_each_entry_rcu(mod, &module_bug_list, bug_list) { @@ -122,15 +122,15 @@ void module_bug_cleanup(struct module *mod) #else -static inline const struct bug_entry *module_find_bug(unsigned long bugaddr) +static inline struct bug_entry *module_find_bug(unsigned long bugaddr) { return NULL; } #endif -const struct bug_entry *find_bug(unsigned long bugaddr) +struct bug_entry *find_bug(unsigned long bugaddr) { - const struct bug_entry *bug; + struct bug_entry *bug; for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) if (bugaddr == bug_addr(bug)) @@ -141,9 +141,9 @@ const struct bug_entry *find_bug(unsigned long bugaddr) enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) { - const struct bug_entry *bug; + struct bug_entry *bug; const char *file; - unsigned line, warning; + unsigned line, warning, once, done; if (!is_valid_bugaddr(bugaddr)) return BUG_TRAP_TYPE_NONE; @@ -164,6 +164,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) line = bug->line; #endif warning = (bug->flags & BUGFLAG_WARNING) != 0; + once = (bug->flags & BUGFLAG_ONCE) != 0; + done = (bug->flags & BUGFLAG_DONE) != 0; + + if (warning && once) { + if (done) + return BUG_TRAP_TYPE_WARN; + + /* + * Since this is the only store, concurrency is not an issue. + */ + bug->flags |= BUGFLAG_DONE; + } } if (warning) { diff --git a/lib/cmdline.c b/lib/cmdline.c index 8f13cf73c2ecf916c203ef7e963d250a113c3dee..3c6432df7e63466a24d41dead807c7ef14c0ab86 100644 --- a/lib/cmdline.c +++ b/lib/cmdline.c @@ -15,6 +15,7 @@ #include #include #include +#include /* * If a hyphen was found in get_option, this will handle the @@ -189,3 +190,59 @@ bool parse_option_str(const char *str, const char *option) return false; } + +/* + * Parse a string to get a param value pair. + * You can use " around spaces, but can't escape ". + * Hyphens and underscores equivalent in parameter names. + */ +char *next_arg(char *args, char **param, char **val) +{ + unsigned int i, equals = 0; + int in_quote = 0, quoted = 0; + char *next; + + if (*args == '"') { + args++; + in_quote = 1; + quoted = 1; + } + + for (i = 0; args[i]; i++) { + if (isspace(args[i]) && !in_quote) + break; + if (equals == 0) { + if (args[i] == '=') + equals = i; + } + if (args[i] == '"') + in_quote = !in_quote; + } + + *param = args; + if (!equals) + *val = NULL; + else { + args[equals] = '\0'; + *val = args + equals + 1; + + /* Don't include quotes in value. */ + if (**val == '"') { + (*val)++; + if (args[i-1] == '"') + args[i-1] = '\0'; + } + } + if (quoted && args[i-1] == '"') + args[i-1] = '\0'; + + if (args[i]) { + args[i] = '\0'; + next = args + i + 1; + } else + next = args + i; + + /* Chew up trailing spaces. */ + return skip_spaces(next); + //return next; +} diff --git a/lib/devres.c b/lib/devres.c index cb1464c411a2b4fa88778343bf33208ccd8dae76..78eca713b1d9ad99caa14deb3b688559aaf0fd63 100644 --- a/lib/devres.c +++ b/lib/devres.c @@ -17,7 +17,7 @@ static int devm_ioremap_match(struct device *dev, void *res, void *match_data) /** * devm_ioremap - Managed ioremap() * @dev: Generic device to remap IO address for - * @offset: BUS offset to map + * @offset: Resource address to map * @size: Size of map * * Managed ioremap(). Map is automatically unmapped on driver detach. @@ -45,7 +45,7 @@ EXPORT_SYMBOL(devm_ioremap); /** * devm_ioremap_nocache - Managed ioremap_nocache() * @dev: Generic device to remap IO address for - * @offset: BUS offset to map + * @offset: Resource address to map * @size: Size of map * * Managed ioremap_nocache(). Map is automatically unmapped on driver @@ -74,7 +74,7 @@ EXPORT_SYMBOL(devm_ioremap_nocache); /** * devm_ioremap_wc - Managed ioremap_wc() * @dev: Generic device to remap IO address for - * @offset: BUS offset to map + * @offset: Resource address to map * @size: Size of map * * Managed ioremap_wc(). Map is automatically unmapped on driver detach. diff --git a/lib/dma-debug.c b/lib/dma-debug.c index b157b46cc9a69ca830a2cdbc8abcbc9baab4a63e..ea4cc3dde4f1bac9f3b8337fdac95408cb3e8b35 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -942,21 +942,17 @@ static int device_dma_allocations(struct device *dev, struct dma_debug_entry **o unsigned long flags; int count = 0, i; - local_irq_save(flags); - for (i = 0; i < HASH_SIZE; ++i) { - spin_lock(&dma_entry_hash[i].lock); + spin_lock_irqsave(&dma_entry_hash[i].lock, flags); list_for_each_entry(entry, &dma_entry_hash[i].list, list) { if (entry->dev == dev) { count += 1; *out_entry = entry; } } - spin_unlock(&dma_entry_hash[i].lock); + spin_unlock_irqrestore(&dma_entry_hash[i].lock, flags); } - local_irq_restore(flags); - return count; } @@ -1502,7 +1498,7 @@ void debug_dma_alloc_coherent(struct device *dev, size_t size, entry->type = dma_debug_coherent; entry->dev = dev; entry->pfn = page_to_pfn(virt_to_page(virt)); - entry->offset = (size_t) virt & ~PAGE_MASK; + entry->offset = offset_in_page(virt); entry->size = size; entry->dev_addr = dma_addr; entry->direction = DMA_BIDIRECTIONAL; @@ -1518,7 +1514,7 @@ void debug_dma_free_coherent(struct device *dev, size_t size, .type = dma_debug_coherent, .dev = dev, .pfn = page_to_pfn(virt_to_page(virt)), - .offset = (size_t) virt & ~PAGE_MASK, + .offset = offset_in_page(virt), .dev_addr = addr, .size = size, .direction = DMA_BIDIRECTIONAL, diff --git a/lib/fault-inject.c b/lib/fault-inject.c index 6a823a53e357bc83e84476d93cc2c155f0adfee5..4ff157159a0d4d9d746fad5d228e878bdecd5be2 100644 --- a/lib/fault-inject.c +++ b/lib/fault-inject.c @@ -56,7 +56,7 @@ static void fail_dump(struct fault_attr *attr) static bool fail_task(struct fault_attr *attr, struct task_struct *task) { - return !in_interrupt() && task->make_it_fail; + return in_task() && task->make_it_fail; } #define MAX_STACK_TRACE_DEPTH 32 diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 60abc44385b795de7449b3810e9e294b7d2c48fc..f835964c9485f147699609bf2c2dc1ffecb8833b 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -413,7 +413,7 @@ void iov_iter_init(struct iov_iter *i, int direction, size_t count) { /* It will get better. Eventually... */ - if (segment_eq(get_fs(), KERNEL_DS)) { + if (uaccess_kernel()) { direction |= ITER_KVEC; i->type = direction; i->kvec = (struct kvec *)iov; @@ -604,7 +604,7 @@ size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) return 0; } iterate_and_advance(i, bytes, v, - __copy_from_user_nocache((to += v.iov_len) - v.iov_len, + __copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len), memcpy_from_page((to += v.bv_len) - v.bv_len, v.bv_page, v.bv_offset, v.bv_len), @@ -625,7 +625,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) if (unlikely(i->count < bytes)) return false; iterate_all_kinds(i, bytes, v, ({ - if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len, + if (__copy_from_user_inatomic_nocache((to += v.iov_len) - v.iov_len, v.iov_base, v.iov_len)) return false; 0;}), @@ -790,6 +790,8 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) { if (!unroll) return; + if (WARN_ON(unroll > MAX_RW_COUNT)) + return; i->count += unroll; if (unlikely(i->type & ITER_PIPE)) { struct pipe_inode_info *pipe = i->pipe; @@ -798,7 +800,7 @@ void iov_iter_revert(struct iov_iter *i, size_t unroll) while (1) { size_t n = off - pipe->bufs[idx].offset; if (unroll < n) { - off -= (n - unroll); + off -= unroll; break; } unroll -= n; @@ -1028,10 +1030,7 @@ EXPORT_SYMBOL(iov_iter_get_pages); static struct page **get_pages_array(size_t n) { - struct page **p = kmalloc(n * sizeof(struct page *), GFP_KERNEL); - if (!p) - p = vmalloc(n * sizeof(struct page *)); - return p; + return kvmalloc_array(n, sizeof(struct page *), GFP_KERNEL); } static ssize_t pipe_get_pages_alloc(struct iov_iter *i, diff --git a/lib/kobject.c b/lib/kobject.c index 445dcaeb0f56de7882a1b1278a90ef7931f4099c..763d70a189410cb7307b87474801a662f631f806 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj) } EXPORT_SYMBOL(kobject_get); -static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) +struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { + if (!kobj) + return NULL; if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; return kobj; } +EXPORT_SYMBOL(kobject_get_unless_zero); /* * kobject_cleanup - free kobject resources. diff --git a/lib/list_sort.c b/lib/list_sort.c index 3fe401067e20ba81c762fb00ae00730de6b9502f..9e9acc37652f2814542438f65d20ea9b3aab6539 100644 --- a/lib/list_sort.c +++ b/lib/list_sort.c @@ -1,6 +1,3 @@ - -#define pr_fmt(fmt) "list_sort_test: " fmt - #include #include #include @@ -145,149 +142,3 @@ void list_sort(void *priv, struct list_head *head, merge_and_restore_back_links(priv, cmp, head, part[max_lev], list); } EXPORT_SYMBOL(list_sort); - -#ifdef CONFIG_TEST_LIST_SORT - -#include -#include - -/* - * The pattern of set bits in the list length determines which cases - * are hit in list_sort(). - */ -#define TEST_LIST_LEN (512+128+2) /* not including head */ - -#define TEST_POISON1 0xDEADBEEF -#define TEST_POISON2 0xA324354C - -struct debug_el { - unsigned int poison1; - struct list_head list; - unsigned int poison2; - int value; - unsigned serial; -}; - -/* Array, containing pointers to all elements in the test list */ -static struct debug_el **elts __initdata; - -static int __init check(struct debug_el *ela, struct debug_el *elb) -{ - if (ela->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", ela->serial); - return -EINVAL; - } - if (elb->serial >= TEST_LIST_LEN) { - pr_err("error: incorrect serial %d\n", elb->serial); - return -EINVAL; - } - if (elts[ela->serial] != ela || elts[elb->serial] != elb) { - pr_err("error: phantom element\n"); - return -EINVAL; - } - if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - ela->poison1, ela->poison2); - return -EINVAL; - } - if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { - pr_err("error: bad poison: %#x/%#x\n", - elb->poison1, elb->poison2); - return -EINVAL; - } - return 0; -} - -static int __init cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct debug_el *ela, *elb; - - ela = container_of(a, struct debug_el, list); - elb = container_of(b, struct debug_el, list); - - check(ela, elb); - return ela->value - elb->value; -} - -static int __init list_sort_test(void) -{ - int i, count = 1, err = -ENOMEM; - struct debug_el *el; - struct list_head *cur; - LIST_HEAD(head); - - pr_debug("start testing list_sort()\n"); - - elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); - if (!elts) { - pr_err("error: cannot allocate memory\n"); - return err; - } - - for (i = 0; i < TEST_LIST_LEN; i++) { - el = kmalloc(sizeof(*el), GFP_KERNEL); - if (!el) { - pr_err("error: cannot allocate memory\n"); - goto exit; - } - /* force some equivalencies */ - el->value = prandom_u32() % (TEST_LIST_LEN / 3); - el->serial = i; - el->poison1 = TEST_POISON1; - el->poison2 = TEST_POISON2; - elts[i] = el; - list_add_tail(&el->list, &head); - } - - list_sort(NULL, &head, cmp); - - err = -EINVAL; - for (cur = head.next; cur->next != &head; cur = cur->next) { - struct debug_el *el1; - int cmp_result; - - if (cur->next->prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - cmp_result = cmp(NULL, cur, cur->next); - if (cmp_result > 0) { - pr_err("error: list is not sorted\n"); - goto exit; - } - - el = container_of(cur, struct debug_el, list); - el1 = container_of(cur->next, struct debug_el, list); - if (cmp_result == 0 && el->serial >= el1->serial) { - pr_err("error: order of equivalent elements not " - "preserved\n"); - goto exit; - } - - if (check(el, el1)) { - pr_err("error: element check failed\n"); - goto exit; - } - count++; - } - if (head.prev != cur) { - pr_err("error: list is corrupted\n"); - goto exit; - } - - - if (count != TEST_LIST_LEN) { - pr_err("error: bad list length %d", count); - goto exit; - } - - err = 0; -exit: - for (i = 0; i < TEST_LIST_LEN; i++) - kfree(elts[i]); - kfree(elts); - return err; -} -late_initcall(list_sort_test); -#endif /* CONFIG_TEST_LIST_SORT */ diff --git a/lib/md5.c b/lib/md5.c deleted file mode 100644 index bb0cd01d356d4263670a06909a2717eb913915a7..0000000000000000000000000000000000000000 --- a/lib/md5.c +++ /dev/null @@ -1,95 +0,0 @@ -#include -#include -#include - -#define F1(x, y, z) (z ^ (x & (y ^ z))) -#define F2(x, y, z) F1(z, x, y) -#define F3(x, y, z) (x ^ y ^ z) -#define F4(x, y, z) (y ^ (x | ~z)) - -#define MD5STEP(f, w, x, y, z, in, s) \ - (w += f(x, y, z) + in, w = (w<>(32-s)) + x) - -void md5_transform(__u32 *hash, __u32 const *in) -{ - u32 a, b, c, d; - - a = hash[0]; - b = hash[1]; - c = hash[2]; - d = hash[3]; - - MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); - MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); - MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); - MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); - MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); - MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); - MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); - MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); - MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); - MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); - MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); - MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); - MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); - MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); - MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); - MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); - - MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); - MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); - MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); - MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); - MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); - MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); - MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); - MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); - MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); - MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); - MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); - MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); - MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); - MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); - MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); - MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); - - MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); - MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); - MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); - MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); - MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); - MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); - MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); - MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); - MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); - MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); - MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); - MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); - MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); - MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); - MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); - MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); - - MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); - MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); - MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); - MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); - MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); - MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); - MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); - MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); - MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); - MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); - MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); - MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); - MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); - MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); - MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); - MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); - - hash[0] += a; - hash[1] += b; - hash[2] += c; - hash[3] += d; -} -EXPORT_SYMBOL(md5_transform); diff --git a/lib/nlattr.c b/lib/nlattr.c index b42b8577fc2346045c4d88ede93960bf1cd8f3e7..a7e0b16078dff5035d1494d15f62622fdfe54b66 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -112,6 +112,7 @@ static int validate_nla(const struct nlattr *nla, int maxtype, * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy + * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be @@ -120,20 +121,23 @@ static int validate_nla(const struct nlattr *nla, int maxtype, * Returns 0 on success or a negative error code. */ int nla_validate(const struct nlattr *head, int len, int maxtype, - const struct nla_policy *policy) + const struct nla_policy *policy, + struct netlink_ext_ack *extack) { const struct nlattr *nla; - int rem, err; + int rem; nla_for_each_attr(nla, head, len, rem) { - err = validate_nla(nla, maxtype, policy); - if (err < 0) - goto errout; + int err = validate_nla(nla, maxtype, policy); + + if (err < 0) { + if (extack) + extack->bad_attr = nla; + return err; + } } - err = 0; -errout: - return err; + return 0; } EXPORT_SYMBOL(nla_validate); @@ -180,7 +184,8 @@ EXPORT_SYMBOL(nla_policy_len); * Returns 0 on success or a negative error code. */ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, - int len, const struct nla_policy *policy) + int len, const struct nla_policy *policy, + struct netlink_ext_ack *extack) { const struct nlattr *nla; int rem, err; @@ -193,8 +198,11 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, if (type > 0 && type <= maxtype) { if (policy) { err = validate_nla(nla, maxtype, policy); - if (err < 0) + if (err < 0) { + if (extack) + extack->bad_attr = nla; goto errout; + } } tb[type] = (struct nlattr *)nla; diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 9ac959ef4cae972374540f34895465507e656d7b..fe03c6d527611937e196395f83f7348da86f8f06 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c @@ -260,6 +260,22 @@ void percpu_ref_switch_to_atomic(struct percpu_ref *ref, spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } +EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic); + +/** + * percpu_ref_switch_to_atomic_sync - switch a percpu_ref to atomic mode + * @ref: percpu_ref to switch to atomic mode + * + * Schedule switching the ref to atomic mode, and wait for the + * switch to complete. Caller must ensure that no other thread + * will switch back to percpu mode. + */ +void percpu_ref_switch_to_atomic_sync(struct percpu_ref *ref) +{ + percpu_ref_switch_to_atomic(ref, NULL); + wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); +} +EXPORT_SYMBOL_GPL(percpu_ref_switch_to_atomic_sync); /** * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode @@ -290,6 +306,7 @@ void percpu_ref_switch_to_percpu(struct percpu_ref *ref) spin_unlock_irqrestore(&percpu_ref_switch_lock, flags); } +EXPORT_SYMBOL_GPL(percpu_ref_switch_to_percpu); /** * percpu_ref_kill_and_confirm - drop the initial ref and schedule confirmation diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 691a9ad48497b02e3b09304d6565165ef2317b16..898e8799841759ff20f1dc3eb25f277ebb3a8119 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -2284,6 +2284,8 @@ static int radix_tree_cpu_dead(unsigned int cpu) void __init radix_tree_init(void) { int ret; + + BUILD_BUG_ON(RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT > 32); radix_tree_node_cachep = kmem_cache_create("radix_tree_node", sizeof(struct radix_tree_node), 0, SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, diff --git a/lib/refcount.c b/lib/refcount.c index aa09ad3c30b0dc37a920c46f0da711f366d29423..9f906783987e2a22813d6fd2a9f395ecccca1aef 100644 --- a/lib/refcount.c +++ b/lib/refcount.c @@ -37,11 +37,29 @@ #include #include +/** + * refcount_add_not_zero - add a value to a refcount unless it is 0 + * @i: the value to add to the refcount + * @r: the refcount + * + * Will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + * + * Return: false if the passed refcount is 0, true otherwise + */ bool refcount_add_not_zero(unsigned int i, refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (!val) return false; @@ -51,37 +69,54 @@ bool refcount_add_not_zero(unsigned int i, refcount_t *r) new = val + i; if (new < val) new = UINT_MAX; - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - val = old; - } + } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); return true; } -EXPORT_SYMBOL_GPL(refcount_add_not_zero); +EXPORT_SYMBOL(refcount_add_not_zero); +/** + * refcount_add - add a value to a refcount + * @i: the value to add to the refcount + * @r: the refcount + * + * Similar to atomic_add(), but will saturate at UINT_MAX and WARN. + * + * Provides no memory ordering, it is assumed the caller has guaranteed the + * object memory to be stable (RCU, etc.). It does provide a control dependency + * and thereby orders future stores. See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_inc(), or one of its variants, should instead be used to + * increment a reference count. + */ void refcount_add(unsigned int i, refcount_t *r) { WARN_ONCE(!refcount_add_not_zero(i, r), "refcount_t: addition on 0; use-after-free.\n"); } -EXPORT_SYMBOL_GPL(refcount_add); +EXPORT_SYMBOL(refcount_add); -/* - * Similar to atomic_inc_not_zero(), will saturate at UINT_MAX and WARN. +/** + * refcount_inc_not_zero - increment a refcount unless it is 0 + * @r: the refcount to increment + * + * Similar to atomic_inc_not_zero(), but will saturate at UINT_MAX and WARN. * * Provides no memory ordering, it is assumed the caller has guaranteed the * object memory to be stable (RCU, etc.). It does provide a control dependency * and thereby orders future stores. See the comment on top. + * + * Return: true if the increment was successful, false otherwise */ bool refcount_inc_not_zero(refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { new = val + 1; if (!val) @@ -90,36 +125,57 @@ bool refcount_inc_not_zero(refcount_t *r) if (unlikely(!new)) return true; - old = atomic_cmpxchg_relaxed(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_relaxed(&r->refs, &val, new)); WARN_ONCE(new == UINT_MAX, "refcount_t: saturated; leaking memory.\n"); return true; } -EXPORT_SYMBOL_GPL(refcount_inc_not_zero); +EXPORT_SYMBOL(refcount_inc_not_zero); -/* - * Similar to atomic_inc(), will saturate at UINT_MAX and WARN. +/** + * refcount_inc - increment a refcount + * @r: the refcount to increment + * + * Similar to atomic_inc(), but will saturate at UINT_MAX and WARN. * * Provides no memory ordering, it is assumed the caller already has a - * reference on the object, will WARN when this is not so. + * reference on the object. + * + * Will WARN if the refcount is 0, as this represents a possible use-after-free + * condition. */ void refcount_inc(refcount_t *r) { WARN_ONCE(!refcount_inc_not_zero(r), "refcount_t: increment on 0; use-after-free.\n"); } -EXPORT_SYMBOL_GPL(refcount_inc); +EXPORT_SYMBOL(refcount_inc); +/** + * refcount_sub_and_test - subtract from a refcount and test if it is 0 + * @i: amount to subtract from the refcount + * @r: the refcount + * + * Similar to atomic_dec_and_test(), but it will WARN, return false and + * ultimately leak on underflow and will fail to decrement when saturated + * at UINT_MAX. + * + * Provides release memory ordering, such that prior loads and stores are done + * before, and provides a control dependency such that free() must come after. + * See the comment on top. + * + * Use of this function is not recommended for the normal reference counting + * use case in which references are taken and released one at a time. In these + * cases, refcount_dec(), or one of its variants, should instead be used to + * decrement a reference count. + * + * Return: true if the resulting refcount is 0, false otherwise + */ bool refcount_sub_and_test(unsigned int i, refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (unlikely(val == UINT_MAX)) return false; @@ -129,46 +185,51 @@ bool refcount_sub_and_test(unsigned int i, refcount_t *r) return false; } - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return !new; } -EXPORT_SYMBOL_GPL(refcount_sub_and_test); +EXPORT_SYMBOL(refcount_sub_and_test); -/* +/** + * refcount_dec_and_test - decrement a refcount and test if it is 0 + * @r: the refcount + * * Similar to atomic_dec_and_test(), it will WARN on underflow and fail to * decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_and_test(refcount_t *r) { return refcount_sub_and_test(1, r); } -EXPORT_SYMBOL_GPL(refcount_dec_and_test); +EXPORT_SYMBOL(refcount_dec_and_test); -/* +/** + * refcount_dec - decrement a refcount + * @r: the refcount + * * Similar to atomic_dec(), it will WARN on underflow and fail to decrement * when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before. */ - void refcount_dec(refcount_t *r) { WARN_ONCE(refcount_dec_and_test(r), "refcount_t: decrement hit 0; leaking memory.\n"); } -EXPORT_SYMBOL_GPL(refcount_dec); +EXPORT_SYMBOL(refcount_dec); -/* +/** + * refcount_dec_if_one - decrement a refcount if it is 1 + * @r: the refcount + * * No atomic_t counterpart, it attempts a 1 -> 0 transition and returns the * success thereof. * @@ -178,24 +239,33 @@ EXPORT_SYMBOL_GPL(refcount_dec); * It can be used like a try-delete operator; this explicit case is provided * and not cmpxchg in generic, because that would allow implementing unsafe * operations. + * + * Return: true if the resulting refcount is 0, false otherwise */ bool refcount_dec_if_one(refcount_t *r) { - return atomic_cmpxchg_release(&r->refs, 1, 0) == 1; + int val = 1; + + return atomic_try_cmpxchg_release(&r->refs, &val, 0); } -EXPORT_SYMBOL_GPL(refcount_dec_if_one); +EXPORT_SYMBOL(refcount_dec_if_one); -/* +/** + * refcount_dec_not_one - decrement a refcount if it is not 1 + * @r: the refcount + * * No atomic_t counterpart, it decrements unless the value is 1, in which case * it will return false. * * Was often done like: atomic_add_unless(&var, -1, 1) + * + * Return: true if the decrement operation was successful, false otherwise */ bool refcount_dec_not_one(refcount_t *r) { - unsigned int old, new, val = atomic_read(&r->refs); + unsigned int new, val = atomic_read(&r->refs); - for (;;) { + do { if (unlikely(val == UINT_MAX)) return true; @@ -208,24 +278,27 @@ bool refcount_dec_not_one(refcount_t *r) return true; } - old = atomic_cmpxchg_release(&r->refs, val, new); - if (old == val) - break; - - val = old; - } + } while (!atomic_try_cmpxchg_release(&r->refs, &val, new)); return true; } -EXPORT_SYMBOL_GPL(refcount_dec_not_one); +EXPORT_SYMBOL(refcount_dec_not_one); -/* +/** + * refcount_dec_and_mutex_lock - return holding mutex if able to decrement + * refcount to 0 + * @r: the refcount + * @lock: the mutex to be locked + * * Similar to atomic_dec_and_mutex_lock(), it will WARN on underflow and fail * to decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true and hold mutex if able to decrement refcount to 0, false + * otherwise */ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) { @@ -240,15 +313,23 @@ bool refcount_dec_and_mutex_lock(refcount_t *r, struct mutex *lock) return true; } -EXPORT_SYMBOL_GPL(refcount_dec_and_mutex_lock); +EXPORT_SYMBOL(refcount_dec_and_mutex_lock); -/* +/** + * refcount_dec_and_lock - return holding spinlock if able to decrement + * refcount to 0 + * @r: the refcount + * @lock: the spinlock to be locked + * * Similar to atomic_dec_and_lock(), it will WARN on underflow and fail to * decrement when saturated at UINT_MAX. * * Provides release memory ordering, such that prior loads and stores are done * before, and provides a control dependency such that free() must come after. * See the comment on top. + * + * Return: true and hold spinlock if able to decrement refcount to 0, false + * otherwise */ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) { @@ -263,5 +344,5 @@ bool refcount_dec_and_lock(refcount_t *r, spinlock_t *lock) return true; } -EXPORT_SYMBOL_GPL(refcount_dec_and_lock); +EXPORT_SYMBOL(refcount_dec_and_lock); diff --git a/lib/rhashtable.c b/lib/rhashtable.c index f8635fd5744259431305b73599949011d67a78b0..d9e7274a04cd98d14456b1bba9ad2e241eae8895 100644 --- a/lib/rhashtable.c +++ b/lib/rhashtable.c @@ -86,16 +86,9 @@ static int alloc_bucket_locks(struct rhashtable *ht, struct bucket_table *tbl, size = min(size, 1U << tbl->nest); if (sizeof(spinlock_t) != 0) { - tbl->locks = NULL; -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE && - gfp == GFP_KERNEL) - tbl->locks = vmalloc(size * sizeof(spinlock_t)); -#endif - if (gfp != GFP_KERNEL) - gfp |= __GFP_NOWARN | __GFP_NORETRY; - - if (!tbl->locks) + if (gfpflags_allow_blocking(gfp)) + tbl->locks = kvmalloc(size * sizeof(spinlock_t), gfp); + else tbl->locks = kmalloc_array(size, sizeof(spinlock_t), gfp); if (!tbl->locks) @@ -535,7 +528,7 @@ static void *rhashtable_lookup_one(struct rhashtable *ht, struct rhash_head *head; int elasticity; - elasticity = ht->elasticity; + elasticity = RHT_ELASTICITY; pprev = rht_bucket_var(tbl, hash); rht_for_each_continue(head, *pprev, tbl, hash) { struct rhlist_head *list; @@ -958,35 +951,20 @@ int rhashtable_init(struct rhashtable *ht, if (params->min_size) ht->p.min_size = roundup_pow_of_two(params->min_size); - if (params->max_size) - ht->p.max_size = rounddown_pow_of_two(params->max_size); + /* Cap total entries at 2^31 to avoid nelems overflow. */ + ht->max_elems = 1u << 31; - if (params->insecure_max_entries) - ht->p.insecure_max_entries = - rounddown_pow_of_two(params->insecure_max_entries); - else - ht->p.insecure_max_entries = ht->p.max_size * 2; + if (params->max_size) { + ht->p.max_size = rounddown_pow_of_two(params->max_size); + if (ht->p.max_size < ht->max_elems / 2) + ht->max_elems = ht->p.max_size * 2; + } - ht->p.min_size = max(ht->p.min_size, HASH_MIN_SIZE); + ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE); if (params->nelem_hint) size = rounded_hashtable_size(&ht->p); - /* The maximum (not average) chain length grows with the - * size of the hash table, at a rate of (log N)/(log log N). - * The value of 16 is selected so that even if the hash - * table grew to 2^32 you would not expect the maximum - * chain length to exceed it unless we are under attack - * (or extremely unlucky). - * - * As this limit is only to detect attacks, we don't need - * to set it to a lower value as you'd need the chain - * length to vastly exceed 16 to have any real effect - * on the system. - */ - if (!params->insecure_elasticity) - ht->elasticity = 16; - if (params->locks_mul) ht->p.locks_mul = roundup_pow_of_two(params->locks_mul); else diff --git a/lib/sbitmap.c b/lib/sbitmap.c index 60e800e0b5a0d986ec0e2664dd9e4185aa50328a..80aa8d5463faf9f4c39d5c58bb7efdb927836cdb 100644 --- a/lib/sbitmap.c +++ b/lib/sbitmap.c @@ -79,15 +79,15 @@ void sbitmap_resize(struct sbitmap *sb, unsigned int depth) } EXPORT_SYMBOL_GPL(sbitmap_resize); -static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, - bool wrap) +static int __sbitmap_get_word(unsigned long *word, unsigned long depth, + unsigned int hint, bool wrap) { unsigned int orig_hint = hint; int nr; while (1) { - nr = find_next_zero_bit(&word->word, word->depth, hint); - if (unlikely(nr >= word->depth)) { + nr = find_next_zero_bit(word, depth, hint); + if (unlikely(nr >= depth)) { /* * We started with an offset, and we didn't reset the * offset to 0 in a failure case, so start from 0 to @@ -100,11 +100,11 @@ static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, return -1; } - if (!test_and_set_bit(nr, &word->word)) + if (!test_and_set_bit(nr, word)) break; hint = nr + 1; - if (hint >= word->depth - 1) + if (hint >= depth - 1) hint = 0; } @@ -119,7 +119,8 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) index = SB_NR_TO_INDEX(sb, alloc_hint); for (i = 0; i < sb->map_nr; i++) { - nr = __sbitmap_get_word(&sb->map[index], + nr = __sbitmap_get_word(&sb->map[index].word, + sb->map[index].depth, SB_NR_TO_BIT(sb, alloc_hint), !round_robin); if (nr != -1) { @@ -141,6 +142,37 @@ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) } EXPORT_SYMBOL_GPL(sbitmap_get); +int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, + unsigned long shallow_depth) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index].word, + min(sb->map[index].depth, shallow_depth), + SB_NR_TO_BIT(sb, alloc_hint), true); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get_shallow); + bool sbitmap_any_bit_set(const struct sbitmap *sb) { unsigned int i; @@ -342,6 +374,35 @@ int __sbitmap_queue_get(struct sbitmap_queue *sbq) } EXPORT_SYMBOL_GPL(__sbitmap_queue_get); +int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, + unsigned int shallow_depth) +{ + unsigned int hint, depth; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + nr = sbitmap_get_shallow(&sbq->sb, hint, shallow_depth); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(sbq->round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get_shallow); + static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) { int i, wake_index; diff --git a/lib/string.c b/lib/string.c index ed83562a53ae5c57b7c55232cf703d3e4de90c58..1c1fc9187b054ccae0d55f6678a23971de109718 100644 --- a/lib/string.c +++ b/lib/string.c @@ -131,7 +131,7 @@ EXPORT_SYMBOL(strncpy); * @src: Where to copy the string from * @size: size of destination buffer * - * Compatible with *BSD: the result is always a valid + * Compatible with ``*BSD``: the result is always a valid * NUL-terminated string that fits in the buffer (unless, * of course, the buffer size is zero). It does not pad * out the result like strncpy() does. @@ -656,6 +656,32 @@ int match_string(const char * const *array, size_t n, const char *string) } EXPORT_SYMBOL(match_string); +/** + * __sysfs_match_string - matches given string in an array + * @array: array of strings + * @n: number of strings in the array or -1 for NULL terminated arrays + * @str: string to match with + * + * Returns index of @str in the @array or -EINVAL, just like match_string(). + * Uses sysfs_streq instead of strcmp for matching. + */ +int __sysfs_match_string(const char * const *array, size_t n, const char *str) +{ + const char *item; + int index; + + for (index = 0; index < n; index++) { + item = array[index]; + if (!item) + break; + if (sysfs_streq(item, str)) + return index; + } + + return -EINVAL; +} +EXPORT_SYMBOL(__sysfs_match_string); + #ifndef __HAVE_ARCH_MEMSET /** * memset - Fill a region of memory with the given value diff --git a/lib/test_bpf.c b/lib/test_bpf.c index 0362da0b66c352e4cb3eb96748fe2db4955d6b11..889bc31785bee57901ae969f16a55390db8265c4 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -434,6 +434,41 @@ static int bpf_fill_ld_abs_vlan_push_pop(struct bpf_test *self) return 0; } +static int __bpf_fill_stxdw(struct bpf_test *self, int size) +{ + unsigned int len = BPF_MAXINSNS; + struct bpf_insn *insn; + int i; + + insn = kmalloc_array(len, sizeof(*insn), GFP_KERNEL); + if (!insn) + return -ENOMEM; + + insn[0] = BPF_ALU32_IMM(BPF_MOV, R0, 1); + insn[1] = BPF_ST_MEM(size, R10, -40, 42); + + for (i = 2; i < len - 2; i++) + insn[i] = BPF_STX_XADD(size, R10, R0, -40); + + insn[len - 2] = BPF_LDX_MEM(size, R0, R10, -40); + insn[len - 1] = BPF_EXIT_INSN(); + + self->u.ptr.insns = insn; + self->u.ptr.len = len; + + return 0; +} + +static int bpf_fill_stxw(struct bpf_test *self) +{ + return __bpf_fill_stxdw(self, BPF_W); +} + +static int bpf_fill_stxdw(struct bpf_test *self) +{ + return __bpf_fill_stxdw(self, BPF_DW); +} + static struct bpf_test tests[] = { { "TAX", @@ -4302,6 +4337,41 @@ static struct bpf_test tests[] = { { }, { { 0, 0x22 } }, }, + { + "STX_XADD_W: Test side-effects, r10: 0x12 + 0x10 = 0x22", + .u.insns_int = { + BPF_ALU64_REG(BPF_MOV, R1, R10), + BPF_ALU32_IMM(BPF_MOV, R0, 0x12), + BPF_ST_MEM(BPF_W, R10, -40, 0x10), + BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_ALU64_REG(BPF_MOV, R0, R10), + BPF_ALU64_REG(BPF_SUB, R0, R1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + }, + { + "STX_XADD_W: Test side-effects, r0: 0x12 + 0x10 = 0x22", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0x12), + BPF_ST_MEM(BPF_W, R10, -40, 0x10), + BPF_STX_XADD(BPF_W, R10, R0, -40), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x12 } }, + }, + { + "STX_XADD_W: X + 1 + 1 + 1 + ...", + { }, + INTERNAL, + { }, + { { 0, 4134 } }, + .fill_helper = bpf_fill_stxw, + }, { "STX_XADD_DW: Test: 0x12 + 0x10 = 0x22", .u.insns_int = { @@ -4315,6 +4385,41 @@ static struct bpf_test tests[] = { { }, { { 0, 0x22 } }, }, + { + "STX_XADD_DW: Test side-effects, r10: 0x12 + 0x10 = 0x22", + .u.insns_int = { + BPF_ALU64_REG(BPF_MOV, R1, R10), + BPF_ALU32_IMM(BPF_MOV, R0, 0x12), + BPF_ST_MEM(BPF_DW, R10, -40, 0x10), + BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_ALU64_REG(BPF_MOV, R0, R10), + BPF_ALU64_REG(BPF_SUB, R0, R1), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0 } }, + }, + { + "STX_XADD_DW: Test side-effects, r0: 0x12 + 0x10 = 0x22", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0x12), + BPF_ST_MEM(BPF_DW, R10, -40, 0x10), + BPF_STX_XADD(BPF_DW, R10, R0, -40), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0x12 } }, + }, + { + "STX_XADD_DW: X + 1 + 1 + 1 + ...", + { }, + INTERNAL, + { }, + { { 0, 4134 } }, + .fill_helper = bpf_fill_stxdw, + }, /* BPF_JMP | BPF_EXIT */ { "JMP_EXIT", @@ -4656,6 +4761,51 @@ static struct bpf_test tests[] = { { }, { { 0, 1 } }, }, + { + /* Mainly testing JIT + imm64 here. */ + "JMP_JGE_X: ldimm64 test 1", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JGE, R1, R2, 2), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xeeeeeeeeU } }, + }, + { + "JMP_JGE_X: ldimm64 test 2", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 0), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JGE, R1, R2, 0), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 0xffffffffU } }, + }, + { + "JMP_JGE_X: ldimm64 test 3", + .u.insns_int = { + BPF_ALU32_IMM(BPF_MOV, R0, 1), + BPF_LD_IMM64(R1, 3), + BPF_LD_IMM64(R2, 2), + BPF_JMP_REG(BPF_JGE, R1, R2, 4), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), + BPF_EXIT_INSN(), + }, + INTERNAL, + { }, + { { 0, 1 } }, + }, /* BPF_JMP | BPF_JNE | BPF_X */ { "JMP_JNE_X: if (3 != 2) return 1", diff --git a/lib/test_list_sort.c b/lib/test_list_sort.c new file mode 100644 index 0000000000000000000000000000000000000000..28e817387b04f1c8862c54deefd4e8309f26437e --- /dev/null +++ b/lib/test_list_sort.c @@ -0,0 +1,150 @@ +#define pr_fmt(fmt) "list_sort_test: " fmt + +#include +#include +#include +#include +#include +#include +#include + +/* + * The pattern of set bits in the list length determines which cases + * are hit in list_sort(). + */ +#define TEST_LIST_LEN (512+128+2) /* not including head */ + +#define TEST_POISON1 0xDEADBEEF +#define TEST_POISON2 0xA324354C + +struct debug_el { + unsigned int poison1; + struct list_head list; + unsigned int poison2; + int value; + unsigned serial; +}; + +/* Array, containing pointers to all elements in the test list */ +static struct debug_el **elts __initdata; + +static int __init check(struct debug_el *ela, struct debug_el *elb) +{ + if (ela->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", ela->serial); + return -EINVAL; + } + if (elb->serial >= TEST_LIST_LEN) { + pr_err("error: incorrect serial %d\n", elb->serial); + return -EINVAL; + } + if (elts[ela->serial] != ela || elts[elb->serial] != elb) { + pr_err("error: phantom element\n"); + return -EINVAL; + } + if (ela->poison1 != TEST_POISON1 || ela->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + ela->poison1, ela->poison2); + return -EINVAL; + } + if (elb->poison1 != TEST_POISON1 || elb->poison2 != TEST_POISON2) { + pr_err("error: bad poison: %#x/%#x\n", + elb->poison1, elb->poison2); + return -EINVAL; + } + return 0; +} + +static int __init cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct debug_el *ela, *elb; + + ela = container_of(a, struct debug_el, list); + elb = container_of(b, struct debug_el, list); + + check(ela, elb); + return ela->value - elb->value; +} + +static int __init list_sort_test(void) +{ + int i, count = 1, err = -ENOMEM; + struct debug_el *el; + struct list_head *cur; + LIST_HEAD(head); + + pr_debug("start testing list_sort()\n"); + + elts = kcalloc(TEST_LIST_LEN, sizeof(*elts), GFP_KERNEL); + if (!elts) { + pr_err("error: cannot allocate memory\n"); + return err; + } + + for (i = 0; i < TEST_LIST_LEN; i++) { + el = kmalloc(sizeof(*el), GFP_KERNEL); + if (!el) { + pr_err("error: cannot allocate memory\n"); + goto exit; + } + /* force some equivalencies */ + el->value = prandom_u32() % (TEST_LIST_LEN / 3); + el->serial = i; + el->poison1 = TEST_POISON1; + el->poison2 = TEST_POISON2; + elts[i] = el; + list_add_tail(&el->list, &head); + } + + list_sort(NULL, &head, cmp); + + err = -EINVAL; + for (cur = head.next; cur->next != &head; cur = cur->next) { + struct debug_el *el1; + int cmp_result; + + if (cur->next->prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + cmp_result = cmp(NULL, cur, cur->next); + if (cmp_result > 0) { + pr_err("error: list is not sorted\n"); + goto exit; + } + + el = container_of(cur, struct debug_el, list); + el1 = container_of(cur->next, struct debug_el, list); + if (cmp_result == 0 && el->serial >= el1->serial) { + pr_err("error: order of equivalent elements not " + "preserved\n"); + goto exit; + } + + if (check(el, el1)) { + pr_err("error: element check failed\n"); + goto exit; + } + count++; + } + if (head.prev != cur) { + pr_err("error: list is corrupted\n"); + goto exit; + } + + + if (count != TEST_LIST_LEN) { + pr_err("error: bad list length %d", count); + goto exit; + } + + err = 0; +exit: + for (i = 0; i < TEST_LIST_LEN; i++) + kfree(elts[i]); + kfree(elts); + return err; +} +module_init(list_sort_test); +MODULE_LICENSE("GPL"); diff --git a/lib/test_sort.c b/lib/test_sort.c index 4db3911db50ace76356b6530f03480955226b79a..d389c1cc2f6cf795783c0572771b54bb64f9b751 100644 --- a/lib/test_sort.c +++ b/lib/test_sort.c @@ -1,11 +1,8 @@ #include #include -#include +#include -/* - * A simple boot-time regression test - * License: GPL - */ +/* a simple boot-time regression test */ #define TEST_LEN 1000 @@ -41,4 +38,6 @@ static int __init test_sort_init(void) kfree(a); return err; } -subsys_initcall(test_sort_init); + +module_init(test_sort_init); +MODULE_LICENSE("GPL"); diff --git a/lib/test_user_copy.c b/lib/test_user_copy.c index 1a8d71a685318ca901090805e98f45e12fdde315..4621db801b233009c096452cbcd137d00574ad77 100644 --- a/lib/test_user_copy.c +++ b/lib/test_user_copy.c @@ -31,7 +31,6 @@ * their capability at compile-time, we just have to opt-out certain archs. */ #if BITS_PER_LONG == 64 || (!(defined(CONFIG_ARM) && !defined(MMU)) && \ - !defined(CONFIG_AVR32) && \ !defined(CONFIG_BLACKFIN) && \ !defined(CONFIG_M32R) && \ !defined(CONFIG_M68K) && \ diff --git a/lib/usercopy.c b/lib/usercopy.c new file mode 100644 index 0000000000000000000000000000000000000000..1b6010a3beb8528a71676e2fdf0cf19924fd889a --- /dev/null +++ b/lib/usercopy.c @@ -0,0 +1,26 @@ +#include + +/* out-of-line parts */ + +#ifndef INLINE_COPY_FROM_USER +unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n) +{ + unsigned long res = n; + if (likely(access_ok(VERIFY_READ, from, n))) + res = raw_copy_from_user(to, from, n); + if (unlikely(res)) + memset(to + (n - res), 0, res); + return res; +} +EXPORT_SYMBOL(_copy_from_user); +#endif + +#ifndef INLINE_COPY_TO_USER +unsigned long _copy_to_user(void *to, const void __user *from, unsigned long n) +{ + if (likely(access_ok(VERIFY_WRITE, to, n))) + n = raw_copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(_copy_to_user); +#endif diff --git a/lib/vsprintf.c b/lib/vsprintf.c index e3bf4e0f10b568ba3d74674642adeff013cb62c8..2d41de3f98a1c9a0e0883b3d73b6980b0110cff1 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -1477,6 +1477,9 @@ int kptr_restrict __read_mostly; * by an extra set of alphanumeric characters that are extended format * specifiers. * + * Please update scripts/checkpatch.pl when adding/removing conversion + * characters. (Search for "check for vsprintf extension"). + * * Right now we handle: * * - 'F' For symbolic function descriptor pointers with offset @@ -1954,13 +1957,13 @@ set_precision(struct printf_spec *spec, int prec) * This function generally follows C99 vsnprintf, but has some * extensions and a few limitations: * - * %n is unsupported - * %p* is handled by pointer() + * - ``%n`` is unsupported + * - ``%p*`` is handled by pointer() * * See pointer() or Documentation/printk-formats.txt for more * extensive description. * - * ** Please update the documentation in both places when making changes ** + * **Please update the documentation in both places when making changes** * * The return value is the number of characters which would * be generated for the given input, excluding the trailing diff --git a/lib/zlib_inflate/inftrees.c b/lib/zlib_inflate/inftrees.c index 3fe6ce5b53e51999b0ad15f97843b93c313cb60f..02894305292663abc7c4ac9b8e1fbaed18396f6b 100644 --- a/lib/zlib_inflate/inftrees.c +++ b/lib/zlib_inflate/inftrees.c @@ -109,7 +109,7 @@ int zlib_inflate_table(codetype type, unsigned short *lens, unsigned codes, *bits = 1; return 0; /* no symbols, but wait for decoding to report error */ } - for (min = 1; min <= MAXBITS; min++) + for (min = 1; min < MAXBITS; min++) if (count[min] != 0) break; if (root < min) root = min; diff --git a/mm/Kconfig b/mm/Kconfig index 9b8fccb969dc0fee5f5e0a623bc5b97a99bbc602..beb7a455915d062f75bce89477961e121f2bca32 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -312,7 +312,6 @@ config NEED_BOUNCE_POOL config NR_QUICK int depends on QUICKLIST - default "2" if AVR32 default "1" config VIRT_TO_BUS diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 79d0fd13b5b3c1a826f472398fd60a1ae1cb5da6..5b0adf1435de7dba61e7d0b34c6a7fd912041fd1 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -42,7 +42,6 @@ config DEBUG_PAGEALLOC_ENABLE_DEFAULT config PAGE_POISONING bool "Poison pages after freeing" - select PAGE_EXTENSION select PAGE_POISONING_NO_SANITY if HIBERNATION ---help--- Fill the pages with poison patterns after free_pages() and verify diff --git a/mm/backing-dev.c b/mm/backing-dev.c index c6f2a37028c205db8143ebe58677c790c66a0faf..f028a9a472fd9b2c7098bce8fe622fd58ba2f140 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -12,8 +12,6 @@ #include #include -static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); - struct backing_dev_info noop_backing_dev_info = { .name = "noop", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, @@ -242,6 +240,8 @@ static __init int bdi_class_init(void) } postcore_initcall(bdi_class_init); +static int bdi_init(struct backing_dev_info *bdi); + static int __init default_bdi_init(void) { int err; @@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); + if (wb != &bdi->wb) + bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -314,8 +316,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, wb->dirty_sleep = jiffies; wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) - return -ENOMEM; + if (!wb->congested) { + err = -ENOMEM; + goto out_put_bdi; + } err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -335,9 +339,14 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, fprop_local_destroy_percpu(&wb->completions); out_put_cong: wb_congested_put(wb->congested); +out_put_bdi: + if (wb != &bdi->wb) + bdi_put(bdi); return err; } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); + /* * Remove bdi from the global list and shutdown any threads we have running */ @@ -347,10 +356,18 @@ static void wb_shutdown(struct bdi_writeback *wb) spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); + /* + * Wait for wb shutdown to finish if someone else is just + * running wb_shutdown(). Otherwise we could proceed to wb / + * bdi destruction before wb_shutdown() is finished. + */ + wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); return; } + set_bit(WB_shutting_down, &wb->state); spin_unlock_bh(&wb->work_lock); + cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to @@ -359,6 +376,12 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + /* + * Make sure bit gets cleared after shutdown is finished. Matches with + * the barrier provided by test_and_clear_bit() above. + */ + smp_wmb(); + clear_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -372,6 +395,8 @@ static void wb_exit(struct bdi_writeback *wb) fprop_local_destroy_percpu(&wb->completions); wb_congested_put(wb->congested); + if (wb != &wb->bdi->wb) + bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK @@ -381,11 +406,9 @@ static void wb_exit(struct bdi_writeback *wb) /* * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. cgwb_release_wait is used to wait for the completion of cgwb - * releases from bdi destruction path. + * protected. */ static DEFINE_SPINLOCK(cgwb_lock); -static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); /** * wb_congested_get_create - get or create a wb_congested @@ -438,7 +461,7 @@ wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) return NULL; atomic_set(&new_congested->refcnt, 0); - new_congested->bdi = bdi; + new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; @@ -466,10 +489,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested) } /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->bdi) { + if (congested->__bdi) { rb_erase(&congested->rb_node, - &congested->bdi->cgwb_congested_tree); - congested->bdi = NULL; + &congested->__bdi->cgwb_congested_tree); + congested->__bdi = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -480,11 +503,6 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); - struct backing_dev_info *bdi = wb->bdi; - - spin_lock_irq(&cgwb_lock); - list_del_rcu(&wb->bdi_node); - spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); @@ -495,9 +513,6 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); - - if (atomic_dec_and_test(&bdi->usage_cnt)) - wake_up_all(&cgwb_release_wait); } static void cgwb_release(struct percpu_ref *refcnt) @@ -517,6 +532,13 @@ static void cgwb_kill(struct bdi_writeback *wb) percpu_ref_kill(&wb->refcnt); } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); +} + static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { @@ -580,7 +602,6 @@ static int cgwb_create(struct backing_dev_info *bdi, /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { - atomic_inc(&bdi->usage_cnt); list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); @@ -670,7 +691,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; - atomic_set(&bdi->usage_cnt, 1); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -680,29 +700,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return ret; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; + struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - spin_unlock_irq(&cgwb_lock); - /* - * All cgwb's must be shutdown and released before returning. Drain - * the usage counter to wait for all cgwb's ever created on @bdi. - */ - atomic_dec(&bdi->usage_cnt); - wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); - /* - * Grab back our reference so that we hold it when @bdi gets - * re-registered. - */ - atomic_inc(&bdi->usage_cnt); + while (!list_empty(&bdi->wb_list)) { + wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, + bdi_node); + spin_unlock_irq(&cgwb_lock); + wb_shutdown(wb); + spin_lock_irq(&cgwb_lock); + } + spin_unlock_irq(&cgwb_lock); } /** @@ -752,11 +769,18 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) rb_entry(rbn, struct bdi_writeback_congested, rb_node); rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ + congested->__bdi = NULL; /* mark @congested unlinked */ } spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + spin_lock_irq(&cgwb_lock); + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -777,16 +801,26 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_exit(struct backing_dev_info *bdi) { wb_congested_put(bdi->wb_congested); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + list_del_rcu(&wb->bdi_node); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ -int bdi_init(struct backing_dev_info *bdi) +static int bdi_init(struct backing_dev_info *bdi) { int ret; @@ -802,11 +836,8 @@ int bdi_init(struct backing_dev_info *bdi) ret = cgwb_bdi_init(bdi); - list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); - return ret; } -EXPORT_SYMBOL(bdi_init); struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) { @@ -823,22 +854,20 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } return bdi; } +EXPORT_SYMBOL(bdi_alloc_node); -int bdi_register(struct backing_dev_info *bdi, struct device *parent, - const char *fmt, ...) +int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { - va_list args; struct device *dev; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; - va_start(args, fmt); - dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); - va_end(args); + dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args); if (IS_ERR(dev)) return PTR_ERR(dev); + cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); @@ -851,20 +880,25 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, trace_writeback_bdi_register(bdi); return 0; } -EXPORT_SYMBOL(bdi_register); +EXPORT_SYMBOL(bdi_register_va); -int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) +int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { - return bdi_register(bdi, NULL, "%u:%u", MAJOR(dev), MINOR(dev)); + va_list args; + int ret; + + va_start(args, fmt); + ret = bdi_register_va(bdi, fmt, args); + va_end(args); + return ret; } -EXPORT_SYMBOL(bdi_register_dev); +EXPORT_SYMBOL(bdi_register); int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) { int rc; - rc = bdi_register(bdi, NULL, "%u:%u", MAJOR(owner->devt), - MINOR(owner->devt)); + rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt)); if (rc) return rc; /* Leaking owner reference... */ @@ -892,7 +926,7 @@ void bdi_unregister(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); - cgwb_bdi_destroy(bdi); + cgwb_bdi_unregister(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); @@ -906,19 +940,16 @@ void bdi_unregister(struct backing_dev_info *bdi) } } -static void bdi_exit(struct backing_dev_info *bdi) -{ - WARN_ON_ONCE(bdi->dev); - wb_exit(&bdi->wb); - cgwb_bdi_exit(bdi); -} - static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); - bdi_exit(bdi); + if (test_bit(WB_registered, &bdi->wb.state)) + bdi_unregister(bdi); + WARN_ON_ONCE(bdi->dev); + wb_exit(&bdi->wb); + cgwb_bdi_exit(bdi); kfree(bdi); } @@ -926,38 +957,7 @@ void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } - -void bdi_destroy(struct backing_dev_info *bdi) -{ - bdi_unregister(bdi); - bdi_exit(bdi); -} -EXPORT_SYMBOL(bdi_destroy); - -/* - * For use from filesystems to quickly init and register a bdi associated - * with dirty writeback - */ -int bdi_setup_and_register(struct backing_dev_info *bdi, char *name) -{ - int err; - - bdi->name = name; - bdi->capabilities = 0; - err = bdi_init(bdi); - if (err) - return err; - - err = bdi_register(bdi, NULL, "%.28s-%ld", name, - atomic_long_inc_return(&bdi_seq)); - if (err) { - bdi_destroy(bdi); - return err; - } - - return 0; -} -EXPORT_SYMBOL(bdi_setup_and_register); +EXPORT_SYMBOL(bdi_put); static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), diff --git a/mm/cma.c b/mm/cma.c index a6033e3444304c95adb7c392984cc6600684c7ec..978b4a1441ef7120678e1fc7973b38c8ccf4dd5a 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -53,6 +53,11 @@ unsigned long cma_get_size(const struct cma *cma) return cma->count << PAGE_SHIFT; } +const char *cma_get_name(const struct cma *cma) +{ + return cma->name ? cma->name : "(undefined)"; +} + static unsigned long cma_bitmap_aligned_mask(const struct cma *cma, int align_order) { @@ -168,6 +173,7 @@ core_initcall(cma_init_reserved_areas); */ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, unsigned int order_per_bit, + const char *name, struct cma **res_cma) { struct cma *cma; @@ -198,6 +204,13 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, * subsystems (like slab allocator) are available. */ cma = &cma_areas[cma_area_count]; + if (name) { + cma->name = name; + } else { + cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count); + if (!cma->name) + return -ENOMEM; + } cma->base_pfn = PFN_DOWN(base); cma->count = size >> PAGE_SHIFT; cma->order_per_bit = order_per_bit; @@ -229,7 +242,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, int __init cma_declare_contiguous(phys_addr_t base, phys_addr_t size, phys_addr_t limit, phys_addr_t alignment, unsigned int order_per_bit, - bool fixed, struct cma **res_cma) + bool fixed, const char *name, struct cma **res_cma) { phys_addr_t memblock_end = memblock_end_of_DRAM(); phys_addr_t highmem_start; @@ -335,7 +348,7 @@ int __init cma_declare_contiguous(phys_addr_t base, base = addr; } - ret = cma_init_reserved_mem(base, size, order_per_bit, res_cma); + ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma); if (ret) goto err; @@ -491,3 +504,17 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count) return true; } + +int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data) +{ + int i; + + for (i = 0; i < cma_area_count; i++) { + int ret = it(&cma_areas[i], data); + + if (ret) + return ret; + } + + return 0; +} diff --git a/mm/cma.h b/mm/cma.h index 17c75a4246c8bbab8b56fe4d562cd85ea670a21f..49861286279d7dfb52b5643b59557aa0f9341d36 100644 --- a/mm/cma.h +++ b/mm/cma.h @@ -11,6 +11,7 @@ struct cma { struct hlist_head mem_head; spinlock_t mem_head_lock; #endif + const char *name; }; extern struct cma cma_areas[MAX_CMA_AREAS]; diff --git a/mm/cma_debug.c b/mm/cma_debug.c index ffc0c3d0ae64a610409d85a5ac5704c2a660ab0c..595b757bef72722fb2715afd9ab0cd1536181c9e 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) char name[16]; int u32s; - sprintf(name, "cma-%d", idx); + sprintf(name, "cma-%s", cma->name); tmp = debugfs_create_dir(name, cma_debugfs_root); diff --git a/mm/compaction.c b/mm/compaction.c index 81e1eaa2a2cf1bea89767185e9cb9549f2139ca2..613c59e928cb5b07075c197126e01b9cdafff992 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,11 +89,6 @@ static void map_pages(struct list_head *list) list_splice(&tmp_list, list); } -static inline bool migrate_async_suitable(int migratetype) -{ - return is_migrate_cma(migratetype) || migratetype == MIGRATE_MOVABLE; -} - #ifdef CONFIG_COMPACTION int PageMovable(struct page *page) @@ -988,13 +983,26 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* Returns true if the page is within a block suitable for migration to */ -static bool suitable_migration_target(struct compact_control *cc, +static bool suitable_migration_source(struct compact_control *cc, struct page *page) { - if (cc->ignore_block_suitable) + int block_mt; + + if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction) return true; + block_mt = get_pageblock_migratetype(page); + + if (cc->migratetype == MIGRATE_MOVABLE) + return is_migrate_movable(block_mt); + else + return block_mt == cc->migratetype; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct compact_control *cc, + struct page *page) +{ /* If the page is a large free page, then disallow migration */ if (PageBuddy(page)) { /* @@ -1006,8 +1014,11 @@ static bool suitable_migration_target(struct compact_control *cc, return false; } + if (cc->ignore_block_suitable) + return true; + /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (migrate_async_suitable(get_pageblock_migratetype(page))) + if (is_migrate_movable(get_pageblock_migratetype(page))) return true; /* Otherwise skip the block */ @@ -1242,8 +1253,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, * Async compaction is optimistic to see if the minimum amount * of work satisfies the allocation. */ - if (cc->mode == MIGRATE_ASYNC && - !migrate_async_suitable(get_pageblock_migratetype(page))) + if (!suitable_migration_source(cc, page)) continue; /* Perform the isolation */ @@ -1276,11 +1286,11 @@ static inline bool is_via_compact_memory(int order) return order == -1; } -static enum compact_result __compact_finished(struct zone *zone, struct compact_control *cc, - const int migratetype) +static enum compact_result __compact_finished(struct zone *zone, + struct compact_control *cc) { unsigned int order; - unsigned long watermark; + const int migratetype = cc->migratetype; if (cc->contended || fatal_signal_pending(current)) return COMPACT_CONTENDED; @@ -1308,12 +1318,16 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ if (is_via_compact_memory(cc->order)) return COMPACT_CONTINUE; - /* Compaction run is not finished if the watermark is not met */ - watermark = zone->watermark[cc->alloc_flags & ALLOC_WMARK_MASK]; - - if (!zone_watermark_ok(zone, cc->order, watermark, cc->classzone_idx, - cc->alloc_flags)) - return COMPACT_CONTINUE; + if (cc->finishing_block) { + /* + * We have finished the pageblock, but better check again that + * we really succeeded. + */ + if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + cc->finishing_block = false; + else + return COMPACT_CONTINUE; + } /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { @@ -1335,20 +1349,40 @@ static enum compact_result __compact_finished(struct zone *zone, struct compact_ * other migratetype buddy lists. */ if (find_suitable_fallback(area, order, migratetype, - true, &can_steal) != -1) - return COMPACT_SUCCESS; + true, &can_steal) != -1) { + + /* movable pages are OK in any pageblock */ + if (migratetype == MIGRATE_MOVABLE) + return COMPACT_SUCCESS; + + /* + * We are stealing for a non-movable allocation. Make + * sure we finish compacting the current pageblock + * first so it is as free as possible and we won't + * have to steal another one soon. This only applies + * to sync compaction, as async compaction operates + * on pageblocks of the same migratetype. + */ + if (cc->mode == MIGRATE_ASYNC || + IS_ALIGNED(cc->migrate_pfn, + pageblock_nr_pages)) { + return COMPACT_SUCCESS; + } + + cc->finishing_block = true; + return COMPACT_CONTINUE; + } } return COMPACT_NO_SUITABLE_PAGE; } static enum compact_result compact_finished(struct zone *zone, - struct compact_control *cc, - const int migratetype) + struct compact_control *cc) { int ret; - ret = __compact_finished(zone, cc, migratetype); + ret = __compact_finished(zone, cc); trace_mm_compaction_finished(zone, cc->order, ret); if (ret == COMPACT_NO_SUITABLE_PAGE) ret = COMPACT_CONTINUE; @@ -1481,9 +1515,9 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro enum compact_result ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); - const int migratetype = gfpflags_to_migratetype(cc->gfp_mask); const bool sync = cc->mode != MIGRATE_ASYNC; + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); ret = compaction_suitable(zone, cc->order, cc->alloc_flags, cc->classzone_idx); /* Compaction is likely to fail */ @@ -1533,8 +1567,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro migrate_prep_local(); - while ((ret = compact_finished(zone, cc, migratetype)) == - COMPACT_CONTINUE) { + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { int err; switch (isolate_migratepages(zone, cc)) { diff --git a/mm/filemap.c b/mm/filemap.c index 1694623a628902b76e11d6983eebd1ea7d33ddf1..6f1be573a5e60fbb0c4a6cbcf85df9b32b0fc673 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -519,7 +519,7 @@ EXPORT_SYMBOL(filemap_write_and_wait); * * Write out and wait upon file offsets lstart->lend, inclusive. * - * Note that `lend' is inclusive (describes the last byte to be written) so + * Note that @lend is inclusive (describes the last byte to be written) so * that this function can be used to write to the very end-of-file (end = -1). */ int filemap_write_and_wait_range(struct address_space *mapping, @@ -1277,12 +1277,14 @@ EXPORT_SYMBOL(find_lock_entry); * * PCG flags modify how the page is returned. * - * FGP_ACCESSED: the page will be marked accessed - * FGP_LOCK: Page is return locked - * FGP_CREAT: If page is not present then a new page is allocated using - * @gfp_mask and added to the page cache and the VM's LRU - * list. The page is returned locked and with an increased - * refcount. Otherwise, %NULL is returned. + * @fgp_flags can be: + * + * - FGP_ACCESSED: the page will be marked accessed + * - FGP_LOCK: Page is return locked + * - FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, NULL is returned. * * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even * if the GFP flags specified for FGP_CREAT are atomic. @@ -2033,7 +2035,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) if (iocb->ki_flags & IOCB_DIRECT) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - struct iov_iter data = *iter; loff_t size; size = i_size_read(inode); @@ -2044,11 +2045,12 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) file_accessed(file); - retval = mapping->a_ops->direct_IO(iocb, &data); + retval = mapping->a_ops->direct_IO(iocb, iter); if (retval >= 0) { iocb->ki_pos += retval; - iov_iter_advance(iter, retval); + count -= retval; } + iov_iter_revert(iter, count - iov_iter_count(iter)); /* * Btrfs can have a short DIO read if we encounter @@ -2059,7 +2061,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) * the rest of the read. Buffered reads will not work for * DAX files, so don't bother trying. */ - if (retval < 0 || !iov_iter_count(iter) || iocb->ki_pos >= size || + if (retval < 0 || !count || iocb->ki_pos >= size || IS_DAX(inode)) goto out; } @@ -2202,12 +2204,12 @@ int filemap_fault(struct vm_fault *vmf) struct file_ra_state *ra = &file->f_ra; struct inode *inode = mapping->host; pgoff_t offset = vmf->pgoff; + pgoff_t max_off; struct page *page; - loff_t size; int ret = 0; - size = round_up(i_size_read(inode), PAGE_SIZE); - if (offset >= size >> PAGE_SHIFT) + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) return VM_FAULT_SIGBUS; /* @@ -2256,8 +2258,8 @@ int filemap_fault(struct vm_fault *vmf) * Found the page and have a reference on it. * We must recheck i_size under page lock. */ - size = round_up(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= size >> PAGE_SHIFT)) { + max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); + if (unlikely(offset >= max_off)) { unlock_page(page); put_page(page); return VM_FAULT_SIGBUS; @@ -2323,7 +2325,7 @@ void filemap_map_pages(struct vm_fault *vmf, struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; - loff_t size; + unsigned long max_idx; struct page *head, *page; rcu_read_lock(); @@ -2369,8 +2371,8 @@ void filemap_map_pages(struct vm_fault *vmf, if (page->mapping != mapping || !PageUptodate(page)) goto unlock; - size = round_up(i_size_read(mapping->host), PAGE_SIZE); - if (page->index >= size >> PAGE_SHIFT) + max_idx = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); + if (page->index >= max_idx) goto unlock; if (file->f_ra.mmap_miss > 0) @@ -2704,7 +2706,6 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) ssize_t written; size_t write_len; pgoff_t end; - struct iov_iter data; write_len = iov_iter_count(from); end = (pos + write_len - 1) >> PAGE_SHIFT; @@ -2719,22 +2720,19 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * about to write. We do this *before* the write so that we can return * without clobbering -EIOCBQUEUED from ->direct_IO(). */ - if (mapping->nrpages) { - written = invalidate_inode_pages2_range(mapping, + written = invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT, end); - /* - * If a page can not be invalidated, return 0 to fall back - * to buffered write. - */ - if (written) { - if (written == -EBUSY) - return 0; - goto out; - } + /* + * If a page can not be invalidated, return 0 to fall back + * to buffered write. + */ + if (written) { + if (written == -EBUSY) + return 0; + goto out; } - data = *from; - written = mapping->a_ops->direct_IO(iocb, &data); + written = mapping->a_ops->direct_IO(iocb, from); /* * Finally, try again to invalidate clean pages which might have been @@ -2744,20 +2742,19 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from) * so we don't support it 100%. If this invalidation * fails, tough, the write still worked... */ - if (mapping->nrpages) { - invalidate_inode_pages2_range(mapping, - pos >> PAGE_SHIFT, end); - } + invalidate_inode_pages2_range(mapping, + pos >> PAGE_SHIFT, end); if (written > 0) { pos += written; - iov_iter_advance(from, written); + write_len -= written; if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { i_size_write(inode, pos); mark_inode_dirty(inode); } iocb->ki_pos = pos; } + iov_iter_revert(from, write_len - iov_iter_count(from)); out: return written; } @@ -2794,12 +2791,6 @@ ssize_t generic_perform_write(struct file *file, ssize_t written = 0; unsigned int flags = 0; - /* - * Copies from kernel address space cannot fail (NFSD is a big user). - */ - if (!iter_is_iovec(i)) - flags |= AOP_FLAG_UNINTERRUPTIBLE; - do { struct page *page; unsigned long offset; /* Offset into pagecache page */ @@ -3001,7 +2992,7 @@ EXPORT_SYMBOL(generic_file_write_iter); * @gfp_mask: memory allocation flags (and I/O mode) * * The address_space is to try to release any data against the page - * (presumably at page->private). If the release was successful, return `1'. + * (presumably at page->private). If the release was successful, return '1'. * Otherwise return zero. * * This may also be called if PG_fscache is set on a page, indicating that the diff --git a/mm/frame_vector.c b/mm/frame_vector.c index db77dcb38afda3d3720a228e14236fb2e324f929..72ebec18629c2529fc6d2b6faeb46040adcac611 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -200,10 +200,7 @@ struct frame_vector *frame_vector_create(unsigned int nr_frames) * Avoid higher order allocations, use vmalloc instead. It should * be rare anyway. */ - if (size <= PAGE_SIZE) - vec = kmalloc(size, GFP_KERNEL); - else - vec = vmalloc(size); + vec = kvmalloc(size, GFP_KERNEL); if (!vec) return NULL; vec->nr_allocated = nr_frames; diff --git a/mm/gup.c b/mm/gup.c index 04aa405350dce8656db4293a34e95e9bfbe166d8..d9e6fddcc51f06a1286c56a24c510c1a3efa8add 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1189,34 +1189,57 @@ struct page *get_dump_page(unsigned long addr) */ #ifdef CONFIG_HAVE_GENERIC_RCU_GUP +#ifndef gup_get_pte +/* + * We assume that the PTE can be read atomically. If this is not the case for + * your architecture, please provide the helper. + */ +static inline pte_t gup_get_pte(pte_t *ptep) +{ + return READ_ONCE(*ptep); +} +#endif + +static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) +{ + while ((*nr) - nr_start) { + struct page *page = pages[--(*nr)]; + + ClearPageReferenced(page); + put_page(page); + } +} + #ifdef __HAVE_ARCH_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { + struct dev_pagemap *pgmap = NULL; + int nr_start = *nr, ret = 0; pte_t *ptep, *ptem; - int ret = 0; ptem = ptep = pte_offset_map(&pmd, addr); do { - /* - * In the line below we are assuming that the pte can be read - * atomically. If this is not the case for your architecture, - * please wrap this in a helper function! - * - * for an example see gup_get_pte in arch/x86/mm/gup.c - */ - pte_t pte = READ_ONCE(*ptep); + pte_t pte = gup_get_pte(ptep); struct page *head, *page; /* * Similar to the PMD case below, NUMA hinting must take slow * path using the pte_protnone check. */ - if (!pte_present(pte) || pte_special(pte) || - pte_protnone(pte) || (write && !pte_write(pte))) + if (pte_protnone(pte)) goto pte_unmap; - if (!arch_pte_access_permitted(pte, write)) + if (!pte_access_permitted(pte, write)) + goto pte_unmap; + + if (pte_devmap(pte)) { + pgmap = get_dev_pagemap(pte_pfn(pte), pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + goto pte_unmap; + } + } else if (pte_special(pte)) goto pte_unmap; VM_BUG_ON(!pfn_valid(pte_pfn(pte))); @@ -1232,6 +1255,9 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } VM_BUG_ON_PAGE(compound_head(page) != head, page); + + put_dev_pagemap(pgmap); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -1261,15 +1287,76 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, } #endif /* __HAVE_ARCH_PTE_SPECIAL */ +#ifdef __HAVE_ARCH_PTE_DEVMAP +static int __gup_device_huge(unsigned long pfn, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + int nr_start = *nr; + struct dev_pagemap *pgmap = NULL; + + do { + struct page *page = pfn_to_page(pfn); + + pgmap = get_dev_pagemap(pfn, pgmap); + if (unlikely(!pgmap)) { + undo_dev_pagemap(nr, nr_start, pages); + return 0; + } + SetPageReferenced(page); + pages[*nr] = page; + get_page(page); + put_dev_pagemap(pgmap); + (*nr)++; + pfn++; + } while (addr += PAGE_SIZE, addr != end); + return 1; +} + +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + unsigned long fault_pfn; + + fault_pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + return __gup_device_huge(fault_pfn, addr, end, pages, nr); +} +#else +static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} + +static int __gup_device_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, struct page **pages, int *nr) +{ + BUILD_BUG(); + return 0; +} +#endif + static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { struct page *head, *page; int refs; - if (write && !pmd_write(orig)) + if (!pmd_access_permitted(orig, write)) return 0; + if (pmd_devmap(orig)) + return __gup_device_huge_pmd(orig, addr, end, pages, nr); + refs = 0; head = pmd_page(orig); page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); @@ -1293,6 +1380,7 @@ static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1302,9 +1390,12 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, struct page *head, *page; int refs; - if (write && !pud_write(orig)) + if (!pud_access_permitted(orig, write)) return 0; + if (pud_devmap(orig)) + return __gup_device_huge_pud(orig, addr, end, pages, nr); + refs = 0; head = pud_page(orig); page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); @@ -1328,6 +1419,7 @@ static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1338,9 +1430,10 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, int refs; struct page *head, *page; - if (write && !pgd_write(orig)) + if (!pgd_access_permitted(orig, write)) return 0; + BUILD_BUG_ON(pgd_devmap(orig)); refs = 0; head = pgd_page(orig); page = head + ((addr & ~PGDIR_MASK) >> PAGE_SHIFT); @@ -1364,6 +1457,7 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 0; } + SetPageReferenced(head); return 1; } @@ -1481,7 +1575,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - start, len))) + (void __user *)start, len))) return 0; /* @@ -1520,6 +1614,21 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr; } +#ifndef gup_fast_permitted +/* + * Check if it's allowed to use __get_user_pages_fast() for the range, or + * we need to fall back to the slow version: + */ +bool gup_fast_permitted(unsigned long start, int nr_pages, int write) +{ + unsigned long len, end; + + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; + return end >= start; +} +#endif + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address @@ -1539,11 +1648,14 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { - int nr, ret; + int nr = 0, ret = 0; start &= PAGE_MASK; - nr = __get_user_pages_fast(start, nr_pages, write, pages); - ret = nr; + + if (gup_fast_permitted(start, nr_pages, write)) { + nr = __get_user_pages_fast(start, nr_pages, write, pages); + ret = nr; + } if (nr < nr_pages) { /* Try to get the remaining pages with get_user_pages */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f3c4f9d22821f889104340332eee93c5e124df4d..a84909cf20d36b3d84f00d8529127f78f6b5981d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,7 +715,8 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, - pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write) + pmd_t *pmd, pfn_t pfn, pgprot_t prot, bool write, + pgtable_t pgtable) { struct mm_struct *mm = vma->vm_mm; pmd_t entry; @@ -729,6 +730,12 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, entry = pmd_mkyoung(pmd_mkdirty(entry)); entry = maybe_pmd_mkwrite(entry, vma); } + + if (pgtable) { + pgtable_trans_huge_deposit(mm, pmd, pgtable); + atomic_long_inc(&mm->nr_ptes); + } + set_pmd_at(mm, addr, pmd, entry); update_mmu_cache_pmd(vma, addr, pmd); spin_unlock(ptl); @@ -738,6 +745,7 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, pfn_t pfn, bool write) { pgprot_t pgprot = vma->vm_page_prot; + pgtable_t pgtable = NULL; /* * If we had pmd_special, we could avoid all these restrictions, * but we need to be consistent with PTEs and architectures that @@ -752,9 +760,15 @@ int vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; + if (arch_needs_pgtable_deposit()) { + pgtable = pte_alloc_one(vma->vm_mm, addr); + if (!pgtable) + return VM_FAULT_OOM; + } + track_pfn_insert(vma, &pgprot, pfn); - insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write); + insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd); @@ -1564,9 +1578,6 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, ClearPageDirty(page); unlock_page(page); - if (PageActive(page)) - deactivate_page(page); - if (pmd_young(orig_pmd) || pmd_dirty(orig_pmd)) { pmdp_invalidate(vma, addr, pmd); orig_pmd = pmd_mkold(orig_pmd); @@ -1575,6 +1586,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, set_pmd_at(mm, addr, pmd, orig_pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); } + + mark_page_lazyfree(page); ret = true; out: spin_unlock(ptl); @@ -1612,12 +1625,13 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb->fullmm); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); if (vma_is_dax(vma)) { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { - pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); spin_unlock(ptl); tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { @@ -1626,10 +1640,7 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON_PAGE(page_mapcount(page) < 0, page); VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) { - pgtable_t pgtable; - pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); - pte_free(tlb->mm, pgtable); - atomic_long_dec(&tlb->mm->nr_ptes); + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { if (arch_needs_pgtable_deposit()) @@ -2145,15 +2156,15 @@ static void freeze_page(struct page *page) { enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - int ret; + bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_MIGRATION; - ret = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(ret, page); + unmap_success = try_to_unmap(page, ttu_flags); + VM_BUG_ON_PAGE(!unmap_success, page); } static void unfreeze_page(struct page *page) @@ -2399,7 +2410,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(is_huge_zero_page(page), page); VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); if (PageAnon(head)) { diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 9d26fd9fefe4a1f4ec78279455c3b27f4ddbb875..356df057a2a8d3752b9e058ce74fe7b48142e02b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -34,8 +34,7 @@ static int hwpoison_inject(void *data, u64 val) if (!hwpoison_filter_enable) goto inject; - if (!PageLRU(hpage) && !PageHuge(p)) - shake_page(hpage, 0); + shake_page(hpage, 0); /* * This implies unable to support non-LRU pages. */ diff --git a/mm/internal.h b/mm/internal.h index 266efaeaa370a46debcc5b6b614a72e33833ac4d..0e4f558412fb195c0b2bd997fbc0ac1b9c1e4566 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -80,12 +80,17 @@ static inline void set_page_refcounted(struct page *page) extern unsigned long highest_memmap_pfn; +/* + * Maximum number of reclaim retries without progress before the OOM + * killer is consider the only way forward. + */ +#define MAX_RECLAIM_RETRIES 16 + /* * in mm/vmscan.c: */ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); -extern bool pgdat_reclaimable(struct pglist_data *pgdat); /* * in mm/rmap.c: @@ -178,6 +183,7 @@ extern int user_min_free_kbytes; struct compact_control { struct list_head freepages; /* List of free pages to migrate to */ struct list_head migratepages; /* List of pages being migrated */ + struct zone *zone; unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long total_migrate_scanned; @@ -185,17 +191,18 @@ struct compact_control { unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ unsigned long last_migrated_pfn;/* Not yet flushed page being freed */ + const gfp_t gfp_mask; /* gfp mask of a direct compactor */ + int order; /* order a direct compactor needs */ + int migratetype; /* migratetype of direct compactor */ + const unsigned int alloc_flags; /* alloc flags of a direct compactor */ + const int classzone_idx; /* zone index of a direct compactor */ enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool ignore_block_suitable; /* Scan blocks considered unsuitable */ bool direct_compaction; /* False from kcompactd or /proc/... */ bool whole_zone; /* Whole zone should/has been scanned */ - int order; /* order a direct compactor needs */ - const gfp_t gfp_mask; /* gfp mask of a direct compactor */ - const unsigned int alloc_flags; /* alloc flags of a direct compactor */ - const int classzone_idx; /* zone index of a direct compactor */ - struct zone *zone; bool contended; /* Signal lock or sched contention */ + bool finishing_block; /* Finishing current pageblock */ }; unsigned long @@ -505,4 +512,14 @@ extern const struct trace_print_flags pageflag_names[]; extern const struct trace_print_flags vmaflag_names[]; extern const struct trace_print_flags gfpflag_names[]; +static inline bool is_migrate_highatomic(enum migratetype migratetype) +{ + return migratetype == MIGRATE_HIGHATOMIC; +} + +static inline bool is_migrate_highatomic_page(struct page *page) +{ + return get_pageblock_migratetype(page) == MIGRATE_HIGHATOMIC; +} + #endif /* __MM_INTERNAL_H */ diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 98b27195e38b07fc1b6e20c1f9c49db9bc112303..c81549d5c8330f59bec68165127dff1d3aab85bd 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -413,7 +413,7 @@ void kasan_cache_create(struct kmem_cache *cache, size_t *size, *size += sizeof(struct kasan_alloc_meta); /* Add free meta. */ - if (cache->flags & SLAB_DESTROY_BY_RCU || cache->ctor || + if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor || cache->object_size < sizeof(struct kasan_free_meta)) { cache->kasan_info.free_meta_offset = *size; *size += sizeof(struct kasan_free_meta); @@ -561,7 +561,7 @@ static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return; kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE); @@ -572,12 +572,13 @@ bool kasan_slab_free(struct kmem_cache *cache, void *object) s8 shadow_byte; /* RCU slabs could be legally used after free within the RCU period */ - if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU)) return false; shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object)); if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) { - kasan_report_double_free(cache, object, shadow_byte); + kasan_report_double_free(cache, object, + __builtin_return_address(1)); return true; } @@ -690,7 +691,7 @@ int kasan_module_alloc(void *addr, size_t size) ret = __vmalloc_node_range(shadow_size, 1, shadow_start, shadow_start + shadow_size, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE, __builtin_return_address(0)); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index dd2dea8eb0771a506c0b510efc79c3fc5253bda5..1229298cce646ddc725c4f1ea9d823a0c71e3cd1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -99,7 +99,7 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr) void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow); + void *ip); #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB) void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ab42a0803f161c6834b1362aefd5ded1990eb04f..beee0e980e2dd3385b007fbcb119f599207dacc0 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -51,7 +51,13 @@ static const void *find_first_bad_addr(const void *addr, size_t size) return first_bad_addr; } -static void print_error_description(struct kasan_access_info *info) +static bool addr_has_shadow(struct kasan_access_info *info) +{ + return (info->access_addr >= + kasan_shadow_to_mem((void *)KASAN_SHADOW_START)); +} + +static const char *get_shadow_bug_type(struct kasan_access_info *info) { const char *bug_type = "unknown-crash"; u8 *shadow_addr; @@ -98,12 +104,39 @@ static void print_error_description(struct kasan_access_info *info) break; } - pr_err("BUG: KASAN: %s in %pS at addr %p\n", - bug_type, (void *)info->ip, - info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, task_pid_nr(current)); + return bug_type; +} + +const char *get_wild_bug_type(struct kasan_access_info *info) +{ + const char *bug_type = "unknown-crash"; + + if ((unsigned long)info->access_addr < PAGE_SIZE) + bug_type = "null-ptr-deref"; + else if ((unsigned long)info->access_addr < TASK_SIZE) + bug_type = "user-memory-access"; + else + bug_type = "wild-memory-access"; + + return bug_type; +} + +static const char *get_bug_type(struct kasan_access_info *info) +{ + if (addr_has_shadow(info)) + return get_shadow_bug_type(info); + return get_wild_bug_type(info); +} + +static void print_error_description(struct kasan_access_info *info) +{ + const char *bug_type = get_bug_type(info); + + pr_err("BUG: KASAN: %s in %pS\n", + bug_type, (void *)info->ip); + pr_err("%s of size %zu at addr %p by task %s/%d\n", + info->is_write ? "Write" : "Read", info->access_size, + info->access_addr, current->comm, task_pid_nr(current)); } static inline bool kernel_or_module_addr(const void *addr) @@ -144,9 +177,9 @@ static void kasan_end_report(unsigned long *flags) kasan_enable_current(); } -static void print_track(struct kasan_track *track) +static void print_track(struct kasan_track *track, const char *prefix) { - pr_err("PID = %u\n", track->pid); + pr_err("%s by task %u:\n", prefix, track->pid); if (track->stack) { struct stack_trace trace; @@ -157,59 +190,84 @@ static void print_track(struct kasan_track *track) } } -static void kasan_object_err(struct kmem_cache *cache, void *object) +static struct page *addr_to_page(const void *addr) { - struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); + if ((addr >= (void *)PAGE_OFFSET) && + (addr < high_memory)) + return virt_to_head_page(addr); + return NULL; +} - dump_stack(); - pr_err("Object at %p, in cache %s size: %d\n", object, cache->name, - cache->object_size); +static void describe_object_addr(struct kmem_cache *cache, void *object, + const void *addr) +{ + unsigned long access_addr = (unsigned long)addr; + unsigned long object_addr = (unsigned long)object; + const char *rel_type; + int rel_bytes; - if (!(cache->flags & SLAB_KASAN)) + pr_err("The buggy address belongs to the object at %p\n" + " which belongs to the cache %s of size %d\n", + object, cache->name, cache->object_size); + + if (!addr) return; - pr_err("Allocated:\n"); - print_track(&alloc_info->alloc_track); - pr_err("Freed:\n"); - print_track(&alloc_info->free_track); + if (access_addr < object_addr) { + rel_type = "to the left"; + rel_bytes = object_addr - access_addr; + } else if (access_addr >= object_addr + cache->object_size) { + rel_type = "to the right"; + rel_bytes = access_addr - (object_addr + cache->object_size); + } else { + rel_type = "inside"; + rel_bytes = access_addr - object_addr; + } + + pr_err("The buggy address is located %d bytes %s of\n" + " %d-byte region [%p, %p)\n", + rel_bytes, rel_type, cache->object_size, (void *)object_addr, + (void *)(object_addr + cache->object_size)); } -void kasan_report_double_free(struct kmem_cache *cache, void *object, - s8 shadow) +static void describe_object(struct kmem_cache *cache, void *object, + const void *addr) { - unsigned long flags; + struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object); - kasan_start_report(&flags); - pr_err("BUG: Double free or freeing an invalid pointer\n"); - pr_err("Unexpected shadow byte: 0x%hhX\n", shadow); - kasan_object_err(cache, object); - kasan_end_report(&flags); + if (cache->flags & SLAB_KASAN) { + print_track(&alloc_info->alloc_track, "Allocated"); + pr_err("\n"); + print_track(&alloc_info->free_track, "Freed"); + pr_err("\n"); + } + + describe_object_addr(cache, object, addr); } -static void print_address_description(struct kasan_access_info *info) +static void print_address_description(void *addr) { - const void *addr = info->access_addr; + struct page *page = addr_to_page(addr); - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) { - struct page *page = virt_to_head_page(addr); - - if (PageSlab(page)) { - void *object; - struct kmem_cache *cache = page->slab_cache; - object = nearest_obj(cache, page, - (void *)info->access_addr); - kasan_object_err(cache, object); - return; - } - dump_page(page, "kasan: bad access detected"); + dump_stack(); + pr_err("\n"); + + if (page && PageSlab(page)) { + struct kmem_cache *cache = page->slab_cache; + void *object = nearest_obj(cache, page, addr); + + describe_object(cache, object, addr); } - if (kernel_or_module_addr(addr)) { - if (!init_task_stack_addr(addr)) - pr_err("Address belongs to variable %pS\n", addr); + if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) { + pr_err("The buggy address belongs to the variable:\n"); + pr_err(" %pS\n", addr); + } + + if (page) { + pr_err("The buggy address belongs to the page:\n"); + dump_page(page, "kasan: bad access detected"); } - dump_stack(); } static bool row_is_guilty(const void *row, const void *guilty) @@ -264,31 +322,34 @@ static void print_shadow_for_address(const void *addr) } } +void kasan_report_double_free(struct kmem_cache *cache, void *object, + void *ip) +{ + unsigned long flags; + + kasan_start_report(&flags); + pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", ip); + pr_err("\n"); + print_address_description(object); + pr_err("\n"); + print_shadow_for_address(object); + kasan_end_report(&flags); +} + static void kasan_report_error(struct kasan_access_info *info) { unsigned long flags; - const char *bug_type; kasan_start_report(&flags); - if (info->access_addr < - kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) { - if ((unsigned long)info->access_addr < PAGE_SIZE) - bug_type = "null-ptr-deref"; - else if ((unsigned long)info->access_addr < TASK_SIZE) - bug_type = "user-memory-access"; - else - bug_type = "wild-memory-access"; - pr_err("BUG: KASAN: %s on address %p\n", - bug_type, info->access_addr); - pr_err("%s of size %zu by task %s/%d\n", - info->is_write ? "Write" : "Read", - info->access_size, current->comm, - task_pid_nr(current)); + print_error_description(info); + pr_err("\n"); + + if (!addr_has_shadow(info)) { dump_stack(); } else { - print_error_description(info); - print_address_description(info); + print_address_description((void *)info->access_addr); + pr_err("\n"); print_shadow_for_address(info->first_bad_addr); } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ba40b7f673f4dd44af403c7ed33860c6e2094046..945fd1ca49b5af0bc3b87dbfe8098f0f602775a2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -483,8 +483,7 @@ void __khugepaged_exit(struct mm_struct *mm) static void release_pte_page(struct page *page) { - /* 0 stands for page_is_file_cache(page) == false */ - dec_node_page_state(page, NR_ISOLATED_ANON + 0); + dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); unlock_page(page); putback_lru_page(page); } @@ -532,7 +531,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!PageAnon(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); /* * We can do it before isolate_lru_page because the @@ -550,7 +548,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { unlock_page(page); result = SCAN_PAGE_COUNT; goto out; @@ -579,8 +577,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, result = SCAN_DEL_PAGE_LRU; goto out; } - /* 0 stands for page_is_file_cache(page) == false */ - inc_node_page_state(page, NR_ISOLATED_ANON + 0); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageLRU(page), page); @@ -614,7 +612,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spinlock_t *ptl) { pte_t *_pte; - for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + for (_pte = pte; _pte < pte + HPAGE_PMD_NR; + _pte++, page++, address += PAGE_SIZE) { pte_t pteval = *_pte; struct page *src_page; @@ -653,9 +652,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, spin_unlock(ptl); free_page_and_swap_cache(src_page); } - - address += PAGE_SIZE; - page++; + cond_resched(); } } @@ -909,8 +906,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + if (mm_find_pmd(mm, address) != pmd) { + trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; + } } if (ret & VM_FAULT_ERROR) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); @@ -1183,7 +1182,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, * The page must only be referenced by the scanned process * and page swap cache. */ - if (page_count(page) != 1 + !!PageSwapCache(page)) { + if (page_count(page) != 1 + PageSwapCache(page)) { result = SCAN_PAGE_COUNT; goto out_unmap; } diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 5bf191756a4a07b04ffe1dd792f475e1e0af0492..2d5959c5f7c50469ca3d59da9c62c6c2dd932917 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -95,7 +95,7 @@ void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) { /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) kmemcheck_mark_freed(object, size); } diff --git a/mm/ksm.c b/mm/ksm.c index 19b4f2dea7a591793ff8e18b6eeeafdc5df1de30..d9fc0e4561283d9a351f6dd7c4cfb74ad26ab566 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1933,11 +1933,10 @@ struct page *ksm_might_need_to_copy(struct page *page, return new_page; } -int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { struct stable_node *stable_node; struct rmap_item *rmap_item; - int ret = SWAP_AGAIN; int search_new_forks = 0; VM_BUG_ON_PAGE(!PageKsm(page), page); @@ -1950,7 +1949,7 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) stable_node = page_stable_node(page); if (!stable_node) - return ret; + return; again: hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) { struct anon_vma *anon_vma = rmap_item->anon_vma; @@ -1978,23 +1977,20 @@ int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, - rmap_item->address, rwc->arg); - if (ret != SWAP_AGAIN) { + if (!rwc->rmap_one(page, vma, + rmap_item->address, rwc->arg)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } if (rwc->done && rwc->done(page)) { anon_vma_unlock_read(anon_vma); - goto out; + return; } } anon_vma_unlock_read(anon_vma); } if (!search_new_forks++) goto again; -out: - return ret; } #ifdef CONFIG_MIGRATION diff --git a/mm/madvise.c b/mm/madvise.c index 7a2abf0127aef7a9d4879278293d8cab766133e1..25b78ee4fc2c77addde06a22580e02bf05bb3b51 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -411,10 +411,9 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, ptent = pte_mkold(ptent); ptent = pte_mkclean(ptent); set_pte_at(mm, addr, pte, ptent); - if (PageActive(page)) - deactivate_page(page); tlb_remove_tlb_entry(tlb, pte, addr); } + mark_page_lazyfree(page); } out: if (nr_swap) { @@ -606,34 +605,40 @@ static long madvise_remove(struct vm_area_struct *vma, /* * Error injection support for memory error handling. */ -static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +static int madvise_inject_error(int behavior, + unsigned long start, unsigned long end) { - struct page *p; + struct page *page; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; + for (; start < end; start += PAGE_SIZE << - compound_order(compound_head(p))) { + compound_order(compound_head(page))) { int ret; - ret = get_user_pages_fast(start, 1, 0, &p); + ret = get_user_pages_fast(start, 1, 0, &page); if (ret != 1) return ret; - if (PageHWPoison(p)) { - put_page(p); + if (PageHWPoison(page)) { + put_page(page); continue; } - if (bhv == MADV_SOFT_OFFLINE) { - pr_info("Soft offlining page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = soft_offline_page(p, MF_COUNT_INCREASED); + + if (behavior == MADV_SOFT_OFFLINE) { + pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = soft_offline_page(page, MF_COUNT_INCREASED); if (ret) return ret; continue; } - pr_info("Injecting memory failure for page %#lx at %#lx\n", - page_to_pfn(p), start); - ret = memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n", + page_to_pfn(page), start); + + ret = memory_failure(page_to_pfn(page), 0, MF_COUNT_INCREASED); if (ret) return ret; } @@ -651,13 +656,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, case MADV_WILLNEED: return madvise_willneed(vma, prev, start, end); case MADV_FREE: - /* - * XXX: In this implementation, MADV_FREE works like - * MADV_DONTNEED on swapless system or full swap. - */ - if (get_nr_swap_pages() > 0) - return madvise_free(vma, prev, start, end); - /* passthrough */ + return madvise_free(vma, prev, start, end); case MADV_DONTNEED: return madvise_dontneed(vma, prev, start, end); default: @@ -688,6 +687,10 @@ madvise_behavior_valid(int behavior) #endif case MADV_DONTDUMP: case MADV_DODUMP: +#ifdef CONFIG_MEMORY_FAILURE + case MADV_SOFT_OFFLINE: + case MADV_HWPOISON: +#endif return true; default: @@ -761,10 +764,6 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) size_t len; struct blk_plug plug; -#ifdef CONFIG_MEMORY_FAILURE - if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) - return madvise_hwpoison(behavior, start, start+len_in); -#endif if (!madvise_behavior_valid(behavior)) return error; @@ -784,6 +783,11 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) if (end == start) return error; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_inject_error(behavior, start, start + len_in); +#endif + write = madvise_need_mmap_write(behavior); if (write) { if (down_write_killable(¤t->mm->mmap_sem)) diff --git a/mm/memblock.c b/mm/memblock.c index 696f06d17c4e89b676f19c3c3a5a4c1908697caf..b049c9b2dba8718a6f57777591c2f2753d8d40ad 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -804,6 +804,18 @@ int __init_memblock memblock_mark_nomap(phys_addr_t base, phys_addr_t size) return memblock_setclr_flag(base, size, 1, MEMBLOCK_NOMAP); } +/** + * memblock_clear_nomap - Clear flag MEMBLOCK_NOMAP for a specified region. + * @base: the base phys addr of the region + * @size: the size of the region + * + * Return 0 on success, -errno on failure. + */ +int __init_memblock memblock_clear_nomap(phys_addr_t base, phys_addr_t size) +{ + return memblock_setclr_flag(base, size, 0, MEMBLOCK_NOMAP); +} + /** * __next_reserved_mem_region - next function for for_each_reserved_region() * @idx: pointer to u64 loop variable @@ -1531,11 +1543,37 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) (phys_addr_t)ULLONG_MAX); } +void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) +{ + int start_rgn, end_rgn; + int i, ret; + + if (!size) + return; + + ret = memblock_isolate_range(&memblock.memory, base, size, + &start_rgn, &end_rgn); + if (ret) + return; + + /* remove all the MAP regions */ + for (i = memblock.memory.cnt - 1; i >= end_rgn; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + for (i = start_rgn - 1; i >= 0; i--) + if (!memblock_is_nomap(&memblock.memory.regions[i])) + memblock_remove_region(&memblock.memory, i); + + /* truncate the reserved regions */ + memblock_remove_range(&memblock.reserved, 0, base); + memblock_remove_range(&memblock.reserved, + base + size, (phys_addr_t)ULLONG_MAX); +} + void __init memblock_mem_limit_remove_map(phys_addr_t limit) { - struct memblock_type *type = &memblock.memory; phys_addr_t max_addr; - int i, ret, start_rgn, end_rgn; if (!limit) return; @@ -1546,19 +1584,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) if (max_addr == (phys_addr_t)ULLONG_MAX) return; - ret = memblock_isolate_range(type, max_addr, (phys_addr_t)ULLONG_MAX, - &start_rgn, &end_rgn); - if (ret) - return; - - /* remove all the MAP regions above the limit */ - for (i = end_rgn - 1; i >= start_rgn; i--) { - if (!memblock_is_nomap(&type->regions[i])) - memblock_remove_region(type, i); - } - /* truncate the reserved regions */ - memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + memblock_cap_memory_range(0, max_addr); } static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2bd7541d7c11231431c060ca6cfe84a89f096fe3..94172089f52fce369c57ce8d3e783a9480d522f2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -100,24 +100,7 @@ static bool do_memsw_account(void) return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account; } -static const char * const mem_cgroup_stat_names[] = { - "cache", - "rss", - "rss_huge", - "mapped_file", - "dirty", - "writeback", - "swap", -}; - -static const char * const mem_cgroup_events_names[] = { - "pgpgin", - "pgpgout", - "pgfault", - "pgmajfault", -}; - -static const char * const mem_cgroup_lru_names[] = { +static const char *const mem_cgroup_lru_names[] = { "inactive_anon", "active_anon", "inactive_file", @@ -568,32 +551,15 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) * common workload, threshold and synchronization as vmstat[] should be * implemented. */ -static unsigned long -mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) -{ - long val = 0; - int cpu; - - /* Per-cpu values can be negative, use a signed accumulator */ - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - /* - * Summing races with updates, so val may be negative. Avoid exposing - * transient negative values. - */ - if (val < 0) - val = 0; - return val; -} -static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, - enum mem_cgroup_events_index idx) +static unsigned long memcg_sum_events(struct mem_cgroup *memcg, + enum memcg_event_item event) { unsigned long val = 0; int cpu; for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[event], cpu); return val; } @@ -606,23 +572,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], - nr_pages); - else - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + else { + __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + if (PageSwapBacked(page)) + __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], - nr_pages); + __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[PGPGIN]); else { - __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[PGPGOUT]); nr_pages = -nr_pages; /* for event */ } @@ -1144,6 +1110,28 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } +unsigned int memcg1_stats[] = { + MEMCG_CACHE, + MEMCG_RSS, + MEMCG_RSS_HUGE, + NR_SHMEM, + NR_FILE_MAPPED, + NR_FILE_DIRTY, + NR_WRITEBACK, + MEMCG_SWAP, +}; + +static const char *const memcg1_stat_names[] = { + "cache", + "rss", + "rss_huge", + "shmem", + "mapped_file", + "dirty", + "writeback", + "swap", +}; + #define K(x) ((x) << (PAGE_SHIFT-10)) /** * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. @@ -1188,11 +1176,11 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_cont_cgroup_path(iter->css.cgroup); pr_cont(":"); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account) continue; - pr_cont(" %s:%luKB", mem_cgroup_stat_names[i], - K(mem_cgroup_read_stat(iter, i))); + pr_cont(" %s:%luKB", memcg1_stat_names[i], + K(memcg_page_state(iter, memcg1_stats[i]))); } for (i = 0; i < NR_LRU_LISTS; i++) @@ -1837,7 +1825,7 @@ static void reclaim_high(struct mem_cgroup *memcg, do { if (page_counter_read(&memcg->memory) <= memcg->high) continue; - mem_cgroup_events(memcg, MEMCG_HIGH, 1); + mem_cgroup_event(memcg, MEMCG_HIGH); try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true); } while ((memcg = parent_mem_cgroup(memcg))); } @@ -1928,7 +1916,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!gfpflags_allow_blocking(gfp_mask)) goto nomem; - mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1); + mem_cgroup_event(mem_over_limit, MEMCG_MAX); nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, gfp_mask, may_swap); @@ -1971,7 +1959,7 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, if (fatal_signal_pending(current)) goto force; - mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1); + mem_cgroup_event(mem_over_limit, MEMCG_OOM); mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages * PAGE_SIZE)); @@ -2381,7 +2369,7 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE], + __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ @@ -2391,7 +2379,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); + this_cpu_add(memcg->stat->count[MEMCG_SWAP], val); } /** @@ -2725,7 +2713,7 @@ static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += mem_cgroup_read_stat(iter, i); + stat[i] += memcg_page_state(iter, i); } } @@ -2738,7 +2726,7 @@ static void tree_events(struct mem_cgroup *memcg, unsigned long *events) for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < MEMCG_NR_EVENTS; i++) - events[i] += mem_cgroup_read_events(iter, i); + events[i] += memcg_sum_events(iter, i); } } @@ -2750,13 +2738,10 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) struct mem_cgroup *iter; for_each_mem_cgroup_tree(iter, memcg) { - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_RSS); + val += memcg_page_state(iter, MEMCG_CACHE); + val += memcg_page_state(iter, MEMCG_RSS); if (swap) - val += mem_cgroup_read_stat(iter, - MEM_CGROUP_STAT_SWAP); + val += memcg_page_state(iter, MEMCG_SWAP); } } else { if (!swap) @@ -3131,6 +3116,21 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) } #endif /* CONFIG_NUMA */ +/* Universal VM events cgroup1 shows, original sort order */ +unsigned int memcg1_events[] = { + PGPGIN, + PGPGOUT, + PGFAULT, + PGMAJFAULT, +}; + +static const char *const memcg1_event_names[] = { + "pgpgin", + "pgpgout", + "pgfault", + "pgmajfault", +}; + static int memcg_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -3138,22 +3138,20 @@ static int memcg_stat_show(struct seq_file *m, void *v) struct mem_cgroup *mi; unsigned int i; - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) != - MEM_CGROUP_STAT_NSTATS); - BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) != - MEM_CGROUP_EVENTS_NSTATS); + BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i], - mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + memcg_page_state(memcg, memcg1_stats[i]) * + PAGE_SIZE); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) - seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], - mem_cgroup_read_events(memcg, i)); + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "%s %lu\n", memcg1_event_names[i], + memcg_sum_events(memcg, memcg1_events[i])); for (i = 0; i < NR_LRU_LISTS; i++) seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], @@ -3171,23 +3169,23 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { unsigned long long val = 0; - if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account()) + if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val); + val += memcg_page_state(mi, memcg1_stats[i]) * + PAGE_SIZE; + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); } - for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { unsigned long long val = 0; for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_read_events(mi, i); - seq_printf(m, "total_%s %llu\n", - mem_cgroup_events_names[i], val); + val += memcg_sum_events(mi, memcg1_events[i]); + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); } for (i = 0; i < NR_LRU_LISTS; i++) { @@ -3652,10 +3650,10 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); struct mem_cgroup *parent; - *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY); + *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY); /* this should eventually include NR_UNSTABLE_NFS */ - *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK); + *pwriteback = memcg_page_state(memcg, NR_WRITEBACK); *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) | (1 << LRU_ACTIVE_FILE)); *pheadroom = PAGE_COUNTER_MAX; @@ -4511,33 +4509,29 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED], - nr_pages); + __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); + __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); } /* * move_lock grabbed above and caller set from->moving_account, so - * mem_cgroup_update_page_stat() will serialize updates to PageDirty. + * mod_memcg_page_state will serialize updates to PageDirty. * So mapping should be stable for dirty pages. */ if (!anon && PageDirty(page)) { struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY], + __this_cpu_add(to->stat->count[NR_FILE_DIRTY], nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); - __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK], - nr_pages); + __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); + __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); } /* @@ -5154,7 +5148,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, continue; } - mem_cgroup_events(memcg, MEMCG_OOM, 1); + mem_cgroup_event(memcg, MEMCG_OOM); if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0)) break; } @@ -5167,10 +5161,10 @@ static int memory_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW)); - seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH)); - seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX)); - seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM)); + seq_printf(m, "low %lu\n", memcg_sum_events(memcg, MEMCG_LOW)); + seq_printf(m, "high %lu\n", memcg_sum_events(memcg, MEMCG_HIGH)); + seq_printf(m, "max %lu\n", memcg_sum_events(memcg, MEMCG_MAX)); + seq_printf(m, "oom %lu\n", memcg_sum_events(memcg, MEMCG_OOM)); return 0; } @@ -5197,9 +5191,9 @@ static int memory_stat_show(struct seq_file *m, void *v) tree_events(memcg, events); seq_printf(m, "anon %llu\n", - (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE); + (u64)stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE); + (u64)stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", @@ -5208,12 +5202,14 @@ static int memory_stat_show(struct seq_file *m, void *v) seq_printf(m, "sock %llu\n", (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + seq_printf(m, "shmem %llu\n", + (u64)stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE); + (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE); + (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE); + (u64)stat[NR_WRITEBACK] * PAGE_SIZE); for (i = 0; i < NR_LRU_LISTS; i++) { struct mem_cgroup *mi; @@ -5232,10 +5228,15 @@ static int memory_stat_show(struct seq_file *m, void *v) /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", - events[MEM_CGROUP_EVENTS_PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", - events[MEM_CGROUP_EVENTS_PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + + seq_printf(m, "workingset_refault %lu\n", + stat[WORKINGSET_REFAULT]); + seq_printf(m, "workingset_activate %lu\n", + stat[WORKINGSET_ACTIVATE]); + seq_printf(m, "workingset_nodereclaim %lu\n", + stat[WORKINGSET_NODERECLAIM]); return 0; } @@ -5476,8 +5477,8 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg, static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, unsigned long nr_kmem, - struct page *dummy_page) + unsigned long nr_kmem, unsigned long nr_huge, + unsigned long nr_shmem, struct page *dummy_page) { unsigned long nr_pages = nr_anon + nr_file + nr_kmem; unsigned long flags; @@ -5492,10 +5493,11 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, } local_irq_save(flags); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file); - __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge); - __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS], nr_anon); + __this_cpu_sub(memcg->stat->count[MEMCG_CACHE], nr_file); + __this_cpu_sub(memcg->stat->count[MEMCG_RSS_HUGE], nr_huge); + __this_cpu_sub(memcg->stat->count[NR_SHMEM], nr_shmem); + __this_cpu_add(memcg->stat->events[PGPGOUT], pgpgout); __this_cpu_add(memcg->stat->nr_page_events, nr_pages); memcg_check_events(memcg, dummy_page); local_irq_restore(flags); @@ -5507,6 +5509,7 @@ static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, static void uncharge_list(struct list_head *page_list) { struct mem_cgroup *memcg = NULL; + unsigned long nr_shmem = 0; unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; @@ -5525,7 +5528,7 @@ static void uncharge_list(struct list_head *page_list) next = page->lru.next; VM_BUG_ON_PAGE(PageLRU(page), page); - VM_BUG_ON_PAGE(page_count(page), page); + VM_BUG_ON_PAGE(!PageHWPoison(page) && page_count(page), page); if (!page->mem_cgroup) continue; @@ -5539,9 +5542,9 @@ static void uncharge_list(struct list_head *page_list) if (memcg != page->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); - pgpgout = nr_anon = nr_file = - nr_huge = nr_kmem = 0; + nr_kmem, nr_huge, nr_shmem, page); + pgpgout = nr_anon = nr_file = nr_kmem = 0; + nr_huge = nr_shmem = 0; } memcg = page->mem_cgroup; } @@ -5555,8 +5558,11 @@ static void uncharge_list(struct list_head *page_list) } if (PageAnon(page)) nr_anon += nr_pages; - else + else { nr_file += nr_pages; + if (PageSwapBacked(page)) + nr_shmem += nr_pages; + } pgpgout++; } else { nr_kmem += 1 << compound_order(page); @@ -5568,7 +5574,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_anon, nr_file, - nr_huge, nr_kmem, page); + nr_kmem, nr_huge, nr_shmem, page); } /** diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 27f7210e7fabd1441d699d328213f95302c79378..2527dfeddb003d245ac2e2bd964134030426f777 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -220,6 +220,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, */ void shake_page(struct page *p, int access) { + if (PageHuge(p)) + return; + if (!PageSlab(p)) { lru_add_drain_all(); if (PageLRU(p)) @@ -322,7 +325,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * wrong earlier. */ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, - int fail, struct page *page, unsigned long pfn, + bool fail, struct page *page, unsigned long pfn, int flags) { struct to_kill *tk, *next; @@ -536,6 +539,13 @@ static int delete_from_lru_cache(struct page *p) */ ClearPageActive(p); ClearPageUnevictable(p); + + /* + * Poisoned page might never drop its ref count to 0 so we have + * to uncharge it manually from its memcg. + */ + mem_cgroup_uncharge(p); + /* * drop the page count elevated by isolate_lru_page() */ @@ -904,35 +914,36 @@ EXPORT_SYMBOL_GPL(get_hwpoison_page); * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. */ -static int hwpoison_user_mappings(struct page *p, unsigned long pfn, +static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, int trapno, int flags, struct page **hpagep) { - enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; struct address_space *mapping; LIST_HEAD(tokill); - int ret; + bool unmap_success; int kill = 1, forcekill; struct page *hpage = *hpagep; + bool mlocked = PageMlocked(hpage); /* * Here we are interested only in user-mapped pages, so skip any * other types of pages. */ if (PageReserved(p) || PageSlab(p)) - return SWAP_SUCCESS; + return true; if (!(PageLRU(hpage) || PageHuge(p))) - return SWAP_SUCCESS; + return true; /* * This check implies we don't kill processes if their pages * are in the swap cache early. Those are always late kills. */ if (!page_mapped(hpage)) - return SWAP_SUCCESS; + return true; if (PageKsm(p)) { pr_err("Memory failure: %#lx: can't handle KSM pages.\n", pfn); - return SWAP_FAIL; + return false; } if (PageSwapCache(p)) { @@ -971,11 +982,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); - ret = try_to_unmap(hpage, ttu); - if (ret != SWAP_SUCCESS) + unmap_success = try_to_unmap(hpage, ttu); + if (!unmap_success) pr_err("Memory failure: %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); + /* + * try_to_unmap() might put mlocked page in lru cache, so call + * shake_page() again to ensure that it's flushed. + */ + if (mlocked) + shake_page(hpage, 0); + /* * Now that the dirty bit has been propagated to the * struct page and all unmaps done we can decide if @@ -987,10 +1005,9 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * any accesses to the poisoned memory. */ forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); - kill_procs(&tokill, forcekill, trapno, - ret != SWAP_SUCCESS, p, pfn, flags); + kill_procs(&tokill, forcekill, trapno, !unmap_success, p, pfn, flags); - return ret; + return unmap_success; } static void set_page_hwpoison_huge_page(struct page *hpage) @@ -1138,22 +1155,14 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageHuge(p)) { - if (!PageLRU(p)) - shake_page(p, 0); - if (!PageLRU(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - if (flags & MF_COUNT_INCREASED) - action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); - else - action_result(pfn, MF_MSG_BUDDY_2ND, - MF_DELAYED); - return 0; - } - } + shake_page(p, 0); + /* shake_page could have turned it free. */ + if (!PageLRU(p) && is_free_buddy_page(p)) { + if (flags & MF_COUNT_INCREASED) + action_result(pfn, MF_MSG_BUDDY, MF_DELAYED); + else + action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED); + return 0; } lock_page(hpage); @@ -1230,8 +1239,7 @@ int memory_failure(unsigned long pfn, int trapno, int flags) * When the raw error page is thp tail page, hpage points to the raw * page after thp split. */ - if (hwpoison_user_mappings(p, pfn, trapno, flags, &hpage) - != SWAP_SUCCESS) { + if (!hwpoison_user_mappings(p, pfn, trapno, flags, &hpage)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; goto out; @@ -1543,8 +1551,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags) if (ret == 1 && !PageLRU(page)) { /* Drop page reference which is from __get_any_page() */ put_hwpoison_page(page); - pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); + pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n", + pfn, page->flags, &page->flags); return -EIO; } } @@ -1585,8 +1593,8 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); /* * We know that soft_offline_huge_page() tries to migrate * only one hugepage pointed to by hpage, so we need not @@ -1677,14 +1685,14 @@ static int __soft_offline_page(struct page *page, int flags) if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pfn, ret, page->flags, &page->flags); if (ret > 0) ret = -EIO; } } else { - pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", - pfn, ret, page_count(page), page->flags); + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx (%pGp)\n", + pfn, ret, page_count(page), page->flags, &page->flags); } return ret; } diff --git a/mm/memory.c b/mm/memory.c index 235ba51b2fbf07ffeeeb6b70d6522b4b0addb3de..6ff5d729ded0ecd3a5607d10248697a786091f7e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4298,7 +4298,7 @@ void __might_fault(const char *file, int line) * get paged out, therefore we'll never actually fault, and the * below annotations will generate false positives. */ - if (segment_eq(get_fs(), KERNEL_DS)) + if (uaccess_kernel()) return; if (pagefault_disabled()) return; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 6fa7208bcd564ec8fb6bcf25e206aef9bd724ecb..b63d7d1239df22714da632e191ce4d03049daeb0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1208,7 +1208,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) arch_refresh_nodedata(nid, pgdat); } else { - /* Reset the nr_zones, order and classzone_idx before reuse */ + /* + * Reset the nr_zones, order and classzone_idx before reuse. + * Note that kswapd will init kswapd_classzone_idx properly + * when it starts in the near future. + */ pgdat->nr_zones = 0; pgdat->kswapd_order = 0; pgdat->kswapd_classzone_idx = 0; diff --git a/mm/migrate.c b/mm/migrate.c index ed97c2c14fa80b47ffbf7fa22ec6d4b9b57202b1..89a0a1707f4c67deb77dc466cb6f5c2ecfe19051 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -184,9 +184,9 @@ void putback_movable_pages(struct list_head *l) unlock_page(page); put_page(page); } else { - putback_lru_page(page); dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); + putback_lru_page(page); } } } @@ -194,7 +194,7 @@ void putback_movable_pages(struct list_head *l) /* * Restore a potential migration pte to a working pte entry */ -static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, +static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *old) { struct page_vma_mapped_walk pvmw = { @@ -253,7 +253,7 @@ static int remove_migration_pte(struct page *page, struct vm_area_struct *vma, update_mmu_cache(vma, pvmw.address, pvmw.pte); } - return SWAP_AGAIN; + return true; } /* @@ -1722,9 +1722,6 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, { int z; - if (!pgdat_reclaimable(pgdat)) - return false; - for (z = pgdat->nr_zones - 1; z >= 0; z--) { struct zone *zone = pgdat->node_zones + z; @@ -1947,7 +1944,8 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, /* Prepare a page as a migration target */ __SetPageLocked(new_page); - __SetPageSwapBacked(new_page); + if (PageSwapBacked(page)) + __SetPageSwapBacked(new_page); /* anon mapping, we can simply copy page->mapping to the new page: */ new_page->mapping = page->mapping; diff --git a/mm/mlock.c b/mm/mlock.c index 0dd9ca18e19ed7ddb499a480c5831c312791b10a..c483c5c20b4bd12bcca50972c9f74a0dbd3a713e 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -123,17 +123,15 @@ static bool __munlock_isolate_lru_page(struct page *page, bool getpage) */ static void __munlock_isolated_page(struct page *page) { - int ret = SWAP_AGAIN; - /* * Optimization: if the page was mapped just once, that's our mapping * and we don't need to check all the other vmas. */ if (page_mapcount(page) > 1) - ret = try_to_munlock(page); + try_to_munlock(page); /* Did try_to_unlock() succeed or punt? */ - if (ret != SWAP_MLOCK) + if (!PageMlocked(page)) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); diff --git a/mm/mmap.c b/mm/mmap.c index bfbe8856d134f367464e3582d9a432260487777c..f82741e199c0b06d971bd30d4503bc3e6e6f9dc5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1479,7 +1479,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, struct user_struct *user = NULL; struct hstate *hs; - hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (!hs) return -EINVAL; diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index a7652acd2ab93c2e290f31fbe80e36d157abb609..54ca545629286223a16ef931830a3757b29da77d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -21,7 +21,7 @@ #include /* global SRCU for all MMs */ -static struct srcu_struct srcu; +DEFINE_STATIC_SRCU(srcu); /* * This function allows mmu_notifier::release callback to delay a call to @@ -252,12 +252,6 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); - /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ - BUG_ON(!srcu.per_cpu_ref); - ret = -ENOMEM; mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); if (unlikely(!mmu_notifier_mm)) @@ -406,9 +400,3 @@ void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, mmdrop(mm); } EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release); - -static int __init mmu_notifier_init(void) -{ - return init_srcu_struct(&srcu); -} -subsys_initcall(mmu_notifier_init); diff --git a/mm/nommu.c b/mm/nommu.c index 2d131b97a85169eb11716874b552dc4cef5b4115..fc184f597d59d9af942f8dc60229b999173fe22f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -237,12 +237,16 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) } EXPORT_SYMBOL(__vmalloc); +void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags) +{ + return __vmalloc(size, flags, PAGE_KERNEL); +} + void *vmalloc_user(unsigned long size) { void *ret; - ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, - PAGE_KERNEL); + ret = __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL); if (ret) { struct vm_area_struct *vma; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index d083714a2bb924a0303fa7535acf09d7355aedf0..04c9143a86255a179aa40c06c2f36d203608117b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -685,6 +685,7 @@ void exit_oom_victim(void) void oom_killer_enable(void) { oom_killer_disabled = false; + pr_info("OOM killer enabled.\n"); } /** @@ -721,6 +722,7 @@ bool oom_killer_disable(signed long timeout) oom_killer_enable(); return false; } + pr_info("OOM killer disabled.\n"); return true; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d8ac2a7fb9e7b6db9de3755ab7898095f70f8383..143c1c25d680c272eefd1e2ee7f58a0043d4f8b1 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -650,9 +650,8 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - init_timer_deferrable(&dom->period_timer); - dom->period_timer.function = writeout_period; - dom->period_timer.data = (unsigned long)dom; + setup_deferrable_timer(&dom->period_timer, writeout_period, + (unsigned long)dom); dom->dirty_limit_tstamp = jiffies; @@ -2353,10 +2352,16 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); + while (1) { + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + break; + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } return ret; } @@ -2428,7 +2433,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) inode_attach_wb(inode, page); wb = inode_to_wb(inode); - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_DIRTY); + inc_memcg_page_state(page, NR_FILE_DIRTY); __inc_node_page_state(page, NR_FILE_DIRTY); __inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); __inc_node_page_state(page, NR_DIRTIED); @@ -2450,7 +2455,7 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb) { if (mapping_cap_account_dirty(mapping)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2707,7 +2712,7 @@ int clear_page_dirty_for_io(struct page *page) */ wb = unlocked_inode_to_wb_begin(inode, &locked); if (TestClearPageDirty(page)) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_DIRTY); + dec_memcg_page_state(page, NR_FILE_DIRTY); dec_node_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_wb_stat(wb, WB_RECLAIMABLE); @@ -2754,7 +2759,7 @@ int test_clear_page_writeback(struct page *page) ret = TestClearPageWriteback(page); } if (ret) { - mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + dec_memcg_page_state(page, NR_WRITEBACK); dec_node_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); inc_node_page_state(page, NR_WRITTEN); @@ -2809,7 +2814,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ret = TestSetPageWriteback(page); } if (!ret) { - mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); + inc_memcg_page_state(page, NR_WRITEBACK); inc_node_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_ZONE_WRITE_PENDING); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f3d603cef2c0c0e5aef09540dd2f8d50da5a808c..f9e450c6b6e414d61b00d5a61be9cdea3b773e1b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -1090,14 +1091,10 @@ static void free_pcppages_bulk(struct zone *zone, int count, { int migratetype = 0; int batch_free = 0; - unsigned long nr_scanned, flags; bool isolated_pageblocks; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); isolated_pageblocks = has_isolate_pageblock(zone); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); while (count) { struct page *page; @@ -1142,7 +1139,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, trace_mm_page_pcpu_drain(page, 0, mt); } while (--count && --batch_free && !list_empty(list)); } - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); } static void free_one_page(struct zone *zone, @@ -1150,19 +1147,13 @@ static void free_one_page(struct zone *zone, unsigned int order, int migratetype) { - unsigned long nr_scanned, flags; - spin_lock_irqsave(&zone->lock, flags); - __count_vm_events(PGFREE, 1 << order); - nr_scanned = node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED); - if (nr_scanned) - __mod_node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED, -nr_scanned); - + spin_lock(&zone->lock); if (unlikely(has_isolate_pageblock(zone) || is_migrate_isolate(migratetype))) { migratetype = get_pfnblock_migratetype(page, pfn); } __free_one_page(page, pfn, zone, order, migratetype); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); } static void __meminit __init_single_page(struct page *page, unsigned long pfn, @@ -1240,6 +1231,7 @@ void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) static void __free_pages_ok(struct page *page, unsigned int order) { + unsigned long flags; int migratetype; unsigned long pfn = page_to_pfn(page); @@ -1247,7 +1239,10 @@ static void __free_pages_ok(struct page *page, unsigned int order) return; migratetype = get_pfnblock_migratetype(page, pfn); + local_irq_save(flags); + __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype); + local_irq_restore(flags); } static void __init __free_pages_boot_core(struct page *page, unsigned int order) @@ -1695,10 +1690,10 @@ static inline int check_new_page(struct page *page) return 1; } -static inline bool free_pages_prezeroed(bool poisoned) +static inline bool free_pages_prezeroed(void) { return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && - page_poisoning_enabled() && poisoned; + page_poisoning_enabled(); } #ifdef CONFIG_DEBUG_VM @@ -1752,17 +1747,10 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags unsigned int alloc_flags) { int i; - bool poisoned = true; - - for (i = 0; i < (1 << order); i++) { - struct page *p = page + i; - if (poisoned) - poisoned &= page_is_poisoned(p); - } post_alloc_hook(page, order, gfp_flags); - if (!free_pages_prezeroed(poisoned) && (gfp_flags & __GFP_ZERO)) + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) for (i = 0; i < (1 << order); i++) clear_highpage(page + i); @@ -1844,9 +1832,9 @@ static inline struct page *__rmqueue_cma_fallback(struct zone *zone, * Note that start_page and end_pages are not aligned on a pageblock * boundary. If alignment is required, use move_freepages_block() */ -int move_freepages(struct zone *zone, +static int move_freepages(struct zone *zone, struct page *start_page, struct page *end_page, - int migratetype) + int migratetype, int *num_movable) { struct page *page; unsigned int order; @@ -1863,6 +1851,9 @@ int move_freepages(struct zone *zone, VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); #endif + if (num_movable) + *num_movable = 0; + for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; @@ -1873,6 +1864,15 @@ int move_freepages(struct zone *zone, VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); if (!PageBuddy(page)) { + /* + * We assume that pages that could be isolated for + * migration are movable. But we don't actually try + * isolating, as that would be expensive. + */ + if (num_movable && + (PageLRU(page) || __PageMovable(page))) + (*num_movable)++; + page++; continue; } @@ -1888,7 +1888,7 @@ int move_freepages(struct zone *zone, } int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) + int migratetype, int *num_movable) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -1905,7 +1905,8 @@ int move_freepages_block(struct zone *zone, struct page *page, if (!zone_spans_pfn(zone, end_pfn)) return 0; - return move_freepages(zone, start_page, end_page, migratetype); + return move_freepages(zone, start_page, end_page, migratetype, + num_movable); } static void change_pageblock_range(struct page *pageblock_page, @@ -1955,28 +1956,79 @@ static bool can_steal_fallback(unsigned int order, int start_mt) /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this - * pageblock and check whether half of pages are moved or not. If half of - * pages are moved, we can change migratetype of pageblock and permanently - * use it's pages as requested migratetype in the future. + * pageblock to our migratetype and determine how many already-allocated pages + * are there in the pageblock with a compatible migratetype. If at least half + * of pages are free or compatible, we can change migratetype of the pageblock + * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type) + int start_type, bool whole_block) { unsigned int current_order = page_order(page); - int pages; + struct free_area *area; + int free_pages, movable_pages, alike_pages; + int old_block_type; + + old_block_type = get_pageblock_migratetype(page); + + /* + * This can happen due to races and we want to prevent broken + * highatomic accounting. + */ + if (is_migrate_highatomic(old_block_type)) + goto single_page; /* Take ownership for orders >= pageblock_order */ if (current_order >= pageblock_order) { change_pageblock_range(page, current_order, start_type); - return; + goto single_page; + } + + /* We are not allowed to try stealing from the whole block */ + if (!whole_block) + goto single_page; + + free_pages = move_freepages_block(zone, page, start_type, + &movable_pages); + /* + * Determine how many pages are compatible with our allocation. + * For movable allocation, it's the number of movable pages which + * we just obtained. For other types it's a bit more tricky. + */ + if (start_type == MIGRATE_MOVABLE) { + alike_pages = movable_pages; + } else { + /* + * If we are falling back a RECLAIMABLE or UNMOVABLE allocation + * to MOVABLE pageblock, consider all non-movable pages as + * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or + * vice versa, be conservative since we can't distinguish the + * exact migratetype of non-movable pages. + */ + if (old_block_type == MIGRATE_MOVABLE) + alike_pages = pageblock_nr_pages + - (free_pages + movable_pages); + else + alike_pages = 0; } - pages = move_freepages_block(zone, page, start_type); + /* moving whole block can fail due to zone boundary conditions */ + if (!free_pages) + goto single_page; - /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1)) || + /* + * If a sufficient number of pages in the block are either free or of + * comparable migratability as our allocation, claim the whole block. + */ + if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_type); + + return; + +single_page: + area = &zone->free_area[current_order]; + list_move(&page->lru, &area->free_list[start_type]); } /* @@ -2042,11 +2094,11 @@ static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, /* Yoink! */ mt = get_pageblock_migratetype(page); - if (mt != MIGRATE_HIGHATOMIC && - !is_migrate_isolate(mt) && !is_migrate_cma(mt)) { + if (!is_migrate_highatomic(mt) && !is_migrate_isolate(mt) + && !is_migrate_cma(mt)) { zone->nr_reserved_highatomic += pageblock_nr_pages; set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); - move_freepages_block(zone, page, MIGRATE_HIGHATOMIC); + move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); } out_unlock: @@ -2100,8 +2152,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * from highatomic to ac->migratetype. So we should * adjust the count once. */ - if (get_pageblock_migratetype(page) == - MIGRATE_HIGHATOMIC) { + if (is_migrate_highatomic_page(page)) { /* * It should never happen but changes to * locking could inadvertently allow a per-cpu @@ -2124,7 +2175,8 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - ret = move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype, + NULL); if (ret) { spin_unlock_irqrestore(&zone->lock, flags); return ret; @@ -2136,8 +2188,13 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, return false; } -/* Remove an element from the buddy allocator from the fallback list */ -static inline struct page * +/* + * Try finding a free buddy page on the fallback list and put it on the free + * list of requested migratetype, possibly along with other pages from the same + * block, depending on fragmentation avoidance heuristics. Returns true if + * fallback was found so that __rmqueue_smallest() can grab it. + */ +static inline bool __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; @@ -2158,33 +2215,17 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal && - get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) - steal_suitable_fallback(zone, page, start_migratetype); - - /* Remove the page from the freelists */ - area->nr_free--; - list_del(&page->lru); - rmv_page_order(page); - expand(zone, page, order, current_order, area, - start_migratetype); - /* - * The pcppage_migratetype may differ from pageblock's - * migratetype depending on the decisions in - * find_suitable_fallback(). This is OK as long as it does not - * differ for MIGRATE_CMA pageblocks. Those can be used as - * fallback only via special __rmqueue_cma_fallback() function - */ - set_pcppage_migratetype(page, start_migratetype); + steal_suitable_fallback(zone, page, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); - return page; + return true; } - return NULL; + return false; } /* @@ -2196,13 +2237,14 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order, { struct page *page; +retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { if (migratetype == MIGRATE_MOVABLE) page = __rmqueue_cma_fallback(zone, order); - if (!page) - page = __rmqueue_fallback(zone, order, migratetype); + if (!page && __rmqueue_fallback(zone, order, migratetype)) + goto retry; } trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -2219,9 +2261,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, int migratetype, bool cold) { int i, alloced = 0; - unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { struct page *page = __rmqueue(zone, order, migratetype); if (unlikely(page == NULL)) @@ -2257,7 +2298,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * pages added to the pcp list. */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); return alloced; } @@ -2485,25 +2526,22 @@ void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; + unsigned long flags; unsigned long pfn = page_to_pfn(page); int migratetype; - if (in_interrupt()) { - __free_pages_ok(page, 0); - return; - } - if (!free_pcp_prepare(page)) return; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - preempt_disable(); + local_irq_save(flags); + __count_vm_event(PGFREE); /* * We only track unmovable, reclaimable and movable on pcp lists. * Free ISOLATE pages back to the allocator because they are being - * offlined but treat RESERVE as movable pages so we can get those + * offlined but treat HIGHATOMIC as movable pages so we can get those * areas back if necessary. Otherwise, we may have to free * excessively into the page allocator */ @@ -2515,7 +2553,6 @@ void free_hot_cold_page(struct page *page, bool cold) migratetype = MIGRATE_MOVABLE; } - __count_vm_event(PGFREE); pcp = &this_cpu_ptr(zone->pageset)->pcp; if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); @@ -2529,7 +2566,7 @@ void free_hot_cold_page(struct page *page, bool cold) } out: - preempt_enable(); + local_irq_restore(flags); } /* @@ -2614,7 +2651,7 @@ int __isolate_free_page(struct page *page, unsigned int order) for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) - && mt != MIGRATE_HIGHATOMIC) + && !is_migrate_highatomic(mt)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } @@ -2654,8 +2691,6 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, { struct page *page; - VM_BUG_ON(in_interrupt()); - do { if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, @@ -2686,8 +2721,9 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct list_head *list; bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; + unsigned long flags; - preempt_disable(); + local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); @@ -2695,7 +2731,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); } - preempt_enable(); + local_irq_restore(flags); return page; } @@ -2711,7 +2747,7 @@ struct page *rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; - if (likely(order == 0) && !in_interrupt()) { + if (likely(order == 0)) { page = rmqueue_pcplist(preferred_zone, zone, order, gfp_flags, migratetype); goto out; @@ -3113,8 +3149,7 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) || - debug_guardpage_minorder() > 0) + if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; pr_warn("%s: ", current->comm); @@ -3248,14 +3283,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, enum compact_priority prio, enum compact_result *compact_result) { struct page *page; + unsigned int noreclaim_flag; if (!order) return NULL; - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac, prio); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); if (*compact_result <= COMPACT_INACTIVE) return NULL; @@ -3402,12 +3438,13 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, { struct reclaim_state reclaim_state; int progress; + unsigned int noreclaim_flag; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3417,7 +3454,7 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); cond_resched(); @@ -3524,20 +3561,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) return false; } -/* - * Maximum number of reclaim retries without any progress before OOM killer - * is consider as the only way to move forward. - */ -#define MAX_RECLAIM_RETRIES 16 - /* * Checks whether it makes sense to retry the reclaim to make a forward progress * for the given allocation request. - * The reclaim feedback represented by did_some_progress (any progress during - * the last reclaim round) and no_progress_loops (number of reclaim rounds without - * any progress in a row) is considered as well as the reclaimable pages on the - * applicable zone list (with a backoff mechanism which is a function of - * no_progress_loops). + * + * We give up when we either have tried MAX_RECLAIM_RETRIES in a row + * without success, or when we couldn't even meet the watermark if we + * reclaimed all remaining pages on the LRU lists. * * Returns true if a retry is viable or false to enter the oom path. */ @@ -3582,13 +3612,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, bool wmark; available = reclaimable = zone_reclaimable_pages(zone); - available -= DIV_ROUND_UP((*no_progress_loops) * available, - MAX_RECLAIM_RETRIES); available += zone_page_state_snapshot(zone, NR_FREE_PAGES); /* - * Would the allocation succeed if we reclaimed the whole - * available? + * Would the allocation succeed if we reclaimed all + * reclaimable pages? */ wmark = __zone_watermark_ok(zone, order, min_wmark, ac_classzone_idx(ac), alloc_flags, available); @@ -3639,6 +3667,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct alloc_context *ac) { bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM; + const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER; struct page *page = NULL; unsigned int alloc_flags; unsigned long did_some_progress; @@ -3706,12 +3735,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* * For costly allocations, try direct compaction first, as it's likely - * that we have enough base pages and don't need to reclaim. Don't try - * that for allocations that are allowed to ignore watermarks, as the - * ALLOC_NO_WATERMARKS attempt didn't yet happen. + * that we have enough base pages and don't need to reclaim. For non- + * movable high-order allocations, do that as well, as compaction will + * try prevent permanent fragmentation by migrating from blocks of the + * same migratetype. + * Don't try this for allocations that are allowed to ignore + * watermarks, as the ALLOC_NO_WATERMARKS attempt didn't yet happen. */ - if (can_direct_reclaim && order > PAGE_ALLOC_COSTLY_ORDER && - !gfp_pfmemalloc_allowed(gfp_mask)) { + if (can_direct_reclaim && + (costly_order || + (order > 0 && ac->migratetype != MIGRATE_MOVABLE)) + && !gfp_pfmemalloc_allowed(gfp_mask)) { page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac, INIT_COMPACT_PRIORITY, @@ -3723,7 +3757,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Checks for costly allocations with __GFP_NORETRY, which * includes THP page fault allocations */ - if (gfp_mask & __GFP_NORETRY) { + if (costly_order && (gfp_mask & __GFP_NORETRY)) { /* * If compaction is deferred for high-order allocations, * it is because sync compaction recently failed. If @@ -3774,7 +3808,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, /* Make sure we know about allocations which stall for too long */ if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask, ac->nodemask, + warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, "page allocation stalls for %ums, order:%u", jiffies_to_msecs(jiffies-alloc_start), order); stall_timeout += 10 * HZ; @@ -3804,7 +3838,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * Do not retry costly high order allocations unless they are * __GFP_REPEAT */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_REPEAT)) + if (costly_order && !(gfp_mask & __GFP_REPEAT)) goto nopage; if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, @@ -3974,10 +4008,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, goto out; /* - * Runtime PM, block IO and its error handling path can deadlock - * because I/O on the device might not complete. + * Apply scoped allocation constraints. This is mainly about GFP_NOFS + * resp. GFP_NOIO which has to be inherited for all allocation requests + * from a particular context which has been marked by + * memalloc_no{fs,io}_{save,restore}. */ - alloc_mask = memalloc_noio_flags(gfp_mask); + alloc_mask = current_gfp_context(gfp_mask); ac.spread_dirty_pages = false; /* @@ -4250,7 +4286,8 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * managed_pages - high_pages + * + * nr_free_zone_pages = managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { @@ -4512,7 +4549,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif " writeback_tmp:%lukB" " unstable:%lukB" - " pages_scanned:%lu" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -4535,8 +4571,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), K(node_page_state(pgdat, NR_UNSTABLE_NFS)), - node_page_state(pgdat, NR_PAGES_SCANNED), - !pgdat_reclaimable(pgdat) ? "yes" : "no"); + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? + "yes" : "no"); } for_each_populated_zone(zone) { @@ -7431,7 +7467,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .zone = page_zone(pfn_to_page(start)), .mode = MIGRATE_SYNC, .ignore_skip_hint = true, - .gfp_mask = memalloc_noio_flags(gfp_mask), + .gfp_mask = current_gfp_context(gfp_mask), }; INIT_LIST_HEAD(&cc.migratepages); diff --git a/mm/page_ext.c b/mm/page_ext.c index 121dcffc4ec1768a6fc71dda8af878aa0d16cb92..88ccc044b09a41504fb212afdb5ca8a45e842998 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,9 +59,6 @@ static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, -#ifdef CONFIG_PAGE_POISONING - &page_poisoning_ops, -#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -127,15 +124,12 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (unlikely(!base)) return NULL; @@ -204,15 +198,12 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) || defined(CONFIG_PAGE_POISONING) +#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. - * - * This check is also necessary for ensuring page poisoning - * works as expected when enabled */ if (!section->page_ext) return NULL; diff --git a/mm/page_idle.c b/mm/page_idle.c index b0ee56c56b5850f32f5b3157f9b142788840f50f..1b0f48c62316b3ebf1e09cc7b0a16f413db7e9ae 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -50,7 +50,7 @@ static struct page *page_idle_get_page(unsigned long pfn) return page; } -static int page_idle_clear_pte_refs_one(struct page *page, +static bool page_idle_clear_pte_refs_one(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg) { @@ -84,7 +84,7 @@ static int page_idle_clear_pte_refs_one(struct page *page, */ set_page_young(page); } - return SWAP_AGAIN; + return true; } static void page_idle_clear_pte_refs(struct page *page) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f4e17a57926afffa33fceabf81a742ffc254d914..5092e4ef00c832fefb624a1d9d8f618402ad4402 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -66,7 +66,8 @@ static int set_migratetype_isolate(struct page *page, set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; - nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE); + nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, + NULL); __mod_zone_freepage_state(zone, -nr_pages, migratetype); } @@ -88,7 +89,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!is_migrate_isolate_page(page)) goto out; /* @@ -120,7 +121,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * pageblock scanning for freepage moving. */ if (!isolated_page) { - nr_pages = move_freepages_block(zone, page, migratetype); + nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } set_pageblock_migratetype(page, migratetype); @@ -205,7 +206,7 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (!page || get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (!page || !is_migrate_isolate_page(page)) continue; unset_migratetype_isolate(page, migratetype); } @@ -262,7 +263,7 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, */ for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + if (page && !is_migrate_isolate_page(page)) break; } page = __first_valid_page(start_pfn, end_pfn - start_pfn); diff --git a/mm/page_poison.c b/mm/page_poison.c index 2e647c65916b91b00177837370e210d71c568f5e..be19e989ccff51f667c9698c9cb8fea3d5303f8e 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -6,7 +6,6 @@ #include #include -static bool __page_poisoning_enabled __read_mostly; static bool want_page_poisoning __read_mostly; static int early_page_poison_param(char *buf) @@ -18,75 +17,22 @@ static int early_page_poison_param(char *buf) early_param("page_poison", early_page_poison_param); bool page_poisoning_enabled(void) -{ - return __page_poisoning_enabled; -} - -static bool need_page_poisoning(void) -{ - return want_page_poisoning; -} - -static void init_page_poisoning(void) { /* - * page poisoning is debug page alloc for some arches. If either - * of those options are enabled, enable poisoning + * Assumes that debug_pagealloc_enabled is set before + * free_all_bootmem. + * Page poisoning is debug page alloc for some arches. If + * either of those options are enabled, enable poisoning. */ - if (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC)) { - if (!want_page_poisoning && !debug_pagealloc_enabled()) - return; - } else { - if (!want_page_poisoning) - return; - } - - __page_poisoning_enabled = true; -} - -struct page_ext_operations page_poisoning_ops = { - .need = need_page_poisoning, - .init = init_page_poisoning, -}; - -static inline void set_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -static inline void clear_page_poison(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return; - - __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); -} - -bool page_is_poisoned(struct page *page) -{ - struct page_ext *page_ext; - - page_ext = lookup_page_ext(page); - if (unlikely(!page_ext)) - return false; - - return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); + return (want_page_poisoning || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())); } static void poison_page(struct page *page) { void *addr = kmap_atomic(page); - set_page_poison(page); memset(addr, PAGE_POISON, PAGE_SIZE); kunmap_atomic(addr); } @@ -140,12 +86,13 @@ static void unpoison_page(struct page *page) { void *addr; - if (!page_is_poisoned(page)) - return; - addr = kmap_atomic(page); + /* + * Page poisoning when enabled poisons each and every page + * that is freed to buddy. Thus no extra check is done to + * see if a page was posioned. + */ check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); kunmap_atomic(addr); } diff --git a/mm/percpu.c b/mm/percpu.c index 60a6488e9e6d49d5e9c5d4b611a5f5b037342316..e0aa8ae7bde708188e2d6ba84dbdc12dcf11f52e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1284,18 +1284,7 @@ void free_percpu(void __percpu *ptr) } EXPORT_SYMBOL_GPL(free_percpu); -/** - * is_kernel_percpu_address - test whether address is from static percpu area - * @addr: address to test - * - * Test whether @addr belongs to in-kernel static percpu area. Module - * static percpu areas are not considered. For those, use - * is_module_percpu_address(). - * - * RETURNS: - * %true if @addr is from in-kernel static percpu area, %false otherwise. - */ -bool is_kernel_percpu_address(unsigned long addr) +bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; @@ -1304,15 +1293,38 @@ bool is_kernel_percpu_address(unsigned long addr) for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); + void *va = (void *)addr; - if ((void *)addr >= start && (void *)addr < start + static_size) + if (va >= start && va < start + static_size) { + if (can_addr) { + *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(base, get_boot_cpu_id()); + } return true; - } + } + } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } +/** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ + return __is_kernel_percpu_address(addr, NULL); +} + /** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address diff --git a/mm/rmap.c b/mm/rmap.c index f6838015810f5610abe039daec170aa1da634422..d405f0e0ee9651b40dceac3f45a851469b576e48 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -430,7 +430,7 @@ static void anon_vma_ctor(void *data) void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), - 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, + 0, SLAB_TYPESAFE_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT, anon_vma_ctor); anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC|SLAB_ACCOUNT); @@ -481,7 +481,7 @@ struct anon_vma *page_get_anon_vma(struct page *page) * If this page is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: - * SLAB_DESTROY_BY_RCU guarantees that - so the atomic_inc_not_zero() + * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ if (!page_mapped(page)) { @@ -724,7 +724,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -static int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static bool page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_referenced_arg *pra = arg; @@ -741,7 +741,7 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) { page_vma_mapped_walk_done(&pvmw); pra->vm_flags |= VM_LOCKED; - return SWAP_FAIL; /* To break the loop */ + return false; /* To break the loop */ } if (pvmw.pte) { @@ -781,9 +781,9 @@ static int page_referenced_one(struct page *page, struct vm_area_struct *vma, } if (!pra->mapcount) - return SWAP_SUCCESS; /* To break the loop */ + return false; /* To break the loop */ - return SWAP_AGAIN; + return true; } static bool invalid_page_referenced_vma(struct vm_area_struct *vma, void *arg) @@ -812,7 +812,6 @@ int page_referenced(struct page *page, struct mem_cgroup *memcg, unsigned long *vm_flags) { - int ret; int we_locked = 0; struct page_referenced_arg pra = { .mapcount = total_mapcount(page), @@ -846,7 +845,7 @@ int page_referenced(struct page *page, rwc.invalid_vma = invalid_page_referenced_vma; } - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); *vm_flags = pra.vm_flags; if (we_locked) @@ -855,7 +854,7 @@ int page_referenced(struct page *page, return pra.referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, +static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct page_vma_mapped_walk pvmw = { @@ -908,7 +907,7 @@ static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, } } - return SWAP_AGAIN; + return true; } static bool invalid_mkclean_vma(struct vm_area_struct *vma, void *arg) @@ -1159,7 +1158,7 @@ void page_add_file_rmap(struct page *page, bool compound) goto out; } __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, nr); out: unlock_page_memcg(page); } @@ -1199,7 +1198,7 @@ static void page_remove_file_rmap(struct page *page, bool compound) * pte lock(a spinlock) is held, which implies preemption disabled. */ __mod_node_page_state(page_pgdat(page), NR_FILE_MAPPED, -nr); - mem_cgroup_update_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, -nr); + mod_memcg_page_state(page, NR_FILE_MAPPED, -nr); if (unlikely(PageMlocked(page))) clear_page_mlock(page); @@ -1288,15 +1287,10 @@ void page_remove_rmap(struct page *page, bool compound) */ } -struct rmap_private { - enum ttu_flags flags; - int lazyfreed; -}; - /* * @arg: enum ttu_flags will be passed to this argument */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1307,13 +1301,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, }; pte_t pteval; struct page *subpage; - int ret = SWAP_AGAIN; - struct rmap_private *rp = arg; - enum ttu_flags flags = rp->flags; + bool ret = true; + enum ttu_flags flags = (enum ttu_flags)arg; /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) - return SWAP_AGAIN; + return true; if (flags & TTU_SPLIT_HUGE_PMD) { split_huge_pmd_address(vma, address, @@ -1336,7 +1329,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, */ mlock_vma_page(page); } - ret = SWAP_MLOCK; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1354,7 +1347,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pvmw.pte)) { - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1424,18 +1417,34 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + if (unlikely(PageSwapBacked(page) != PageSwapCache(page))) { + WARN_ON_ONCE(1); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; + } + + /* MADV_FREE page check */ + if (!PageSwapBacked(page)) { + if (!PageDirty(page)) { + dec_mm_counter(mm, MM_ANONPAGES); + goto discard; + } - if (!PageDirty(page) && (flags & TTU_LZFREE)) { - /* It's a freeable page by MADV_FREE */ - dec_mm_counter(mm, MM_ANONPAGES); - rp->lazyfreed++; - goto discard; + /* + * If the page was redirtied, it cannot be + * discarded. Remap the page to page table. + */ + set_pte_at(mm, address, pvmw.pte, pteval); + SetPageSwapBacked(page); + ret = false; + page_vma_mapped_walk_done(&pvmw); + break; } if (swap_duplicate(entry) < 0) { set_pte_at(mm, address, pvmw.pte, pteval); - ret = SWAP_FAIL; + ret = false; page_vma_mapped_walk_done(&pvmw); break; } @@ -1492,24 +1501,14 @@ static int page_mapcount_is_zero(struct page *page) * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. - * Return values are: * - * SWAP_SUCCESS - we succeeded in removing all mappings - * SWAP_AGAIN - we missed a mapping, try again later - * SWAP_FAIL - the page is unswappable - * SWAP_MLOCK - page is mlocked. + * If unmap is successful, return true. Otherwise, false. */ -int try_to_unmap(struct page *page, enum ttu_flags flags) +bool try_to_unmap(struct page *page, enum ttu_flags flags) { - int ret; - struct rmap_private rp = { - .flags = flags, - .lazyfreed = 0, - }; - struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)flags, .done = page_mapcount_is_zero, .anon_lock = page_lock_anon_vma_read, }; @@ -1526,16 +1525,11 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) rwc.invalid_vma = invalid_migration_vma; if (flags & TTU_RMAP_LOCKED) - ret = rmap_walk_locked(page, &rwc); + rmap_walk_locked(page, &rwc); else - ret = rmap_walk(page, &rwc); + rmap_walk(page, &rwc); - if (ret != SWAP_MLOCK && !page_mapcount(page)) { - ret = SWAP_SUCCESS; - if (rp.lazyfreed && !PageDirty(page)) - ret = SWAP_LZFREE; - } - return ret; + return !page_mapcount(page) ? true : false; } static int page_not_mapped(struct page *page) @@ -1550,34 +1544,22 @@ static int page_not_mapped(struct page *page) * Called from munlock code. Checks all of the VMAs mapping the page * to make sure nobody else has this page mlocked. The page will be * returned with PG_mlocked cleared if no other vmas have it mlocked. - * - * Return values are: - * - * SWAP_AGAIN - no vma is holding page mlocked, or, - * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem - * SWAP_FAIL - page cannot be located at present - * SWAP_MLOCK - page is now mlocked. */ -int try_to_munlock(struct page *page) -{ - int ret; - struct rmap_private rp = { - .flags = TTU_MUNLOCK, - .lazyfreed = 0, - }; +void try_to_munlock(struct page *page) +{ struct rmap_walk_control rwc = { .rmap_one = try_to_unmap_one, - .arg = &rp, + .arg = (void *)TTU_MUNLOCK, .done = page_not_mapped, .anon_lock = page_lock_anon_vma_read, }; VM_BUG_ON_PAGE(!PageLocked(page) || PageLRU(page), page); + VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page); - ret = rmap_walk(page, &rwc); - return ret; + rmap_walk(page, &rwc); } void __put_anon_vma(struct anon_vma *anon_vma) @@ -1625,13 +1607,12 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct anon_vma *anon_vma; pgoff_t pgoff_start, pgoff_end; struct anon_vma_chain *avc; - int ret = SWAP_AGAIN; if (locked) { anon_vma = page_anon_vma(page); @@ -1641,7 +1622,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, anon_vma = rmap_walk_anon_lock(page, rwc); } if (!anon_vma) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1655,8 +1636,7 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) break; if (rwc->done && rwc->done(page)) break; @@ -1664,7 +1644,6 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, if (!locked) anon_vma_unlock_read(anon_vma); - return ret; } /* @@ -1680,13 +1659,12 @@ static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, * vm_flags for that VMA. That should be OK, because that vma shouldn't be * LOCKED. */ -static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, +static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, bool locked) { struct address_space *mapping = page_mapping(page); pgoff_t pgoff_start, pgoff_end; struct vm_area_struct *vma; - int ret = SWAP_AGAIN; /* * The page lock not only makes sure that page->mapping cannot @@ -1697,7 +1675,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, VM_BUG_ON_PAGE(!PageLocked(page), page); if (!mapping) - return ret; + return; pgoff_start = page_to_pgoff(page); pgoff_end = pgoff_start + hpage_nr_pages(page) - 1; @@ -1712,8 +1690,7 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) continue; - ret = rwc->rmap_one(page, vma, address, rwc->arg); - if (ret != SWAP_AGAIN) + if (!rwc->rmap_one(page, vma, address, rwc->arg)) goto done; if (rwc->done && rwc->done(page)) goto done; @@ -1722,28 +1699,27 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, done: if (!locked) i_mmap_unlock_read(mapping); - return ret; } -int rmap_walk(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk(struct page *page, struct rmap_walk_control *rwc) { if (unlikely(PageKsm(page))) - return rmap_walk_ksm(page, rwc); + rmap_walk_ksm(page, rwc); else if (PageAnon(page)) - return rmap_walk_anon(page, rwc, false); + rmap_walk_anon(page, rwc, false); else - return rmap_walk_file(page, rwc, false); + rmap_walk_file(page, rwc, false); } /* Like rmap_walk, but caller holds relevant rmap lock */ -int rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) +void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc) { /* no ksm support for now */ VM_BUG_ON_PAGE(PageKsm(page), page); if (PageAnon(page)) - return rmap_walk_anon(page, rwc, true); + rmap_walk_anon(page, rwc, true); else - return rmap_walk_file(page, rwc, true); + rmap_walk_file(page, rwc, true); } #ifdef CONFIG_HUGETLB_PAGE diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 0fd21670b513f735e554b340adc0d86938e32d69..6bb4deb12e78b8e3724c40446069e2cd0731b4a4 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,11 +9,12 @@ * as published by the Free Software Foundation; version 2 * of the License. */ +#define pr_fmt(fmt) "rodata_test: " fmt + #include #include const int rodata_test_data = 0xC3; -EXPORT_SYMBOL_GPL(rodata_test_data); void rodata_test(void) { @@ -23,20 +24,20 @@ void rodata_test(void) /* test 1: read the value */ /* If this test fails, some previous testrun has clobbered the state */ if (!rodata_test_data) { - pr_err("rodata_test: test 1 fails (start data)\n"); + pr_err("test 1 fails (start data)\n"); return; } /* test 2: write to the variable; this should fault */ if (!probe_kernel_write((void *)&rodata_test_data, - (void *)&zero, sizeof(zero))) { - pr_err("rodata_test: test data was not read only\n"); + (void *)&zero, sizeof(zero))) { + pr_err("test data was not read only\n"); return; } /* test 3: check the value hasn't changed */ if (rodata_test_data == zero) { - pr_err("rodata_test: test data was changed\n"); + pr_err("test data was changed\n"); return; } @@ -44,13 +45,13 @@ void rodata_test(void) start = (unsigned long)__start_rodata; end = (unsigned long)__end_rodata; if (start & (PAGE_SIZE - 1)) { - pr_err("rodata_test: start of .rodata is not page size aligned\n"); + pr_err("start of .rodata is not page size aligned\n"); return; } if (end & (PAGE_SIZE - 1)) { - pr_err("rodata_test: end of .rodata is not page size aligned\n"); + pr_err("end of .rodata is not page size aligned\n"); return; } - pr_info("rodata_test: all tests were successful\n"); + pr_info("all tests were successful\n"); } diff --git a/mm/slab.c b/mm/slab.c index 807d86c769088681b47f41c0cc0a721307bd16c1..2a31ee3c5814f192234385303a0d86cc77790580 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1728,7 +1728,7 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) freelist = page->freelist; slab_destroy_debugcheck(cachep, page); - if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) + if (unlikely(cachep->flags & SLAB_TYPESAFE_BY_RCU)) call_rcu(&page->rcu_head, kmem_rcu_free); else kmem_freepages(cachep, page); @@ -1924,7 +1924,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, cachep->num = 0; - if (cachep->ctor || flags & SLAB_DESTROY_BY_RCU) + if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU) return false; left = calculate_slab_order(cachep, size, @@ -2030,7 +2030,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (size < 4096 || fls(size - 1) == fls(size-1 + REDZONE_ALIGN + 2 * sizeof(unsigned long long))) flags |= SLAB_RED_ZONE | SLAB_STORE_USER; - if (!(flags & SLAB_DESTROY_BY_RCU)) + if (!(flags & SLAB_TYPESAFE_BY_RCU)) flags |= SLAB_POISON; #endif #endif @@ -3879,7 +3879,12 @@ static int __do_tune_cpucache(struct kmem_cache *cachep, int limit, prev = cachep->cpu_cache; cachep->cpu_cache = cpu_cache; - kick_all_cpus_sync(); + /* + * Without a previous cpu_cache there's no need to synchronize remote + * cpus, so skip the IPIs. + */ + if (prev) + kick_all_cpus_sync(); check_irq_on(); cachep->batchcount = batchcount; diff --git a/mm/slab.h b/mm/slab.h index 65e7c3fcac72790acece0ac140d864151f95f166..9cfcf099709c19cfc8b5070325a0527c763eddaa 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -126,7 +126,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, /* Legal flag mask for kmem_cache_create(), for various configurations */ #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \ - SLAB_DESTROY_BY_RCU | SLAB_DEBUG_OBJECTS ) + SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS ) #if defined(CONFIG_DEBUG_SLAB) #define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) @@ -415,7 +415,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) * back there or track user information then we can * only use the space before that information. */ - if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER)) + if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) return s->inuse; /* * Else we can use all the padding etc for the allocation diff --git a/mm/slab_common.c b/mm/slab_common.c index 09d0e849b07f47d82f5d9a5cda4862517d297cb2..01a0fe2eb33267f8f04f7e90fd79358cd1f41d07 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -39,7 +39,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, * Set of flags that will prevent slab merging */ #define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \ SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ @@ -500,7 +500,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work) struct kmem_cache *s, *s2; /* - * On destruction, SLAB_DESTROY_BY_RCU kmem_caches are put on the + * On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the * @slab_caches_to_rcu_destroy list. The slab pages are freed * through RCU and and the associated kmem_cache are dereferenced * while freeing the pages, so the kmem_caches should be freed only @@ -537,7 +537,7 @@ static int shutdown_cache(struct kmem_cache *s) memcg_unlink_cache(s); list_del(&s->list); - if (s->flags & SLAB_DESTROY_BY_RCU) { + if (s->flags & SLAB_TYPESAFE_BY_RCU) { list_add_tail(&s->list, &slab_caches_to_rcu_destroy); schedule_work(&slab_caches_to_rcu_destroy_work); } else { diff --git a/mm/slob.c b/mm/slob.c index eac04d4357ec6b8d653de4c30c96ffdd97974462..1bae78d71096ad26bbe4575f631f3fad7b694388 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -126,7 +126,7 @@ static inline void clear_slob_page_free(struct page *sp) /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which - * were created with a SLAB_DESTROY_BY_RCU slab. slob_rcu is used to free + * were created with a SLAB_TYPESAFE_BY_RCU slab. slob_rcu is used to free * the block using call_rcu. */ struct slob_rcu { @@ -524,7 +524,7 @@ EXPORT_SYMBOL(ksize); int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) { - if (flags & SLAB_DESTROY_BY_RCU) { + if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ c->size += sizeof(struct slob_rcu); } @@ -598,7 +598,7 @@ static void kmem_rcu_free(struct rcu_head *head) void kmem_cache_free(struct kmem_cache *c, void *b) { kmemleak_free_recursive(b, c->flags); - if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(c->flags & SLAB_TYPESAFE_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); slob_rcu->size = c->size; diff --git a/mm/slub.c b/mm/slub.c index 7f4bc7027ed53536efaaf5663f007bd2442de503..57e5156f02be6bcc23e70ec801e9cc1c3bbdd631 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1687,7 +1687,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { - if (unlikely(s->flags & SLAB_DESTROY_BY_RCU)) { + if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { struct rcu_head *head; if (need_reserve_slab_rcu) { @@ -2963,7 +2963,7 @@ static __always_inline void slab_free(struct kmem_cache *s, struct page *page, * slab_free_freelist_hook() could have put the items into quarantine. * If so, no need to free them. */ - if (s->flags & SLAB_KASAN && !(s->flags & SLAB_DESTROY_BY_RCU)) + if (s->flags & SLAB_KASAN && !(s->flags & SLAB_TYPESAFE_BY_RCU)) return; do_slab_free(s, page, head, tail, cnt, addr); } @@ -3433,7 +3433,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the slab may touch the object after free or before allocation * then we should never poison the object itself. */ - if ((flags & SLAB_POISON) && !(flags & SLAB_DESTROY_BY_RCU) && + if ((flags & SLAB_POISON) && !(flags & SLAB_TYPESAFE_BY_RCU) && !s->ctor) s->flags |= __OBJECT_POISON; else @@ -3455,7 +3455,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->inuse = size; - if (((flags & (SLAB_DESTROY_BY_RCU | SLAB_POISON)) || + if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || s->ctor)) { /* * Relocate free pointer after the object if it is not @@ -3537,7 +3537,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; - if (need_reserve_slab_rcu && (s->flags & SLAB_DESTROY_BY_RCU)) + if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) s->reserved = sizeof(struct rcu_head); if (!calculate_sizes(s, -1)) @@ -5042,7 +5042,7 @@ SLAB_ATTR_RO(cache_dma); static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); + return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU)); } SLAB_ATTR_RO(destroy_by_rcu); diff --git a/mm/sparse.c b/mm/sparse.c index db6bf3c97ea2cd7e593922ab88840b84d67cdd88..6903c8fc308502eab4ca3570075cd6a881efa724 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -248,10 +248,7 @@ static int __meminit sparse_init_one_section(struct mem_section *ms, unsigned long usemap_size(void) { - unsigned long size_bytes; - size_bytes = roundup(SECTION_BLOCKFLAGS_BITS, 8) / 8; - size_bytes = roundup(size_bytes, sizeof(unsigned long)); - return size_bytes; + return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long); } #ifdef CONFIG_MEMORY_HOTPLUG diff --git a/mm/swap.c b/mm/swap.c index 5dabf444d724db98595567b0f7daed7d53fc877e..98d08b4579faa42b8ef43ba4563ed6cb76f19893 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -46,7 +46,7 @@ int page_cluster; static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs); #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); #endif @@ -97,6 +97,16 @@ static void __put_compound_page(struct page *page) void __put_page(struct page *page) { + if (is_zone_device_page(page)) { + put_dev_pagemap(page->pgmap); + + /* + * The page belongs to the device that created pgmap. Do + * not return it to page allocator. + */ + return; + } + if (unlikely(PageCompound(page))) __put_compound_page(page); else @@ -561,20 +571,27 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec, } -static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, +static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec, void *arg) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + bool active = PageActive(page); - del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE); + del_page_from_lru_list(page, lruvec, + LRU_INACTIVE_ANON + active); ClearPageActive(page); ClearPageReferenced(page); - add_page_to_lru_list(page, lruvec, lru); + /* + * lazyfree pages are clean anonymous pages. They have + * SwapBacked flag cleared to distinguish normal anonymous + * pages + */ + ClearPageSwapBacked(page); + add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE); - __count_vm_event(PGDEACTIVATE); - update_page_reclaim_stat(lruvec, file, 0); + __count_vm_events(PGLAZYFREE, hpage_nr_pages(page)); + update_page_reclaim_stat(lruvec, 1, 0); } } @@ -604,9 +621,9 @@ void lru_add_drain_cpu(int cpu) if (pagevec_count(pvec)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); - pvec = &per_cpu(lru_deactivate_pvecs, cpu); + pvec = &per_cpu(lru_lazyfree_pvecs, cpu); if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); activate_page_drain(cpu); } @@ -638,22 +655,22 @@ void deactivate_file_page(struct page *page) } /** - * deactivate_page - deactivate a page + * mark_page_lazyfree - make an anon page lazyfree * @page: page to deactivate * - * deactivate_page() moves @page to the inactive list if @page was on the active - * list and was not an unevictable page. This is done to accelerate the reclaim - * of @page. + * mark_page_lazyfree() moves @page to the inactive file list. + * This is done to accelerate the reclaim of @page. */ -void deactivate_page(struct page *page) +void mark_page_lazyfree(struct page *page) { - if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); + if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) && + !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs); get_page(page); if (!pagevec_add(pvec, page) || PageCompound(page)) - pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); - put_cpu_var(lru_deactivate_pvecs); + pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL); + put_cpu_var(lru_lazyfree_pvecs); } } @@ -693,7 +710,7 @@ void lru_add_drain_all(void) if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) || pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) || pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) || - pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || + pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); queue_work_on(cpu, mm_percpu_wq, work); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index b1ccb58ad397403214a220e4a0ac7901a6b6ae1e..58f6c78f1dad313dc7aa9b2ec31ddab361335fe1 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_SWAP @@ -119,16 +120,18 @@ static int alloc_swap_slot_cache(unsigned int cpu) /* * Do allocation outside swap_slots_cache_mutex - * as vzalloc could trigger reclaim and get_swap_page, + * as kvzalloc could trigger reclaim and get_swap_page, * which can lock swap_slots_cache_mutex. */ - slots = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + slots = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + GFP_KERNEL); if (!slots) return -ENOMEM; - slots_ret = vzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE); + slots_ret = kvzalloc(sizeof(swp_entry_t) * SWAP_SLOTS_CACHE_SIZE, + GFP_KERNEL); if (!slots_ret) { - vfree(slots); + kvfree(slots); return -ENOMEM; } @@ -152,9 +155,9 @@ static int alloc_swap_slot_cache(unsigned int cpu) out: mutex_unlock(&swap_slots_cache_mutex); if (slots) - vfree(slots); + kvfree(slots); if (slots_ret) - vfree(slots_ret); + kvfree(slots_ret); return 0; } @@ -171,7 +174,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, cache->cur = 0; cache->nr = 0; if (free_slots && cache->slots) { - vfree(cache->slots); + kvfree(cache->slots); cache->slots = NULL; } mutex_unlock(&cache->alloc_lock); @@ -186,7 +189,7 @@ static void drain_slots_cache_cpu(unsigned int cpu, unsigned int type, } spin_unlock_irq(&cache->free_lock); if (slots) - vfree(slots); + kvfree(slots); } } @@ -241,8 +244,10 @@ int enable_swap_slots_cache(void) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache", alloc_swap_slot_cache, free_slot_cache); - if (ret < 0) + if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating " + "without swap slots cache.\n", __func__)) goto out_unlock; + swap_slot_cache_initialized = true; __reenable_swap_slots_cache(); out_unlock: diff --git a/mm/swap_state.c b/mm/swap_state.c index 473b71e052a8ed29df7c496af747e2b43491c782..539b8885e3d1d4942dbb905b60b87d40726e3de5 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -360,17 +360,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* * We might race against get_swap_page() and stumble * across a SWAP_HAS_CACHE swap_map entry whose page - * has not been brought into the swapcache yet, while - * the other end is scheduled away waiting on discard - * I/O completion at scan_swap_map(). - * - * In order to avoid turning this transitory state - * into a permanent loop around this -EEXIST case - * if !CONFIG_PREEMPT and the I/O completion happens - * to be waiting on the CPU waitqueue where we are now - * busy looping, we just conditionally invoke the - * scheduler here, if there are some more important - * tasks to run. + * has not been brought into the swapcache yet. */ cond_resched(); continue; @@ -533,7 +523,7 @@ int init_swap_address_space(unsigned int type, unsigned long nr_pages) unsigned int i, nr; nr = DIV_ROUND_UP(nr_pages, SWAP_ADDRESS_SPACE_PAGES); - spaces = vzalloc(sizeof(struct address_space) * nr); + spaces = kvzalloc(sizeof(struct address_space) * nr, GFP_KERNEL); if (!spaces) return -ENOMEM; for (i = 0; i < nr; i++) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 178130880b908515a105eccf9fa428f7cf61719a..4f6cba1b66322f3500714950ef3a12eeda9e4455 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -335,7 +335,7 @@ static void cluster_list_add_tail(struct swap_cluster_list *list, ci_tail = ci + tail; spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING); cluster_set_next(ci_tail, idx); - unlock_cluster(ci_tail); + spin_unlock(&ci_tail->lock); cluster_set_next_flag(&list->tail, idx, 0); } } @@ -672,6 +672,9 @@ static int scan_swap_map_slots(struct swap_info_struct *si, else goto done; } + si->swap_map[offset] = usage; + inc_cluster_info_page(si, si->cluster_info, offset); + unlock_cluster(ci); if (offset == si->lowest_bit) si->lowest_bit++; @@ -685,9 +688,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si, plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); } - si->swap_map[offset] = usage; - inc_cluster_info_page(si, si->cluster_info, offset); - unlock_cluster(ci); si->cluster_next = offset + 1; slots[n_ret++] = swp_entry(si->type, offset); @@ -1079,8 +1079,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) p = swap_info_get_cont(entries[i], prev); if (p) swap_entry_free(p, entries[i]); - else - break; prev = p; } if (p) @@ -1111,6 +1109,18 @@ int page_swapcount(struct page *page) return count; } +static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) +{ + int count = 0; + pgoff_t offset = swp_offset(entry); + struct swap_cluster_info *ci; + + ci = lock_cluster_or_swap_info(si, offset); + count = swap_count(si->swap_map[offset]); + unlock_cluster_or_swap_info(si, ci); + return count; +} + /* * How many references to @entry are currently swapped out? * This does not give an exact answer when swap count is continued, @@ -1119,17 +1129,11 @@ int page_swapcount(struct page *page) int __swp_swapcount(swp_entry_t entry) { int count = 0; - pgoff_t offset; struct swap_info_struct *si; - struct swap_cluster_info *ci; si = __swap_info_get(entry); - if (si) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(si, offset); - count = swap_count(si->swap_map[offset]); - unlock_cluster_or_swap_info(si, ci); - } + if (si) + count = swap_swapcount(si, entry); return count; } @@ -1291,7 +1295,8 @@ int free_swap_and_cache(swp_entry_t entry) * Also recheck PageSwapCache now page is locked (above). */ if (PageSwapCache(page) && !PageWriteback(page) && - (!page_mapped(page) || mem_cgroup_swap_full(page))) { + (!page_mapped(page) || mem_cgroup_swap_full(page)) && + !swap_swapcount(p, entry)) { delete_from_swap_cache(page); SetPageDirty(page); } @@ -2265,8 +2270,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; vfree(swap_map); - vfree(cluster_info); - vfree(frontswap_map); + kvfree(cluster_info); + kvfree(frontswap_map); /* Destroy swap account information */ swap_cgroup_swapoff(p->type); exit_swap_address_space(p->type); @@ -2789,7 +2794,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->cluster_next = 1 + (prandom_u32() % p->highest_bit); nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER); - cluster_info = vzalloc(nr_cluster * sizeof(*cluster_info)); + cluster_info = kvzalloc(nr_cluster * sizeof(*cluster_info), + GFP_KERNEL); if (!cluster_info) { error = -ENOMEM; goto bad_swap; @@ -2822,7 +2828,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } /* frontswap enabled? set up bit-per-page map for frontswap */ if (IS_ENABLED(CONFIG_FRONTSWAP)) - frontswap_map = vzalloc(BITS_TO_LONGS(maxpages) * sizeof(long)); + frontswap_map = kvzalloc(BITS_TO_LONGS(maxpages) * sizeof(long), + GFP_KERNEL); if (p->bdev &&(swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { /* diff --git a/mm/truncate.c b/mm/truncate.c index 6263affdef8866135f28256b5b1f5e598515d204..6479ed2afc53fb9dd8d9719051ea77e7a5b200af 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -67,17 +67,14 @@ static void truncate_exceptional_entry(struct address_space *mapping, /* * Invalidate exceptional entry if easily possible. This handles exceptional - * entries for invalidate_inode_pages() so for DAX it evicts only unlocked and - * clean entries. + * entries for invalidate_inode_pages(). */ static int invalidate_exceptional_entry(struct address_space *mapping, pgoff_t index, void *entry) { - /* Handled by shmem itself */ - if (shmem_mapping(mapping)) + /* Handled by shmem itself, or for DAX we do nothing. */ + if (shmem_mapping(mapping) || dax_mapping(mapping)) return 1; - if (dax_mapping(mapping)) - return dax_invalidate_mapping_entry(mapping, index); clear_shadow_entry(mapping, index, entry); return 1; } @@ -266,9 +263,8 @@ void truncate_inode_pages_range(struct address_space *mapping, pgoff_t index; int i; - cleancache_invalidate_inode(mapping); if (mapping->nrpages == 0 && mapping->nrexceptional == 0) - return; + goto out; /* Offsets within partial pages */ partial_start = lstart & (PAGE_SIZE - 1); @@ -363,7 +359,7 @@ void truncate_inode_pages_range(struct address_space *mapping, * will be released, just zeroed, so we can bail out now. */ if (start >= end) - return; + goto out; index = start; for ( ; ; ) { @@ -410,6 +406,8 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); index++; } + +out: cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -623,7 +621,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret2 = 0; int did_range_unmap = 0; - cleancache_invalidate_inode(mapping); + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + goto out; + pagevec_init(&pvec, 0); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, @@ -686,6 +686,18 @@ int invalidate_inode_pages2_range(struct address_space *mapping, cond_resched(); index++; } + /* + * For DAX we invalidate page tables after invalidating radix tree. We + * could invalidate page tables while invalidating each entry however + * that would be expensive. And doing range unmapping before doesn't + * work as we have no cheap way to find whether radix tree entry didn't + * get remapped later. + */ + if (dax_mapping(mapping)) { + unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, + (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + } +out: cleancache_invalidate_inode(mapping); return ret; } diff --git a/mm/usercopy.c b/mm/usercopy.c index d155e12563b139a9879bbc370902ea85a84bd0f3..a9852b24715d25598a7603560ffc042ca0153a2b 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -19,15 +19,9 @@ #include #include #include +#include #include -enum { - BAD_STACK = -1, - NOT_STACK = 0, - GOOD_FRAME, - GOOD_STACK, -}; - /* * Checks if a given pointer and length is contained by the current * stack frame (if possible). @@ -206,17 +200,6 @@ static inline const char *check_heap_object(const void *ptr, unsigned long n, { struct page *page; - /* - * Some architectures (arm64) return true for virt_addr_valid() on - * vmalloced addresses. Work around this by checking for vmalloc - * first. - * - * We also need to check for module addresses explicitly since we - * may copy static data from modules to userspace - */ - if (is_vmalloc_or_module_addr(ptr)) - return NULL; - if (!virt_addr_valid(ptr)) return NULL; diff --git a/mm/util.c b/mm/util.c index 656dc5e37a8721e892d1b2fd0a9f8c1808b17a2d..464df34899031d46058b7cbadc1c3be24ae8dc89 100644 --- a/mm/util.c +++ b/mm/util.c @@ -329,6 +329,64 @@ unsigned long vm_mmap(struct file *file, unsigned long addr, } EXPORT_SYMBOL(vm_mmap); +/** + * kvmalloc_node - attempt to allocate physically contiguous memory, but upon + * failure, fall back to non-contiguous (vmalloc) allocation. + * @size: size of the request. + * @flags: gfp mask for the allocation - must be compatible (superset) with GFP_KERNEL. + * @node: numa node to allocate from + * + * Uses kmalloc to get the memory but if the allocation fails then falls back + * to the vmalloc allocator. Use kvfree for freeing the memory. + * + * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. __GFP_REPEAT + * is supported only for large (>32kB) allocations, and it should be used only if + * kmalloc is preferable to the vmalloc fallback, due to visible performance drawbacks. + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + */ +void *kvmalloc_node(size_t size, gfp_t flags, int node) +{ + gfp_t kmalloc_flags = flags; + void *ret; + + /* + * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) + * so the given set of flags has to be compatible. + */ + WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + + /* + * Make sure that larger requests are not too disruptive - no OOM + * killer and no allocation failure warnings as we have a fallback + */ + if (size > PAGE_SIZE) { + kmalloc_flags |= __GFP_NOWARN; + + /* + * We have to override __GFP_REPEAT by __GFP_NORETRY for !costly + * requests because there is no other way to tell the allocator + * that we want to fail rather than retry endlessly. + */ + if (!(kmalloc_flags & __GFP_REPEAT) || + (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) + kmalloc_flags |= __GFP_NORETRY; + } + + ret = kmalloc_node(size, kmalloc_flags, node); + + /* + * It doesn't really make sense to fallback to vmalloc for sub page + * requests + */ + if (ret || size <= PAGE_SIZE) + return ret; + + return __vmalloc_node_flags_caller(size, node, flags, + __builtin_return_address(0)); +} +EXPORT_SYMBOL(kvmalloc_node); + void kvfree(const void *addr) { if (is_vmalloc_addr(addr)) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0b057628a7ba5c45d722710082ce32df3f7e8e13..34a1c3e46ed72594b499e7f61e8aacdd4c5fe818 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -521,7 +521,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, } } - if (printk_ratelimit()) + if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) pr_warn("vmap allocation for size %lu failed: use vmalloc= to increase size\n", size); kfree(va); @@ -1579,7 +1579,7 @@ void vfree_atomic(const void *addr) * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling * conventions for vfree() arch-depenedent would be a really bad idea) * - * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) + * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node) */ void vfree(const void *addr) { @@ -1658,7 +1658,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page **pages; unsigned int nr_pages, array_size, i; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; - const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN; + const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1786,6 +1786,13 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. + * + * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_REPEAT + * and __GFP_NOFAIL are not supported + * + * Any use of gfp flags outside of GFP_KERNEL should be consulted + * with mm people. + * */ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, @@ -1809,6 +1816,13 @@ static inline void *__vmalloc_node_flags(unsigned long size, node, __builtin_return_address(0)); } + +void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags, + void *caller) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1821,7 +1835,7 @@ static inline void *__vmalloc_node_flags(unsigned long size, void *vmalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM); + GFP_KERNEL); } EXPORT_SYMBOL(vmalloc); @@ -1838,7 +1852,7 @@ EXPORT_SYMBOL(vmalloc); void *vzalloc(unsigned long size) { return __vmalloc_node_flags(size, NUMA_NO_NODE, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc); @@ -1855,7 +1869,7 @@ void *vmalloc_user(unsigned long size) void *ret; ret = __vmalloc_node(size, SHMLBA, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL, NUMA_NO_NODE, __builtin_return_address(0)); if (ret) { @@ -1879,7 +1893,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1899,7 +1913,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vzalloc_node(unsigned long size, int node) { return __vmalloc_node_flags(size, node, - GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); } EXPORT_SYMBOL(vzalloc_node); @@ -1921,7 +1935,7 @@ EXPORT_SYMBOL(vzalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC, NUMA_NO_NODE, __builtin_return_address(0)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index bc8031ef994d57a1d1622468f8df6d745853562b..8ad39bbc79e67eaff24c42201ae2470ffc21d0a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -97,8 +97,13 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; - /* Can cgroups be reclaimed below their normal consumption range? */ - unsigned int may_thrash:1; + /* + * Cgroups are not reclaimed below their configured memory.low, + * unless we threaten to OOM. If any cgroups are skipped due to + * memory.low and nothing was reclaimed, go back for memory.low. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; unsigned int hibernation_mode:1; @@ -230,12 +235,6 @@ unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) return nr; } -bool pgdat_reclaimable(struct pglist_data *pgdat) -{ - return node_page_state_snapshot(pgdat, NR_PAGES_SCANNED) < - pgdat_reclaimable_pages(pgdat) * 6; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector @@ -912,7 +911,8 @@ static void page_check_dirty_writeback(struct page *page, * Anonymous pages are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them */ - if (!page_is_file_cache(page)) { + if (!page_is_file_cache(page) || + (PageAnon(page) && !PageSwapBacked(page))) { *dirty = false; *writeback = false; return; @@ -972,8 +972,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; bool dirty, writeback; - bool lazyfree = false; - int ret = SWAP_SUCCESS; cond_resched(); @@ -988,13 +986,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; if (unlikely(!page_evictable(page))) - goto cull_mlocked; + goto activate_locked; if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ - if (page_mapped(page) || PageSwapCache(page)) + if ((page_mapped(page) || PageSwapCache(page)) && + !(PageAnon(page) && !PageSwapBacked(page))) sc->nr_scanned++; may_enter_fs = (sc->gfp_mask & __GFP_FS) || @@ -1120,13 +1119,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. + * Lazyfree page could be freed directly */ - if (PageAnon(page) && !PageSwapCache(page)) { + if (PageAnon(page) && PageSwapBacked(page) && + !PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; if (!add_to_swap(page, page_list)) goto activate_locked; - lazyfree = true; may_enter_fs = 1; /* Adding to swap updated mapping */ @@ -1143,21 +1143,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, * The page is mapped into the page tables of one or more * processes. Try to unmap it here. */ - if (page_mapped(page) && mapping) { - switch (ret = try_to_unmap(page, lazyfree ? - (ttu_flags | TTU_BATCH_FLUSH | TTU_LZFREE) : - (ttu_flags | TTU_BATCH_FLUSH))) { - case SWAP_FAIL: + if (page_mapped(page)) { + if (!try_to_unmap(page, ttu_flags | TTU_BATCH_FLUSH)) { nr_unmap_fail++; goto activate_locked; - case SWAP_AGAIN: - goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_LZFREE: - goto lazyfree; - case SWAP_SUCCESS: - ; /* try to free the page below */ } } @@ -1267,10 +1256,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, } } -lazyfree: - if (!mapping || !__remove_mapping(mapping, page, true)) - goto keep_locked; + if (PageAnon(page) && !PageSwapBacked(page)) { + /* follow __remove_mapping for reference */ + if (!page_ref_freeze(page, 1)) + goto keep_locked; + if (PageDirty(page)) { + page_ref_unfreeze(page, 1); + goto keep_locked; + } + count_vm_event(PGLAZYFREED); + } else if (!mapping || !__remove_mapping(mapping, page, true)) + goto keep_locked; /* * At this point, we have no other references and there is * no way to pick any more up (removed from LRU, removed @@ -1280,9 +1277,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ __ClearPageLocked(page); free_it: - if (ret == SWAP_LZFREE) - count_vm_event(PGLAZYFREED); - nr_reclaimed++; /* @@ -1292,20 +1286,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, list_add(&page->lru, &free_pages); continue; -cull_mlocked: - if (PageSwapCache(page)) - try_to_free_swap(page); - unlock_page(page); - list_add(&page->lru, &ret_pages); - continue; - activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ - if (PageSwapCache(page) && mem_cgroup_swap_full(page)) + if (PageSwapCache(page) && (mem_cgroup_swap_full(page) || + PageMlocked(page))) try_to_free_swap(page); VM_BUG_ON_PAGE(PageActive(page), page); - SetPageActive(page); - pgactivate++; + if (!PageMlocked(page)) { + SetPageActive(page); + pgactivate++; + } keep_locked: unlock_page(page); keep: @@ -1354,7 +1344,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, NULL, true); + TTU_IGNORE_ACCESS, NULL, true); list_splice(&clean_pages, page_list); mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret); return ret; @@ -1459,7 +1449,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * Appropriate locks must be held before calling this function. * - * @nr_to_scan: The number of pages to look through on the list. + * @nr_to_scan: The number of eligible pages to look through on the list. * @lruvec: The LRU vector to pull pages from. * @dst: The temp list to put pages on to. * @nr_scanned: The number of pages that were scanned. @@ -1478,12 +1468,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long nr_taken = 0; unsigned long nr_zone_taken[MAX_NR_ZONES] = { 0 }; unsigned long nr_skipped[MAX_NR_ZONES] = { 0, }; - unsigned long skipped = 0, total_skipped = 0; - unsigned long scan, nr_pages; + unsigned long skipped = 0; + unsigned long scan, total_scan, nr_pages; LIST_HEAD(pages_skipped); - for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan && - !list_empty(src);) { + scan = 0; + for (total_scan = 0; + scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src); + total_scan++) { struct page *page; page = lru_to_page(src); @@ -1498,11 +1490,12 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, } /* - * Account for scanned and skipped separetly to avoid the pgdat - * being prematurely marked unreclaimable by pgdat_reclaimable. + * Do not count skipped pages because that makes the function + * return with no isolated pages if the LRU mostly contains + * ineligible pages. This causes the VM to not reclaim any + * pages, triggering a premature OOM. */ scan++; - switch (__isolate_lru_page(page, mode)) { case 0: nr_pages = hpage_nr_pages(page); @@ -1531,6 +1524,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (!list_empty(&pages_skipped)) { int zid; + list_splice(&pages_skipped, src); for (zid = 0; zid < MAX_NR_ZONES; zid++) { if (!nr_skipped[zid]) continue; @@ -1538,19 +1532,10 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, __count_zid_vm_events(PGSCAN_SKIP, zid, nr_skipped[zid]); skipped += nr_skipped[zid]; } - - /* - * Account skipped pages as a partial scan as the pgdat may be - * close to unreclaimable. If the LRU list is empty, account - * skipped pages as a full scan. - */ - total_skipped = list_empty(src) ? skipped : skipped >> 2; - - list_splice(&pages_skipped, src); } - *nr_scanned = scan + total_skipped; + *nr_scanned = total_scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, - scan, skipped, nr_taken, mode, lru); + total_scan, skipped, nr_taken, mode, lru); update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -1750,7 +1735,6 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, reclaim_stat->recent_scanned[file] += nr_taken; if (global_reclaim(sc)) { - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); if (current_is_kswapd()) __count_vm_events(PGSCAN_KSWAPD, nr_scanned); else @@ -1761,7 +1745,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, TTU_UNMAP, + nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0, &stat, false); spin_lock_irq(&pgdat->lru_lock); @@ -1953,8 +1937,6 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); reclaim_stat->recent_scanned[file] += nr_taken; - if (global_reclaim(sc)) - __mod_node_page_state(pgdat, NR_PAGES_SCANNED, nr_scanned); __count_vm_events(PGREFILL, nr_scanned); spin_unlock_irq(&pgdat->lru_lock); @@ -2033,6 +2015,8 @@ static void shrink_active_list(unsigned long nr_to_scan, * Both inactive lists should also be large enough that each inactive * page has a chance to be referenced again before it is reclaimed. * + * If that fails and refaulting is observed, the inactive list grows. + * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages * on this LRU, maintained by the pageout code. A zone->inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. @@ -2049,12 +2033,15 @@ static void shrink_active_list(unsigned long nr_to_scan, * 10TB 320 32GB */ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, - struct scan_control *sc, bool trace) + struct mem_cgroup *memcg, + struct scan_control *sc, bool actual_reclaim) { - unsigned long inactive_ratio; - unsigned long inactive, active; - enum lru_list inactive_lru = file * LRU_FILE; enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + enum lru_list inactive_lru = file * LRU_FILE; + unsigned long inactive, active; + unsigned long inactive_ratio; + unsigned long refaults; unsigned long gb; /* @@ -2067,27 +2054,42 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx); active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx); - gb = (inactive + active) >> (30 - PAGE_SHIFT); - if (gb) - inactive_ratio = int_sqrt(10 * gb); + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); else - inactive_ratio = 1; + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); - if (trace) - trace_mm_vmscan_inactive_list_is_low(lruvec_pgdat(lruvec)->node_id, - sc->reclaim_idx, - lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, - lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, - inactive_ratio, file); + /* + * When refaults are being observed, it means a new workingset + * is being established. Disable active list protection to get + * rid of the stale workingset quickly. + */ + if (file && actual_reclaim && lruvec->refaults != refaults) { + inactive_ratio = 0; + } else { + gb = (inactive + active) >> (30 - PAGE_SHIFT); + if (gb) + inactive_ratio = int_sqrt(10 * gb); + else + inactive_ratio = 1; + } + + if (actual_reclaim) + trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx, + lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive, + lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active, + inactive_ratio, file); return inactive * inactive_ratio < active; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, - struct lruvec *lruvec, struct scan_control *sc) + struct lruvec *lruvec, struct mem_cgroup *memcg, + struct scan_control *sc) { if (is_active_lru(lru)) { - if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true)) + if (inactive_list_is_low(lruvec, is_file_lru(lru), + memcg, sc, true)) shrink_active_list(nr_to_scan, lruvec, sc, lru); return 0; } @@ -2123,30 +2125,8 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, unsigned long anon_prio, file_prio; enum scan_balance scan_balance; unsigned long anon, file; - bool force_scan = false; unsigned long ap, fp; enum lru_list lru; - bool some_scanned; - int pass; - - /* - * If the zone or memcg is small, nr[l] can be 0. This - * results in no scanning on this priority and a potential - * priority drop. Global direct reclaim can go to the next - * zone and tends to have no problems. Global kswapd is for - * zone balancing and it needs to scan a minimum amount. When - * reclaiming for a memcg, a priority drop can cause high - * latencies, so it's better to scan a minimum amount there as - * well. - */ - if (current_is_kswapd()) { - if (!pgdat_reclaimable(pgdat)) - force_scan = true; - if (!mem_cgroup_online(memcg)) - force_scan = true; - } - if (!global_reclaim(sc)) - force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || mem_cgroup_get_nr_swap_pages(memcg) <= 0) { @@ -2218,7 +2198,7 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, * lruvec even if it has plenty of old anonymous pages unless the * system is under heavy pressure. */ - if (!inactive_list_is_low(lruvec, true, sc, false) && + if (!inactive_list_is_low(lruvec, true, memcg, sc, false) && lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { scan_balance = SCAN_FILE; goto out; @@ -2277,55 +2257,48 @@ static void get_scan_count(struct lruvec *lruvec, struct mem_cgroup *memcg, fraction[1] = fp; denominator = ap + fp + 1; out: - some_scanned = false; - /* Only use force_scan on second pass. */ - for (pass = 0; !some_scanned && pass < 2; pass++) { - *lru_pages = 0; - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; - - size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); - scan = size >> sc->priority; - - if (!scan && pass && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); - - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: - /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. - */ - scan = div64_u64(scan * fraction[file], - denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) { - size = 0; - scan = 0; - } - break; - default: - /* Look ma, no brain */ - BUG(); - } + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - *lru_pages += size; - nr[lru] = scan; + size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = size >> sc->priority; + /* + * If the cgroup's already been deleted, make sure to + * scrape out the remaining cache. + */ + if (!scan && !mem_cgroup_online(memcg)) + scan = min(size, SWAP_CLUSTER_MAX); + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: /* - * Skip the second pass and don't force_scan, - * if we found something to scan. + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. */ - some_scanned |= !!scan; + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) { + size = 0; + scan = 0; + } + break; + default: + /* Look ma, no brain */ + BUG(); } + + *lru_pages += size; + nr[lru] = scan; } } @@ -2376,7 +2349,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc nr[lru] -= nr_to_scan; nr_reclaimed += shrink_list(lru, nr_to_scan, - lruvec, sc); + lruvec, memcg, sc); } } @@ -2443,7 +2416,7 @@ static void shrink_node_memcg(struct pglist_data *pgdat, struct mem_cgroup *memc * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); } @@ -2557,9 +2530,11 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long scanned; if (mem_cgroup_low(root, memcg)) { - if (!sc->may_thrash) + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; continue; - mem_cgroup_events(memcg, MEMCG_LOW, 1); + } + mem_cgroup_event(memcg, MEMCG_LOW); } reclaimed = sc->nr_reclaimed; @@ -2620,6 +2595,15 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, sc->nr_scanned - nr_scanned, sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + return reclaimable; } @@ -2694,10 +2678,6 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) GFP_KERNEL | __GFP_HARDWALL)) continue; - if (sc->priority != DEF_PRIORITY && - !pgdat_reclaimable(zone->zone_pgdat)) - continue; /* Let kswapd poll it */ - /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2752,6 +2732,25 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) sc->gfp_mask = orig_mask; } +static void snapshot_refaults(struct mem_cgroup *root_memcg, pg_data_t *pgdat) +{ + struct mem_cgroup *memcg; + + memcg = mem_cgroup_iter(root_memcg, NULL, NULL); + do { + unsigned long refaults; + struct lruvec *lruvec; + + if (memcg) + refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE); + else + refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE); + + lruvec = mem_cgroup_lruvec(pgdat, memcg); + lruvec->refaults = refaults; + } while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL))); +} + /* * This is the main entry point to direct page reclaim. * @@ -2772,6 +2771,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int initial_priority = sc->priority; + pg_data_t *last_pgdat; + struct zoneref *z; + struct zone *zone; retry: delayacct_freepages_start(); @@ -2798,6 +2800,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->may_writepage = 1; } while (--sc->priority >= 0); + last_pgdat = NULL; + for_each_zone_zonelist_nodemask(zone, z, zonelist, sc->reclaim_idx, + sc->nodemask) { + if (zone->zone_pgdat == last_pgdat) + continue; + last_pgdat = zone->zone_pgdat; + snapshot_refaults(sc->target_mem_cgroup, zone->zone_pgdat); + } + delayacct_freepages_end(); if (sc->nr_reclaimed) @@ -2808,16 +2819,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, return 1; /* Untapped cgroup reserves? Don't OOM, retry. */ - if (!sc->may_thrash) { + if (sc->memcg_low_skipped) { sc->priority = initial_priority; - sc->may_thrash = 1; + sc->memcg_low_reclaim = 1; + sc->memcg_low_skipped = 0; goto retry; } return 0; } -static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +static bool allow_direct_reclaim(pg_data_t *pgdat) { struct zone *zone; unsigned long pfmemalloc_reserve = 0; @@ -2825,10 +2837,15 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) int i; bool wmark_ok; + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; + for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; - if (!managed_zone(zone) || - pgdat_reclaimable_pages(pgdat) == 0) + if (!managed_zone(zone)) + continue; + + if (!zone_reclaimable_pages(zone)) continue; pfmemalloc_reserve += min_wmark_pages(zone); @@ -2905,7 +2922,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, /* Throttle based on the first usable node */ pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + if (allow_direct_reclaim(pgdat)) goto out; break; } @@ -2927,14 +2944,14 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, */ if (!(gfp_mask & __GFP_FS)) { wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat), HZ); + allow_direct_reclaim(pgdat), HZ); goto check_pending; } /* Throttle until kswapd wakes the process */ wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, - pfmemalloc_watermark_ok(pgdat)); + allow_direct_reclaim(pgdat)); check_pending: if (fatal_signal_pending(current)) @@ -2950,7 +2967,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, unsigned long nr_reclaimed; struct scan_control sc = { .nr_to_reclaim = SWAP_CLUSTER_MAX, - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .reclaim_idx = gfp_zone(gfp_mask), .order = order, .nodemask = nodemask, @@ -3028,9 +3045,10 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, struct zonelist *zonelist; unsigned long nr_reclaimed; int nid; + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + .gfp_mask = (current_gfp_context(gfp_mask) & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), .reclaim_idx = MAX_NR_ZONES - 1, .target_mem_cgroup = memcg, @@ -3054,9 +3072,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, sc.gfp_mask, sc.reclaim_idx); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); nr_reclaimed = do_try_to_free_pages(zonelist, &sc); - current->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -3076,7 +3094,7 @@ static void age_active_anon(struct pglist_data *pgdat, do { struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg); - if (inactive_list_is_low(lruvec, false, sc, true)) + if (inactive_list_is_low(lruvec, false, memcg, sc, true)) shrink_active_list(SWAP_CLUSTER_MAX, lruvec, sc, LRU_ACTIVE_ANON); @@ -3084,22 +3102,44 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } -static bool zone_balanced(struct zone *zone, int order, int classzone_idx) +/* + * Returns true if there is an eligible zone balanced for the request order + * and classzone_idx + */ +static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) { - unsigned long mark = high_wmark_pages(zone); + int i; + unsigned long mark = -1; + struct zone *zone; - if (!zone_watermark_ok_safe(zone, order, mark, classzone_idx)) - return false; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + + if (!managed_zone(zone)) + continue; + + mark = high_wmark_pages(zone); + if (zone_watermark_ok_safe(zone, order, mark, classzone_idx)) + return true; + } /* - * If any eligible zone is balanced then the node is not considered - * to be congested or dirty + * If a node has no populated zone within classzone_idx, it does not + * need balancing by definition. This can happen if a zone-restricted + * allocation tries to wake a remote kswapd. */ - clear_bit(PGDAT_CONGESTED, &zone->zone_pgdat->flags); - clear_bit(PGDAT_DIRTY, &zone->zone_pgdat->flags); - clear_bit(PGDAT_WRITEBACK, &zone->zone_pgdat->flags); + if (mark == -1) + return true; - return true; + return false; +} + +/* Clear pgdat state for congested, dirty or under writeback. */ +static void clear_pgdat_congested(pg_data_t *pgdat) +{ + clear_bit(PGDAT_CONGESTED, &pgdat->flags); + clear_bit(PGDAT_DIRTY, &pgdat->flags); + clear_bit(PGDAT_WRITEBACK, &pgdat->flags); } /* @@ -3110,11 +3150,9 @@ static bool zone_balanced(struct zone *zone, int order, int classzone_idx) */ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) { - int i; - /* * The throttled processes are normally woken up in balance_pgdat() as - * soon as pfmemalloc_watermark_ok() is true. But there is a potential + * soon as allow_direct_reclaim() is true. But there is a potential * race between when kswapd checks the watermarks and a process gets * throttled. There is also a potential race if processes get * throttled, kswapd wakes, a large process exits thereby balancing the @@ -3128,17 +3166,16 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx) if (waitqueue_active(&pgdat->pfmemalloc_wait)) wake_up_all(&pgdat->pfmemalloc_wait); - for (i = 0; i <= classzone_idx; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return true; - if (!zone_balanced(zone, order, classzone_idx)) - return false; + if (pgdat_balanced(pgdat, order, classzone_idx)) { + clear_pgdat_congested(pgdat); + return true; } - return true; + return false; } /* @@ -3214,9 +3251,9 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); do { + unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; - sc.nr_reclaimed = 0; sc.reclaim_idx = classzone_idx; /* @@ -3241,23 +3278,12 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Check from - * high to low zone as allocations prefer higher zones. - * Scanning from low to high zone would allow congestion to be - * cleared during a very small window when a small low - * zone was balanced even under extreme pressure when the - * overall node may be congested. Note that sc.reclaim_idx - * is not used as buffer_heads_over_limit may have adjusted - * it. + * Only reclaim if there are no eligible zones. Note that + * sc.reclaim_idx is not used as buffer_heads_over_limit may + * have adjusted it. */ - for (i = classzone_idx; i >= 0; i--) { - zone = pgdat->node_zones + i; - if (!managed_zone(zone)) - continue; - - if (zone_balanced(zone, sc.order, classzone_idx)) - goto out; - } + if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + goto out; /* * Do some background aging of the anon list, to give @@ -3271,7 +3297,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * If we're getting trouble reclaiming, start doing writepage * even in laptop mode. */ - if (sc.priority < DEF_PRIORITY - 2 || !pgdat_reclaimable(pgdat)) + if (sc.priority < DEF_PRIORITY - 2) sc.may_writepage = 1; /* Call soft limit reclaim before calling shrink_node. */ @@ -3295,7 +3321,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * able to safely make forward progress. Wake them */ if (waitqueue_active(&pgdat->pfmemalloc_wait) && - pfmemalloc_watermark_ok(pgdat)) + allow_direct_reclaim(pgdat)) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ @@ -3306,11 +3332,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * Raise priority if scanning rate is too low or there was no * progress in reclaiming pages */ - if (raise_priority || !sc.nr_reclaimed) + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); + if (!sc.nr_reclaimed) + pgdat->kswapd_failures++; + out: + snapshot_refaults(NULL, pgdat); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3320,6 +3351,22 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) return sc.order; } +/* + * pgdat->kswapd_classzone_idx is the highest zone index that a recent + * allocation request woke kswapd for. When kswapd has not woken recently, + * the value is MAX_NR_ZONES which is not a valid index. This compares a + * given classzone and returns it or the highest classzone index kswapd + * was recently woke for. + */ +static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat, + enum zone_type classzone_idx) +{ + if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES) + return classzone_idx; + + return max(pgdat->kswapd_classzone_idx, classzone_idx); +} + static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order, unsigned int classzone_idx) { @@ -3331,7 +3378,13 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - /* Try to sleep for a short interval */ + /* + * Try to sleep for a short interval. Note that kcompactd will only be + * woken if it is possible to sleep for a short interval. This is + * deliberate on the assumption that if reclaim cannot keep an + * eligible zone balanced that it's also unlikely that compaction will + * succeed. + */ if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) { /* * Compaction records what page blocks it recently failed to @@ -3355,7 +3408,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o * the previous request that slept prematurely. */ if (remaining) { - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order); } @@ -3409,7 +3462,8 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o */ static int kswapd(void *p) { - unsigned int alloc_order, reclaim_order, classzone_idx; + unsigned int alloc_order, reclaim_order; + unsigned int classzone_idx = MAX_NR_ZONES - 1; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -3439,20 +3493,23 @@ static int kswapd(void *p) tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; set_freezable(); - pgdat->kswapd_order = alloc_order = reclaim_order = 0; - pgdat->kswapd_classzone_idx = classzone_idx = 0; + pgdat->kswapd_order = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; for ( ; ; ) { bool ret; + alloc_order = reclaim_order = pgdat->kswapd_order; + classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx); + kswapd_try_sleep: kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order, classzone_idx); /* Read the new order and classzone_idx */ alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; + classzone_idx = kswapd_classzone_idx(pgdat, 0); pgdat->kswapd_order = 0; - pgdat->kswapd_classzone_idx = 0; + pgdat->kswapd_classzone_idx = MAX_NR_ZONES; ret = try_to_freeze(); if (kthread_should_stop()) @@ -3478,9 +3535,6 @@ static int kswapd(void *p) reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); if (reclaim_order < alloc_order) goto kswapd_try_sleep; - - alloc_order = reclaim_order = pgdat->kswapd_order; - classzone_idx = pgdat->kswapd_classzone_idx; } tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD); @@ -3496,7 +3550,6 @@ static int kswapd(void *p) void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; - int z; if (!managed_zone(zone)) return; @@ -3504,22 +3557,20 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) return; pgdat = zone->zone_pgdat; - pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx, classzone_idx); + pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, + classzone_idx); pgdat->kswapd_order = max(pgdat->kswapd_order, order); if (!waitqueue_active(&pgdat->kswapd_wait)) return; - /* Only wake kswapd if all zones are unbalanced */ - for (z = 0; z <= classzone_idx; z++) { - zone = pgdat->node_zones + z; - if (!managed_zone(zone)) - continue; + /* Hopeless node, leave it to direct reclaim */ + if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES) + return; - if (zone_balanced(zone, order, classzone_idx)) - return; - } + if (pgdat_balanced(pgdat, order, classzone_idx)) + return; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order); wake_up_interruptible(&pgdat->kswapd_wait); } @@ -3548,8 +3599,9 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); struct task_struct *p = current; unsigned long nr_reclaimed; + unsigned int noreclaim_flag; - p->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); lockdep_set_current_reclaim_state(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3558,7 +3610,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) p->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + memalloc_noreclaim_restore(noreclaim_flag); return nr_reclaimed; } @@ -3723,9 +3775,10 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in struct task_struct *p = current; struct reclaim_state reclaim_state; int classzone_idx = gfp_zone(gfp_mask); + unsigned int noreclaim_flag; struct scan_control sc = { .nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX), - .gfp_mask = (gfp_mask = memalloc_noio_flags(gfp_mask)), + .gfp_mask = (gfp_mask = current_gfp_context(gfp_mask)), .order = order, .priority = NODE_RECLAIM_PRIORITY, .may_writepage = !!(node_reclaim_mode & RECLAIM_WRITE), @@ -3740,7 +3793,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * and we also need to be able to write out pages for RECLAIM_WRITE * and RECLAIM_UNMAP. */ - p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + noreclaim_flag = memalloc_noreclaim_save(); + p->flags |= PF_SWAPWRITE; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3756,7 +3810,8 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + current->flags &= ~PF_SWAPWRITE; + memalloc_noreclaim_restore(noreclaim_flag); lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } @@ -3779,9 +3834,6 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order) sum_zone_node_page_state(pgdat->node_id, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages) return NODE_RECLAIM_FULL; - if (!pgdat_reclaimable(pgdat)) - return NODE_RECLAIM_FULL; - /* * Do not scan if the allocation should not be delayed. */ diff --git a/mm/vmstat.c b/mm/vmstat.c index 809025ed97ea0eee97573a32ba2764c63ee2dffd..76f73670200ac1d34b5f17388df8356fadffaf71 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -954,7 +954,6 @@ const char * const vmstat_text[] = { "nr_unevictable", "nr_isolated_anon", "nr_isolated_file", - "nr_pages_scanned", "workingset_refault", "workingset_activate", "workingset_nodereclaim", @@ -992,6 +991,7 @@ const char * const vmstat_text[] = { "pgfree", "pgactivate", "pgdeactivate", + "pglazyfree", "pgfault", "pgmajfault", @@ -1124,8 +1124,12 @@ static void frag_stop(struct seq_file *m, void *arg) { } -/* Walk all the zones in a node and print using a callback */ +/* + * Walk zones in a node and print using a callback. + * If @assert_populated is true, only use callback for zones that are populated. + */ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, + bool assert_populated, void (*print)(struct seq_file *m, pg_data_t *, struct zone *)) { struct zone *zone; @@ -1133,7 +1137,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, unsigned long flags; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) + if (assert_populated && !populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -1161,7 +1165,7 @@ static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, static int frag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, frag_show_print); + walk_zones_in_node(m, pgdat, true, frag_show_print); return 0; } @@ -1202,7 +1206,7 @@ static int pagetypeinfo_showfree(struct seq_file *m, void *arg) seq_printf(m, "%6d ", order); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showfree_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showfree_print); return 0; } @@ -1254,7 +1258,7 @@ static int pagetypeinfo_showblockcount(struct seq_file *m, void *arg) for (mtype = 0; mtype < MIGRATE_TYPES; mtype++) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showblockcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showblockcount_print); return 0; } @@ -1280,7 +1284,7 @@ static void pagetypeinfo_showmixedcount(struct seq_file *m, pg_data_t *pgdat) seq_printf(m, "%12s ", migratetype_names[mtype]); seq_putc(m, '\n'); - walk_zones_in_node(m, pgdat, pagetypeinfo_showmixedcount_print); + walk_zones_in_node(m, pgdat, true, pagetypeinfo_showmixedcount_print); #endif /* CONFIG_PAGE_OWNER */ } @@ -1355,8 +1359,6 @@ static bool is_zone_first_populated(pg_data_t *pgdat, struct zone *zone) return zone == compare; } - /* The zone must be somewhere! */ - WARN_ON_ONCE(1); return false; } @@ -1378,7 +1380,6 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n node_scanned %lu" "\n spanned %lu" "\n present %lu" "\n managed %lu", @@ -1386,23 +1387,28 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), - node_page_state(zone->zone_pgdat, NR_PAGES_SCANNED), zone->spanned_pages, zone->present_pages, zone->managed_pages); - for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) - seq_printf(m, "\n %-12s %lu", vmstat_text[i], - zone_page_state(zone, i)); - seq_printf(m, "\n protection: (%ld", zone->lowmem_reserve[0]); for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) seq_printf(m, ", %ld", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); + seq_putc(m, ')'); + + /* If unpopulated, no other information is useful */ + if (!populated_zone(zone)) { + seq_putc(m, '\n'); + return; + } + + for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) + seq_printf(m, "\n %-12s %lu", vmstat_text[i], + zone_page_state(zone, i)); + + seq_printf(m, "\n pagesets"); for_each_online_cpu(i) { struct per_cpu_pageset *pageset; @@ -1425,19 +1431,22 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n node_unreclaimable: %u" "\n start_pfn: %lu" "\n node_inactive_ratio: %u", - !pgdat_reclaimable(zone->zone_pgdat), + pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, zone->zone_start_pfn, zone->zone_pgdat->inactive_ratio); seq_putc(m, '\n'); } /* - * Output information about zones in @pgdat. + * Output information about zones in @pgdat. All zones are printed regardless + * of whether they are populated or not: lowmem_reserve_ratio operates on the + * set of all zones and userspace would not be aware of such zones if they are + * suppressed here (zoneinfo displays the effect of lowmem_reserve_ratio). */ static int zoneinfo_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, zoneinfo_show_print); + walk_zones_in_node(m, pgdat, false, zoneinfo_show_print); return 0; } @@ -1586,22 +1595,9 @@ int vmstat_refresh(struct ctl_table *table, int write, for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { val = atomic_long_read(&vm_zone_stat[i]); if (val < 0) { - switch (i) { - case NR_PAGES_SCANNED: - /* - * This is often seen to go negative in - * recent kernels, but not to go permanently - * negative. Whilst it would be nicer not to - * have exceptions, rooting them out would be - * another task, of rather low priority. - */ - break; - default: - pr_warn("%s: %s %ld\n", - __func__, vmstat_text[i], val); - err = -EINVAL; - break; - } + pr_warn("%s: %s %ld\n", + __func__, vmstat_text[i], val); + err = -EINVAL; } } if (err) @@ -1768,8 +1764,7 @@ void __init init_mm_internals(void) { int ret __maybe_unused; - mm_percpu_wq = alloc_workqueue("mm_percpu_wq", - WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + mm_percpu_wq = alloc_workqueue("mm_percpu_wq", WQ_MEM_RECLAIM, 0); #ifdef CONFIG_SMP ret = cpuhp_setup_state_nocalls(CPUHP_MM_VMSTAT_DEAD, "mm/vmstat:dead", @@ -1857,7 +1852,7 @@ static int unusable_show(struct seq_file *m, void *arg) if (!node_state(pgdat->node_id, N_MEMORY)) return 0; - walk_zones_in_node(m, pgdat, unusable_show_print); + walk_zones_in_node(m, pgdat, true, unusable_show_print); return 0; } @@ -1909,7 +1904,7 @@ static int extfrag_show(struct seq_file *m, void *arg) { pg_data_t *pgdat = (pg_data_t *)arg; - walk_zones_in_node(m, pgdat, extfrag_show_print); + walk_zones_in_node(m, pgdat, true, extfrag_show_print); return 0; } diff --git a/mm/workingset.c b/mm/workingset.c index eda05c71fa49e6e1e4f93a4029ddef04a4f8ab4c..b8c9ab6784795a0e7f8eff82a9a071d481e91a5f 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -269,7 +269,6 @@ bool workingset_refault(void *shadow) lruvec = mem_cgroup_lruvec(pgdat, memcg); refault = atomic_long_read(&lruvec->inactive_age); active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES); - rcu_read_unlock(); /* * The unsigned subtraction here gives an accurate distance @@ -290,11 +289,15 @@ bool workingset_refault(void *shadow) refault_distance = (refault - eviction) & EVICTION_MASK; inc_node_state(pgdat, WORKINGSET_REFAULT); + inc_memcg_state(memcg, WORKINGSET_REFAULT); if (refault_distance <= active_file) { inc_node_state(pgdat, WORKINGSET_ACTIVATE); + inc_memcg_state(memcg, WORKINGSET_ACTIVATE); + rcu_read_unlock(); return true; } + rcu_read_unlock(); return false; } @@ -472,6 +475,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); + inc_memcg_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, workingset_update_node, mapping); diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c index 5945f7e19c679cb882a5a5716a1de33bed50a8f1..40d3d72beb5384efee121bd4319a02b6d0821522 100644 --- a/net/6lowpan/core.c +++ b/net/6lowpan/core.c @@ -23,10 +23,18 @@ int lowpan_register_netdevice(struct net_device *dev, { int i, ret; - dev->addr_len = EUI64_ADDR_LEN; + switch (lltype) { + case LOWPAN_LLTYPE_IEEE802154: + dev->addr_len = EUI64_ADDR_LEN; + break; + + case LOWPAN_LLTYPE_BTLE: + dev->addr_len = ETH_ALEN; + break; + } + dev->type = ARPHRD_6LOWPAN; dev->mtu = IPV6_MIN_MTU; - dev->priv_flags |= IFF_NO_QUEUE; lowpan_dev(dev)->lltype = lltype; diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c index 79f1fa22509a350b35340897c1cb137357734713..6b1042e216565a85ecc1b1e6301172750c07a8ae 100644 --- a/net/6lowpan/iphc.c +++ b/net/6lowpan/iphc.c @@ -278,6 +278,23 @@ lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev, return ret; } +static void lowpan_iphc_uncompress_lladdr(const struct net_device *dev, + struct in6_addr *ipaddr, + const void *lladdr) +{ + switch (dev->addr_len) { + case ETH_ALEN: + lowpan_iphc_uncompress_eui48_lladdr(ipaddr, lladdr); + break; + case EUI64_ADDR_LEN: + lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + break; + default: + WARN_ON_ONCE(1); + break; + } +} + /* Uncompress address function for source and * destination address(non-multicast). * @@ -320,7 +337,7 @@ static int lowpan_iphc_uncompress_addr(struct sk_buff *skb, lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_lladdr(dev, ipaddr, lladdr); break; } break; @@ -381,7 +398,7 @@ static int lowpan_iphc_uncompress_ctx_addr(struct sk_buff *skb, lowpan_iphc_uncompress_802154_lladdr(ipaddr, lladdr); break; default: - lowpan_iphc_uncompress_eui64_lladdr(ipaddr, lladdr); + lowpan_iphc_uncompress_lladdr(dev, ipaddr, lladdr); break; } ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen); @@ -666,6 +683,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) { case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC: + skb->pkt_type = PACKET_BROADCAST; + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { @@ -681,11 +700,15 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; case LOWPAN_IPHC_M: + skb->pkt_type = PACKET_BROADCAST; + /* multicast */ err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK); break; case LOWPAN_IPHC_DAC: + skb->pkt_type = PACKET_HOST; + spin_lock_bh(&lowpan_dev(dev)->ctx.lock); ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid)); if (!ci) { @@ -701,6 +724,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev, spin_unlock_bh(&lowpan_dev(dev)->ctx.lock); break; default: + skb->pkt_type = PACKET_HOST; + err = lowpan_iphc_uncompress_addr(skb, dev, &hdr.daddr, iphc1 & LOWPAN_IPHC_DAM_MASK, daddr); @@ -802,6 +827,21 @@ lowpan_iphc_compress_ctx_802154_lladdr(const struct in6_addr *ipaddr, return lladdr_compress; } +static bool lowpan_iphc_addr_equal(const struct net_device *dev, + const struct lowpan_iphc_ctx *ctx, + const struct in6_addr *ipaddr, + const void *lladdr) +{ + struct in6_addr tmp = {}; + + lowpan_iphc_uncompress_lladdr(dev, &tmp, lladdr); + + if (ctx) + ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); + + return ipv6_addr_equal(&tmp, ipaddr); +} + static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev, const struct in6_addr *ipaddr, const struct lowpan_iphc_ctx *ctx, @@ -819,13 +859,7 @@ static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct net_device *dev, } break; default: - /* check for SAM/DAM = 11 */ - memcpy(&tmp.s6_addr[8], lladdr, EUI64_ADDR_LEN); - /* second bit-flip (Universe/Local) is done according RFC2464 */ - tmp.s6_addr[8] ^= 0x02; - /* context information are always used */ - ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen); - if (ipv6_addr_equal(&tmp, ipaddr)) { + if (lowpan_iphc_addr_equal(dev, ctx, ipaddr, lladdr)) { dam = LOWPAN_IPHC_DAM_11; goto out; } @@ -921,11 +955,12 @@ static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct net_device *dev, } break; default: - if (is_addr_mac_addr_based(ipaddr, lladdr)) { - dam = LOWPAN_IPHC_DAM_11; /* 0-bits */ + if (lowpan_iphc_addr_equal(dev, NULL, ipaddr, lladdr)) { + dam = LOWPAN_IPHC_DAM_11; pr_debug("address compression 0 bits\n"); goto out; } + break; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e97ab824e368cc16f9609acd70d5337866eb2936..953b6728bd00c8ca7a4a20f2d2036c6f8f27f8e3 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -562,8 +562,7 @@ static int vlan_dev_init(struct net_device *dev) NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC | NETIF_F_ALL_FCOE; - dev->features |= real_dev->vlan_features | NETIF_F_LLTX | - NETIF_F_GSO_SOFTWARE; + dev->features |= dev->hw_features | NETIF_F_LLTX; dev->gso_max_size = real_dev->gso_max_size; dev->gso_max_segs = real_dev->gso_max_segs; if (dev->features & NETIF_F_VLAN_FEATURES) @@ -627,11 +626,18 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev, { struct net_device *real_dev = vlan_dev_priv(dev)->real_dev; netdev_features_t old_features = features; + netdev_features_t lower_features; - features = netdev_intersect_features(features, real_dev->vlan_features); - features |= NETIF_F_RXCSUM; - features = netdev_intersect_features(features, real_dev->features); + lower_features = netdev_intersect_features((real_dev->vlan_features | + NETIF_F_RXCSUM), + real_dev->features); + /* Add HW_CSUM setting to preserve user ability to control + * checksum offload on the vlan device. + */ + if (lower_features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)) + lower_features |= NETIF_F_HW_CSUM; + features = netdev_intersect_features(features, lower_features); features |= old_features & (NETIF_F_SOFT_FEATURES | NETIF_F_GSO_SOFTWARE); features |= NETIF_F_LLTX; diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c index 1270207f3d7c9dd6fde0e1f7839acfe18a4d82c9..9c94aad153b308d4ca3e1fb098f045365895a218 100644 --- a/net/8021q/vlan_netlink.c +++ b/net/8021q/vlan_netlink.c @@ -35,7 +35,8 @@ static inline int vlan_validate_qos_map(struct nlattr *attr) { if (!attr) return 0; - return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy); + return nla_validate_nested(attr, IFLA_VLAN_QOS_MAX, vlan_map_policy, + NULL); } static int vlan_validate(struct nlattr *tb[], struct nlattr *data[]) diff --git a/net/9p/Kconfig b/net/9p/Kconfig index a75174a337237d5f113ca8c1616a01a46e72ac59..e6014e0e51f7a212dd48cdefcb26dffb44b2fba4 100644 --- a/net/9p/Kconfig +++ b/net/9p/Kconfig @@ -22,6 +22,15 @@ config NET_9P_VIRTIO This builds support for a transports between guest partitions and a host partition. +config NET_9P_XEN + depends on XEN + select XEN_XENBUS_FRONTEND + tristate "9P Xen Transport" + help + This builds support for a transport for 9pfs between + two Xen domains. + + config NET_9P_RDMA depends on INET && INFINIBAND && INFINIBAND_ADDR_TRANS tristate "9P RDMA Transport (Experimental)" diff --git a/net/9p/Makefile b/net/9p/Makefile index a0874cc1f718bcadbb604458127bebcf5664a2c5..697ea7caf46605d2b015200fb1dd126c92caaa30 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -1,4 +1,5 @@ obj-$(CONFIG_NET_9P) := 9pnet.o +obj-$(CONFIG_NET_9P_XEN) += 9pnet_xen.o obj-$(CONFIG_NET_9P_VIRTIO) += 9pnet_virtio.o obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o @@ -14,5 +15,8 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o 9pnet_virtio-objs := \ trans_virtio.o \ +9pnet_xen-objs := \ + trans_xen.o \ + 9pnet_rdma-objs := \ trans_rdma.o \ diff --git a/net/9p/client.c b/net/9p/client.c index 3ce672af1596cfdb8fbd67ce558366e7997b7695..1218fb3b52dad115e4343c465ea891aed38d34c0 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -592,9 +592,8 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, ename = &req->rc->sdata[req->rc->offset]; if (len > inline_len) { /* We have error in external buffer */ - err = copy_from_iter(ename + inline_len, - len - inline_len, uidata); - if (err != len - inline_len) { + if (!copy_from_iter_full(ename + inline_len, + len - inline_len, uidata)) { err = -EFAULT; goto out_err; } @@ -2101,6 +2100,10 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) trace_9p_protocol_dump(clnt, req->rc); goto free_and_error; } + if (rsize < count) { + pr_err("bogus RREADDIR count (%d > %d)\n", count, rsize); + count = rsize; + } p9_debug(P9_DEBUG_9P, "<<< RREADDIR count %d\n", count); diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 16d28756598789504f9a30a07476256a2146b2e3..16e10680518c4cba784296ff96ddcc2643cabf82 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -74,7 +74,7 @@ pdu_write_u(struct p9_fcall *pdu, struct iov_iter *from, size_t size) { size_t len = min(pdu->capacity - pdu->size, size); struct iov_iter i = *from; - if (copy_from_iter(&pdu->sdata[pdu->size], len, &i) != len) + if (!copy_from_iter_full(&pdu->sdata[pdu->size], len, &i)) len = 0; pdu->size += len; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c new file mode 100644 index 0000000000000000000000000000000000000000..71e85643b3f96db09de3330263f556fedba65257 --- /dev/null +++ b/net/9p/trans_xen.c @@ -0,0 +1,545 @@ +/* + * linux/fs/9p/trans_xen + * + * Xen transport layer. + * + * Copyright (C) 2017 by Stefano Stabellini + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define XEN_9PFS_NUM_RINGS 2 +#define XEN_9PFS_RING_ORDER 6 +#define XEN_9PFS_RING_SIZE XEN_FLEX_RING_SIZE(XEN_9PFS_RING_ORDER) + +struct xen_9pfs_header { + uint32_t size; + uint8_t id; + uint16_t tag; + + /* uint8_t sdata[]; */ +} __attribute__((packed)); + +/* One per ring, more than one per 9pfs share */ +struct xen_9pfs_dataring { + struct xen_9pfs_front_priv *priv; + + struct xen_9pfs_data_intf *intf; + grant_ref_t ref; + int evtchn; + int irq; + /* protect a ring from concurrent accesses */ + spinlock_t lock; + + struct xen_9pfs_data data; + wait_queue_head_t wq; + struct work_struct work; +}; + +/* One per 9pfs share */ +struct xen_9pfs_front_priv { + struct list_head list; + struct xenbus_device *dev; + char *tag; + struct p9_client *client; + + int num_rings; + struct xen_9pfs_dataring *rings; +}; + +static LIST_HEAD(xen_9pfs_devs); +static DEFINE_RWLOCK(xen_9pfs_lock); + +/* We don't currently allow canceling of requests */ +static int p9_xen_cancel(struct p9_client *client, struct p9_req_t *req) +{ + return 1; +} + +static int p9_xen_create(struct p9_client *client, const char *addr, char *args) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (!strcmp(priv->tag, addr)) { + priv->client = client; + read_unlock(&xen_9pfs_lock); + return 0; + } + } + read_unlock(&xen_9pfs_lock); + return -EINVAL; +} + +static void p9_xen_close(struct p9_client *client) +{ + struct xen_9pfs_front_priv *priv; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) { + priv->client = NULL; + read_unlock(&xen_9pfs_lock); + return; + } + } + read_unlock(&xen_9pfs_lock); +} + +static bool p9_xen_write_todo(struct xen_9pfs_dataring *ring, RING_IDX size) +{ + RING_IDX cons, prod; + + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + return XEN_9PFS_RING_SIZE - + xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) >= size; +} + +static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) +{ + struct xen_9pfs_front_priv *priv = NULL; + RING_IDX cons, prod, masked_cons, masked_prod; + unsigned long flags; + u32 size = p9_req->tc->size; + struct xen_9pfs_dataring *ring; + int num; + + read_lock(&xen_9pfs_lock); + list_for_each_entry(priv, &xen_9pfs_devs, list) { + if (priv->client == client) + break; + } + read_unlock(&xen_9pfs_lock); + if (!priv || priv->client != client) + return -EINVAL; + + num = p9_req->tc->tag % priv->num_rings; + ring = &priv->rings[num]; + +again: + while (wait_event_interruptible(ring->wq, + p9_xen_write_todo(ring, size)) != 0) + ; + + spin_lock_irqsave(&ring->lock, flags); + cons = ring->intf->out_cons; + prod = ring->intf->out_prod; + virt_mb(); + + if (XEN_9PFS_RING_SIZE - xen_9pfs_queued(prod, cons, + XEN_9PFS_RING_SIZE) < size) { + spin_unlock_irqrestore(&ring->lock, flags); + goto again; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); + + p9_req->status = REQ_STATUS_SENT; + virt_wmb(); /* write ring before updating pointer */ + prod += size; + ring->intf->out_prod = prod; + spin_unlock_irqrestore(&ring->lock, flags); + notify_remote_via_irq(ring->irq); + + return 0; +} + +static void p9_xen_response(struct work_struct *work) +{ + struct xen_9pfs_front_priv *priv; + struct xen_9pfs_dataring *ring; + RING_IDX cons, prod, masked_cons, masked_prod; + struct xen_9pfs_header h; + struct p9_req_t *req; + int status; + + ring = container_of(work, struct xen_9pfs_dataring, work); + priv = ring->priv; + + while (1) { + cons = ring->intf->in_cons; + prod = ring->intf->in_prod; + virt_rmb(); + + if (xen_9pfs_queued(prod, cons, XEN_9PFS_RING_SIZE) < + sizeof(h)) { + notify_remote_via_irq(ring->irq); + return; + } + + masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + + /* First, read just the header */ + xen_9pfs_read_packet(&h, ring->data.in, sizeof(h), + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + req = p9_tag_lookup(priv->client, h.tag); + if (!req || req->status != REQ_STATUS_SENT) { + dev_warn(&priv->dev->dev, "Wrong req tag=%x\n", h.tag); + cons += h.size; + virt_mb(); + ring->intf->in_cons = cons; + continue; + } + + memcpy(req->rc, &h, sizeof(h)); + req->rc->offset = 0; + + masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); + /* Then, read the whole packet (including the header) */ + xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + masked_prod, &masked_cons, + XEN_9PFS_RING_SIZE); + + virt_mb(); + cons += h.size; + ring->intf->in_cons = cons; + + status = (req->status != REQ_STATUS_ERROR) ? + REQ_STATUS_RCVD : REQ_STATUS_ERROR; + + p9_client_cb(priv->client, req, status); + } +} + +static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) +{ + struct xen_9pfs_dataring *ring = r; + + if (!ring || !ring->priv->client) { + /* ignore spurious interrupt */ + return IRQ_HANDLED; + } + + wake_up_interruptible(&ring->wq); + schedule_work(&ring->work); + + return IRQ_HANDLED; +} + +static struct p9_trans_module p9_xen_trans = { + .name = "xen", + .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT), + .def = 1, + .create = p9_xen_create, + .close = p9_xen_close, + .request = p9_xen_request, + .cancel = p9_xen_cancel, + .owner = THIS_MODULE, +}; + +static const struct xenbus_device_id xen_9pfs_front_ids[] = { + { "9pfs" }, + { "" } +}; + +static void xen_9pfs_front_free(struct xen_9pfs_front_priv *priv) +{ + int i, j; + + write_lock(&xen_9pfs_lock); + list_del(&priv->list); + write_unlock(&xen_9pfs_lock); + + for (i = 0; i < priv->num_rings; i++) { + if (!priv->rings[i].intf) + break; + if (priv->rings[i].irq > 0) + unbind_from_irqhandler(priv->rings[i].irq, priv->dev); + if (priv->rings[i].data.in) { + for (j = 0; j < (1 << XEN_9PFS_RING_ORDER); j++) { + grant_ref_t ref; + + ref = priv->rings[i].intf->ref[j]; + gnttab_end_foreign_access(ref, 0, 0); + } + free_pages((unsigned long)priv->rings[i].data.in, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(priv->rings[i].ref, 0, 0); + free_page((unsigned long)priv->rings[i].intf); + } + kfree(priv->rings); + kfree(priv->tag); + kfree(priv); +} + +static int xen_9pfs_front_remove(struct xenbus_device *dev) +{ + struct xen_9pfs_front_priv *priv = dev_get_drvdata(&dev->dev); + + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return 0; +} + +static int xen_9pfs_front_alloc_dataring(struct xenbus_device *dev, + struct xen_9pfs_dataring *ring) +{ + int i = 0; + int ret = -ENOMEM; + void *bytes = NULL; + + init_waitqueue_head(&ring->wq); + spin_lock_init(&ring->lock); + INIT_WORK(&ring->work, p9_xen_response); + + ring->intf = (struct xen_9pfs_data_intf *)get_zeroed_page(GFP_KERNEL); + if (!ring->intf) + return ret; + ret = gnttab_grant_foreign_access(dev->otherend_id, + virt_to_gfn(ring->intf), 0); + if (ret < 0) + goto out; + ring->ref = ret; + bytes = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + XEN_9PFS_RING_ORDER - (PAGE_SHIFT - XEN_PAGE_SHIFT)); + if (!bytes) { + ret = -ENOMEM; + goto out; + } + for (; i < (1 << XEN_9PFS_RING_ORDER); i++) { + ret = gnttab_grant_foreign_access( + dev->otherend_id, virt_to_gfn(bytes) + i, 0); + if (ret < 0) + goto out; + ring->intf->ref[i] = ret; + } + ring->intf->ring_order = XEN_9PFS_RING_ORDER; + ring->data.in = bytes; + ring->data.out = bytes + XEN_9PFS_RING_SIZE; + + ret = xenbus_alloc_evtchn(dev, &ring->evtchn); + if (ret) + goto out; + ring->irq = bind_evtchn_to_irqhandler(ring->evtchn, + xen_9pfs_front_event_handler, + 0, "xen_9pfs-frontend", ring); + if (ring->irq >= 0) + return 0; + + xenbus_free_evtchn(dev, ring->evtchn); + ret = ring->irq; +out: + if (bytes) { + for (i--; i >= 0; i--) + gnttab_end_foreign_access(ring->intf->ref[i], 0, 0); + free_pages((unsigned long)bytes, + XEN_9PFS_RING_ORDER - + (PAGE_SHIFT - XEN_PAGE_SHIFT)); + } + gnttab_end_foreign_access(ring->ref, 0, 0); + free_page((unsigned long)ring->intf); + return ret; +} + +static int xen_9pfs_front_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int ret, i; + struct xenbus_transaction xbt; + struct xen_9pfs_front_priv *priv = NULL; + char *versions; + unsigned int max_rings, max_ring_order, len = 0; + + versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); + if (!len) + return -EINVAL; + if (strcmp(versions, "1")) { + kfree(versions); + return -EINVAL; + } + kfree(versions); + max_rings = xenbus_read_unsigned(dev->otherend, "max-rings", 0); + if (max_rings < XEN_9PFS_NUM_RINGS) + return -EINVAL; + max_ring_order = xenbus_read_unsigned(dev->otherend, + "max-ring-page-order", 0); + if (max_ring_order < XEN_9PFS_RING_ORDER) + return -EINVAL; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->dev = dev; + priv->num_rings = XEN_9PFS_NUM_RINGS; + priv->rings = kcalloc(priv->num_rings, sizeof(*priv->rings), + GFP_KERNEL); + if (!priv->rings) { + kfree(priv); + return -ENOMEM; + } + + for (i = 0; i < priv->num_rings; i++) { + priv->rings[i].priv = priv; + ret = xen_9pfs_front_alloc_dataring(dev, &priv->rings[i]); + if (ret < 0) + goto error; + } + + again: + ret = xenbus_transaction_start(&xbt); + if (ret) { + xenbus_dev_fatal(dev, ret, "starting transaction"); + goto error; + } + ret = xenbus_printf(xbt, dev->nodename, "version", "%u", 1); + if (ret) + goto error_xenbus; + ret = xenbus_printf(xbt, dev->nodename, "num-rings", "%u", + priv->num_rings); + if (ret) + goto error_xenbus; + for (i = 0; i < priv->num_rings; i++) { + char str[16]; + + BUILD_BUG_ON(XEN_9PFS_NUM_RINGS > 9); + sprintf(str, "ring-ref%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%d", + priv->rings[i].ref); + if (ret) + goto error_xenbus; + + sprintf(str, "event-channel-%u", i); + ret = xenbus_printf(xbt, dev->nodename, str, "%u", + priv->rings[i].evtchn); + if (ret) + goto error_xenbus; + } + priv->tag = xenbus_read(xbt, dev->nodename, "tag", NULL); + if (!priv->tag) { + ret = -EINVAL; + goto error_xenbus; + } + ret = xenbus_transaction_end(xbt, 0); + if (ret) { + if (ret == -EAGAIN) + goto again; + xenbus_dev_fatal(dev, ret, "completing transaction"); + goto error; + } + + write_lock(&xen_9pfs_lock); + list_add_tail(&priv->list, &xen_9pfs_devs); + write_unlock(&xen_9pfs_lock); + dev_set_drvdata(&dev->dev, priv); + xenbus_switch_state(dev, XenbusStateInitialised); + + return 0; + + error_xenbus: + xenbus_transaction_end(xbt, 1); + xenbus_dev_fatal(dev, ret, "writing xenstore"); + error: + dev_set_drvdata(&dev->dev, NULL); + xen_9pfs_front_free(priv); + return ret; +} + +static int xen_9pfs_front_resume(struct xenbus_device *dev) +{ + dev_warn(&dev->dev, "suspsend/resume unsupported\n"); + return 0; +} + +static void xen_9pfs_front_changed(struct xenbus_device *dev, + enum xenbus_state backend_state) +{ + switch (backend_state) { + case XenbusStateReconfiguring: + case XenbusStateReconfigured: + case XenbusStateInitialising: + case XenbusStateInitialised: + case XenbusStateUnknown: + break; + + case XenbusStateInitWait: + break; + + case XenbusStateConnected: + xenbus_switch_state(dev, XenbusStateConnected); + break; + + case XenbusStateClosed: + if (dev->state == XenbusStateClosed) + break; + /* Missed the backend's CLOSING state -- fallthrough */ + case XenbusStateClosing: + xenbus_frontend_closed(dev); + break; + } +} + +static struct xenbus_driver xen_9pfs_front_driver = { + .ids = xen_9pfs_front_ids, + .probe = xen_9pfs_front_probe, + .remove = xen_9pfs_front_remove, + .resume = xen_9pfs_front_resume, + .otherend_changed = xen_9pfs_front_changed, +}; + +int p9_trans_xen_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + pr_info("Initialising Xen transport for 9pfs\n"); + + v9fs_register_trans(&p9_xen_trans); + return xenbus_register_frontend(&xen_9pfs_front_driver); +} +module_init(p9_trans_xen_init); + +void p9_trans_xen_exit(void) +{ + v9fs_unregister_trans(&p9_xen_trans); + return xenbus_unregister_driver(&xen_9pfs_front_driver); +} +module_exit(p9_trans_xen_exit); diff --git a/net/Makefile b/net/Makefile index 9b681550e3a3ea3c6146ac67572b6c97a28c9d2c..9086ffbb508514c1e4fb1a5d2d04d6c6b1cf5bea 100644 --- a/net/Makefile +++ b/net/Makefile @@ -12,7 +12,7 @@ obj-$(CONFIG_NET) += $(tmp-y) # LLC has to be linked before the files in net/802/ obj-$(CONFIG_LLC) += llc/ -obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ +obj-$(CONFIG_NET) += ethernet/ 802/ sched/ netlink/ bpf/ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_XFRM) += xfrm/ diff --git a/net/atm/clip.c b/net/atm/clip.c index 53b4ac09e7b7d5d6a57f049dc1653c961b052622..ec527b62f79db1a4712d6fd654d2f899255de8bc 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -106,7 +106,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) entry->expires = jiffies - 1; /* force resolution or expiration */ error = neigh_update(entry->neigh, NULL, NUD_NONE, - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, 0); if (error) pr_crit("neigh_update failed with %d\n", error); goto out; @@ -481,7 +481,7 @@ static int clip_setentry(struct atm_vcc *vcc, __be32 ip) link_vcc(clip_vcc, entry); } error = neigh_update(neigh, llc_oui, NUD_PERMANENT, - NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); return error; } diff --git a/net/atm/common.c b/net/atm/common.c index 9613381f5db04e28ff66749706d1d61d82f77b88..f06422f4108d209fde356457c453164f2f4d7289 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -62,21 +62,16 @@ static void vcc_remove_socket(struct sock *sk) write_unlock_irq(&vcc_sklist_lock); } -static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size) +static bool vcc_tx_ready(struct atm_vcc *vcc, unsigned int size) { - struct sk_buff *skb; struct sock *sk = sk_atm(vcc); if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) { pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n", sk_wmem_alloc_get(sk), size, sk->sk_sndbuf); - return NULL; + return false; } - while (!(skb = alloc_skb(size, GFP_KERNEL))) - schedule(); - pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize); - atomic_add(skb->truesize, &sk->sk_wmem_alloc); - return skb; + return true; } static void vcc_sock_destruct(struct sock *sk) @@ -606,7 +601,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) eff = (size+3) & ~3; /* align to word boundary */ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); error = 0; - while (!(skb = alloc_tx(vcc, eff))) { + while (!vcc_tx_ready(vcc, eff)) { if (m->msg_flags & MSG_DONTWAIT) { error = -EAGAIN; break; @@ -628,6 +623,15 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, size_t size) finish_wait(sk_sleep(sk), &wait); if (error) goto out; + + skb = alloc_skb(eff, GFP_KERNEL); + if (!skb) { + error = -ENOMEM; + goto out; + } + pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize); + atomic_add(skb->truesize, &sk->sk_wmem_alloc); + skb->dev = NULL; /* for paths shared with net_device interfaces */ ATM_SKB(skb)->atm_options = vcc->atm_options; if (!copy_from_iter_full(skb_put(skb, size), size, &m->msg_iter)) { diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c index 71343d0fec94b55f7318ec8578abc956148f7791..495ba7cdcb0451c997656116a300a49b7e43a089 100644 --- a/net/batman-adv/bat_iv_ogm.c +++ b/net/batman-adv/bat_iv_ogm.c @@ -679,15 +679,11 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, { struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface); struct batadv_forw_packet *forw_packet_aggr; + struct sk_buff *skb; unsigned char *skb_buff; unsigned int skb_size; atomic_t *queue_left = own_packet ? NULL : &bat_priv->batman_queue_left; - forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, - queue_left, bat_priv); - if (!forw_packet_aggr) - return; - if (atomic_read(&bat_priv->aggregated_ogms) && packet_len < BATADV_MAX_AGGREGATION_BYTES) skb_size = BATADV_MAX_AGGREGATION_BYTES; @@ -696,9 +692,14 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff, skb_size += ETH_HLEN; - forw_packet_aggr->skb = netdev_alloc_skb_ip_align(NULL, skb_size); - if (!forw_packet_aggr->skb) { - batadv_forw_packet_free(forw_packet_aggr, true); + skb = netdev_alloc_skb_ip_align(NULL, skb_size); + if (!skb) + return; + + forw_packet_aggr = batadv_forw_packet_alloc(if_incoming, if_outgoing, + queue_left, bat_priv, skb); + if (!forw_packet_aggr) { + kfree_skb(skb); return; } diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c index ba8420d8a992db2c14936f568f761869afd921c0..d07e89ec84677d22a74891406ab6fa5ee6087011 100644 --- a/net/batman-adv/bridge_loop_avoidance.c +++ b/net/batman-adv/bridge_loop_avoidance.c @@ -395,7 +395,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): CLAIM %pM on vid %d\n", mac, - BATADV_PRINT_VID(vid)); + batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_UNCLAIM: /* unclaim frame @@ -404,7 +404,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): UNCLAIM %pM on vid %d\n", mac, - BATADV_PRINT_VID(vid)); + batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_ANNOUNCE: /* announcement frame @@ -413,7 +413,7 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, ether_addr_copy(hw_src, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): ANNOUNCE of %pM on vid %d\n", - ethhdr->h_source, BATADV_PRINT_VID(vid)); + ethhdr->h_source, batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_REQUEST: /* request frame @@ -425,14 +425,14 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): REQUEST of %pM to %pM on vid %d\n", ethhdr->h_source, ethhdr->h_dest, - BATADV_PRINT_VID(vid)); + batadv_print_vid(vid)); break; case BATADV_CLAIM_TYPE_LOOPDETECT: ether_addr_copy(ethhdr->h_source, mac); batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_send_claim(): LOOPDETECT of %pM to %pM on vid %d\n", ethhdr->h_source, ethhdr->h_dest, - BATADV_PRINT_VID(vid)); + batadv_print_vid(vid)); break; } @@ -475,9 +475,9 @@ static void batadv_bla_loopdetect_report(struct work_struct *work) batadv_info(bat_priv->soft_iface, "Possible loop on VLAN %d detected which can't be handled by BLA - please check your network setup!\n", - BATADV_PRINT_VID(backbone_gw->vid)); + batadv_print_vid(backbone_gw->vid)); snprintf(vid_str, sizeof(vid_str), "%d", - BATADV_PRINT_VID(backbone_gw->vid)); + batadv_print_vid(backbone_gw->vid)); vid_str[sizeof(vid_str) - 1] = 0; batadv_throw_uevent(bat_priv, BATADV_UEV_BLA, BATADV_UEV_LOOPDETECT, @@ -510,7 +510,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_get_backbone_gw(): not found (%pM, %d), creating new entry\n", - orig, BATADV_PRINT_VID(vid)); + orig, batadv_print_vid(vid)); entry = kzalloc(sizeof(*entry), GFP_ATOMIC); if (!entry) @@ -719,7 +719,7 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_add_claim(): adding new entry %pM, vid %d to hash ...\n", - mac, BATADV_PRINT_VID(vid)); + mac, batadv_print_vid(vid)); kref_get(&claim->refcount); hash_added = batadv_hash_add(bat_priv->bla.claim_hash, @@ -739,8 +739,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv, goto claim_free_ref; batadv_dbg(BATADV_DBG_BLA, bat_priv, - "bla_add_claim(): changing ownership for %pM, vid %d\n", - mac, BATADV_PRINT_VID(vid)); + "bla_add_claim(): changing ownership for %pM, vid %d to gw %pM\n", + mac, batadv_print_vid(vid), backbone_gw->orig); remove_crc = true; } @@ -809,7 +809,7 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv, return; batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_del_claim(): %pM, vid %d\n", - mac, BATADV_PRINT_VID(vid)); + mac, batadv_print_vid(vid)); batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim, batadv_choose_claim, claim); @@ -849,7 +849,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n", - BATADV_PRINT_VID(vid), backbone_gw->orig, crc); + batadv_print_vid(vid), backbone_gw->orig, crc); spin_lock_bh(&backbone_gw->crc_lock); backbone_crc = backbone_gw->crc; @@ -859,7 +859,7 @@ static bool batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr, batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv, "handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n", backbone_gw->orig, - BATADV_PRINT_VID(backbone_gw->vid), + batadv_print_vid(backbone_gw->vid), backbone_crc, crc); batadv_bla_send_request(backbone_gw); @@ -904,7 +904,7 @@ static bool batadv_handle_request(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_request(): REQUEST vid %d (sent by %pM)...\n", - BATADV_PRINT_VID(vid), ethhdr->h_source); + batadv_print_vid(vid), ethhdr->h_source); batadv_bla_answer_request(bat_priv, primary_if, vid); return true; @@ -941,7 +941,7 @@ static bool batadv_handle_unclaim(struct batadv_priv *bat_priv, /* this must be an UNCLAIM frame */ batadv_dbg(BATADV_DBG_BLA, bat_priv, "handle_unclaim(): UNCLAIM %pM on vid %d (sent by %pM)...\n", - claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig); + claim_addr, batadv_print_vid(vid), backbone_gw->orig); batadv_bla_del_claim(bat_priv, claim_addr, vid); batadv_backbone_gw_put(backbone_gw); @@ -1161,7 +1161,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, if (ret == 1) batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): received a claim frame from another group. From: %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", - ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, + ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); if (ret < 2) @@ -1197,7 +1197,7 @@ static bool batadv_bla_process_claim(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_process_claim(): ERROR - this looks like a claim frame, but is useless. eth src %pM on vid %d ...(hw_src %pM, hw_dst %pM)\n", - ethhdr->h_source, BATADV_PRINT_VID(vid), hw_src, hw_dst); + ethhdr->h_source, batadv_print_vid(vid), hw_src, hw_dst); return true; } @@ -1295,10 +1295,13 @@ static void batadv_bla_purge_claims(struct batadv_priv *bat_priv, goto skip; batadv_dbg(BATADV_DBG_BLA, bat_priv, - "bla_purge_claims(): %pM, vid %d, time out\n", - claim->addr, claim->vid); + "bla_purge_claims(): timed out.\n"); purge_now: + batadv_dbg(BATADV_DBG_BLA, bat_priv, + "bla_purge_claims(): %pM, vid %d\n", + claim->addr, claim->vid); + batadv_handle_unclaim(bat_priv, primary_if, backbone_gw->orig, claim->addr, claim->vid); @@ -1846,6 +1849,13 @@ bool batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb, /* possible optimization: race for a claim */ /* No claim exists yet, claim it for us! */ + + batadv_dbg(BATADV_DBG_BLA, bat_priv, + "bla_rx(): Unclaimed MAC %pM found. Claim it. Local: %s\n", + ethhdr->h_source, + batadv_is_my_client(bat_priv, + ethhdr->h_source, vid) ? + "yes" : "no"); batadv_handle_claim(bat_priv, primary_if, primary_if->net_dev->dev_addr, ethhdr->h_source, vid); @@ -1963,10 +1973,22 @@ bool batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb, /* if yes, the client has roamed and we have * to unclaim it. */ - batadv_handle_unclaim(bat_priv, primary_if, - primary_if->net_dev->dev_addr, - ethhdr->h_source, vid); - goto allow; + if (batadv_has_timed_out(claim->lasttime, 100)) { + /* only unclaim if the last claim entry is + * older than 100 ms to make sure we really + * have a roaming client here. + */ + batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_tx(): Roaming client %pM detected. Unclaim it.\n", + ethhdr->h_source); + batadv_handle_unclaim(bat_priv, primary_if, + primary_if->net_dev->dev_addr, + ethhdr->h_source, vid); + goto allow; + } else { + batadv_dbg(BATADV_DBG_BLA, bat_priv, "bla_tx(): Race for claim %pM detected. Drop packet.\n", + ethhdr->h_source); + goto handled; + } } /* check if it is a multicast/broadcast frame */ @@ -2042,7 +2064,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset) backbone_crc = backbone_gw->crc; spin_unlock_bh(&backbone_gw->crc_lock); seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n", - claim->addr, BATADV_PRINT_VID(claim->vid), + claim->addr, batadv_print_vid(claim->vid), backbone_gw->orig, (is_own ? 'x' : ' '), backbone_crc); @@ -2274,7 +2296,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n", backbone_gw->orig, - BATADV_PRINT_VID(backbone_gw->vid), secs, + batadv_print_vid(backbone_gw->vid), secs, msecs, backbone_crc); } rcu_read_unlock(); @@ -2449,3 +2471,52 @@ int batadv_bla_backbone_dump(struct sk_buff *msg, struct netlink_callback *cb) return ret; } + +#ifdef CONFIG_BATMAN_ADV_DAT +/** + * batadv_bla_check_claim - check if address is claimed + * + * @bat_priv: the bat priv with all the soft interface information + * @addr: mac address of which the claim status is checked + * @vid: the VLAN ID + * + * addr is checked if this address is claimed by the local device itself. + * + * Return: true if bla is disabled or the mac is claimed by the device, + * false if the device addr is already claimed by another gateway + */ +bool batadv_bla_check_claim(struct batadv_priv *bat_priv, + u8 *addr, unsigned short vid) +{ + struct batadv_bla_claim search_claim; + struct batadv_bla_claim *claim = NULL; + struct batadv_hard_iface *primary_if = NULL; + bool ret = true; + + if (!atomic_read(&bat_priv->bridge_loop_avoidance)) + return ret; + + primary_if = batadv_primary_if_get_selected(bat_priv); + if (!primary_if) + return ret; + + /* First look if the mac address is claimed */ + ether_addr_copy(search_claim.addr, addr); + search_claim.vid = vid; + + claim = batadv_claim_hash_find(bat_priv, &search_claim); + + /* If there is a claim and we are not owner of the claim, + * return false. + */ + if (claim) { + if (!batadv_compare_eth(claim->backbone_gw->orig, + primary_if->net_dev->dev_addr)) + ret = false; + batadv_claim_put(claim); + } + + batadv_hardif_put(primary_if); + return ret; +} +#endif diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h index e157986bd01cf989dc70c93bae7a4fdf17cf3c4f..234775748b8eae9477f802804f004e50d2a4840c 100644 --- a/net/batman-adv/bridge_loop_avoidance.h +++ b/net/batman-adv/bridge_loop_avoidance.h @@ -69,6 +69,10 @@ void batadv_bla_status_update(struct net_device *net_dev); int batadv_bla_init(struct batadv_priv *bat_priv); void batadv_bla_free(struct batadv_priv *bat_priv); int batadv_bla_claim_dump(struct sk_buff *msg, struct netlink_callback *cb); +#ifdef CONFIG_BATMAN_ADV_DAT +bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, + unsigned short vid); +#endif #define BATADV_BLA_CRC_INIT 0 #else /* ifdef CONFIG_BATMAN_ADV_BLA */ @@ -145,6 +149,13 @@ static inline int batadv_bla_backbone_dump(struct sk_buff *msg, return -EOPNOTSUPP; } +static inline +bool batadv_bla_check_claim(struct batadv_priv *bat_priv, u8 *addr, + unsigned short vid) +{ + return true; +} + #endif /* ifdef CONFIG_BATMAN_ADV_BLA */ #endif /* ifndef _NET_BATMAN_ADV_BLA_H_ */ diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c index 1bfd1dbc2feba7bf6c16004ea8ca85ed6066c929..013e970eff393e0550aa250f7e72c27301071552 100644 --- a/net/batman-adv/distributed-arp-table.c +++ b/net/batman-adv/distributed-arp-table.c @@ -43,6 +43,7 @@ #include #include +#include "bridge_loop_avoidance.h" #include "hard-interface.h" #include "hash.h" #include "log.h" @@ -330,7 +331,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, batadv_dbg(BATADV_DBG_DAT, bat_priv, "Entry updated: %pI4 %pM (vid: %d)\n", &dat_entry->ip, dat_entry->mac_addr, - BATADV_PRINT_VID(vid)); + batadv_print_vid(vid)); goto out; } @@ -356,7 +357,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip, } batadv_dbg(BATADV_DBG_DAT, bat_priv, "New entry added: %pI4 %pM (vid: %d)\n", - &dat_entry->ip, dat_entry->mac_addr, BATADV_PRINT_VID(vid)); + &dat_entry->ip, dat_entry->mac_addr, batadv_print_vid(vid)); out: if (dat_entry) @@ -835,7 +836,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, " * %15pI4 %14pM %4i %6i:%02i\n", &dat_entry->ip, dat_entry->mac_addr, - BATADV_PRINT_VID(dat_entry->vid), + batadv_print_vid(dat_entry->vid), last_seen_mins, last_seen_secs); } rcu_read_unlock(); @@ -1002,6 +1003,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, bool ret = false; struct batadv_dat_entry *dat_entry = NULL; struct sk_buff *skb_new; + struct net_device *soft_iface = bat_priv->soft_iface; int hdr_size = 0; unsigned short vid; @@ -1040,16 +1042,30 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv, goto out; } + /* If BLA is enabled, only send ARP replies if we have claimed + * the destination for the ARP request or if no one else of + * the backbone gws belonging to our backbone has claimed the + * destination. + */ + if (!batadv_bla_check_claim(bat_priv, + dat_entry->mac_addr, vid)) { + batadv_dbg(BATADV_DBG_DAT, bat_priv, + "Device %pM claimed by another backbone gw. Don't send ARP reply!", + dat_entry->mac_addr); + ret = true; + goto out; + } + skb_new = batadv_dat_arp_create_reply(bat_priv, ip_dst, ip_src, dat_entry->mac_addr, hw_src, vid); if (!skb_new) goto out; - skb_new->protocol = eth_type_trans(skb_new, - bat_priv->soft_iface); - bat_priv->stats.rx_packets++; - bat_priv->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; + skb_new->protocol = eth_type_trans(skb_new, soft_iface); + + soft_iface->stats.rx_packets++; + soft_iface->stats.rx_bytes += skb->len + ETH_HLEN + hdr_size; netif_rx(skb_new); batadv_dbg(BATADV_DBG_DAT, bat_priv, "ARP request replied locally\n"); @@ -1188,6 +1204,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv, bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, struct sk_buff *skb, int hdr_size) { + struct batadv_dat_entry *dat_entry = NULL; u16 type; __be32 ip_src, ip_dst; u8 *hw_src, *hw_dst; @@ -1210,12 +1227,41 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, hw_dst = batadv_arp_hw_dst(skb, hdr_size); ip_dst = batadv_arp_ip_dst(skb, hdr_size); + /* If ip_dst is already in cache and has the right mac address, + * drop this frame if this ARP reply is destined for us because it's + * most probably an ARP reply generated by another node of the DHT. + * We have most probably received already a reply earlier. Delivering + * this frame would lead to doubled receive of an ARP reply. + */ + dat_entry = batadv_dat_entry_hash_find(bat_priv, ip_src, vid); + if (dat_entry && batadv_compare_eth(hw_src, dat_entry->mac_addr)) { + batadv_dbg(BATADV_DBG_DAT, bat_priv, "Doubled ARP reply removed: ARP MSG = [src: %pM-%pI4 dst: %pM-%pI4]; dat_entry: %pM-%pI4\n", + hw_src, &ip_src, hw_dst, &ip_dst, + dat_entry->mac_addr, &dat_entry->ip); + dropped = true; + goto out; + } + /* Update our internal cache with both the IP addresses the node got * within the ARP reply */ batadv_dat_entry_add(bat_priv, ip_src, hw_src, vid); batadv_dat_entry_add(bat_priv, ip_dst, hw_dst, vid); + /* If BLA is enabled, only forward ARP replies if we have claimed the + * source of the ARP reply or if no one else of the same backbone has + * already claimed that client. This prevents that different gateways + * to the same backbone all forward the ARP reply leading to multiple + * replies in the backbone. + */ + if (!batadv_bla_check_claim(bat_priv, hw_src, vid)) { + batadv_dbg(BATADV_DBG_DAT, bat_priv, + "Device %pM claimed by another backbone gw. Drop ARP reply.\n", + hw_src); + dropped = true; + goto out; + } + /* if this REPLY is directed to a client of mine, let's deliver the * packet to the interface */ @@ -1228,6 +1274,8 @@ bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv, out: if (dropped) kfree_skb(skb); + if (dat_entry) + batadv_dat_entry_put(dat_entry); /* if dropped == false -> deliver to the interface */ return dropped; } @@ -1256,7 +1304,7 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv, /* If this packet is an ARP_REQUEST and the node already has the * information that it is going to ask, then the packet can be dropped */ - if (forw_packet->num_packets) + if (batadv_forw_packet_is_rebroadcast(forw_packet)) goto out; vid = batadv_dat_get_vid(forw_packet->skb, &hdr_size); diff --git a/net/batman-adv/log.h b/net/batman-adv/log.h index 7a2b9f4da07830103a8f00e4c3b488b0367a9dc2..65ce97efa6b5946175f64d300573a532f28387f8 100644 --- a/net/batman-adv/log.h +++ b/net/batman-adv/log.h @@ -73,9 +73,10 @@ __printf(2, 3); /* possibly ratelimited debug output */ #define _batadv_dbg(type, bat_priv, ratelimited, fmt, arg...) \ do { \ - if (atomic_read(&(bat_priv)->log_level) & (type) && \ + struct batadv_priv *__batpriv = (bat_priv); \ + if (atomic_read(&__batpriv->log_level) & (type) && \ (!(ratelimited) || net_ratelimit())) \ - batadv_debug_log(bat_priv, fmt, ## arg); \ + batadv_debug_log(__batpriv, fmt, ## arg); \ } \ while (0) #else /* !CONFIG_BATMAN_ADV_DEBUG */ diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c index 5000c540614d0c0a866857e5245ff3f365d14223..fb381fb26a66196942401ec6cb636c66e648a36a 100644 --- a/net/batman-adv/main.c +++ b/net/batman-adv/main.c @@ -516,6 +516,9 @@ static void batadv_recv_handler_init(void) BUILD_BUG_ON(sizeof(struct batadv_tvlv_tt_change) != 12); BUILD_BUG_ON(sizeof(struct batadv_tvlv_roam_adv) != 8); + i = FIELD_SIZEOF(struct sk_buff, cb); + BUILD_BUG_ON(sizeof(struct batadv_skb_cb) > i); + /* broadcast packet */ batadv_rx_handler[BATADV_BCAST] = batadv_recv_bcast_packet; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 57a8103dbce7f00fb58ff5ae252e12a0f6453aec..810f7d026f544027991af1714c169342792e1a68 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -24,7 +24,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2017.0" +#define BATADV_SOURCE_VERSION "2017.1" #endif /* B.A.T.M.A.N. parameters */ @@ -193,6 +193,7 @@ enum batadv_uev_type { #include #include +#include "packet.h" #include "types.h" struct net_device; @@ -200,8 +201,19 @@ struct packet_type; struct seq_file; struct sk_buff; -#define BATADV_PRINT_VID(vid) (((vid) & BATADV_VLAN_HAS_TAG) ? \ - (int)((vid) & VLAN_VID_MASK) : -1) +/** + * batadv_print_vid - return printable version of vid information + * @vid: the VLAN identifier + * + * Return: -1 when no VLAN is used, VLAN id otherwise + */ +static inline int batadv_print_vid(unsigned short vid) +{ + if (vid & BATADV_VLAN_HAS_TAG) + return (int)(vid & VLAN_VID_MASK); + else + return -1; +} extern struct list_head batadv_hardif_list; diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c index 952ba81a565b611ee0a83fc0bbfb719b54187407..d327670641ac336a14f0ecc85dd848d4952e8e6e 100644 --- a/net/batman-adv/multicast.c +++ b/net/batman-adv/multicast.c @@ -494,9 +494,8 @@ static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv) if (!bridged) goto update; -#if !IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING) - pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); -#endif + if (!IS_ENABLED(CONFIG_BRIDGE_IGMP_SNOOPING)) + pr_warn_once("No bridge IGMP snooping compiled - multicast optimizations disabled\n"); querier4.exists = br_multicast_has_querier_anywhere(dev, ETH_P_IP); querier4.shadowing = br_multicast_has_querier_adjacent(dev, ETH_P_IP); @@ -671,7 +670,6 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv, return 0; } -#if IS_ENABLED(CONFIG_IPV6) /** * batadv_mcast_is_report_ipv6 - check for MLD reports * @skb: the ethernet frame destined for the mesh @@ -736,7 +734,6 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv, return 0; } -#endif /** * batadv_mcast_forw_mode_check - check for optimized forwarding potential @@ -765,11 +762,12 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv, case ETH_P_IP: return batadv_mcast_forw_mode_check_ipv4(bat_priv, skb, is_unsnoopable); -#if IS_ENABLED(CONFIG_IPV6) case ETH_P_IPV6: + if (!IS_ENABLED(CONFIG_IPV6)) + return -EINVAL; + return batadv_mcast_forw_mode_check_ipv6(bat_priv, skb, is_unsnoopable); -#endif default: return -EINVAL; } diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c index 7fd740b6e36dfb0e11c67c283cec68a5cfd1b5f6..e1ebe14ee2a6e21cc8d6b4a42552cae4bd15061f 100644 --- a/net/batman-adv/routing.c +++ b/net/batman-adv/routing.c @@ -941,15 +941,17 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, struct batadv_priv *bat_priv = netdev_priv(recv_if->soft_iface); struct batadv_unicast_packet *unicast_packet; struct batadv_unicast_4addr_packet *unicast_4addr_packet; - u8 *orig_addr; - struct batadv_orig_node *orig_node = NULL; + u8 *orig_addr, *orig_addr_gw; + struct batadv_orig_node *orig_node = NULL, *orig_node_gw = NULL; int check, hdr_size = sizeof(*unicast_packet); enum batadv_subtype subtype; - bool is4addr; + struct ethhdr *ethhdr; int ret = NET_RX_DROP; + bool is4addr, is_gw; unicast_packet = (struct batadv_unicast_packet *)skb->data; unicast_4addr_packet = (struct batadv_unicast_4addr_packet *)skb->data; + ethhdr = eth_hdr(skb); is4addr = unicast_packet->packet_type == BATADV_UNICAST_4ADDR; /* the caller function should have already pulled 2 bytes */ @@ -972,6 +974,23 @@ int batadv_recv_unicast_packet(struct sk_buff *skb, /* packet for me */ if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) { + /* If this is a unicast packet from another backgone gw, + * drop it. + */ + orig_addr_gw = ethhdr->h_source; + orig_node_gw = batadv_orig_hash_find(bat_priv, orig_addr_gw); + if (orig_node_gw) { + is_gw = batadv_bla_is_backbone_gw(skb, orig_node_gw, + hdr_size); + batadv_orig_node_put(orig_node_gw); + if (is_gw) { + batadv_dbg(BATADV_DBG_BLA, bat_priv, + "recv_unicast_packet(): Dropped unicast pkt received from another backbone gw %pM.\n", + orig_addr_gw); + return NET_RX_DROP; + } + } + if (is4addr) { subtype = unicast_4addr_packet->subtype; batadv_dat_inc_counter(bat_priv, subtype); diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c index 1489ec27daff5548b072e88648f5cca192f74afa..403df596a73d28afa8b31b46a2c0649052ce3db0 100644 --- a/net/batman-adv/send.c +++ b/net/batman-adv/send.c @@ -482,6 +482,7 @@ void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet, * @if_outgoing: The (optional) if_outgoing to be grabbed * @queue_left: The (optional) queue counter to decrease * @bat_priv: The bat_priv for the mesh of this forw_packet + * @skb: The raw packet this forwarding packet shall contain * * Allocates a forwarding packet and tries to get a reference to the * (optional) if_incoming, if_outgoing and queue_left. If queue_left @@ -493,7 +494,8 @@ struct batadv_forw_packet * batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, atomic_t *queue_left, - struct batadv_priv *bat_priv) + struct batadv_priv *bat_priv, + struct sk_buff *skb) { struct batadv_forw_packet *forw_packet; const char *qname; @@ -525,7 +527,7 @@ batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, INIT_HLIST_NODE(&forw_packet->list); INIT_HLIST_NODE(&forw_packet->cleanup_list); - forw_packet->skb = NULL; + forw_packet->skb = skb; forw_packet->queue_left = queue_left; forw_packet->if_incoming = if_incoming; forw_packet->if_outgoing = if_outgoing; @@ -756,22 +758,23 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, if (!primary_if) goto err; + newskb = skb_copy(skb, GFP_ATOMIC); + if (!newskb) { + batadv_hardif_put(primary_if); + goto err; + } + forw_packet = batadv_forw_packet_alloc(primary_if, NULL, &bat_priv->bcast_queue_left, - bat_priv); + bat_priv, newskb); batadv_hardif_put(primary_if); if (!forw_packet) - goto err; - - newskb = skb_copy(skb, GFP_ATOMIC); - if (!newskb) goto err_packet_free; /* as we have a copy now, it is safe to decrease the TTL */ bcast_packet = (struct batadv_bcast_packet *)newskb->data; bcast_packet->ttl--; - forw_packet->skb = newskb; forw_packet->own = own_packet; INIT_DELAYED_WORK(&forw_packet->delayed_work, @@ -781,11 +784,60 @@ int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv, return NETDEV_TX_OK; err_packet_free: - batadv_forw_packet_free(forw_packet, true); + kfree_skb(newskb); err: return NETDEV_TX_BUSY; } +/** + * batadv_forw_packet_bcasts_left - check if a retransmission is necessary + * @forw_packet: the forwarding packet to check + * @hard_iface: the interface to check on + * + * Checks whether a given packet has any (re)transmissions left on the provided + * interface. + * + * hard_iface may be NULL: In that case the number of transmissions this skb had + * so far is compared with the maximum amount of retransmissions independent of + * any interface instead. + * + * Return: True if (re)transmissions are left, false otherwise. + */ +static bool +batadv_forw_packet_bcasts_left(struct batadv_forw_packet *forw_packet, + struct batadv_hard_iface *hard_iface) +{ + unsigned int max; + + if (hard_iface) + max = hard_iface->num_bcasts; + else + max = BATADV_NUM_BCASTS_MAX; + + return BATADV_SKB_CB(forw_packet->skb)->num_bcasts < max; +} + +/** + * batadv_forw_packet_bcasts_inc - increment retransmission counter of a packet + * @forw_packet: the packet to increase the counter for + */ +static void +batadv_forw_packet_bcasts_inc(struct batadv_forw_packet *forw_packet) +{ + BATADV_SKB_CB(forw_packet->skb)->num_bcasts++; +} + +/** + * batadv_forw_packet_is_rebroadcast - check packet for previous transmissions + * @forw_packet: the packet to check + * + * Return: True if this packet was transmitted before, false otherwise. + */ +bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet) +{ + return BATADV_SKB_CB(forw_packet->skb)->num_bcasts > 0; +} + static void batadv_send_outstanding_bcast_packet(struct work_struct *work) { struct batadv_hard_iface *hard_iface; @@ -826,7 +878,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) if (hard_iface->soft_iface != soft_iface) continue; - if (forw_packet->num_packets >= hard_iface->num_bcasts) + if (!batadv_forw_packet_bcasts_left(forw_packet, hard_iface)) continue; if (forw_packet->own) { @@ -884,10 +936,10 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work) } rcu_read_unlock(); - forw_packet->num_packets++; + batadv_forw_packet_bcasts_inc(forw_packet); /* if we still have some more bcasts to send */ - if (forw_packet->num_packets < BATADV_NUM_BCASTS_MAX) { + if (batadv_forw_packet_bcasts_left(forw_packet, NULL)) { batadv_forw_packet_bcast_queue(bat_priv, forw_packet, send_time); return; diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h index f21166d1032360a1febe3cebdfc9ee0e9958bb9c..a16b34f473ef02e46e642b46d4b31f588d89ea67 100644 --- a/net/batman-adv/send.h +++ b/net/batman-adv/send.h @@ -34,11 +34,13 @@ struct batadv_forw_packet * batadv_forw_packet_alloc(struct batadv_hard_iface *if_incoming, struct batadv_hard_iface *if_outgoing, atomic_t *queue_left, - struct batadv_priv *bat_priv); + struct batadv_priv *bat_priv, + struct sk_buff *skb); bool batadv_forw_packet_steal(struct batadv_forw_packet *packet, spinlock_t *l); void batadv_forw_packet_ogmv1_queue(struct batadv_priv *bat_priv, struct batadv_forw_packet *forw_packet, unsigned long send_time); +bool batadv_forw_packet_is_rebroadcast(struct batadv_forw_packet *forw_packet); int batadv_send_skb_to_orig(struct sk_buff *skb, struct batadv_orig_node *orig_node, diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c index d042c99af028e2083307de1ba8978f2061fee45d..b25789abf7b9e10aec7af1dfc41a5c9ff805284a 100644 --- a/net/batman-adv/soft-interface.c +++ b/net/batman-adv/soft-interface.c @@ -64,28 +64,6 @@ #include "sysfs.h" #include "translation-table.h" -static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd); -static void batadv_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info); -static u32 batadv_get_msglevel(struct net_device *dev); -static void batadv_set_msglevel(struct net_device *dev, u32 value); -static u32 batadv_get_link(struct net_device *dev); -static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data); -static void batadv_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, u64 *data); -static int batadv_get_sset_count(struct net_device *dev, int stringset); - -static const struct ethtool_ops batadv_ethtool_ops = { - .get_settings = batadv_get_settings, - .get_drvinfo = batadv_get_drvinfo, - .get_msglevel = batadv_get_msglevel, - .set_msglevel = batadv_set_msglevel, - .get_link = batadv_get_link, - .get_strings = batadv_get_strings, - .get_ethtool_stats = batadv_get_ethtool_stats, - .get_sset_count = batadv_get_sset_count, -}; - int batadv_skb_head_push(struct sk_buff *skb, unsigned int len) { int result; @@ -140,7 +118,7 @@ static u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx) static struct net_device_stats *batadv_interface_stats(struct net_device *dev) { struct batadv_priv *bat_priv = netdev_priv(dev); - struct net_device_stats *stats = &bat_priv->stats; + struct net_device_stats *stats = &dev->stats; stats->tx_packets = batadv_sum_counter(bat_priv, BATADV_CNT_TX); stats->tx_bytes = batadv_sum_counter(bat_priv, BATADV_CNT_TX_BYTES); @@ -230,6 +208,9 @@ static int batadv_interface_tx(struct sk_buff *skb, if (atomic_read(&bat_priv->mesh_state) != BATADV_MESH_ACTIVE) goto dropped; + /* reset control block to avoid left overs from previous users */ + memset(skb->cb, 0, sizeof(struct batadv_skb_cb)); + netif_trans_update(soft_iface); vid = batadv_get_vid(skb, 0); ethhdr = eth_hdr(skb); @@ -947,6 +928,98 @@ static const struct net_device_ops batadv_netdev_ops = { .ndo_del_slave = batadv_softif_slave_del, }; +static void batadv_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *info) +{ + strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); + strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); + strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); + strlcpy(info->bus_info, "batman", sizeof(info->bus_info)); +} + +/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 + * Declare each description string in struct.name[] to get fixed sized buffer + * and compile time checking for strings longer than ETH_GSTRING_LEN. + */ +static const struct { + const char name[ETH_GSTRING_LEN]; +} batadv_counters_strings[] = { + { "tx" }, + { "tx_bytes" }, + { "tx_dropped" }, + { "rx" }, + { "rx_bytes" }, + { "forward" }, + { "forward_bytes" }, + { "mgmt_tx" }, + { "mgmt_tx_bytes" }, + { "mgmt_rx" }, + { "mgmt_rx_bytes" }, + { "frag_tx" }, + { "frag_tx_bytes" }, + { "frag_rx" }, + { "frag_rx_bytes" }, + { "frag_fwd" }, + { "frag_fwd_bytes" }, + { "tt_request_tx" }, + { "tt_request_rx" }, + { "tt_response_tx" }, + { "tt_response_rx" }, + { "tt_roam_adv_tx" }, + { "tt_roam_adv_rx" }, +#ifdef CONFIG_BATMAN_ADV_DAT + { "dat_get_tx" }, + { "dat_get_rx" }, + { "dat_put_tx" }, + { "dat_put_rx" }, + { "dat_cached_reply_tx" }, +#endif +#ifdef CONFIG_BATMAN_ADV_NC + { "nc_code" }, + { "nc_code_bytes" }, + { "nc_recode" }, + { "nc_recode_bytes" }, + { "nc_buffer" }, + { "nc_decode" }, + { "nc_decode_bytes" }, + { "nc_decode_failed" }, + { "nc_sniffed" }, +#endif +}; + +static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) +{ + if (stringset == ETH_SS_STATS) + memcpy(data, batadv_counters_strings, + sizeof(batadv_counters_strings)); +} + +static void batadv_get_ethtool_stats(struct net_device *dev, + struct ethtool_stats *stats, u64 *data) +{ + struct batadv_priv *bat_priv = netdev_priv(dev); + int i; + + for (i = 0; i < BATADV_CNT_NUM; i++) + data[i] = batadv_sum_counter(bat_priv, i); +} + +static int batadv_get_sset_count(struct net_device *dev, int stringset) +{ + if (stringset == ETH_SS_STATS) + return BATADV_CNT_NUM; + + return -EOPNOTSUPP; +} + +static const struct ethtool_ops batadv_ethtool_ops = { + .get_drvinfo = batadv_get_drvinfo, + .get_link = ethtool_op_get_link, + .get_strings = batadv_get_strings, + .get_ethtool_stats = batadv_get_ethtool_stats, + .get_sset_count = batadv_get_sset_count, +}; + /** * batadv_softif_free - Deconstructor of batadv_soft_interface * @dev: Device to cleanup and remove @@ -971,8 +1044,6 @@ static void batadv_softif_free(struct net_device *dev) */ static void batadv_softif_init_early(struct net_device *dev) { - struct batadv_priv *priv = netdev_priv(dev); - ether_setup(dev); dev->netdev_ops = &batadv_netdev_ops; @@ -989,8 +1060,6 @@ static void batadv_softif_init_early(struct net_device *dev) eth_hw_addr_random(dev); dev->ethtool_ops = &batadv_ethtool_ops; - - memset(priv, 0, sizeof(*priv)); } struct net_device *batadv_softif_create(struct net *net, const char *name) @@ -1083,118 +1152,3 @@ struct rtnl_link_ops batadv_link_ops __read_mostly = { .setup = batadv_softif_init_early, .dellink = batadv_softif_destroy_netlink, }; - -/* ethtool */ -static int batadv_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) -{ - cmd->supported = 0; - cmd->advertising = 0; - ethtool_cmd_speed_set(cmd, SPEED_10); - cmd->duplex = DUPLEX_FULL; - cmd->port = PORT_TP; - cmd->phy_address = 0; - cmd->transceiver = XCVR_INTERNAL; - cmd->autoneg = AUTONEG_DISABLE; - cmd->maxtxpkt = 0; - cmd->maxrxpkt = 0; - - return 0; -} - -static void batadv_get_drvinfo(struct net_device *dev, - struct ethtool_drvinfo *info) -{ - strlcpy(info->driver, "B.A.T.M.A.N. advanced", sizeof(info->driver)); - strlcpy(info->version, BATADV_SOURCE_VERSION, sizeof(info->version)); - strlcpy(info->fw_version, "N/A", sizeof(info->fw_version)); - strlcpy(info->bus_info, "batman", sizeof(info->bus_info)); -} - -static u32 batadv_get_msglevel(struct net_device *dev) -{ - return -EOPNOTSUPP; -} - -static void batadv_set_msglevel(struct net_device *dev, u32 value) -{ -} - -static u32 batadv_get_link(struct net_device *dev) -{ - return 1; -} - -/* Inspired by drivers/net/ethernet/dlink/sundance.c:1702 - * Declare each description string in struct.name[] to get fixed sized buffer - * and compile time checking for strings longer than ETH_GSTRING_LEN. - */ -static const struct { - const char name[ETH_GSTRING_LEN]; -} batadv_counters_strings[] = { - { "tx" }, - { "tx_bytes" }, - { "tx_dropped" }, - { "rx" }, - { "rx_bytes" }, - { "forward" }, - { "forward_bytes" }, - { "mgmt_tx" }, - { "mgmt_tx_bytes" }, - { "mgmt_rx" }, - { "mgmt_rx_bytes" }, - { "frag_tx" }, - { "frag_tx_bytes" }, - { "frag_rx" }, - { "frag_rx_bytes" }, - { "frag_fwd" }, - { "frag_fwd_bytes" }, - { "tt_request_tx" }, - { "tt_request_rx" }, - { "tt_response_tx" }, - { "tt_response_rx" }, - { "tt_roam_adv_tx" }, - { "tt_roam_adv_rx" }, -#ifdef CONFIG_BATMAN_ADV_DAT - { "dat_get_tx" }, - { "dat_get_rx" }, - { "dat_put_tx" }, - { "dat_put_rx" }, - { "dat_cached_reply_tx" }, -#endif -#ifdef CONFIG_BATMAN_ADV_NC - { "nc_code" }, - { "nc_code_bytes" }, - { "nc_recode" }, - { "nc_recode_bytes" }, - { "nc_buffer" }, - { "nc_decode" }, - { "nc_decode_bytes" }, - { "nc_decode_failed" }, - { "nc_sniffed" }, -#endif -}; - -static void batadv_get_strings(struct net_device *dev, u32 stringset, u8 *data) -{ - if (stringset == ETH_SS_STATS) - memcpy(data, batadv_counters_strings, - sizeof(batadv_counters_strings)); -} - -static void batadv_get_ethtool_stats(struct net_device *dev, - struct ethtool_stats *stats, u64 *data) -{ - struct batadv_priv *bat_priv = netdev_priv(dev); - int i; - - for (i = 0; i < BATADV_CNT_NUM; i++) - data[i] = batadv_sum_counter(bat_priv, i); -} - -static int batadv_get_sset_count(struct net_device *dev, int stringset) -{ - if (stringset == ETH_SS_STATS) - return BATADV_CNT_NUM; - - return -EOPNOTSUPP; -} diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c index c94ebdecdc3d123f71c5af89783623e58d2f323d..556f9a865ddfb5e488c65e9edd708fad187f56b1 100644 --- a/net/batman-adv/tp_meter.c +++ b/net/batman-adv/tp_meter.c @@ -873,8 +873,8 @@ static int batadv_tp_send(void *arg) /* something went wrong during the preparation/transmission */ if (unlikely(err && err != BATADV_TP_REASON_CANT_SEND)) { batadv_dbg(BATADV_DBG_TP_METER, bat_priv, - "Meter: batadv_tp_send() cannot send packets (%d)\n", - err); + "Meter: %s() cannot send packets (%d)\n", + __func__, err); /* ensure nobody else tries to stop the thread now */ if (atomic_dec_and_test(&tp_vars->sending)) tp_vars->reason = err; @@ -979,7 +979,8 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst, if (!tp_vars) { spin_unlock_bh(&bat_priv->tp_list_lock); batadv_dbg(BATADV_DBG_TP_METER, bat_priv, - "Meter: batadv_tp_start cannot allocate list elements\n"); + "Meter: %s cannot allocate list elements\n", + __func__); batadv_tp_batctl_error_notify(BATADV_TP_REASON_MEMORY_ERROR, dst, bat_priv, session_cookie); return; diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c index 6077a87d46f0f781ac72dcc3cc1d9f84c814ad19..e75b4937b497401284bc553182737a2f7af54605 100644 --- a/net/batman-adv/translation-table.c +++ b/net/batman-adv/translation-table.c @@ -617,7 +617,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, - BATADV_PRINT_VID(tt_global->common.vid), message); + batadv_print_vid(tt_global->common.vid), message); batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt, batadv_choose_tt, &tt_global->common); @@ -671,7 +671,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (tt_local->common.flags & BATADV_TT_CLIENT_PENDING) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Re-adding pending client %pM (vid: %d)\n", - addr, BATADV_PRINT_VID(vid)); + addr, batadv_print_vid(vid)); /* whatever the reason why the PENDING flag was set, * this is a client which was enqueued to be removed in * this orig_interval. Since it popped up again, the @@ -684,7 +684,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (tt_local->common.flags & BATADV_TT_CLIENT_ROAM) { batadv_dbg(BATADV_DBG_TT, bat_priv, "Roaming client %pM (vid: %d) came back to its original location\n", - addr, BATADV_PRINT_VID(vid)); + addr, batadv_print_vid(vid)); /* the ROAM flag is set because this client roamed away * and the node got a roaming_advertisement message. Now * that the client popped up again at its original @@ -716,7 +716,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, if (!vlan) { net_ratelimited_function(batadv_info, soft_iface, "adding TT local entry %pM to non-existent VLAN %d\n", - addr, BATADV_PRINT_VID(vid)); + addr, batadv_print_vid(vid)); kmem_cache_free(batadv_tl_cache, tt_local); tt_local = NULL; goto out; @@ -724,7 +724,7 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr, batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new local tt entry: %pM (vid: %d, ttvn: %d)\n", - addr, BATADV_PRINT_VID(vid), + addr, batadv_print_vid(vid), (u8)atomic_read(&bat_priv->tt.vn)); ether_addr_copy(tt_local->common.addr, addr); @@ -1097,7 +1097,7 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset) seq_printf(seq, " * %pM %4i [%c%c%c%c%c%c] %3u.%03u (%#.8x)\n", tt_common_entry->addr, - BATADV_PRINT_VID(tt_common_entry->vid), + batadv_print_vid(tt_common_entry->vid), ((tt_common_entry->flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), no_purge ? 'P' : '.', @@ -1296,7 +1296,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Local tt entry (%pM, vid: %d) pending to be removed: %s\n", tt_local_entry->common.addr, - BATADV_PRINT_VID(tt_local_entry->common.vid), message); + batadv_print_vid(tt_local_entry->common.vid), message); } /** @@ -1727,7 +1727,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Creating new global tt entry: %pM (vid: %d, via %pM)\n", - common->addr, BATADV_PRINT_VID(common->vid), + common->addr, batadv_print_vid(common->vid), orig_node->orig); ret = true; @@ -1835,7 +1835,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, if (!vlan) { seq_printf(seq, " * Cannot retrieve VLAN %d for originator %pM\n", - BATADV_PRINT_VID(tt_common_entry->vid), + batadv_print_vid(tt_common_entry->vid), best_entry->orig_node->orig); goto print_list; } @@ -1844,7 +1844,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, seq_printf(seq, " %c %pM %4i (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", '*', tt_global_entry->common.addr, - BATADV_PRINT_VID(tt_global_entry->common.vid), + batadv_print_vid(tt_global_entry->common.vid), best_entry->ttvn, best_entry->orig_node->orig, last_ttvn, vlan->tt.crc, ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), @@ -1867,7 +1867,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, if (!vlan) { seq_printf(seq, " + Cannot retrieve VLAN %d for originator %pM\n", - BATADV_PRINT_VID(tt_common_entry->vid), + batadv_print_vid(tt_common_entry->vid), orig_entry->orig_node->orig); continue; } @@ -1876,7 +1876,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv, seq_printf(seq, " %c %pM %4d (%3u) via %pM (%3u) (%#.8x) [%c%c%c%c]\n", '+', tt_global_entry->common.addr, - BATADV_PRINT_VID(tt_global_entry->common.vid), + batadv_print_vid(tt_global_entry->common.vid), orig_entry->ttvn, orig_entry->orig_node->orig, last_ttvn, vlan->tt.crc, ((flags & BATADV_TT_CLIENT_ROAM) ? 'R' : '.'), @@ -2213,7 +2213,7 @@ batadv_tt_global_del_orig_node(struct batadv_priv *bat_priv, "Deleting %pM from global tt entry %pM (vid: %d): %s\n", orig_node->orig, tt_global_entry->common.addr, - BATADV_PRINT_VID(vid), message); + batadv_print_vid(vid), message); _batadv_tt_global_del_orig_entry(tt_global_entry, orig_entry); } @@ -2253,12 +2253,13 @@ batadv_tt_global_del_roaming(struct batadv_priv *bat_priv, /* its the last one, mark for roaming. */ tt_global_entry->common.flags |= BATADV_TT_CLIENT_ROAM; tt_global_entry->roam_at = jiffies; - } else + } else { /* there is another entry, we can simply delete this * one and can still use the other one. */ batadv_tt_global_del_orig_node(bat_priv, tt_global_entry, orig_node, message); + } } /** @@ -2314,10 +2315,11 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv, /* local entry exists, case 2: client roamed to us. */ batadv_tt_global_del_orig_list(tt_global_entry); batadv_tt_global_free(bat_priv, tt_global_entry, message); - } else + } else { /* no local entry exists, case 1: check for roaming */ batadv_tt_global_del_roaming(bat_priv, tt_global_entry, orig_node, message); + } out: if (tt_global_entry) @@ -2375,7 +2377,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, - BATADV_PRINT_VID(vid), message); + batadv_print_vid(vid), message); hlist_del_rcu(&tt_common_entry->hash_entry); batadv_tt_global_entry_put(tt_global); } @@ -2435,7 +2437,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv) batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting global tt entry %pM (vid: %d): %s\n", tt_global->common.addr, - BATADV_PRINT_VID(tt_global->common.vid), + batadv_print_vid(tt_global->common.vid), msg); hlist_del_rcu(&tt_common->hash_entry); @@ -3650,7 +3652,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client, batadv_dbg(BATADV_DBG_TT, bat_priv, "Sending ROAMING_ADV to %pM (client %pM, vid: %d)\n", - orig_node->orig, client, BATADV_PRINT_VID(vid)); + orig_node->orig, client, batadv_print_vid(vid)); batadv_inc_counter(bat_priv, BATADV_CNT_TT_ROAM_ADV_TX); @@ -3773,7 +3775,7 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv) batadv_dbg(BATADV_DBG_TT, bat_priv, "Deleting local tt entry (%pM, vid: %d): pending\n", tt_common->addr, - BATADV_PRINT_VID(tt_common->vid)); + batadv_print_vid(tt_common->vid)); batadv_tt_local_size_dec(bat_priv, tt_common->vid); hlist_del_rcu(&tt_common->hash_entry); @@ -4017,7 +4019,7 @@ bool batadv_tt_add_temporary_global_entry(struct batadv_priv *bat_priv, batadv_dbg(BATADV_DBG_TT, bat_priv, "Added temporary global client (addr: %pM, vid: %d, orig: %pM)\n", - addr, BATADV_PRINT_VID(vid), orig_node->orig); + addr, batadv_print_vid(vid), orig_node->orig); ret = true; out: return ret; diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 246f21b4973bc39d0678273ad831da1f5b7e0df3..ea43a64492479809fe6bdf95b436792078f50e9f 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1000,7 +1000,6 @@ struct batadv_priv_bat_v { * struct batadv_priv - per mesh interface data * @mesh_state: current status of the mesh (inactive/active/deactivating) * @soft_iface: net device which holds this struct as private data - * @stats: structure holding the data for the ndo_get_stats() call * @bat_counters: mesh internal traffic statistic counters (see batadv_counters) * @aggregated_ogms: bool indicating whether OGM aggregation is enabled * @bonding: bool indicating whether traffic bonding is enabled @@ -1055,7 +1054,6 @@ struct batadv_priv_bat_v { struct batadv_priv { atomic_t mesh_state; struct net_device *soft_iface; - struct net_device_stats stats; u64 __percpu *bat_counters; /* Per cpu counters */ atomic_t aggregated_ogms; atomic_t bonding; @@ -1377,9 +1375,11 @@ struct batadv_nc_packet { * relevant to batman-adv in the skb->cb buffer in skbs. * @decoded: Marks a skb as decoded, which is checked when searching for coding * opportunities in network-coding.c + * @num_bcasts: Counter for broadcast packet retransmissions */ struct batadv_skb_cb { bool decoded; + unsigned int num_bcasts; }; /** @@ -1392,7 +1392,7 @@ struct batadv_skb_cb { * @skb: bcast packet's skb buffer * @packet_len: size of aggregated OGM packet inside the skb buffer * @direct_link_flags: direct link flags for aggregated OGM packets - * @num_packets: counter for bcast packet retransmission + * @num_packets: counter for aggregated OGMv1 packets * @delayed_work: work queue callback item for packet sending * @if_incoming: pointer to incoming hard-iface or primary iface if * locally generated packet diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c index d491529332f4568b532759331bd8f88653573ba0..608959989f8eddbfc9b97279a7fba61fd7381d04 100644 --- a/net/bluetooth/6lowpan.c +++ b/net/bluetooth/6lowpan.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -38,7 +39,6 @@ struct skb_cb { struct in6_addr addr; struct in6_addr gw; struct l2cap_chan *chan; - int status; }; #define lowpan_cb(skb) ((struct skb_cb *)((skb)->cb)) @@ -64,7 +64,7 @@ struct lowpan_peer { struct l2cap_chan *chan; /* peer addresses in various formats */ - unsigned char eui64_addr[EUI64_ADDR_LEN]; + unsigned char lladdr[ETH_ALEN]; struct in6_addr peer_addr; }; @@ -270,28 +270,20 @@ static int give_skb_to_upper(struct sk_buff *skb, struct net_device *dev) } static int iphc_decompress(struct sk_buff *skb, struct net_device *netdev, - struct l2cap_chan *chan) + struct lowpan_peer *peer) { - const u8 *saddr, *daddr; + const u8 *saddr; struct lowpan_btle_dev *dev; - struct lowpan_peer *peer; dev = lowpan_btle_dev(netdev); - rcu_read_lock(); - peer = __peer_lookup_chan(dev, chan); - rcu_read_unlock(); - if (!peer) - return -EINVAL; - - saddr = peer->eui64_addr; - daddr = dev->netdev->dev_addr; + saddr = peer->lladdr; - return lowpan_header_decompress(skb, netdev, daddr, saddr); + return lowpan_header_decompress(skb, netdev, netdev->dev_addr, saddr); } static int recv_pkt(struct sk_buff *skb, struct net_device *dev, - struct l2cap_chan *chan) + struct lowpan_peer *peer) { struct sk_buff *local_skb; int ret; @@ -344,8 +336,9 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, local_skb->dev = dev; - ret = iphc_decompress(local_skb, dev, chan); + ret = iphc_decompress(local_skb, dev, peer); if (ret < 0) { + BT_DBG("iphc_decompress failed: %d", ret); kfree_skb(local_skb); goto drop; } @@ -365,6 +358,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev, consume_skb(local_skb); consume_skb(skb); } else { + BT_DBG("unknown packet type"); goto drop; } @@ -390,7 +384,7 @@ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) if (!dev || !dev->netdev) return -ENOENT; - err = recv_pkt(skb, dev->netdev, chan); + err = recv_pkt(skb, dev->netdev, peer); if (err) { BT_DBG("recv pkt %d", err); err = -EAGAIN; @@ -399,37 +393,6 @@ static int chan_recv_cb(struct l2cap_chan *chan, struct sk_buff *skb) return err; } -static u8 get_addr_type_from_eui64(u8 byte) -{ - /* Is universal(0) or local(1) bit */ - return ((byte & 0x02) ? BDADDR_LE_RANDOM : BDADDR_LE_PUBLIC); -} - -static void copy_to_bdaddr(struct in6_addr *ip6_daddr, bdaddr_t *addr) -{ - u8 *eui64 = ip6_daddr->s6_addr + 8; - - addr->b[0] = eui64[7]; - addr->b[1] = eui64[6]; - addr->b[2] = eui64[5]; - addr->b[3] = eui64[2]; - addr->b[4] = eui64[1]; - addr->b[5] = eui64[0]; -} - -static void convert_dest_bdaddr(struct in6_addr *ip6_daddr, - bdaddr_t *addr, u8 *addr_type) -{ - copy_to_bdaddr(ip6_daddr, addr); - - /* We need to toggle the U/L bit that we got from IPv6 address - * so that we get the proper address and type of the BD address. - */ - addr->b[5] ^= 0x02; - - *addr_type = get_addr_type_from_eui64(addr->b[5]); -} - static int setup_header(struct sk_buff *skb, struct net_device *netdev, bdaddr_t *peer_addr, u8 *peer_addr_type) { @@ -437,8 +400,7 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, struct ipv6hdr *hdr; struct lowpan_btle_dev *dev; struct lowpan_peer *peer; - bdaddr_t addr, *any = BDADDR_ANY; - u8 *daddr = any->b; + u8 *daddr; int err, status = 0; hdr = ipv6_hdr(skb); @@ -449,34 +411,24 @@ static int setup_header(struct sk_buff *skb, struct net_device *netdev, if (ipv6_addr_is_multicast(&ipv6_daddr)) { lowpan_cb(skb)->chan = NULL; + daddr = NULL; } else { - u8 addr_type; + BT_DBG("dest IP %pI6c", &ipv6_daddr); - /* Get destination BT device from skb. - * If there is no such peer then discard the packet. + /* The packet might be sent to 6lowpan interface + * because of routing (either via default route + * or user set route) so get peer according to + * the destination address. */ - convert_dest_bdaddr(&ipv6_daddr, &addr, &addr_type); - - BT_DBG("dest addr %pMR type %d IP %pI6c", &addr, - addr_type, &ipv6_daddr); - - peer = peer_lookup_ba(dev, &addr, addr_type); + peer = peer_lookup_dst(dev, &ipv6_daddr, skb); if (!peer) { - /* The packet might be sent to 6lowpan interface - * because of routing (either via default route - * or user set route) so get peer according to - * the destination address. - */ - peer = peer_lookup_dst(dev, &ipv6_daddr, skb); - if (!peer) { - BT_DBG("no such peer %pMR found", &addr); - return -ENOENT; - } + BT_DBG("no such peer"); + return -ENOENT; } - daddr = peer->eui64_addr; - *peer_addr = addr; - *peer_addr_type = addr_type; + daddr = peer->lladdr; + *peer_addr = peer->chan->dst; + *peer_addr_type = peer->chan->dst_type; lowpan_cb(skb)->chan = peer->chan; status = 1; @@ -527,15 +479,8 @@ static int send_pkt(struct l2cap_chan *chan, struct sk_buff *skb, return 0; } - if (!err) - err = lowpan_cb(skb)->status; - - if (err < 0) { - if (err == -EAGAIN) - netdev->stats.tx_dropped++; - else - netdev->stats.tx_errors++; - } + if (err < 0) + netdev->stats.tx_errors++; return err; } @@ -647,9 +592,9 @@ static void netdev_setup(struct net_device *dev) { dev->hard_header_len = 0; dev->needed_tailroom = 0; - dev->flags = IFF_RUNNING | IFF_POINTOPOINT | - IFF_MULTICAST; + dev->flags = IFF_RUNNING | IFF_MULTICAST; dev->watchdog_timeo = 0; + dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN; dev->netdev_ops = &netdev_ops; dev->header_ops = &header_ops; @@ -660,34 +605,6 @@ static struct device_type bt_type = { .name = "bluetooth", }; -static void set_addr(u8 *eui, u8 *addr, u8 addr_type) -{ - /* addr is the BT address in little-endian format */ - eui[0] = addr[5]; - eui[1] = addr[4]; - eui[2] = addr[3]; - eui[3] = 0xFF; - eui[4] = 0xFE; - eui[5] = addr[2]; - eui[6] = addr[1]; - eui[7] = addr[0]; - - /* Universal/local bit set, BT 6lowpan draft ch. 3.2.1 */ - if (addr_type == BDADDR_LE_PUBLIC) - eui[0] &= ~0x02; - else - eui[0] |= 0x02; - - BT_DBG("type %d addr %*phC", addr_type, 8, eui); -} - -static void set_dev_addr(struct net_device *netdev, bdaddr_t *addr, - u8 addr_type) -{ - netdev->addr_assign_type = NET_ADDR_PERM; - set_addr(netdev->dev_addr, addr->b, addr_type); -} - static void ifup(struct net_device *netdev) { int err; @@ -746,16 +663,9 @@ static struct l2cap_chan *chan_create(void) return chan; } -static void set_ip_addr_bits(u8 addr_type, u8 *addr) -{ - if (addr_type == BDADDR_LE_PUBLIC) - *addr |= 0x02; - else - *addr &= ~0x02; -} - static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, - struct lowpan_btle_dev *dev) + struct lowpan_btle_dev *dev, + bool new_netdev) { struct lowpan_peer *peer; @@ -766,19 +676,9 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, peer->chan = chan; memset(&peer->peer_addr, 0, sizeof(struct in6_addr)); - /* RFC 2464 ch. 5 */ - peer->peer_addr.s6_addr[0] = 0xFE; - peer->peer_addr.s6_addr[1] = 0x80; - set_addr((u8 *)&peer->peer_addr.s6_addr + 8, chan->dst.b, - chan->dst_type); - - memcpy(&peer->eui64_addr, (u8 *)&peer->peer_addr.s6_addr + 8, - EUI64_ADDR_LEN); + baswap((void *)peer->lladdr, &chan->dst); - /* IPv6 address needs to have the U/L bit set properly so toggle - * it back here. - */ - set_ip_addr_bits(chan->dst_type, (u8 *)&peer->peer_addr.s6_addr + 8); + lowpan_iphc_uncompress_eui48_lladdr(&peer->peer_addr, peer->lladdr); spin_lock(&devices_lock); INIT_LIST_HEAD(&peer->list); @@ -786,7 +686,8 @@ static struct l2cap_chan *add_peer_chan(struct l2cap_chan *chan, spin_unlock(&devices_lock); /* Notifying peers about us needs to be done without locks held */ - INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); + if (new_netdev) + INIT_DELAYED_WORK(&dev->notify_peers, do_notify_peers); schedule_delayed_work(&dev->notify_peers, msecs_to_jiffies(100)); return peer->chan; @@ -803,7 +704,8 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) if (!netdev) return -ENOMEM; - set_dev_addr(netdev, &chan->src, chan->src_type); + netdev->addr_assign_type = NET_ADDR_PERM; + baswap((void *)netdev->dev_addr, &chan->src); netdev->netdev_ops = &netdev_ops; SET_NETDEV_DEV(netdev, &chan->conn->hcon->hdev->dev); @@ -843,6 +745,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_btle_dev **dev) static inline void chan_ready_cb(struct l2cap_chan *chan) { struct lowpan_btle_dev *dev; + bool new_netdev = false; dev = lookup_dev(chan->conn); @@ -853,12 +756,13 @@ static inline void chan_ready_cb(struct l2cap_chan *chan) l2cap_chan_del(chan, -ENOENT); return; } + new_netdev = true; } if (!try_module_get(THIS_MODULE)) return; - add_peer_chan(chan, dev); + add_peer_chan(chan, dev, new_netdev); ifup(dev->netdev); } @@ -964,26 +868,28 @@ static struct sk_buff *chan_alloc_skb_cb(struct l2cap_chan *chan, static void chan_suspend_cb(struct l2cap_chan *chan) { - struct sk_buff *skb = chan->data; + struct lowpan_btle_dev *dev; - BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + BT_DBG("chan %p suspend", chan); - if (!skb) + dev = lookup_dev(chan->conn); + if (!dev || !dev->netdev) return; - lowpan_cb(skb)->status = -EAGAIN; + netif_stop_queue(dev->netdev); } static void chan_resume_cb(struct l2cap_chan *chan) { - struct sk_buff *skb = chan->data; + struct lowpan_btle_dev *dev; - BT_DBG("chan %p conn %p skb %p", chan, chan->conn, skb); + BT_DBG("chan %p resume", chan); - if (!skb) + dev = lookup_dev(chan->conn); + if (!dev || !dev->netdev) return; - lowpan_cb(skb)->status = 0; + netif_wake_queue(dev->netdev); } static long chan_get_sndtimeo_cb(struct l2cap_chan *chan) diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig index 06c31b9a68b0bce3cc4f28224c3c23864f78e7c9..68f951b3e85aa725485bef77de63f162f0241d97 100644 --- a/net/bluetooth/Kconfig +++ b/net/bluetooth/Kconfig @@ -13,6 +13,7 @@ menuconfig BT select CRYPTO_CMAC select CRYPTO_ECB select CRYPTO_SHA256 + select CRYPTO_ECDH help Bluetooth is low-cost, low-power, short-range wireless technology. It was designed as a replacement for cables and other short-range diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile index 4bfaa19a557382311369b229b0d26846d5a968dc..5d0a113e2e4049e6043099ddcf1fdd50c22ce0ab 100644 --- a/net/bluetooth/Makefile +++ b/net/bluetooth/Makefile @@ -13,7 +13,7 @@ bluetooth_6lowpan-y := 6lowpan.o bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \ hci_sock.o hci_sysfs.o l2cap_core.o l2cap_sock.o smp.o lib.o \ - ecc.o hci_request.o mgmt_util.o + ecdh_helper.o hci_request.o mgmt_util.o bluetooth-$(CONFIG_BT_BREDR) += sco.o bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 69e1f7d362a8b71ed61cc12c66e8b81837712447..42d0997e2fbb78af58970504baf0d855d72a964a 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -159,12 +159,17 @@ void bt_accept_enqueue(struct sock *parent, struct sock *sk) BT_DBG("parent %p, sk %p", parent, sk); sock_hold(sk); + lock_sock(sk); list_add_tail(&bt_sk(sk)->accept_q, &bt_sk(parent)->accept_q); bt_sk(sk)->parent = parent; + release_sock(sk); parent->sk_ack_backlog++; } EXPORT_SYMBOL(bt_accept_enqueue); +/* Calling function must hold the sk lock. + * bt_sk(sk)->parent must be non-NULL meaning sk is in the parent list. + */ void bt_accept_unlink(struct sock *sk) { BT_DBG("sk %p state %d", sk, sk->sk_state); @@ -183,11 +188,32 @@ struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock) BT_DBG("parent %p", parent); +restart: list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) { sk = (struct sock *)s; + /* Prevent early freeing of sk due to unlink and sock_kill */ + sock_hold(sk); lock_sock(sk); + /* Check sk has not already been unlinked via + * bt_accept_unlink() due to serialisation caused by sk locking + */ + if (!bt_sk(sk)->parent) { + BT_DBG("sk %p, already unlinked", sk); + release_sock(sk); + sock_put(sk); + + /* Restart the loop as sk is no longer in the list + * and also avoid a potential infinite loop because + * list_for_each_entry_safe() is not thread safe. + */ + goto restart; + } + + /* sk is safely in the parent list so reduce reference count */ + sock_put(sk); + /* FIXME: Is this check still needed */ if (sk->sk_state == BT_CLOSED) { bt_accept_unlink(sk); diff --git a/net/bluetooth/amp.c b/net/bluetooth/amp.c index 02a4ccc04e1ebba5301f8d2bfec55229d428ffd8..ebcab5bbadd7eb174f8bb2aff10b4e0d5283e182 100644 --- a/net/bluetooth/amp.c +++ b/net/bluetooth/amp.c @@ -263,7 +263,7 @@ void amp_read_loc_assoc_frag(struct hci_dev *hdev, u8 phy_handle) struct hci_cp_read_local_amp_assoc cp; struct amp_assoc *loc_assoc = &hdev->loc_assoc; struct hci_request req; - int err = 0; + int err; BT_DBG("%s handle %d", hdev->name, phy_handle); @@ -282,7 +282,7 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) { struct hci_cp_read_local_amp_assoc cp; struct hci_request req; - int err = 0; + int err; memset(&hdev->loc_assoc, 0, sizeof(struct amp_assoc)); memset(&cp, 0, sizeof(cp)); @@ -292,7 +292,7 @@ void amp_read_loc_assoc(struct hci_dev *hdev, struct amp_mgr *mgr) set_bit(READ_LOC_AMP_ASSOC, &mgr->state); hci_req_init(&req, hdev); hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); - hci_req_run_skb(&req, read_local_amp_assoc_complete); + err = hci_req_run_skb(&req, read_local_amp_assoc_complete); if (err < 0) a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } @@ -303,7 +303,7 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, struct hci_cp_read_local_amp_assoc cp; struct amp_mgr *mgr = hcon->amp_mgr; struct hci_request req; - int err = 0; + int err; cp.phy_handle = hcon->handle; cp.len_so_far = cpu_to_le16(0); @@ -314,7 +314,7 @@ void amp_read_loc_assoc_final_data(struct hci_dev *hdev, /* Read Local AMP Assoc final link information data */ hci_req_init(&req, hdev); hci_req_add(&req, HCI_OP_READ_LOCAL_AMP_ASSOC, sizeof(cp), &cp); - hci_req_run_skb(&req, read_local_amp_assoc_complete); + err = hci_req_run_skb(&req, read_local_amp_assoc_complete); if (err < 0) a2mp_send_getampassoc_rsp(hdev, A2MP_STATUS_INVALID_CTRL_ID); } diff --git a/net/bluetooth/ecc.c b/net/bluetooth/ecc.c deleted file mode 100644 index e1709f8467acacb216241ec03f845a0043464c3b..0000000000000000000000000000000000000000 --- a/net/bluetooth/ecc.c +++ /dev/null @@ -1,816 +0,0 @@ -/* - * Copyright (c) 2013, Kenneth MacKay - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -#include - -#include "ecc.h" - -/* 256-bit curve */ -#define ECC_BYTES 32 - -#define MAX_TRIES 16 - -/* Number of u64's needed */ -#define NUM_ECC_DIGITS (ECC_BYTES / 8) - -struct ecc_point { - u64 x[NUM_ECC_DIGITS]; - u64 y[NUM_ECC_DIGITS]; -}; - -typedef struct { - u64 m_low; - u64 m_high; -} uint128_t; - -#define CURVE_P_32 { 0xFFFFFFFFFFFFFFFFull, 0x00000000FFFFFFFFull, \ - 0x0000000000000000ull, 0xFFFFFFFF00000001ull } - -#define CURVE_G_32 { \ - { 0xF4A13945D898C296ull, 0x77037D812DEB33A0ull, \ - 0xF8BCE6E563A440F2ull, 0x6B17D1F2E12C4247ull }, \ - { 0xCBB6406837BF51F5ull, 0x2BCE33576B315ECEull, \ - 0x8EE7EB4A7C0F9E16ull, 0x4FE342E2FE1A7F9Bull } \ -} - -#define CURVE_N_32 { 0xF3B9CAC2FC632551ull, 0xBCE6FAADA7179E84ull, \ - 0xFFFFFFFFFFFFFFFFull, 0xFFFFFFFF00000000ull } - -static u64 curve_p[NUM_ECC_DIGITS] = CURVE_P_32; -static struct ecc_point curve_g = CURVE_G_32; -static u64 curve_n[NUM_ECC_DIGITS] = CURVE_N_32; - -static void vli_clear(u64 *vli) -{ - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) - vli[i] = 0; -} - -/* Returns true if vli == 0, false otherwise. */ -static bool vli_is_zero(const u64 *vli) -{ - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - if (vli[i]) - return false; - } - - return true; -} - -/* Returns nonzero if bit bit of vli is set. */ -static u64 vli_test_bit(const u64 *vli, unsigned int bit) -{ - return (vli[bit / 64] & ((u64) 1 << (bit % 64))); -} - -/* Counts the number of 64-bit "digits" in vli. */ -static unsigned int vli_num_digits(const u64 *vli) -{ - int i; - - /* Search from the end until we find a non-zero digit. - * We do it in reverse because we expect that most digits will - * be nonzero. - */ - for (i = NUM_ECC_DIGITS - 1; i >= 0 && vli[i] == 0; i--); - - return (i + 1); -} - -/* Counts the number of bits required for vli. */ -static unsigned int vli_num_bits(const u64 *vli) -{ - unsigned int i, num_digits; - u64 digit; - - num_digits = vli_num_digits(vli); - if (num_digits == 0) - return 0; - - digit = vli[num_digits - 1]; - for (i = 0; digit; i++) - digit >>= 1; - - return ((num_digits - 1) * 64 + i); -} - -/* Sets dest = src. */ -static void vli_set(u64 *dest, const u64 *src) -{ - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) - dest[i] = src[i]; -} - -/* Returns sign of left - right. */ -static int vli_cmp(const u64 *left, const u64 *right) -{ - int i; - - for (i = NUM_ECC_DIGITS - 1; i >= 0; i--) { - if (left[i] > right[i]) - return 1; - else if (left[i] < right[i]) - return -1; - } - - return 0; -} - -/* Computes result = in << c, returning carry. Can modify in place - * (if result == in). 0 < shift < 64. - */ -static u64 vli_lshift(u64 *result, const u64 *in, - unsigned int shift) -{ - u64 carry = 0; - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - u64 temp = in[i]; - - result[i] = (temp << shift) | carry; - carry = temp >> (64 - shift); - } - - return carry; -} - -/* Computes vli = vli >> 1. */ -static void vli_rshift1(u64 *vli) -{ - u64 *end = vli; - u64 carry = 0; - - vli += NUM_ECC_DIGITS; - - while (vli-- > end) { - u64 temp = *vli; - *vli = (temp >> 1) | carry; - carry = temp << 63; - } -} - -/* Computes result = left + right, returning carry. Can modify in place. */ -static u64 vli_add(u64 *result, const u64 *left, - const u64 *right) -{ - u64 carry = 0; - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - u64 sum; - - sum = left[i] + right[i] + carry; - if (sum != left[i]) - carry = (sum < left[i]); - - result[i] = sum; - } - - return carry; -} - -/* Computes result = left - right, returning borrow. Can modify in place. */ -static u64 vli_sub(u64 *result, const u64 *left, const u64 *right) -{ - u64 borrow = 0; - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - u64 diff; - - diff = left[i] - right[i] - borrow; - if (diff != left[i]) - borrow = (diff > left[i]); - - result[i] = diff; - } - - return borrow; -} - -static uint128_t mul_64_64(u64 left, u64 right) -{ - u64 a0 = left & 0xffffffffull; - u64 a1 = left >> 32; - u64 b0 = right & 0xffffffffull; - u64 b1 = right >> 32; - u64 m0 = a0 * b0; - u64 m1 = a0 * b1; - u64 m2 = a1 * b0; - u64 m3 = a1 * b1; - uint128_t result; - - m2 += (m0 >> 32); - m2 += m1; - - /* Overflow */ - if (m2 < m1) - m3 += 0x100000000ull; - - result.m_low = (m0 & 0xffffffffull) | (m2 << 32); - result.m_high = m3 + (m2 >> 32); - - return result; -} - -static uint128_t add_128_128(uint128_t a, uint128_t b) -{ - uint128_t result; - - result.m_low = a.m_low + b.m_low; - result.m_high = a.m_high + b.m_high + (result.m_low < a.m_low); - - return result; -} - -static void vli_mult(u64 *result, const u64 *left, const u64 *right) -{ - uint128_t r01 = { 0, 0 }; - u64 r2 = 0; - unsigned int i, k; - - /* Compute each digit of result in sequence, maintaining the - * carries. - */ - for (k = 0; k < NUM_ECC_DIGITS * 2 - 1; k++) { - unsigned int min; - - if (k < NUM_ECC_DIGITS) - min = 0; - else - min = (k + 1) - NUM_ECC_DIGITS; - - for (i = min; i <= k && i < NUM_ECC_DIGITS; i++) { - uint128_t product; - - product = mul_64_64(left[i], right[k - i]); - - r01 = add_128_128(r01, product); - r2 += (r01.m_high < product.m_high); - } - - result[k] = r01.m_low; - r01.m_low = r01.m_high; - r01.m_high = r2; - r2 = 0; - } - - result[NUM_ECC_DIGITS * 2 - 1] = r01.m_low; -} - -static void vli_square(u64 *result, const u64 *left) -{ - uint128_t r01 = { 0, 0 }; - u64 r2 = 0; - int i, k; - - for (k = 0; k < NUM_ECC_DIGITS * 2 - 1; k++) { - unsigned int min; - - if (k < NUM_ECC_DIGITS) - min = 0; - else - min = (k + 1) - NUM_ECC_DIGITS; - - for (i = min; i <= k && i <= k - i; i++) { - uint128_t product; - - product = mul_64_64(left[i], left[k - i]); - - if (i < k - i) { - r2 += product.m_high >> 63; - product.m_high = (product.m_high << 1) | - (product.m_low >> 63); - product.m_low <<= 1; - } - - r01 = add_128_128(r01, product); - r2 += (r01.m_high < product.m_high); - } - - result[k] = r01.m_low; - r01.m_low = r01.m_high; - r01.m_high = r2; - r2 = 0; - } - - result[NUM_ECC_DIGITS * 2 - 1] = r01.m_low; -} - -/* Computes result = (left + right) % mod. - * Assumes that left < mod and right < mod, result != mod. - */ -static void vli_mod_add(u64 *result, const u64 *left, const u64 *right, - const u64 *mod) -{ - u64 carry; - - carry = vli_add(result, left, right); - - /* result > mod (result = mod + remainder), so subtract mod to - * get remainder. - */ - if (carry || vli_cmp(result, mod) >= 0) - vli_sub(result, result, mod); -} - -/* Computes result = (left - right) % mod. - * Assumes that left < mod and right < mod, result != mod. - */ -static void vli_mod_sub(u64 *result, const u64 *left, const u64 *right, - const u64 *mod) -{ - u64 borrow = vli_sub(result, left, right); - - /* In this case, p_result == -diff == (max int) - diff. - * Since -x % d == d - x, we can get the correct result from - * result + mod (with overflow). - */ - if (borrow) - vli_add(result, result, mod); -} - -/* Computes result = product % curve_p - from http://www.nsa.gov/ia/_files/nist-routines.pdf */ -static void vli_mmod_fast(u64 *result, const u64 *product) -{ - u64 tmp[NUM_ECC_DIGITS]; - int carry; - - /* t */ - vli_set(result, product); - - /* s1 */ - tmp[0] = 0; - tmp[1] = product[5] & 0xffffffff00000000ull; - tmp[2] = product[6]; - tmp[3] = product[7]; - carry = vli_lshift(tmp, tmp, 1); - carry += vli_add(result, result, tmp); - - /* s2 */ - tmp[1] = product[6] << 32; - tmp[2] = (product[6] >> 32) | (product[7] << 32); - tmp[3] = product[7] >> 32; - carry += vli_lshift(tmp, tmp, 1); - carry += vli_add(result, result, tmp); - - /* s3 */ - tmp[0] = product[4]; - tmp[1] = product[5] & 0xffffffff; - tmp[2] = 0; - tmp[3] = product[7]; - carry += vli_add(result, result, tmp); - - /* s4 */ - tmp[0] = (product[4] >> 32) | (product[5] << 32); - tmp[1] = (product[5] >> 32) | (product[6] & 0xffffffff00000000ull); - tmp[2] = product[7]; - tmp[3] = (product[6] >> 32) | (product[4] << 32); - carry += vli_add(result, result, tmp); - - /* d1 */ - tmp[0] = (product[5] >> 32) | (product[6] << 32); - tmp[1] = (product[6] >> 32); - tmp[2] = 0; - tmp[3] = (product[4] & 0xffffffff) | (product[5] << 32); - carry -= vli_sub(result, result, tmp); - - /* d2 */ - tmp[0] = product[6]; - tmp[1] = product[7]; - tmp[2] = 0; - tmp[3] = (product[4] >> 32) | (product[5] & 0xffffffff00000000ull); - carry -= vli_sub(result, result, tmp); - - /* d3 */ - tmp[0] = (product[6] >> 32) | (product[7] << 32); - tmp[1] = (product[7] >> 32) | (product[4] << 32); - tmp[2] = (product[4] >> 32) | (product[5] << 32); - tmp[3] = (product[6] << 32); - carry -= vli_sub(result, result, tmp); - - /* d4 */ - tmp[0] = product[7]; - tmp[1] = product[4] & 0xffffffff00000000ull; - tmp[2] = product[5]; - tmp[3] = product[6] & 0xffffffff00000000ull; - carry -= vli_sub(result, result, tmp); - - if (carry < 0) { - do { - carry += vli_add(result, result, curve_p); - } while (carry < 0); - } else { - while (carry || vli_cmp(curve_p, result) != 1) - carry -= vli_sub(result, result, curve_p); - } -} - -/* Computes result = (left * right) % curve_p. */ -static void vli_mod_mult_fast(u64 *result, const u64 *left, const u64 *right) -{ - u64 product[2 * NUM_ECC_DIGITS]; - - vli_mult(product, left, right); - vli_mmod_fast(result, product); -} - -/* Computes result = left^2 % curve_p. */ -static void vli_mod_square_fast(u64 *result, const u64 *left) -{ - u64 product[2 * NUM_ECC_DIGITS]; - - vli_square(product, left); - vli_mmod_fast(result, product); -} - -#define EVEN(vli) (!(vli[0] & 1)) -/* Computes result = (1 / p_input) % mod. All VLIs are the same size. - * See "From Euclid's GCD to Montgomery Multiplication to the Great Divide" - * https://labs.oracle.com/techrep/2001/smli_tr-2001-95.pdf - */ -static void vli_mod_inv(u64 *result, const u64 *input, const u64 *mod) -{ - u64 a[NUM_ECC_DIGITS], b[NUM_ECC_DIGITS]; - u64 u[NUM_ECC_DIGITS], v[NUM_ECC_DIGITS]; - u64 carry; - int cmp_result; - - if (vli_is_zero(input)) { - vli_clear(result); - return; - } - - vli_set(a, input); - vli_set(b, mod); - vli_clear(u); - u[0] = 1; - vli_clear(v); - - while ((cmp_result = vli_cmp(a, b)) != 0) { - carry = 0; - - if (EVEN(a)) { - vli_rshift1(a); - - if (!EVEN(u)) - carry = vli_add(u, u, mod); - - vli_rshift1(u); - if (carry) - u[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull; - } else if (EVEN(b)) { - vli_rshift1(b); - - if (!EVEN(v)) - carry = vli_add(v, v, mod); - - vli_rshift1(v); - if (carry) - v[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull; - } else if (cmp_result > 0) { - vli_sub(a, a, b); - vli_rshift1(a); - - if (vli_cmp(u, v) < 0) - vli_add(u, u, mod); - - vli_sub(u, u, v); - if (!EVEN(u)) - carry = vli_add(u, u, mod); - - vli_rshift1(u); - if (carry) - u[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull; - } else { - vli_sub(b, b, a); - vli_rshift1(b); - - if (vli_cmp(v, u) < 0) - vli_add(v, v, mod); - - vli_sub(v, v, u); - if (!EVEN(v)) - carry = vli_add(v, v, mod); - - vli_rshift1(v); - if (carry) - v[NUM_ECC_DIGITS - 1] |= 0x8000000000000000ull; - } - } - - vli_set(result, u); -} - -/* ------ Point operations ------ */ - -/* Returns true if p_point is the point at infinity, false otherwise. */ -static bool ecc_point_is_zero(const struct ecc_point *point) -{ - return (vli_is_zero(point->x) && vli_is_zero(point->y)); -} - -/* Point multiplication algorithm using Montgomery's ladder with co-Z - * coordinates. From http://eprint.iacr.org/2011/338.pdf - */ - -/* Double in place */ -static void ecc_point_double_jacobian(u64 *x1, u64 *y1, u64 *z1) -{ - /* t1 = x, t2 = y, t3 = z */ - u64 t4[NUM_ECC_DIGITS]; - u64 t5[NUM_ECC_DIGITS]; - - if (vli_is_zero(z1)) - return; - - vli_mod_square_fast(t4, y1); /* t4 = y1^2 */ - vli_mod_mult_fast(t5, x1, t4); /* t5 = x1*y1^2 = A */ - vli_mod_square_fast(t4, t4); /* t4 = y1^4 */ - vli_mod_mult_fast(y1, y1, z1); /* t2 = y1*z1 = z3 */ - vli_mod_square_fast(z1, z1); /* t3 = z1^2 */ - - vli_mod_add(x1, x1, z1, curve_p); /* t1 = x1 + z1^2 */ - vli_mod_add(z1, z1, z1, curve_p); /* t3 = 2*z1^2 */ - vli_mod_sub(z1, x1, z1, curve_p); /* t3 = x1 - z1^2 */ - vli_mod_mult_fast(x1, x1, z1); /* t1 = x1^2 - z1^4 */ - - vli_mod_add(z1, x1, x1, curve_p); /* t3 = 2*(x1^2 - z1^4) */ - vli_mod_add(x1, x1, z1, curve_p); /* t1 = 3*(x1^2 - z1^4) */ - if (vli_test_bit(x1, 0)) { - u64 carry = vli_add(x1, x1, curve_p); - vli_rshift1(x1); - x1[NUM_ECC_DIGITS - 1] |= carry << 63; - } else { - vli_rshift1(x1); - } - /* t1 = 3/2*(x1^2 - z1^4) = B */ - - vli_mod_square_fast(z1, x1); /* t3 = B^2 */ - vli_mod_sub(z1, z1, t5, curve_p); /* t3 = B^2 - A */ - vli_mod_sub(z1, z1, t5, curve_p); /* t3 = B^2 - 2A = x3 */ - vli_mod_sub(t5, t5, z1, curve_p); /* t5 = A - x3 */ - vli_mod_mult_fast(x1, x1, t5); /* t1 = B * (A - x3) */ - vli_mod_sub(t4, x1, t4, curve_p); /* t4 = B * (A - x3) - y1^4 = y3 */ - - vli_set(x1, z1); - vli_set(z1, y1); - vli_set(y1, t4); -} - -/* Modify (x1, y1) => (x1 * z^2, y1 * z^3) */ -static void apply_z(u64 *x1, u64 *y1, u64 *z) -{ - u64 t1[NUM_ECC_DIGITS]; - - vli_mod_square_fast(t1, z); /* z^2 */ - vli_mod_mult_fast(x1, x1, t1); /* x1 * z^2 */ - vli_mod_mult_fast(t1, t1, z); /* z^3 */ - vli_mod_mult_fast(y1, y1, t1); /* y1 * z^3 */ -} - -/* P = (x1, y1) => 2P, (x2, y2) => P' */ -static void xycz_initial_double(u64 *x1, u64 *y1, u64 *x2, u64 *y2, - u64 *p_initial_z) -{ - u64 z[NUM_ECC_DIGITS]; - - vli_set(x2, x1); - vli_set(y2, y1); - - vli_clear(z); - z[0] = 1; - - if (p_initial_z) - vli_set(z, p_initial_z); - - apply_z(x1, y1, z); - - ecc_point_double_jacobian(x1, y1, z); - - apply_z(x2, y2, z); -} - -/* Input P = (x1, y1, Z), Q = (x2, y2, Z) - * Output P' = (x1', y1', Z3), P + Q = (x3, y3, Z3) - * or P => P', Q => P + Q - */ -static void xycz_add(u64 *x1, u64 *y1, u64 *x2, u64 *y2) -{ - /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */ - u64 t5[NUM_ECC_DIGITS]; - - vli_mod_sub(t5, x2, x1, curve_p); /* t5 = x2 - x1 */ - vli_mod_square_fast(t5, t5); /* t5 = (x2 - x1)^2 = A */ - vli_mod_mult_fast(x1, x1, t5); /* t1 = x1*A = B */ - vli_mod_mult_fast(x2, x2, t5); /* t3 = x2*A = C */ - vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y2 - y1 */ - vli_mod_square_fast(t5, y2); /* t5 = (y2 - y1)^2 = D */ - - vli_mod_sub(t5, t5, x1, curve_p); /* t5 = D - B */ - vli_mod_sub(t5, t5, x2, curve_p); /* t5 = D - B - C = x3 */ - vli_mod_sub(x2, x2, x1, curve_p); /* t3 = C - B */ - vli_mod_mult_fast(y1, y1, x2); /* t2 = y1*(C - B) */ - vli_mod_sub(x2, x1, t5, curve_p); /* t3 = B - x3 */ - vli_mod_mult_fast(y2, y2, x2); /* t4 = (y2 - y1)*(B - x3) */ - vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y3 */ - - vli_set(x2, t5); -} - -/* Input P = (x1, y1, Z), Q = (x2, y2, Z) - * Output P + Q = (x3, y3, Z3), P - Q = (x3', y3', Z3) - * or P => P - Q, Q => P + Q - */ -static void xycz_add_c(u64 *x1, u64 *y1, u64 *x2, u64 *y2) -{ - /* t1 = X1, t2 = Y1, t3 = X2, t4 = Y2 */ - u64 t5[NUM_ECC_DIGITS]; - u64 t6[NUM_ECC_DIGITS]; - u64 t7[NUM_ECC_DIGITS]; - - vli_mod_sub(t5, x2, x1, curve_p); /* t5 = x2 - x1 */ - vli_mod_square_fast(t5, t5); /* t5 = (x2 - x1)^2 = A */ - vli_mod_mult_fast(x1, x1, t5); /* t1 = x1*A = B */ - vli_mod_mult_fast(x2, x2, t5); /* t3 = x2*A = C */ - vli_mod_add(t5, y2, y1, curve_p); /* t4 = y2 + y1 */ - vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y2 - y1 */ - - vli_mod_sub(t6, x2, x1, curve_p); /* t6 = C - B */ - vli_mod_mult_fast(y1, y1, t6); /* t2 = y1 * (C - B) */ - vli_mod_add(t6, x1, x2, curve_p); /* t6 = B + C */ - vli_mod_square_fast(x2, y2); /* t3 = (y2 - y1)^2 */ - vli_mod_sub(x2, x2, t6, curve_p); /* t3 = x3 */ - - vli_mod_sub(t7, x1, x2, curve_p); /* t7 = B - x3 */ - vli_mod_mult_fast(y2, y2, t7); /* t4 = (y2 - y1)*(B - x3) */ - vli_mod_sub(y2, y2, y1, curve_p); /* t4 = y3 */ - - vli_mod_square_fast(t7, t5); /* t7 = (y2 + y1)^2 = F */ - vli_mod_sub(t7, t7, t6, curve_p); /* t7 = x3' */ - vli_mod_sub(t6, t7, x1, curve_p); /* t6 = x3' - B */ - vli_mod_mult_fast(t6, t6, t5); /* t6 = (y2 + y1)*(x3' - B) */ - vli_mod_sub(y1, t6, y1, curve_p); /* t2 = y3' */ - - vli_set(x1, t7); -} - -static void ecc_point_mult(struct ecc_point *result, - const struct ecc_point *point, u64 *scalar, - u64 *initial_z, int num_bits) -{ - /* R0 and R1 */ - u64 rx[2][NUM_ECC_DIGITS]; - u64 ry[2][NUM_ECC_DIGITS]; - u64 z[NUM_ECC_DIGITS]; - int i, nb; - - vli_set(rx[1], point->x); - vli_set(ry[1], point->y); - - xycz_initial_double(rx[1], ry[1], rx[0], ry[0], initial_z); - - for (i = num_bits - 2; i > 0; i--) { - nb = !vli_test_bit(scalar, i); - xycz_add_c(rx[1 - nb], ry[1 - nb], rx[nb], ry[nb]); - xycz_add(rx[nb], ry[nb], rx[1 - nb], ry[1 - nb]); - } - - nb = !vli_test_bit(scalar, 0); - xycz_add_c(rx[1 - nb], ry[1 - nb], rx[nb], ry[nb]); - - /* Find final 1/Z value. */ - vli_mod_sub(z, rx[1], rx[0], curve_p); /* X1 - X0 */ - vli_mod_mult_fast(z, z, ry[1 - nb]); /* Yb * (X1 - X0) */ - vli_mod_mult_fast(z, z, point->x); /* xP * Yb * (X1 - X0) */ - vli_mod_inv(z, z, curve_p); /* 1 / (xP * Yb * (X1 - X0)) */ - vli_mod_mult_fast(z, z, point->y); /* yP / (xP * Yb * (X1 - X0)) */ - vli_mod_mult_fast(z, z, rx[1 - nb]); /* Xb * yP / (xP * Yb * (X1 - X0)) */ - /* End 1/Z calculation */ - - xycz_add(rx[nb], ry[nb], rx[1 - nb], ry[1 - nb]); - - apply_z(rx[0], ry[0], z); - - vli_set(result->x, rx[0]); - vli_set(result->y, ry[0]); -} - -static void ecc_bytes2native(const u8 bytes[ECC_BYTES], - u64 native[NUM_ECC_DIGITS]) -{ - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - const u8 *digit = bytes + 8 * (NUM_ECC_DIGITS - 1 - i); - - native[NUM_ECC_DIGITS - 1 - i] = - ((u64) digit[0] << 0) | - ((u64) digit[1] << 8) | - ((u64) digit[2] << 16) | - ((u64) digit[3] << 24) | - ((u64) digit[4] << 32) | - ((u64) digit[5] << 40) | - ((u64) digit[6] << 48) | - ((u64) digit[7] << 56); - } -} - -static void ecc_native2bytes(const u64 native[NUM_ECC_DIGITS], - u8 bytes[ECC_BYTES]) -{ - int i; - - for (i = 0; i < NUM_ECC_DIGITS; i++) { - u8 *digit = bytes + 8 * (NUM_ECC_DIGITS - 1 - i); - - digit[0] = native[NUM_ECC_DIGITS - 1 - i] >> 0; - digit[1] = native[NUM_ECC_DIGITS - 1 - i] >> 8; - digit[2] = native[NUM_ECC_DIGITS - 1 - i] >> 16; - digit[3] = native[NUM_ECC_DIGITS - 1 - i] >> 24; - digit[4] = native[NUM_ECC_DIGITS - 1 - i] >> 32; - digit[5] = native[NUM_ECC_DIGITS - 1 - i] >> 40; - digit[6] = native[NUM_ECC_DIGITS - 1 - i] >> 48; - digit[7] = native[NUM_ECC_DIGITS - 1 - i] >> 56; - } -} - -bool ecc_make_key(u8 public_key[64], u8 private_key[32]) -{ - struct ecc_point pk; - u64 priv[NUM_ECC_DIGITS]; - unsigned int tries = 0; - - do { - if (tries++ >= MAX_TRIES) - return false; - - get_random_bytes(priv, ECC_BYTES); - - if (vli_is_zero(priv)) - continue; - - /* Make sure the private key is in the range [1, n-1]. */ - if (vli_cmp(curve_n, priv) != 1) - continue; - - ecc_point_mult(&pk, &curve_g, priv, NULL, vli_num_bits(priv)); - } while (ecc_point_is_zero(&pk)); - - ecc_native2bytes(priv, private_key); - ecc_native2bytes(pk.x, public_key); - ecc_native2bytes(pk.y, &public_key[32]); - - return true; -} - -bool ecdh_shared_secret(const u8 public_key[64], const u8 private_key[32], - u8 secret[32]) -{ - u64 priv[NUM_ECC_DIGITS]; - u64 rand[NUM_ECC_DIGITS]; - struct ecc_point product, pk; - - get_random_bytes(rand, ECC_BYTES); - - ecc_bytes2native(public_key, pk.x); - ecc_bytes2native(&public_key[32], pk.y); - ecc_bytes2native(private_key, priv); - - ecc_point_mult(&product, &pk, priv, rand, vli_num_bits(priv)); - - ecc_native2bytes(product.x, secret); - - return !ecc_point_is_zero(&product); -} diff --git a/net/bluetooth/ecc.h b/net/bluetooth/ecc.h deleted file mode 100644 index 8d6a2f4d1905f33df35e1d2380f7e49cfe0f6c5f..0000000000000000000000000000000000000000 --- a/net/bluetooth/ecc.h +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Copyright (c) 2013, Kenneth MacKay - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are - * met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* Create a public/private key pair. - * Outputs: - * public_key - Will be filled in with the public key. - * private_key - Will be filled in with the private key. - * - * Returns true if the key pair was generated successfully, false - * if an error occurred. The keys are with the LSB first. - */ -bool ecc_make_key(u8 public_key[64], u8 private_key[32]); - -/* Compute a shared secret given your secret key and someone else's - * public key. - * Note: It is recommended that you hash the result of ecdh_shared_secret - * before using it for symmetric encryption or HMAC. - * - * Inputs: - * public_key - The public key of the remote party - * private_key - Your private key. - * - * Outputs: - * secret - Will be filled in with the shared secret value. - * - * Returns true if the shared secret was generated successfully, false - * if an error occurred. Both input and output parameters are with the - * LSB first. - */ -bool ecdh_shared_secret(const u8 public_key[64], const u8 private_key[32], - u8 secret[32]); diff --git a/net/bluetooth/ecdh_helper.c b/net/bluetooth/ecdh_helper.c new file mode 100644 index 0000000000000000000000000000000000000000..24d4e60f8c48b8c02532558f7a90bf273f586274 --- /dev/null +++ b/net/bluetooth/ecdh_helper.c @@ -0,0 +1,231 @@ +/* + * ECDH helper functions - KPP wrappings + * + * Copyright (C) 2017 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation; + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + * CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + * COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + * SOFTWARE IS DISCLAIMED. + */ +#include "ecdh_helper.h" + +#include +#include +#include + +struct ecdh_completion { + struct completion completion; + int err; +}; + +static void ecdh_complete(struct crypto_async_request *req, int err) +{ + struct ecdh_completion *res = req->data; + + if (err == -EINPROGRESS) + return; + + res->err = err; + complete(&res->completion); +} + +static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) +{ + int i; + + for (i = 0; i < ndigits; i++) + out[i] = __swab64(in[ndigits - 1 - i]); +} + +bool compute_ecdh_secret(const u8 public_key[64], const u8 private_key[32], + u8 secret[32]) +{ + struct crypto_kpp *tfm; + struct kpp_request *req; + struct ecdh p; + struct ecdh_completion result; + struct scatterlist src, dst; + u8 *tmp, *buf; + unsigned int buf_len; + int err = -ENOMEM; + + tmp = kmalloc(64, GFP_KERNEL); + if (!tmp) + return false; + + tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(tfm)) { + pr_err("alg: kpp: Failed to load tfm for kpp: %ld\n", + PTR_ERR(tfm)); + goto free_tmp; + } + + req = kpp_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto free_kpp; + + init_completion(&result.completion); + + /* Security Manager Protocol holds digits in litte-endian order + * while ECC API expect big-endian data + */ + swap_digits((u64 *)private_key, (u64 *)tmp, 4); + p.key = (char *)tmp; + p.key_size = 32; + /* Set curve_id */ + p.curve_id = ECC_CURVE_NIST_P256; + buf_len = crypto_ecdh_key_len(&p); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!buf) { + pr_err("alg: kpp: Failed to allocate %d bytes for buf\n", + buf_len); + goto free_req; + } + crypto_ecdh_encode_key(buf, buf_len, &p); + + /* Set A private Key */ + err = crypto_kpp_set_secret(tfm, (void *)buf, buf_len); + if (err) + goto free_all; + + swap_digits((u64 *)public_key, (u64 *)tmp, 4); /* x */ + swap_digits((u64 *)&public_key[32], (u64 *)&tmp[32], 4); /* y */ + + sg_init_one(&src, tmp, 64); + sg_init_one(&dst, secret, 32); + kpp_request_set_input(req, &src, 64); + kpp_request_set_output(req, &dst, 32); + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + ecdh_complete, &result); + err = crypto_kpp_compute_shared_secret(req); + if (err == -EINPROGRESS) { + wait_for_completion(&result.completion); + err = result.err; + } + if (err < 0) { + pr_err("alg: ecdh: compute shared secret failed. err %d\n", + err); + goto free_all; + } + + swap_digits((u64 *)secret, (u64 *)tmp, 4); + memcpy(secret, tmp, 32); + +free_all: + kzfree(buf); +free_req: + kpp_request_free(req); +free_kpp: + crypto_free_kpp(tfm); +free_tmp: + kfree(tmp); + return (err == 0); +} + +bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]) +{ + struct crypto_kpp *tfm; + struct kpp_request *req; + struct ecdh p; + struct ecdh_completion result; + struct scatterlist dst; + u8 *tmp, *buf; + unsigned int buf_len; + int err = -ENOMEM; + const unsigned short max_tries = 16; + unsigned short tries = 0; + + tmp = kmalloc(64, GFP_KERNEL); + if (!tmp) + return false; + + tfm = crypto_alloc_kpp("ecdh", CRYPTO_ALG_INTERNAL, 0); + if (IS_ERR(tfm)) { + pr_err("alg: kpp: Failed to load tfm for kpp: %ld\n", + PTR_ERR(tfm)); + goto free_tmp; + } + + req = kpp_request_alloc(tfm, GFP_KERNEL); + if (!req) + goto free_kpp; + + init_completion(&result.completion); + + /* Set curve_id */ + p.curve_id = ECC_CURVE_NIST_P256; + p.key_size = 32; + buf_len = crypto_ecdh_key_len(&p); + buf = kmalloc(buf_len, GFP_KERNEL); + if (!buf) { + pr_err("alg: kpp: Failed to allocate %d bytes for buf\n", + buf_len); + goto free_req; + } + + do { + if (tries++ >= max_tries) + goto free_all; + + /* Set private Key */ + p.key = (char *)private_key; + crypto_ecdh_encode_key(buf, buf_len, &p); + err = crypto_kpp_set_secret(tfm, buf, buf_len); + if (err) + goto free_all; + + sg_init_one(&dst, tmp, 64); + kpp_request_set_input(req, NULL, 0); + kpp_request_set_output(req, &dst, 64); + kpp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + ecdh_complete, &result); + + err = crypto_kpp_generate_public_key(req); + + if (err == -EINPROGRESS) { + wait_for_completion(&result.completion); + err = result.err; + } + + /* Private key is not valid. Regenerate */ + if (err == -EINVAL) + continue; + + if (err < 0) + goto free_all; + else + break; + + } while (true); + + /* Keys are handed back in little endian as expected by Security + * Manager Protocol + */ + swap_digits((u64 *)tmp, (u64 *)public_key, 4); /* x */ + swap_digits((u64 *)&tmp[32], (u64 *)&public_key[32], 4); /* y */ + swap_digits((u64 *)private_key, (u64 *)tmp, 4); + memcpy(private_key, tmp, 32); + +free_all: + kzfree(buf); +free_req: + kpp_request_free(req); +free_kpp: + crypto_free_kpp(tfm); +free_tmp: + kfree(tmp); + return (err == 0); +} diff --git a/net/bluetooth/ecdh_helper.h b/net/bluetooth/ecdh_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..7a423faf76e5657c6a4c860031138f59eef0dcc0 --- /dev/null +++ b/net/bluetooth/ecdh_helper.h @@ -0,0 +1,27 @@ +/* + * ECDH helper functions - KPP wrappings + * + * Copyright (C) 2017 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation; + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS + * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY + * CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + * + * ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, + * COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS + * SOFTWARE IS DISCLAIMED. + */ +#include + +bool compute_ecdh_secret(const u8 pub_a[64], const u8 priv_b[32], + u8 secret[32]); +bool generate_ecdh_keys(u8 public_key[64], u8 private_key[32]); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 3ac89e9ace71556c66804b19e8dde8d397eb308f..05686776a5fb78d67dd38b22632ccfd1355b857e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2950,8 +2950,8 @@ struct hci_dev *hci_alloc_dev(void) hdev->le_adv_max_interval = 0x0800; hdev->le_scan_interval = 0x0060; hdev->le_scan_window = 0x0030; - hdev->le_conn_min_interval = 0x0028; - hdev->le_conn_max_interval = 0x0038; + hdev->le_conn_min_interval = 0x0018; + hdev->le_conn_max_interval = 0x0028; hdev->le_conn_latency = 0x0000; hdev->le_supv_timeout = 0x002a; hdev->le_def_tx_len = 0x001b; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index f64d6566021fccab9208518a1c53934b2da7f708..638bf0e1a2e390133fc7bcfa2041fc41a94d413f 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -1680,7 +1680,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE)) + if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_NOSIGNAL|MSG_ERRQUEUE| + MSG_CMSG_COMPAT)) return -EINVAL; if (len < 4 || len > HCI_MAX_FRAME_SIZE) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index fc7f321a382369f0d5097ce785921b1298a237f0..f88ac99528ce448d8e0f8ea5eb06ee2d7719afab 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -2425,6 +2425,22 @@ static int l2cap_segment_le_sdu(struct l2cap_chan *chan, return 0; } +static void l2cap_le_flowctl_send(struct l2cap_chan *chan) +{ + int sent = 0; + + BT_DBG("chan %p", chan); + + while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { + l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); + chan->tx_credits--; + sent++; + } + + BT_DBG("Sent %d credits %u queued %u", sent, chan->tx_credits, + skb_queue_len(&chan->tx_q)); +} + int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) { struct sk_buff *skb; @@ -2458,9 +2474,6 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) if (len > chan->omtu) return -EMSGSIZE; - if (!chan->tx_credits) - return -EAGAIN; - __skb_queue_head_init(&seg_queue); err = l2cap_segment_le_sdu(chan, &seg_queue, msg, len); @@ -2475,10 +2488,7 @@ int l2cap_chan_send(struct l2cap_chan *chan, struct msghdr *msg, size_t len) skb_queue_splice_tail_init(&seg_queue, &chan->tx_q); - while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { - l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); - chan->tx_credits--; - } + l2cap_le_flowctl_send(chan); if (!chan->tx_credits) chan->ops->suspend(chan); @@ -5570,10 +5580,8 @@ static inline int l2cap_le_credits(struct l2cap_conn *conn, chan->tx_credits += credits; - while (chan->tx_credits && !skb_queue_empty(&chan->tx_q)) { - l2cap_do_send(chan, skb_dequeue(&chan->tx_q)); - chan->tx_credits--; - } + /* Resume sending */ + l2cap_le_flowctl_send(chan); if (chan->tx_credits) chan->ops->resume(chan); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index f7eb02f09b54055c9b8c17b6fdcf53ec5041aa11..8ebca9033d60621ba686656242864dcf6a721dd7 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -311,7 +311,7 @@ struct rfcomm_dlc *rfcomm_dlc_alloc(gfp_t prio) skb_queue_head_init(&d->tx_queue); mutex_init(&d->lock); - atomic_set(&d->refcnt, 1); + refcount_set(&d->refcnt, 1); rfcomm_dlc_clear_state(d); @@ -342,7 +342,7 @@ static void rfcomm_dlc_unlink(struct rfcomm_dlc *d) { struct rfcomm_session *s = d->session; - BT_DBG("dlc %p refcnt %d session %p", d, atomic_read(&d->refcnt), s); + BT_DBG("dlc %p refcnt %d session %p", d, refcount_read(&d->refcnt), s); list_del(&d->list); d->session = NULL; diff --git a/net/bluetooth/selftest.c b/net/bluetooth/selftest.c index dc688f13e49612cd74decf8853b8c270803942f3..ee92c925ecc5d3ac42bc041f60924da80672e1b8 100644 --- a/net/bluetooth/selftest.c +++ b/net/bluetooth/selftest.c @@ -26,7 +26,7 @@ #include #include -#include "ecc.h" +#include "ecdh_helper.h" #include "smp.h" #include "selftest.h" @@ -142,18 +142,30 @@ static int __init test_ecdh_sample(const u8 priv_a[32], const u8 priv_b[32], const u8 pub_a[64], const u8 pub_b[64], const u8 dhkey[32]) { - u8 dhkey_a[32], dhkey_b[32]; + u8 *tmp, *dhkey_a, *dhkey_b; + int ret = 0; - ecdh_shared_secret(pub_b, priv_a, dhkey_a); - ecdh_shared_secret(pub_a, priv_b, dhkey_b); - - if (memcmp(dhkey_a, dhkey, 32)) + tmp = kmalloc(64, GFP_KERNEL); + if (!tmp) return -EINVAL; + dhkey_a = &tmp[0]; + dhkey_b = &tmp[32]; + + compute_ecdh_secret(pub_b, priv_a, dhkey_a); + compute_ecdh_secret(pub_a, priv_b, dhkey_b); + + if (memcmp(dhkey_a, dhkey, 32)) { + ret = -EINVAL; + goto out; + } + if (memcmp(dhkey_b, dhkey, 32)) - return -EINVAL; + ret = -EINVAL; - return 0; +out: + kfree(dhkey_a); + return ret; } static char test_ecdh_buffer[32]; diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c index fae391f1871f138c802722d8d8d86d817a97947e..14585edc94397c6f42e5f5fcee2f7f471019819e 100644 --- a/net/bluetooth/smp.c +++ b/net/bluetooth/smp.c @@ -31,7 +31,7 @@ #include #include -#include "ecc.h" +#include "ecdh_helper.h" #include "smp.h" #define SMP_DEV(hdev) \ @@ -569,8 +569,11 @@ int smp_generate_oob(struct hci_dev *hdev, u8 hash[16], u8 rand[16]) smp->debug_key = true; } else { while (true) { + /* Seed private key with random number */ + get_random_bytes(smp->local_sk, 32); + /* Generate local key pair for Secure Connections */ - if (!ecc_make_key(smp->local_pk, smp->local_sk)) + if (!generate_ecdh_keys(smp->local_pk, smp->local_sk)) return -EIO; /* This is unlikely, but we need to check that @@ -1895,8 +1898,11 @@ static u8 sc_send_public_key(struct smp_chan *smp) set_bit(SMP_FLAG_DEBUG_KEY, &smp->flags); } else { while (true) { + /* Seed private key with random number */ + get_random_bytes(smp->local_sk, 32); + /* Generate local key pair for Secure Connections */ - if (!ecc_make_key(smp->local_pk, smp->local_sk)) + if (!generate_ecdh_keys(smp->local_pk, smp->local_sk)) return SMP_UNSPECIFIED; /* This is unlikely, but we need to check that @@ -2670,7 +2676,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb) SMP_DBG("Remote Public Key X: %32phN", smp->remote_pk); SMP_DBG("Remote Public Key Y: %32phN", smp->remote_pk + 32); - if (!ecdh_shared_secret(smp->remote_pk, smp->local_sk, smp->dhkey)) + if (!compute_ecdh_secret(smp->remote_pk, smp->local_sk, smp->dhkey)) return SMP_UNSPECIFIED; SMP_DBG("DHKey %32phN", smp->dhkey); @@ -3483,6 +3489,32 @@ void smp_unregister(struct hci_dev *hdev) #if IS_ENABLED(CONFIG_BT_SELFTEST_SMP) +static inline void swap_digits(u64 *in, u64 *out, unsigned int ndigits) +{ + int i; + + for (i = 0; i < ndigits; i++) + out[i] = __swab64(in[ndigits - 1 - i]); +} + +static int __init test_debug_key(void) +{ + u8 pk[64], sk[32]; + + swap_digits((u64 *)debug_sk, (u64 *)sk, 4); + + if (!generate_ecdh_keys(pk, sk)) + return -EINVAL; + + if (memcmp(sk, debug_sk, 32)) + return -EINVAL; + + if (memcmp(pk, debug_pk, 64)) + return -EINVAL; + + return 0; +} + static int __init test_ah(struct crypto_cipher *tfm_aes) { const u8 irk[16] = { @@ -3738,6 +3770,12 @@ static int __init run_selftests(struct crypto_cipher *tfm_aes, calltime = ktime_get(); + err = test_debug_key(); + if (err) { + BT_ERR("debug_key test failed"); + goto done; + } + err = test_ah(tfm_aes); if (err) { BT_ERR("smp_ah test failed"); diff --git a/net/bpf/Makefile b/net/bpf/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..27b2992a06925240a8118e8182e8c7f048a73bbf --- /dev/null +++ b/net/bpf/Makefile @@ -0,0 +1 @@ +obj-y := test_run.o diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c new file mode 100644 index 0000000000000000000000000000000000000000..6be41a44d688a44970981fc9c2f1aa35666807fa --- /dev/null +++ b/net/bpf/test_run.c @@ -0,0 +1,173 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include + +static __always_inline u32 bpf_test_run_one(struct bpf_prog *prog, void *ctx) +{ + u32 ret; + + preempt_disable(); + rcu_read_lock(); + ret = BPF_PROG_RUN(prog, ctx); + rcu_read_unlock(); + preempt_enable(); + + return ret; +} + +static u32 bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat, u32 *time) +{ + u64 time_start, time_spent = 0; + u32 ret = 0, i; + + if (!repeat) + repeat = 1; + time_start = ktime_get_ns(); + for (i = 0; i < repeat; i++) { + ret = bpf_test_run_one(prog, ctx); + if (need_resched()) { + if (signal_pending(current)) + break; + time_spent += ktime_get_ns() - time_start; + cond_resched(); + time_start = ktime_get_ns(); + } + } + time_spent += ktime_get_ns() - time_start; + do_div(time_spent, repeat); + *time = time_spent > U32_MAX ? U32_MAX : (u32)time_spent; + + return ret; +} + +static int bpf_test_finish(const union bpf_attr *kattr, + union bpf_attr __user *uattr, const void *data, + u32 size, u32 retval, u32 duration) +{ + void __user *data_out = u64_to_user_ptr(kattr->test.data_out); + int err = -EFAULT; + + if (data_out && copy_to_user(data_out, data, size)) + goto out; + if (copy_to_user(&uattr->test.data_size_out, &size, sizeof(size))) + goto out; + if (copy_to_user(&uattr->test.retval, &retval, sizeof(retval))) + goto out; + if (copy_to_user(&uattr->test.duration, &duration, sizeof(duration))) + goto out; + err = 0; +out: + return err; +} + +static void *bpf_test_init(const union bpf_attr *kattr, u32 size, + u32 headroom, u32 tailroom) +{ + void __user *data_in = u64_to_user_ptr(kattr->test.data_in); + void *data; + + if (size < ETH_HLEN || size > PAGE_SIZE - headroom - tailroom) + return ERR_PTR(-EINVAL); + + data = kzalloc(size + headroom + tailroom, GFP_USER); + if (!data) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(data + headroom, data_in, size)) { + kfree(data); + return ERR_PTR(-EFAULT); + } + return data; +} + +int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + bool is_l2 = false, is_direct_pkt_access = false; + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + u32 retval, duration; + struct sk_buff *skb; + void *data; + int ret; + + data = bpf_test_init(kattr, size, NET_SKB_PAD + NET_IP_ALIGN, + SKB_DATA_ALIGN(sizeof(struct skb_shared_info))); + if (IS_ERR(data)) + return PTR_ERR(data); + + switch (prog->type) { + case BPF_PROG_TYPE_SCHED_CLS: + case BPF_PROG_TYPE_SCHED_ACT: + is_l2 = true; + /* fall through */ + case BPF_PROG_TYPE_LWT_IN: + case BPF_PROG_TYPE_LWT_OUT: + case BPF_PROG_TYPE_LWT_XMIT: + is_direct_pkt_access = true; + break; + default: + break; + } + + skb = build_skb(data, 0); + if (!skb) { + kfree(data); + return -ENOMEM; + } + + skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN); + __skb_put(skb, size); + skb->protocol = eth_type_trans(skb, current->nsproxy->net_ns->loopback_dev); + skb_reset_network_header(skb); + + if (is_l2) + __skb_push(skb, ETH_HLEN); + if (is_direct_pkt_access) + bpf_compute_data_end(skb); + retval = bpf_test_run(prog, skb, repeat, &duration); + if (!is_l2) + __skb_push(skb, ETH_HLEN); + size = skb->len; + /* bpf program can never convert linear skb to non-linear */ + if (WARN_ON_ONCE(skb_is_nonlinear(skb))) + size = skb_headlen(skb); + ret = bpf_test_finish(kattr, uattr, skb->data, size, retval, duration); + kfree_skb(skb); + return ret; +} + +int bpf_prog_test_run_xdp(struct bpf_prog *prog, const union bpf_attr *kattr, + union bpf_attr __user *uattr) +{ + u32 size = kattr->test.data_size_in; + u32 repeat = kattr->test.repeat; + struct xdp_buff xdp = {}; + u32 retval, duration; + void *data; + int ret; + + data = bpf_test_init(kattr, size, XDP_PACKET_HEADROOM + NET_IP_ALIGN, 0); + if (IS_ERR(data)) + return PTR_ERR(data); + + xdp.data_hard_start = data; + xdp.data = data + XDP_PACKET_HEADROOM + NET_IP_ALIGN; + xdp.data_end = xdp.data + size; + + retval = bpf_test_run(prog, &xdp, repeat, &duration); + if (xdp.data != data + XDP_PACKET_HEADROOM + NET_IP_ALIGN) + size = xdp.data_end - xdp.data; + ret = bpf_test_finish(kattr, uattr, xdp.data, size, retval, duration); + kfree(data); + return ret; +} diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 90f49a194249ff5fecb3d81a05837ea8b75cc586..430b53e7d941def09220a1c97a2e82d288304595 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -123,6 +123,7 @@ static void br_dev_uninit(struct net_device *dev) { struct net_bridge *br = netdev_priv(dev); + br_multicast_dev_del(br); br_multicast_uninit_stats(br); br_vlan_flush(br); free_percpu(br->stats); diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c index 6e08b7199dd7442acdbd4f85e5ef6315b121ca06..ab0c7cc8448f4824d69b9260e79ede7aac14dd9e 100644 --- a/net/bridge/br_fdb.c +++ b/net/bridge/br_fdb.c @@ -589,6 +589,9 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source, if (unlikely(source != fdb->dst)) { fdb->dst = source; fdb_modified = true; + /* Take over HW learned entry */ + if (unlikely(fdb->added_by_external_learn)) + fdb->added_by_external_learn = 0; } if (now != fdb->updated) fdb->updated = now; @@ -854,6 +857,8 @@ static int __br_fdb_add(struct ndmsg *ndm, struct net_bridge *br, br_fdb_update(br, p, addr, vid, true); rcu_read_unlock(); local_bh_enable(); + } else if (ndm->ndm_flags & NTF_EXT_LEARNED) { + err = br_fdb_external_learn_add(br, p, addr, vid); } else { spin_lock_bh(&br->hash_lock); err = fdb_add_entry(br, p, addr, ndm->ndm_state, diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 902af6ba481c999f81ed2fba488d91665e03c02e..48fb17417fac3397e74cb712388076ed1ee87865 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -183,13 +183,23 @@ void br_flood(struct net_bridge *br, struct sk_buff *skb, struct net_bridge_port *p; list_for_each_entry_rcu(p, &br->port_list, list) { - /* Do not flood unicast traffic to ports that turn it off */ - if (pkt_type == BR_PKT_UNICAST && !(p->flags & BR_FLOOD)) - continue; - /* Do not flood if mc off, except for traffic we originate */ - if (pkt_type == BR_PKT_MULTICAST && - !(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) - continue; + /* Do not flood unicast traffic to ports that turn it off, nor + * other traffic if flood off, except for traffic we originate + */ + switch (pkt_type) { + case BR_PKT_UNICAST: + if (!(p->flags & BR_FLOOD)) + continue; + break; + case BR_PKT_MULTICAST: + if (!(p->flags & BR_MCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + case BR_PKT_BROADCAST: + if (!(p->flags & BR_BCAST_FLOOD) && skb->dev != br->dev) + continue; + break; + } /* Do not flood to ports that enable proxy ARP */ if (p->flags & BR_PROXYARP) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 56a2a72e7738c9869e27e77ca13140aed4bd530a..7f8d05cf90656e43211d9682657b788f99736722 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -311,7 +312,6 @@ void br_dev_delete(struct net_device *dev, struct list_head *head) br_fdb_delete_by_port(br, NULL, 0, 1); - br_multicast_dev_del(br); cancel_delayed_work_sync(&br->gc_work); br_sysfs_delbr(br->dev); @@ -361,7 +361,7 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; p->port_no = index; - p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD; + p->flags = BR_LEARNING | BR_FLOOD | BR_MCAST_FLOOD | BR_BCAST_FLOOD; br_init_port(p); br_set_state(p, BR_STATE_DISABLED); br_stp_port_timer_init(p); diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 056e6ac49d8fc7727fce8fc8320b8db4ac5351d2..b0845480a3ae7f65f347f7ad9399bc6a5d21d806 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -464,7 +464,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh, struct net_device *dev; int err; - err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*bpm), tb, MDBA_SET_ENTRY_MAX, NULL, + NULL); if (err < 0) return err; @@ -568,7 +569,8 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br, return ret; } -static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; @@ -662,7 +664,8 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry) return err; } -static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) +static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_bridge_vlan_group *vg; diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 1f1e62095464f99eaca8de49a772289f057bd943..067cf03134492a33f10982755105e103666cd1ef 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -997,13 +997,10 @@ int br_nf_hook_thresh(unsigned int hook, struct net *net, if (!elem) return okfn(net, sk, skb); - /* We may already have this, but read-locks nest anyway */ - rcu_read_lock(); nf_hook_state_init(&state, hook, NFPROTO_BRIDGE, indev, outdev, sk, net, okfn); ret = nf_hook_slow(skb, &state, elem); - rcu_read_unlock(); if (ret == 1) ret = okfn(net, sk, skb); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index 225ef7d5370166baff69080996cf64ff68b055f6..c5ce7745b230fa3e67b66ba211be00d66cdc52da 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -133,6 +133,8 @@ static inline size_t br_port_info_size(void) + nla_total_size(1) /* IFLA_BRPORT_MCAST_TO_UCAST */ + nla_total_size(1) /* IFLA_BRPORT_LEARNING */ + nla_total_size(1) /* IFLA_BRPORT_UNICAST_FLOOD */ + + nla_total_size(1) /* IFLA_BRPORT_MCAST_FLOOD */ + + nla_total_size(1) /* IFLA_BRPORT_BCAST_FLOOD */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP */ + nla_total_size(1) /* IFLA_BRPORT_PROXYARP_WIFI */ + nla_total_size(1) /* IFLA_BRPORT_VLAN_TUNNEL */ @@ -189,6 +191,8 @@ static int br_port_fill_attrs(struct sk_buff *skb, !!(p->flags & BR_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_MCAST_FLOOD, !!(p->flags & BR_MCAST_FLOOD)) || + nla_put_u8(skb, IFLA_BRPORT_BCAST_FLOOD, + !!(p->flags & BR_BCAST_FLOOD)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP, !!(p->flags & BR_PROXYARP)) || nla_put_u8(skb, IFLA_BRPORT_PROXYARP_WIFI, !!(p->flags & BR_PROXYARP_WIFI)) || @@ -631,6 +635,8 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = { [IFLA_BRPORT_PROXYARP_WIFI] = { .type = NLA_U8 }, [IFLA_BRPORT_MULTICAST_ROUTER] = { .type = NLA_U8 }, [IFLA_BRPORT_MCAST_TO_UCAST] = { .type = NLA_U8 }, + [IFLA_BRPORT_MCAST_FLOOD] = { .type = NLA_U8 }, + [IFLA_BRPORT_BCAST_FLOOD] = { .type = NLA_U8 }, }; /* Change the state of the port and notify spanning tree */ @@ -683,6 +689,7 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[]) br_set_port_flag(p, tb, IFLA_BRPORT_UNICAST_FLOOD, BR_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_FLOOD, BR_MCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_MCAST_TO_UCAST, BR_MULTICAST_TO_UNICAST); + br_set_port_flag(p, tb, IFLA_BRPORT_BCAST_FLOOD, BR_BCAST_FLOOD); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP, BR_PROXYARP); br_set_port_flag(p, tb, IFLA_BRPORT_PROXYARP_WIFI, BR_PROXYARP_WIFI); @@ -748,8 +755,8 @@ int br_setlink(struct net_device *dev, struct nlmsghdr *nlh, u16 flags) if (p && protinfo) { if (protinfo->nla_type & NLA_F_NESTED) { - err = nla_parse_nested(tb, IFLA_BRPORT_MAX, - protinfo, br_port_policy); + err = nla_parse_nested(tb, IFLA_BRPORT_MAX, protinfo, + br_port_policy, NULL); if (err) return err; diff --git a/net/bridge/br_netlink_tunnel.c b/net/bridge/br_netlink_tunnel.c index c913491495ab21ade17dec3512bd1d0a829b8e67..3712c7f0e00cd1addd73cdfe4504b687e425d0ec 100644 --- a/net/bridge/br_netlink_tunnel.c +++ b/net/bridge/br_netlink_tunnel.c @@ -227,8 +227,8 @@ int br_parse_vlan_tunnel_info(struct nlattr *attr, memset(tinfo, 0, sizeof(*tinfo)); - err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, - attr, vlan_tunnel_policy); + err = nla_parse_nested(tb, IFLA_BRIDGE_VLAN_TUNNEL_MAX, attr, + vlan_tunnel_policy, NULL); if (err < 0) return err; diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c index 79aee759aba5906692d0c715c851cd3dca96801b..5d5d413a6cf8a311ddde9e341caa39a8f3640b35 100644 --- a/net/bridge/br_sysfs_if.c +++ b/net/bridge/br_sysfs_if.c @@ -173,6 +173,7 @@ BRPORT_ATTR_FLAG(unicast_flood, BR_FLOOD); BRPORT_ATTR_FLAG(proxyarp, BR_PROXYARP); BRPORT_ATTR_FLAG(proxyarp_wifi, BR_PROXYARP_WIFI); BRPORT_ATTR_FLAG(multicast_flood, BR_MCAST_FLOOD); +BRPORT_ATTR_FLAG(broadcast_flood, BR_BCAST_FLOOD); #ifdef CONFIG_BRIDGE_IGMP_SNOOPING static ssize_t show_multicast_router(struct net_bridge_port *p, char *buf) @@ -221,6 +222,7 @@ static const struct brport_attribute *brport_attrs[] = { &brport_attr_proxyarp, &brport_attr_proxyarp_wifi, &brport_attr_multicast_flood, + &brport_attr_broadcast_flood, NULL }; diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 4e0b0c3593250bd8a1be0cdafca49ce7e4684f94..e0bb624c3845eff5d756830504709eeb3d5bf960 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -9,6 +9,7 @@ */ #include #include +#include "../br_private.h" #include #include #include @@ -18,11 +19,30 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; + struct net_device *dev; if (!skb_make_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); + + if (is_multicast_ether_addr(info->mac)) { + if (is_broadcast_ether_addr(info->mac)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + if (xt_hooknum(par) != NF_BR_BROUTING) + dev = br_port_get_rcu(xt_in(par))->br->dev; + else + dev = xt_in(par); + + if (ether_addr_equal(info->mac, dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + else + skb->pkt_type = PACKET_OTHERHOST; + } + return info->target; } diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 98b9c8e8615ebc6e2ddefd1885a01af3ef58781b..707caea397433b66c887e527f0e4532eaf260880 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -62,10 +62,10 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset) pptr = skb_header_pointer(skb, offset, sizeof(_ports), &_ports); if (pptr == NULL) { - printk(" INCOMPLETE TCP/UDP header"); + pr_cont(" INCOMPLETE TCP/UDP header"); return; } - printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst)); + pr_cont(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst)); } } @@ -100,11 +100,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) { - printk(" INCOMPLETE IP header"); + pr_cont(" INCOMPLETE IP header"); goto out; } - printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d", - &ih->saddr, &ih->daddr, ih->tos, ih->protocol); + pr_cont(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d", + &ih->saddr, &ih->daddr, ih->tos, ih->protocol); print_ports(skb, ih->protocol, ih->ihl*4); goto out; } @@ -120,11 +120,11 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); if (ih == NULL) { - printk(" INCOMPLETE IPv6 header"); + pr_cont(" INCOMPLETE IPv6 header"); goto out; } - printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d", - &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr); + pr_cont(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d", + &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr); nexthdr = ih->nexthdr; offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off); if (offset_ph == -1) @@ -142,12 +142,12 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph); if (ah == NULL) { - printk(" INCOMPLETE ARP header"); + pr_cont(" INCOMPLETE ARP header"); goto out; } - printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d", - ntohs(ah->ar_hrd), ntohs(ah->ar_pro), - ntohs(ah->ar_op)); + pr_cont(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d", + ntohs(ah->ar_hrd), ntohs(ah->ar_pro), + ntohs(ah->ar_op)); /* If it's for Ethernet and the lengths are OK, * then log the ARP payload @@ -161,17 +161,17 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum, ap = skb_header_pointer(skb, sizeof(_arph), sizeof(_arpp), &_arpp); if (ap == NULL) { - printk(" INCOMPLETE ARP payload"); + pr_cont(" INCOMPLETE ARP payload"); goto out; } - printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4", - ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst); + pr_cont(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4", + ap->mac_src, ap->ip_src, + ap->mac_dst, ap->ip_dst); } } out: - printk("\n"); + pr_cont("\n"); spin_unlock_bh(&ebt_log_lock); - } static unsigned int diff --git a/net/bridge/netfilter/ebtable_broute.c b/net/bridge/netfilter/ebtable_broute.c index 8fe36dc3aab29ceac90443f229b9d745b6dfe5d6..2585b100ebbbc332a33209c2b0bf3643eb0dd961 100644 --- a/net/bridge/netfilter/ebtable_broute.c +++ b/net/bridge/netfilter/ebtable_broute.c @@ -65,13 +65,13 @@ static int ebt_broute(struct sk_buff *skb) static int __net_init broute_net_init(struct net *net) { - net->xt.broute_table = ebt_register_table(net, &broute_table); + net->xt.broute_table = ebt_register_table(net, &broute_table, NULL); return PTR_ERR_OR_ZERO(net->xt.broute_table); } static void __net_exit broute_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.broute_table); + ebt_unregister_table(net, net->xt.broute_table, NULL); } static struct pernet_operations broute_net_ops = { diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c index 593a1bdc079e8ee55165bd13dfef23a0689f3362..f22ef7c219137231e13e756b2695b281b90b9dd8 100644 --- a/net/bridge/netfilter/ebtable_filter.c +++ b/net/bridge/netfilter/ebtable_filter.c @@ -93,13 +93,13 @@ static struct nf_hook_ops ebt_ops_filter[] __read_mostly = { static int __net_init frame_filter_net_init(struct net *net) { - net->xt.frame_filter = ebt_register_table(net, &frame_filter); + net->xt.frame_filter = ebt_register_table(net, &frame_filter, ebt_ops_filter); return PTR_ERR_OR_ZERO(net->xt.frame_filter); } static void __net_exit frame_filter_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_filter); + ebt_unregister_table(net, net->xt.frame_filter, ebt_ops_filter); } static struct pernet_operations frame_filter_net_ops = { @@ -109,20 +109,11 @@ static struct pernet_operations frame_filter_net_ops = { static int __init ebtable_filter_init(void) { - int ret; - - ret = register_pernet_subsys(&frame_filter_net_ops); - if (ret < 0) - return ret; - ret = nf_register_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter)); - if (ret < 0) - unregister_pernet_subsys(&frame_filter_net_ops); - return ret; + return register_pernet_subsys(&frame_filter_net_ops); } static void __exit ebtable_filter_fini(void) { - nf_unregister_hooks(ebt_ops_filter, ARRAY_SIZE(ebt_ops_filter)); unregister_pernet_subsys(&frame_filter_net_ops); } diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c index eb33919821ee12e1b74610c894c22d3d0fa05f2a..2f7a4f314406382a655a94d2ce384226ab906bfb 100644 --- a/net/bridge/netfilter/ebtable_nat.c +++ b/net/bridge/netfilter/ebtable_nat.c @@ -93,13 +93,13 @@ static struct nf_hook_ops ebt_ops_nat[] __read_mostly = { static int __net_init frame_nat_net_init(struct net *net) { - net->xt.frame_nat = ebt_register_table(net, &frame_nat); + net->xt.frame_nat = ebt_register_table(net, &frame_nat, ebt_ops_nat); return PTR_ERR_OR_ZERO(net->xt.frame_nat); } static void __net_exit frame_nat_net_exit(struct net *net) { - ebt_unregister_table(net, net->xt.frame_nat); + ebt_unregister_table(net, net->xt.frame_nat, ebt_ops_nat); } static struct pernet_operations frame_nat_net_ops = { @@ -109,20 +109,11 @@ static struct pernet_operations frame_nat_net_ops = { static int __init ebtable_nat_init(void) { - int ret; - - ret = register_pernet_subsys(&frame_nat_net_ops); - if (ret < 0) - return ret; - ret = nf_register_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat)); - if (ret < 0) - unregister_pernet_subsys(&frame_nat_net_ops); - return ret; + return register_pernet_subsys(&frame_nat_net_ops); } static void __exit ebtable_nat_fini(void) { - nf_unregister_hooks(ebt_ops_nat, ARRAY_SIZE(ebt_ops_nat)); unregister_pernet_subsys(&frame_nat_net_ops); } diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 79b69917f5210c0065ab5c9c37a497ede7cc273b..9ec0c9f908fa712b18bfa25c87aa7bce12cee786 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1157,8 +1157,30 @@ static int do_replace(struct net *net, const void __user *user, return ret; } +static void __ebt_unregister_table(struct net *net, struct ebt_table *table) +{ + int i; + + mutex_lock(&ebt_mutex); + list_del(&table->list); + mutex_unlock(&ebt_mutex); + EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, + ebt_cleanup_entry, net, NULL); + if (table->private->nentries) + module_put(table->me); + vfree(table->private->entries); + if (table->private->chainstack) { + for_each_possible_cpu(i) + vfree(table->private->chainstack[i]); + vfree(table->private->chainstack); + } + vfree(table->private); + kfree(table); +} + struct ebt_table * -ebt_register_table(struct net *net, const struct ebt_table *input_table) +ebt_register_table(struct net *net, const struct ebt_table *input_table, + const struct nf_hook_ops *ops) { struct ebt_table_info *newinfo; struct ebt_table *t, *table; @@ -1238,6 +1260,16 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) } list_add(&table->list, &net->xt.tables[NFPROTO_BRIDGE]); mutex_unlock(&ebt_mutex); + + if (!ops) + return table; + + ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks)); + if (ret) { + __ebt_unregister_table(net, table); + return ERR_PTR(ret); + } + return table; free_unlock: mutex_unlock(&ebt_mutex); @@ -1256,29 +1288,12 @@ ebt_register_table(struct net *net, const struct ebt_table *input_table) return ERR_PTR(ret); } -void ebt_unregister_table(struct net *net, struct ebt_table *table) +void ebt_unregister_table(struct net *net, struct ebt_table *table, + const struct nf_hook_ops *ops) { - int i; - - if (!table) { - BUGPRINT("Request to unregister NULL table!!!\n"); - return; - } - mutex_lock(&ebt_mutex); - list_del(&table->list); - mutex_unlock(&ebt_mutex); - EBT_ENTRY_ITERATE(table->private->entries, table->private->entries_size, - ebt_cleanup_entry, net, NULL); - if (table->private->nentries) - module_put(table->me); - vfree(table->private->entries); - if (table->private->chainstack) { - for_each_possible_cpu(i) - vfree(table->private->chainstack[i]); - vfree(table->private->chainstack); - } - vfree(table->private); - kfree(table); + if (ops) + nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks)); + __ebt_unregister_table(net, table); } /* userspace just supplied us with counters */ @@ -1713,7 +1728,7 @@ static int compat_copy_entry_to_user(struct ebt_entry *e, void __user **dstptr, if (*size < sizeof(*ce)) return -EINVAL; - ce = (struct ebt_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(*ce))) return -EFAULT; diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c index 5974dbc1ea240fa6b8b2d109891ab211a8d556ff..bb63c9aed55d22af0f9d6538267e50900701a351 100644 --- a/net/bridge/netfilter/nft_meta_bridge.c +++ b/net/bridge/netfilter/nft_meta_bridge.c @@ -111,7 +111,7 @@ nft_meta_bridge_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_meta_bridge_type __read_mostly = { .family = NFPROTO_BRIDGE, .name = "meta", - .select_ops = &nft_meta_bridge_select_ops, + .select_ops = nft_meta_bridge_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 206dc266ecd237c2874d25352dd631e3bc31b002..346ef6b00b8f05b62edc911d06c01692624596d9 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -375,11 +375,7 @@ static int nft_reject_bridge_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code, err; - - err = nft_reject_bridge_validate(ctx, expr, NULL); - if (err < 0) - return err; + int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/can/af_can.c b/net/can/af_can.c index 5488e4a6ccd062e6f6e7e2b841dde5ef055d4337..b6406fe33c76d1e7ca5f439e9b7409ad3a680e6d 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -2,7 +2,7 @@ * af_can.c - Protocol family CAN core module * (used by different CAN protocol modules) * - * Copyright (c) 2002-2007 Volkswagen Group Electronic Research + * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -75,20 +75,12 @@ static int stats_timer __read_mostly = 1; module_param(stats_timer, int, S_IRUGO); MODULE_PARM_DESC(stats_timer, "enable timer for statistics (default:on)"); -/* receive filters subscribed for 'all' CAN devices */ -struct dev_rcv_lists can_rx_alldev_list; -static DEFINE_SPINLOCK(can_rcvlists_lock); - static struct kmem_cache *rcv_cache __read_mostly; /* table of registered CAN protocols */ static const struct can_proto *proto_tab[CAN_NPROTO] __read_mostly; static DEFINE_MUTEX(proto_tab_lock); -struct timer_list can_stattimer; /* timer for statistics update */ -struct s_stats can_stats; /* packet statistics */ -struct s_pstats can_pstats; /* receive list statistics */ - static atomic_t skbcounter = ATOMIC_INIT(0); /* @@ -145,9 +137,6 @@ static int can_create(struct net *net, struct socket *sock, int protocol, if (protocol < 0 || protocol >= CAN_NPROTO) return -EINVAL; - if (!net_eq(net, &init_net)) - return -EAFNOSUPPORT; - cp = can_get_proto(protocol); #ifdef CONFIG_MODULES @@ -228,6 +217,7 @@ int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + struct s_stats *can_stats = dev_net(skb->dev)->can.can_stats; int err = -EINVAL; if (skb->len == CAN_MTU) { @@ -316,8 +306,8 @@ int can_send(struct sk_buff *skb, int loop) netif_rx_ni(newskb); /* update statistics */ - can_stats.tx_frames++; - can_stats.tx_frames_delta++; + can_stats->tx_frames++; + can_stats->tx_frames_delta++; return 0; @@ -331,10 +321,11 @@ EXPORT_SYMBOL(can_send); * af_can rx path */ -static struct dev_rcv_lists *find_dev_rcv_lists(struct net_device *dev) +static struct dev_rcv_lists *find_dev_rcv_lists(struct net *net, + struct net_device *dev) { if (!dev) - return &can_rx_alldev_list; + return net->can.can_rx_alldev_list; else return (struct dev_rcv_lists *)dev->ml_priv; } @@ -467,13 +458,14 @@ static struct hlist_head *find_rcv_list(canid_t *can_id, canid_t *mask, * -ENOMEM on missing cache mem to create subscription entry * -ENODEV unknown device */ -int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, - void (*func)(struct sk_buff *, void *), void *data, - char *ident, struct sock *sk) +int can_rx_register(struct net *net, struct net_device *dev, canid_t can_id, + canid_t mask, void (*func)(struct sk_buff *, void *), + void *data, char *ident, struct sock *sk) { struct receiver *r; struct hlist_head *rl; struct dev_rcv_lists *d; + struct s_pstats *can_pstats = net->can.can_pstats; int err = 0; /* insert new receiver (dev,canid,mask) -> (func,data) */ @@ -481,13 +473,16 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, if (dev && dev->type != ARPHRD_CAN) return -ENODEV; + if (dev && !net_eq(net, dev_net(dev))) + return -ENODEV; + r = kmem_cache_alloc(rcv_cache, GFP_KERNEL); if (!r) return -ENOMEM; - spin_lock(&can_rcvlists_lock); + spin_lock(&net->can.can_rcvlists_lock); - d = find_dev_rcv_lists(dev); + d = find_dev_rcv_lists(net, dev); if (d) { rl = find_rcv_list(&can_id, &mask, d); @@ -502,15 +497,15 @@ int can_rx_register(struct net_device *dev, canid_t can_id, canid_t mask, hlist_add_head_rcu(&r->list, rl); d->entries++; - can_pstats.rcv_entries++; - if (can_pstats.rcv_entries_max < can_pstats.rcv_entries) - can_pstats.rcv_entries_max = can_pstats.rcv_entries; + can_pstats->rcv_entries++; + if (can_pstats->rcv_entries_max < can_pstats->rcv_entries) + can_pstats->rcv_entries_max = can_pstats->rcv_entries; } else { kmem_cache_free(rcv_cache, r); err = -ENODEV; } - spin_unlock(&can_rcvlists_lock); + spin_unlock(&net->can.can_rcvlists_lock); return err; } @@ -540,19 +535,24 @@ static void can_rx_delete_receiver(struct rcu_head *rp) * Description: * Removes subscription entry depending on given (subscription) values. */ -void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, - void (*func)(struct sk_buff *, void *), void *data) +void can_rx_unregister(struct net *net, struct net_device *dev, canid_t can_id, + canid_t mask, void (*func)(struct sk_buff *, void *), + void *data) { struct receiver *r = NULL; struct hlist_head *rl; + struct s_pstats *can_pstats = net->can.can_pstats; struct dev_rcv_lists *d; if (dev && dev->type != ARPHRD_CAN) return; - spin_lock(&can_rcvlists_lock); + if (dev && !net_eq(net, dev_net(dev))) + return; + + spin_lock(&net->can.can_rcvlists_lock); - d = find_dev_rcv_lists(dev); + d = find_dev_rcv_lists(net, dev); if (!d) { pr_err("BUG: receive list not found for " "dev %s, id %03X, mask %03X\n", @@ -588,8 +588,8 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, hlist_del_rcu(&r->list); d->entries--; - if (can_pstats.rcv_entries > 0) - can_pstats.rcv_entries--; + if (can_pstats->rcv_entries > 0) + can_pstats->rcv_entries--; /* remove device structure requested by NETDEV_UNREGISTER */ if (d->remove_on_zero_entries && !d->entries) { @@ -598,7 +598,7 @@ void can_rx_unregister(struct net_device *dev, canid_t can_id, canid_t mask, } out: - spin_unlock(&can_rcvlists_lock); + spin_unlock(&net->can.can_rcvlists_lock); /* schedule the receiver item for deletion */ if (r) { @@ -683,11 +683,13 @@ static int can_rcv_filter(struct dev_rcv_lists *d, struct sk_buff *skb) static void can_receive(struct sk_buff *skb, struct net_device *dev) { struct dev_rcv_lists *d; + struct net *net = dev_net(dev); + struct s_stats *can_stats = net->can.can_stats; int matches; /* update statistics */ - can_stats.rx_frames++; - can_stats.rx_frames_delta++; + can_stats->rx_frames++; + can_stats->rx_frames_delta++; /* create non-zero unique skb identifier together with *skb */ while (!(can_skb_prv(skb)->skbcnt)) @@ -696,10 +698,10 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) rcu_read_lock(); /* deliver the packet to sockets listening on all devices */ - matches = can_rcv_filter(&can_rx_alldev_list, skb); + matches = can_rcv_filter(net->can.can_rx_alldev_list, skb); /* find receive list for this device */ - d = find_dev_rcv_lists(dev); + d = find_dev_rcv_lists(net, dev); if (d) matches += can_rcv_filter(d, skb); @@ -709,8 +711,8 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) consume_skb(skb); if (matches > 0) { - can_stats.matches++; - can_stats.matches_delta++; + can_stats->matches++; + can_stats->matches_delta++; } } @@ -719,9 +721,6 @@ static int can_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(!net_eq(dev_net(dev), &init_net))) - goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || skb->len != CAN_MTU || cfd->len > CAN_MAX_DLEN, @@ -743,9 +742,6 @@ static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, { struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - if (unlikely(!net_eq(dev_net(dev), &init_net))) - goto drop; - if (WARN_ONCE(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU || cfd->len > CANFD_MAX_DLEN, @@ -835,9 +831,6 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct dev_rcv_lists *d; - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; - if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; @@ -855,7 +848,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, break; case NETDEV_UNREGISTER: - spin_lock(&can_rcvlists_lock); + spin_lock(&dev_net(dev)->can.can_rcvlists_lock); d = dev->ml_priv; if (d) { @@ -869,7 +862,7 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, pr_err("can: notifier: receive list not found for dev " "%s\n", dev->name); - spin_unlock(&can_rcvlists_lock); + spin_unlock(&dev_net(dev)->can.can_rcvlists_lock); break; } @@ -877,6 +870,59 @@ static int can_notifier(struct notifier_block *nb, unsigned long msg, return NOTIFY_DONE; } +static int can_pernet_init(struct net *net) +{ + net->can.can_rcvlists_lock = + __SPIN_LOCK_UNLOCKED(net->can.can_rcvlists_lock); + net->can.can_rx_alldev_list = + kzalloc(sizeof(struct dev_rcv_lists), GFP_KERNEL); + + net->can.can_stats = kzalloc(sizeof(struct s_stats), GFP_KERNEL); + net->can.can_pstats = kzalloc(sizeof(struct s_pstats), GFP_KERNEL); + + if (IS_ENABLED(CONFIG_PROC_FS)) { + /* the statistics are updated every second (timer triggered) */ + if (stats_timer) { + setup_timer(&net->can.can_stattimer, can_stat_update, + (unsigned long)net); + mod_timer(&net->can.can_stattimer, + round_jiffies(jiffies + HZ)); + } + net->can.can_stats->jiffies_init = jiffies; + can_init_proc(net); + } + + return 0; +} + +static void can_pernet_exit(struct net *net) +{ + struct net_device *dev; + + if (IS_ENABLED(CONFIG_PROC_FS)) { + can_remove_proc(net); + if (stats_timer) + del_timer_sync(&net->can.can_stattimer); + } + + /* remove created dev_rcv_lists from still registered CAN devices */ + rcu_read_lock(); + for_each_netdev_rcu(net, dev) { + if (dev->type == ARPHRD_CAN && dev->ml_priv) { + struct dev_rcv_lists *d = dev->ml_priv; + + BUG_ON(d->entries); + kfree(d); + dev->ml_priv = NULL; + } + } + rcu_read_unlock(); + + kfree(net->can.can_rx_alldev_list); + kfree(net->can.can_stats); + kfree(net->can.can_pstats); +} + /* * af_can module init/exit functions */ @@ -902,6 +948,11 @@ static struct notifier_block can_netdev_notifier __read_mostly = { .notifier_call = can_notifier, }; +static struct pernet_operations can_pernet_ops __read_mostly = { + .init = can_pernet_init, + .exit = can_pernet_exit, +}; + static __init int can_init(void) { /* check for correct padding to be able to use the structs similarly */ @@ -912,21 +963,12 @@ static __init int can_init(void) pr_info("can: controller area network core (" CAN_VERSION_STRING ")\n"); - memset(&can_rx_alldev_list, 0, sizeof(can_rx_alldev_list)); - rcv_cache = kmem_cache_create("can_receiver", sizeof(struct receiver), 0, 0, NULL); if (!rcv_cache) return -ENOMEM; - if (IS_ENABLED(CONFIG_PROC_FS)) { - if (stats_timer) { - /* the statistics are updated every second (timer triggered) */ - setup_timer(&can_stattimer, can_stat_update, 0); - mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); - } - can_init_proc(); - } + register_pernet_subsys(&can_pernet_ops); /* protocol register */ sock_register(&can_family_ops); @@ -939,34 +981,13 @@ static __init int can_init(void) static __exit void can_exit(void) { - struct net_device *dev; - - if (IS_ENABLED(CONFIG_PROC_FS)) { - if (stats_timer) - del_timer_sync(&can_stattimer); - - can_remove_proc(); - } - /* protocol unregister */ dev_remove_pack(&canfd_packet); dev_remove_pack(&can_packet); unregister_netdevice_notifier(&can_netdev_notifier); sock_unregister(PF_CAN); - /* remove created dev_rcv_lists from still registered CAN devices */ - rcu_read_lock(); - for_each_netdev_rcu(&init_net, dev) { - if (dev->type == ARPHRD_CAN && dev->ml_priv) { - - struct dev_rcv_lists *d = dev->ml_priv; - - BUG_ON(d->entries); - kfree(d); - dev->ml_priv = NULL; - } - } - rcu_read_unlock(); + unregister_pernet_subsys(&can_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ diff --git a/net/can/af_can.h b/net/can/af_can.h index b86f5129e8385fe84ef671bb914e8e05c2977ca0..d0ef45bb2a72748e95f4ed0c0156e723ea01b7a3 100644 --- a/net/can/af_can.h +++ b/net/can/af_can.h @@ -110,18 +110,9 @@ struct s_pstats { unsigned long rcv_entries_max; }; -/* receive filters subscribed for 'all' CAN devices */ -extern struct dev_rcv_lists can_rx_alldev_list; - /* function prototypes for the CAN networklayer procfs (proc.c) */ -void can_init_proc(void); -void can_remove_proc(void); +void can_init_proc(struct net *net); +void can_remove_proc(struct net *net); void can_stat_update(unsigned long data); -/* structures and variables from af_can.c needed in proc.c for reading */ -extern struct timer_list can_stattimer; /* timer for statistics update */ -extern struct s_stats can_stats; /* packet statistics */ -extern struct s_pstats can_pstats; /* receive list statistics */ -extern struct hlist_head can_rx_dev_list; /* rx dispatcher structures */ - #endif /* AF_CAN_H */ diff --git a/net/can/bcm.c b/net/can/bcm.c index 95d13b233c65161cf3595a8b0036207f5c2892e3..65432633a25000361c720058f8c3e8499fff9529 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1,7 +1,7 @@ /* * bcm.c - Broadcast Manager to filter/send (cyclic) CAN content * - * Copyright (c) 2002-2016 Volkswagen Group Electronic Research + * Copyright (c) 2002-2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -77,7 +77,7 @@ (CAN_EFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG) : \ (CAN_SFF_MASK | CAN_EFF_FLAG | CAN_RTR_FLAG)) -#define CAN_BCM_VERSION "20161123" +#define CAN_BCM_VERSION "20170425" MODULE_DESCRIPTION("PF_CAN broadcast manager protocol"); MODULE_LICENSE("Dual BSD/GPL"); @@ -118,8 +118,6 @@ struct bcm_op { struct net_device *rx_reg_dev; }; -static struct proc_dir_entry *proc_dir; - struct bcm_sock { struct sock sk; int bound; @@ -149,7 +147,8 @@ static inline ktime_t bcm_timeval_to_ktime(struct bcm_timeval tv) /* * procfs functions */ -static char *bcm_proc_getifname(char *result, int ifindex) +#if IS_ENABLED(CONFIG_PROC_FS) +static char *bcm_proc_getifname(struct net *net, char *result, int ifindex) { struct net_device *dev; @@ -157,7 +156,7 @@ static char *bcm_proc_getifname(char *result, int ifindex) return "any"; rcu_read_lock(); - dev = dev_get_by_index_rcu(&init_net, ifindex); + dev = dev_get_by_index_rcu(net, ifindex); if (dev) strcpy(result, dev->name); else @@ -170,7 +169,8 @@ static char *bcm_proc_getifname(char *result, int ifindex) static int bcm_proc_show(struct seq_file *m, void *v) { char ifname[IFNAMSIZ]; - struct sock *sk = (struct sock *)m->private; + struct net *net = m->private; + struct sock *sk = (struct sock *)PDE_DATA(m->file->f_inode); struct bcm_sock *bo = bcm_sk(sk); struct bcm_op *op; @@ -178,7 +178,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, " / sk %pK", sk); seq_printf(m, " / bo %pK", bo); seq_printf(m, " / dropped %lu", bo->dropped_usr_msgs); - seq_printf(m, " / bound %s", bcm_proc_getifname(ifname, bo->ifindex)); + seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); list_for_each_entry(op, &bo->rx_ops, list) { @@ -190,7 +190,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, "rx_op: %03X %-5s ", op->can_id, - bcm_proc_getifname(ifname, op->ifindex)); + bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u)", op->nframes); @@ -219,7 +219,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) list_for_each_entry(op, &bo->tx_ops, list) { seq_printf(m, "tx_op: %03X %s ", op->can_id, - bcm_proc_getifname(ifname, op->ifindex)); + bcm_proc_getifname(net, ifname, op->ifindex)); if (op->flags & CAN_FD_FRAME) seq_printf(m, "(%u) ", op->nframes); @@ -242,7 +242,7 @@ static int bcm_proc_show(struct seq_file *m, void *v) static int bcm_proc_open(struct inode *inode, struct file *file) { - return single_open(file, bcm_proc_show, PDE_DATA(inode)); + return single_open_net(inode, file, bcm_proc_show); } static const struct file_operations bcm_proc_fops = { @@ -252,6 +252,7 @@ static const struct file_operations bcm_proc_fops = { .llseek = seq_lseek, .release = single_release, }; +#endif /* CONFIG_PROC_FS */ /* * bcm_can_tx - send the (next) CAN frame to the appropriate CAN interface @@ -267,7 +268,7 @@ static void bcm_can_tx(struct bcm_op *op) if (!op->ifindex) return; - dev = dev_get_by_index(&init_net, op->ifindex); + dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ return; @@ -764,8 +765,8 @@ static void bcm_remove_op(struct bcm_op *op) static void bcm_rx_unreg(struct net_device *dev, struct bcm_op *op) { if (op->rx_reg_dev == dev) { - can_rx_unregister(dev, op->can_id, REGMASK(op->can_id), - bcm_rx_handler, op); + can_rx_unregister(dev_net(dev), dev, op->can_id, + REGMASK(op->can_id), bcm_rx_handler, op); /* mark as removed subscription */ op->rx_reg_dev = NULL; @@ -800,7 +801,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, if (op->rx_reg_dev) { struct net_device *dev; - dev = dev_get_by_index(&init_net, + dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (dev) { bcm_rx_unreg(dev, op); @@ -808,7 +809,8 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, } } } else - can_rx_unregister(NULL, op->can_id, + can_rx_unregister(sock_net(op->sk), NULL, + op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); @@ -1220,9 +1222,10 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(sock_net(sk), ifindex); if (dev) { - err = can_rx_register(dev, op->can_id, + err = can_rx_register(sock_net(sk), dev, + op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); @@ -1232,7 +1235,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } } else - err = can_rx_register(NULL, op->can_id, + err = can_rx_register(sock_net(sk), NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op, "bcm", sk); if (err) { @@ -1272,7 +1275,7 @@ static int bcm_tx_send(struct msghdr *msg, int ifindex, struct sock *sk, return err; } - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) { kfree_skb(skb); return -ENODEV; @@ -1337,7 +1340,7 @@ static int bcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENODEV; @@ -1418,7 +1421,7 @@ static int bcm_notifier(struct notifier_block *nb, unsigned long msg, struct bcm_op *op; int notify_enodev = 0; - if (!net_eq(dev_net(dev), &init_net)) + if (!net_eq(dev_net(dev), sock_net(sk))) return NOTIFY_DONE; if (dev->type != ARPHRD_CAN) @@ -1490,6 +1493,7 @@ static int bcm_init(struct sock *sk) static int bcm_release(struct socket *sock) { struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct bcm_sock *bo; struct bcm_op *op, *next; @@ -1521,23 +1525,25 @@ static int bcm_release(struct socket *sock) if (op->rx_reg_dev) { struct net_device *dev; - dev = dev_get_by_index(&init_net, op->ifindex); + dev = dev_get_by_index(net, op->ifindex); if (dev) { bcm_rx_unreg(dev, op); dev_put(dev); } } } else - can_rx_unregister(NULL, op->can_id, + can_rx_unregister(net, NULL, op->can_id, REGMASK(op->can_id), bcm_rx_handler, op); bcm_remove_op(op); } +#if IS_ENABLED(CONFIG_PROC_FS) /* remove procfs entry */ - if (proc_dir && bo->bcm_proc_read) - remove_proc_entry(bo->procname, proc_dir); + if (net->can.bcmproc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, net->can.bcmproc_dir); +#endif /* CONFIG_PROC_FS */ /* remove device reference */ if (bo->bound) { @@ -1560,6 +1566,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); + struct net *net = sock_net(sk); int ret = 0; if (len < sizeof(*addr)) @@ -1576,7 +1583,7 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, if (addr->can_ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, addr->can_ifindex); + dev = dev_get_by_index(net, addr->can_ifindex); if (!dev) { ret = -ENODEV; goto fail; @@ -1595,17 +1602,19 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, bo->ifindex = 0; } - if (proc_dir) { +#if IS_ENABLED(CONFIG_PROC_FS) + if (net->can.bcmproc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_data(bo->procname, 0644, - proc_dir, + net->can.bcmproc_dir, &bcm_proc_fops, sk); if (!bo->bcm_proc_read) { ret = -ENOMEM; goto fail; } } +#endif /* CONFIG_PROC_FS */ bo->bound = 1; @@ -1686,6 +1695,30 @@ static const struct can_proto bcm_can_proto = { .prot = &bcm_proto, }; +static int canbcm_pernet_init(struct net *net) +{ +#if IS_ENABLED(CONFIG_PROC_FS) + /* create /proc/net/can-bcm directory */ + net->can.bcmproc_dir = proc_net_mkdir(net, "can-bcm", net->proc_net); +#endif /* CONFIG_PROC_FS */ + + return 0; +} + +static void canbcm_pernet_exit(struct net *net) +{ +#if IS_ENABLED(CONFIG_PROC_FS) + /* remove /proc/net/can-bcm directory */ + if (net->can.bcmproc_dir) + remove_proc_entry("can-bcm", net->proc_net); +#endif /* CONFIG_PROC_FS */ +} + +static struct pernet_operations canbcm_pernet_ops __read_mostly = { + .init = canbcm_pernet_init, + .exit = canbcm_pernet_exit, +}; + static int __init bcm_module_init(void) { int err; @@ -1698,17 +1731,14 @@ static int __init bcm_module_init(void) return err; } - /* create /proc/net/can-bcm directory */ - proc_dir = proc_mkdir("can-bcm", init_net.proc_net); + register_pernet_subsys(&canbcm_pernet_ops); return 0; } static void __exit bcm_module_exit(void) { can_proto_unregister(&bcm_can_proto); - - if (proc_dir) - remove_proc_entry("can-bcm", init_net.proc_net); + unregister_pernet_subsys(&canbcm_pernet_ops); } module_init(bcm_module_init); diff --git a/net/can/gw.c b/net/can/gw.c index 7056a1a2bb70098e691ce557f05e5bc1f27cb42f..29748d844c3fac1bda35d494cdc1474fe81849a9 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -1,7 +1,7 @@ /* * gw.c - CAN frame Gateway/Router/Bridge with netlink interface * - * Copyright (c) 2011 Volkswagen Group Electronic Research + * Copyright (c) 2017 Volkswagen Group Electronic Research * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,7 +59,7 @@ #include #include -#define CAN_GW_VERSION "20130117" +#define CAN_GW_VERSION "20170425" #define CAN_GW_NAME "can-gw" MODULE_DESCRIPTION("PF_CAN netlink gateway"); @@ -79,9 +79,7 @@ MODULE_PARM_DESC(max_hops, __stringify(CGW_MAX_HOPS) " hops, " "default: " __stringify(CGW_DEFAULT_HOPS) ")"); -static HLIST_HEAD(cgw_list); static struct notifier_block notifier; - static struct kmem_cache *cgw_cache __read_mostly; /* structure that contains the (on-the-fly) CAN frame modifications */ @@ -438,16 +436,16 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) gwj->handled_frames++; } -static inline int cgw_register_filter(struct cgw_job *gwj) +static inline int cgw_register_filter(struct net *net, struct cgw_job *gwj) { - return can_rx_register(gwj->src.dev, gwj->ccgw.filter.can_id, + return can_rx_register(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj, "gw", NULL); } -static inline void cgw_unregister_filter(struct cgw_job *gwj) +static inline void cgw_unregister_filter(struct net *net, struct cgw_job *gwj) { - can_rx_unregister(gwj->src.dev, gwj->ccgw.filter.can_id, + can_rx_unregister(net, gwj->src.dev, gwj->ccgw.filter.can_id, gwj->ccgw.filter.can_mask, can_can_gw_rcv, gwj); } @@ -455,9 +453,8 @@ static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct net *net = dev_net(dev); - if (!net_eq(dev_net(dev), &init_net)) - return NOTIFY_DONE; if (dev->type != ARPHRD_CAN) return NOTIFY_DONE; @@ -468,11 +465,11 @@ static int cgw_notifier(struct notifier_block *nb, ASSERT_RTNL(); - hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->src.dev == dev || gwj->dst.dev == dev) { hlist_del(&gwj->list); - cgw_unregister_filter(gwj); + cgw_unregister_filter(net, gwj); kmem_cache_free(cgw_cache, gwj); } } @@ -592,12 +589,13 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, /* Dump information about all CAN gateway jobs, in response to RTM_GETROUTE */ static int cgw_dump_jobs(struct sk_buff *skb, struct netlink_callback *cb) { + struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; int idx = 0; int s_idx = cb->args[0]; rcu_read_lock(); - hlist_for_each_entry_rcu(gwj, &cgw_list, list) { + hlist_for_each_entry_rcu(gwj, &net->can.cgw_list, list) { if (idx < s_idx) goto cont; @@ -641,7 +639,7 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, memset(mod, 0, sizeof(*mod)); err = nlmsg_parse(nlh, sizeof(struct rtcanmsg), tb, CGW_MAX, - cgw_policy); + cgw_policy, NULL); if (err < 0) return err; @@ -809,8 +807,10 @@ static int cgw_parse_attr(struct nlmsghdr *nlh, struct cf_mod *mod, return 0; } -static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) +static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; struct cf_mod mod; @@ -841,7 +841,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); /* check for updating an existing job with identical uid */ - hlist_for_each_entry(gwj, &cgw_list, list) { + hlist_for_each_entry(gwj, &net->can.cgw_list, list) { if (gwj->mod.uid != mod.uid) continue; @@ -879,7 +879,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) err = -ENODEV; - gwj->src.dev = __dev_get_by_index(&init_net, gwj->ccgw.src_idx); + gwj->src.dev = __dev_get_by_index(net, gwj->ccgw.src_idx); if (!gwj->src.dev) goto out; @@ -887,7 +887,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) if (gwj->src.dev->type != ARPHRD_CAN) goto out; - gwj->dst.dev = __dev_get_by_index(&init_net, gwj->ccgw.dst_idx); + gwj->dst.dev = __dev_get_by_index(net, gwj->ccgw.dst_idx); if (!gwj->dst.dev) goto out; @@ -897,9 +897,9 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - err = cgw_register_filter(gwj); + err = cgw_register_filter(net, gwj); if (!err) - hlist_add_head_rcu(&gwj->list, &cgw_list); + hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: if (err) kmem_cache_free(cgw_cache, gwj); @@ -907,22 +907,24 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static void cgw_remove_all_jobs(void) +static void cgw_remove_all_jobs(struct net *net) { struct cgw_job *gwj = NULL; struct hlist_node *nx; ASSERT_RTNL(); - hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { hlist_del(&gwj->list); - cgw_unregister_filter(gwj); + cgw_unregister_filter(net, gwj); kmem_cache_free(cgw_cache, gwj); } } -static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) +static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { + struct net *net = sock_net(skb->sk); struct cgw_job *gwj = NULL; struct hlist_node *nx; struct rtcanmsg *r; @@ -951,7 +953,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) /* two interface indices both set to 0 => remove all entries */ if (!ccgw.src_idx && !ccgw.dst_idx) { - cgw_remove_all_jobs(); + cgw_remove_all_jobs(net); return 0; } @@ -960,7 +962,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); /* remove only the first matching entry */ - hlist_for_each_entry_safe(gwj, nx, &cgw_list, list) { + hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { if (gwj->flags != r->flags) continue; @@ -983,7 +985,7 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) continue; hlist_del(&gwj->list); - cgw_unregister_filter(gwj); + cgw_unregister_filter(net, gwj); kmem_cache_free(cgw_cache, gwj); err = 0; break; @@ -992,6 +994,24 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } +static int __net_init cangw_pernet_init(struct net *net) +{ + INIT_HLIST_HEAD(&net->can.cgw_list); + return 0; +} + +static void __net_exit cangw_pernet_exit(struct net *net) +{ + rtnl_lock(); + cgw_remove_all_jobs(net); + rtnl_unlock(); +} + +static struct pernet_operations cangw_pernet_ops = { + .init = cangw_pernet_init, + .exit = cangw_pernet_exit, +}; + static __init int cgw_module_init(void) { /* sanitize given module parameter */ @@ -1000,6 +1020,7 @@ static __init int cgw_module_init(void) pr_info("can: netlink gateway (rev " CAN_GW_VERSION ") max_hops=%d\n", max_hops); + register_pernet_subsys(&cangw_pernet_ops); cgw_cache = kmem_cache_create("can_gw", sizeof(struct cgw_job), 0, 0, NULL); @@ -1029,10 +1050,7 @@ static __exit void cgw_module_exit(void) unregister_netdevice_notifier(¬ifier); - rtnl_lock(); - cgw_remove_all_jobs(); - rtnl_unlock(); - + unregister_pernet_subsys(&cangw_pernet_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ kmem_cache_destroy(cgw_cache); diff --git a/net/can/proc.c b/net/can/proc.c index 85ef7bb0f1768fdc34e41baae297893e4758d16c..83045f00c63c147dd4664c76b17a98a9e040c45e 100644 --- a/net/can/proc.c +++ b/net/can/proc.c @@ -62,17 +62,6 @@ #define CAN_PROC_RCVLIST_EFF "rcvlist_eff" #define CAN_PROC_RCVLIST_ERR "rcvlist_err" -static struct proc_dir_entry *can_dir; -static struct proc_dir_entry *pde_version; -static struct proc_dir_entry *pde_stats; -static struct proc_dir_entry *pde_reset_stats; -static struct proc_dir_entry *pde_rcvlist_all; -static struct proc_dir_entry *pde_rcvlist_fil; -static struct proc_dir_entry *pde_rcvlist_inv; -static struct proc_dir_entry *pde_rcvlist_sff; -static struct proc_dir_entry *pde_rcvlist_eff; -static struct proc_dir_entry *pde_rcvlist_err; - static int user_reset; static const char rx_list_name[][8] = { @@ -86,21 +75,23 @@ static const char rx_list_name[][8] = { * af_can statistics stuff */ -static void can_init_stats(void) +static void can_init_stats(struct net *net) { + struct s_stats *can_stats = net->can.can_stats; + struct s_pstats *can_pstats = net->can.can_pstats; /* * This memset function is called from a timer context (when * can_stattimer is active which is the default) OR in a process * context (reading the proc_fs when can_stattimer is disabled). */ - memset(&can_stats, 0, sizeof(can_stats)); - can_stats.jiffies_init = jiffies; + memset(can_stats, 0, sizeof(struct s_stats)); + can_stats->jiffies_init = jiffies; - can_pstats.stats_reset++; + can_pstats->stats_reset++; if (user_reset) { user_reset = 0; - can_pstats.user_reset++; + can_pstats->user_reset++; } } @@ -126,64 +117,66 @@ static unsigned long calc_rate(unsigned long oldjif, unsigned long newjif, void can_stat_update(unsigned long data) { + struct net *net = (struct net *)data; + struct s_stats *can_stats = net->can.can_stats; unsigned long j = jiffies; /* snapshot */ /* restart counting in timer context on user request */ if (user_reset) - can_init_stats(); + can_init_stats(net); /* restart counting on jiffies overflow */ - if (j < can_stats.jiffies_init) - can_init_stats(); + if (j < can_stats->jiffies_init) + can_init_stats(net); /* prevent overflow in calc_rate() */ - if (can_stats.rx_frames > (ULONG_MAX / HZ)) - can_init_stats(); + if (can_stats->rx_frames > (ULONG_MAX / HZ)) + can_init_stats(net); /* prevent overflow in calc_rate() */ - if (can_stats.tx_frames > (ULONG_MAX / HZ)) - can_init_stats(); + if (can_stats->tx_frames > (ULONG_MAX / HZ)) + can_init_stats(net); /* matches overflow - very improbable */ - if (can_stats.matches > (ULONG_MAX / 100)) - can_init_stats(); + if (can_stats->matches > (ULONG_MAX / 100)) + can_init_stats(net); /* calc total values */ - if (can_stats.rx_frames) - can_stats.total_rx_match_ratio = (can_stats.matches * 100) / - can_stats.rx_frames; + if (can_stats->rx_frames) + can_stats->total_rx_match_ratio = (can_stats->matches * 100) / + can_stats->rx_frames; - can_stats.total_tx_rate = calc_rate(can_stats.jiffies_init, j, - can_stats.tx_frames); - can_stats.total_rx_rate = calc_rate(can_stats.jiffies_init, j, - can_stats.rx_frames); + can_stats->total_tx_rate = calc_rate(can_stats->jiffies_init, j, + can_stats->tx_frames); + can_stats->total_rx_rate = calc_rate(can_stats->jiffies_init, j, + can_stats->rx_frames); /* calc current values */ - if (can_stats.rx_frames_delta) - can_stats.current_rx_match_ratio = - (can_stats.matches_delta * 100) / - can_stats.rx_frames_delta; + if (can_stats->rx_frames_delta) + can_stats->current_rx_match_ratio = + (can_stats->matches_delta * 100) / + can_stats->rx_frames_delta; - can_stats.current_tx_rate = calc_rate(0, HZ, can_stats.tx_frames_delta); - can_stats.current_rx_rate = calc_rate(0, HZ, can_stats.rx_frames_delta); + can_stats->current_tx_rate = calc_rate(0, HZ, can_stats->tx_frames_delta); + can_stats->current_rx_rate = calc_rate(0, HZ, can_stats->rx_frames_delta); /* check / update maximum values */ - if (can_stats.max_tx_rate < can_stats.current_tx_rate) - can_stats.max_tx_rate = can_stats.current_tx_rate; + if (can_stats->max_tx_rate < can_stats->current_tx_rate) + can_stats->max_tx_rate = can_stats->current_tx_rate; - if (can_stats.max_rx_rate < can_stats.current_rx_rate) - can_stats.max_rx_rate = can_stats.current_rx_rate; + if (can_stats->max_rx_rate < can_stats->current_rx_rate) + can_stats->max_rx_rate = can_stats->current_rx_rate; - if (can_stats.max_rx_match_ratio < can_stats.current_rx_match_ratio) - can_stats.max_rx_match_ratio = can_stats.current_rx_match_ratio; + if (can_stats->max_rx_match_ratio < can_stats->current_rx_match_ratio) + can_stats->max_rx_match_ratio = can_stats->current_rx_match_ratio; /* clear values for 'current rate' calculation */ - can_stats.tx_frames_delta = 0; - can_stats.rx_frames_delta = 0; - can_stats.matches_delta = 0; + can_stats->tx_frames_delta = 0; + can_stats->rx_frames_delta = 0; + can_stats->matches_delta = 0; /* restart timer (one second) */ - mod_timer(&can_stattimer, round_jiffies(jiffies + HZ)); + mod_timer(&net->can.can_stattimer, round_jiffies(jiffies + HZ)); } /* @@ -217,57 +210,61 @@ static void can_print_recv_banner(struct seq_file *m) static int can_stats_proc_show(struct seq_file *m, void *v) { + struct net *net = m->private; + struct s_stats *can_stats = net->can.can_stats; + struct s_pstats *can_pstats = net->can.can_pstats; + seq_putc(m, '\n'); - seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats.tx_frames); - seq_printf(m, " %8ld received frames (RXF)\n", can_stats.rx_frames); - seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats.matches); + seq_printf(m, " %8ld transmitted frames (TXF)\n", can_stats->tx_frames); + seq_printf(m, " %8ld received frames (RXF)\n", can_stats->rx_frames); + seq_printf(m, " %8ld matched frames (RXMF)\n", can_stats->matches); seq_putc(m, '\n'); - if (can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == can_stat_update) { seq_printf(m, " %8ld %% total match ratio (RXMR)\n", - can_stats.total_rx_match_ratio); + can_stats->total_rx_match_ratio); seq_printf(m, " %8ld frames/s total tx rate (TXR)\n", - can_stats.total_tx_rate); + can_stats->total_tx_rate); seq_printf(m, " %8ld frames/s total rx rate (RXR)\n", - can_stats.total_rx_rate); + can_stats->total_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% current match ratio (CRXMR)\n", - can_stats.current_rx_match_ratio); + can_stats->current_rx_match_ratio); seq_printf(m, " %8ld frames/s current tx rate (CTXR)\n", - can_stats.current_tx_rate); + can_stats->current_tx_rate); seq_printf(m, " %8ld frames/s current rx rate (CRXR)\n", - can_stats.current_rx_rate); + can_stats->current_rx_rate); seq_putc(m, '\n'); seq_printf(m, " %8ld %% max match ratio (MRXMR)\n", - can_stats.max_rx_match_ratio); + can_stats->max_rx_match_ratio); seq_printf(m, " %8ld frames/s max tx rate (MTXR)\n", - can_stats.max_tx_rate); + can_stats->max_tx_rate); seq_printf(m, " %8ld frames/s max rx rate (MRXR)\n", - can_stats.max_rx_rate); + can_stats->max_rx_rate); seq_putc(m, '\n'); } seq_printf(m, " %8ld current receive list entries (CRCV)\n", - can_pstats.rcv_entries); + can_pstats->rcv_entries); seq_printf(m, " %8ld maximum receive list entries (MRCV)\n", - can_pstats.rcv_entries_max); + can_pstats->rcv_entries_max); - if (can_pstats.stats_reset) + if (can_pstats->stats_reset) seq_printf(m, "\n %8ld statistic resets (STR)\n", - can_pstats.stats_reset); + can_pstats->stats_reset); - if (can_pstats.user_reset) + if (can_pstats->user_reset) seq_printf(m, " %8ld user statistic resets (USTR)\n", - can_pstats.user_reset); + can_pstats->user_reset); seq_putc(m, '\n'); return 0; @@ -275,7 +272,7 @@ static int can_stats_proc_show(struct seq_file *m, void *v) static int can_stats_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_stats_proc_show, NULL); + return single_open_net(inode, file, can_stats_proc_show); } static const struct file_operations can_stats_proc_fops = { @@ -288,25 +285,28 @@ static const struct file_operations can_stats_proc_fops = { static int can_reset_stats_proc_show(struct seq_file *m, void *v) { + struct net *net = m->private; + struct s_pstats *can_pstats = net->can.can_pstats; + struct s_stats *can_stats = net->can.can_stats; + user_reset = 1; - if (can_stattimer.function == can_stat_update) { + if (net->can.can_stattimer.function == can_stat_update) { seq_printf(m, "Scheduled statistic reset #%ld.\n", - can_pstats.stats_reset + 1); - + can_pstats->stats_reset + 1); } else { - if (can_stats.jiffies_init != jiffies) - can_init_stats(); + if (can_stats->jiffies_init != jiffies) + can_init_stats(net); seq_printf(m, "Performed statistic reset #%ld.\n", - can_pstats.stats_reset); + can_pstats->stats_reset); } return 0; } static int can_reset_stats_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_reset_stats_proc_show, NULL); + return single_open_net(inode, file, can_reset_stats_proc_show); } static const struct file_operations can_reset_stats_proc_fops = { @@ -325,7 +325,7 @@ static int can_version_proc_show(struct seq_file *m, void *v) static int can_version_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_version_proc_show, NULL); + return single_open_net(inode, file, can_version_proc_show); } static const struct file_operations can_version_proc_fops = { @@ -351,20 +351,21 @@ static inline void can_rcvlist_proc_show_one(struct seq_file *m, int idx, static int can_rcvlist_proc_show(struct seq_file *m, void *v) { /* double cast to prevent GCC warning */ - int idx = (int)(long)m->private; + int idx = (int)(long)PDE_DATA(m->file->f_inode); struct net_device *dev; struct dev_rcv_lists *d; + struct net *net = m->private; seq_printf(m, "\nreceive list '%s':\n", rx_list_name[idx]); rcu_read_lock(); /* receive list for 'all' CAN devices (dev == NULL) */ - d = &can_rx_alldev_list; + d = net->can.can_rx_alldev_list; can_rcvlist_proc_show_one(m, idx, NULL, d); /* receive list for registered CAN devices */ - for_each_netdev_rcu(&init_net, dev) { + for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) can_rcvlist_proc_show_one(m, idx, dev, dev->ml_priv); } @@ -377,7 +378,7 @@ static int can_rcvlist_proc_show(struct seq_file *m, void *v) static int can_rcvlist_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_rcvlist_proc_show, PDE_DATA(inode)); + return single_open_net(inode, file, can_rcvlist_proc_show); } static const struct file_operations can_rcvlist_proc_fops = { @@ -417,6 +418,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct dev_rcv_lists *d; + struct net *net = m->private; /* RX_SFF */ seq_puts(m, "\nreceive list 'rx_sff':\n"); @@ -424,11 +426,11 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) rcu_read_lock(); /* sff receive list for 'all' CAN devices (dev == NULL) */ - d = &can_rx_alldev_list; + d = net->can.can_rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, d->rx_sff, ARRAY_SIZE(d->rx_sff)); /* sff receive list for registered CAN devices */ - for_each_netdev_rcu(&init_net, dev) { + for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) { d = dev->ml_priv; can_rcvlist_proc_show_array(m, dev, d->rx_sff, @@ -444,7 +446,7 @@ static int can_rcvlist_sff_proc_show(struct seq_file *m, void *v) static int can_rcvlist_sff_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_rcvlist_sff_proc_show, NULL); + return single_open_net(inode, file, can_rcvlist_sff_proc_show); } static const struct file_operations can_rcvlist_sff_proc_fops = { @@ -460,6 +462,7 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) { struct net_device *dev; struct dev_rcv_lists *d; + struct net *net = m->private; /* RX_EFF */ seq_puts(m, "\nreceive list 'rx_eff':\n"); @@ -467,11 +470,11 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) rcu_read_lock(); /* eff receive list for 'all' CAN devices (dev == NULL) */ - d = &can_rx_alldev_list; + d = net->can.can_rx_alldev_list; can_rcvlist_proc_show_array(m, NULL, d->rx_eff, ARRAY_SIZE(d->rx_eff)); /* eff receive list for registered CAN devices */ - for_each_netdev_rcu(&init_net, dev) { + for_each_netdev_rcu(net, dev) { if (dev->type == ARPHRD_CAN && dev->ml_priv) { d = dev->ml_priv; can_rcvlist_proc_show_array(m, dev, d->rx_eff, @@ -487,7 +490,7 @@ static int can_rcvlist_eff_proc_show(struct seq_file *m, void *v) static int can_rcvlist_eff_proc_open(struct inode *inode, struct file *file) { - return single_open(file, can_rcvlist_eff_proc_show, NULL); + return single_open_net(inode, file, can_rcvlist_eff_proc_show); } static const struct file_operations can_rcvlist_eff_proc_fops = { @@ -498,82 +501,86 @@ static const struct file_operations can_rcvlist_eff_proc_fops = { .release = single_release, }; -/* - * proc utility functions - */ - -static void can_remove_proc_readentry(const char *name) -{ - if (can_dir) - remove_proc_entry(name, can_dir); -} - /* * can_init_proc - create main CAN proc directory and procfs entries */ -void can_init_proc(void) +void can_init_proc(struct net *net) { /* create /proc/net/can directory */ - can_dir = proc_mkdir("can", init_net.proc_net); + net->can.proc_dir = proc_net_mkdir(net, "can", net->proc_net); - if (!can_dir) { - pr_info("can: failed to create /proc/net/can.\n"); + if (!net->can.proc_dir) { + printk(KERN_INFO "can: failed to create /proc/net/can . " + "CONFIG_PROC_FS missing?\n"); return; } /* own procfs entries from the AF_CAN core */ - pde_version = proc_create(CAN_PROC_VERSION, 0644, can_dir, - &can_version_proc_fops); - pde_stats = proc_create(CAN_PROC_STATS, 0644, can_dir, - &can_stats_proc_fops); - pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, can_dir, - &can_reset_stats_proc_fops); - pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, can_dir, - &can_rcvlist_proc_fops, (void *)RX_ERR); - pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, can_dir, - &can_rcvlist_proc_fops, (void *)RX_ALL); - pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, can_dir, - &can_rcvlist_proc_fops, (void *)RX_FIL); - pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, can_dir, - &can_rcvlist_proc_fops, (void *)RX_INV); - pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644, can_dir, - &can_rcvlist_eff_proc_fops); - pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, can_dir, - &can_rcvlist_sff_proc_fops); + net->can.pde_version = proc_create(CAN_PROC_VERSION, 0644, + net->can.proc_dir, + &can_version_proc_fops); + net->can.pde_stats = proc_create(CAN_PROC_STATS, 0644, + net->can.proc_dir, + &can_stats_proc_fops); + net->can.pde_reset_stats = proc_create(CAN_PROC_RESET_STATS, 0644, + net->can.proc_dir, + &can_reset_stats_proc_fops); + net->can.pde_rcvlist_err = proc_create_data(CAN_PROC_RCVLIST_ERR, 0644, + net->can.proc_dir, + &can_rcvlist_proc_fops, + (void *)RX_ERR); + net->can.pde_rcvlist_all = proc_create_data(CAN_PROC_RCVLIST_ALL, 0644, + net->can.proc_dir, + &can_rcvlist_proc_fops, + (void *)RX_ALL); + net->can.pde_rcvlist_fil = proc_create_data(CAN_PROC_RCVLIST_FIL, 0644, + net->can.proc_dir, + &can_rcvlist_proc_fops, + (void *)RX_FIL); + net->can.pde_rcvlist_inv = proc_create_data(CAN_PROC_RCVLIST_INV, 0644, + net->can.proc_dir, + &can_rcvlist_proc_fops, + (void *)RX_INV); + net->can.pde_rcvlist_eff = proc_create(CAN_PROC_RCVLIST_EFF, 0644, + net->can.proc_dir, + &can_rcvlist_eff_proc_fops); + net->can.pde_rcvlist_sff = proc_create(CAN_PROC_RCVLIST_SFF, 0644, + net->can.proc_dir, + &can_rcvlist_sff_proc_fops); } /* * can_remove_proc - remove procfs entries and main CAN proc directory */ -void can_remove_proc(void) +void can_remove_proc(struct net *net) { - if (pde_version) - can_remove_proc_readentry(CAN_PROC_VERSION); + if (net->can.pde_version) + remove_proc_entry(CAN_PROC_VERSION, net->can.proc_dir); - if (pde_stats) - can_remove_proc_readentry(CAN_PROC_STATS); + if (net->can.pde_stats) + remove_proc_entry(CAN_PROC_STATS, net->can.proc_dir); - if (pde_reset_stats) - can_remove_proc_readentry(CAN_PROC_RESET_STATS); + if (net->can.pde_reset_stats) + remove_proc_entry(CAN_PROC_RESET_STATS, net->can.proc_dir); - if (pde_rcvlist_err) - can_remove_proc_readentry(CAN_PROC_RCVLIST_ERR); + if (net->can.pde_rcvlist_err) + remove_proc_entry(CAN_PROC_RCVLIST_ERR, net->can.proc_dir); - if (pde_rcvlist_all) - can_remove_proc_readentry(CAN_PROC_RCVLIST_ALL); + if (net->can.pde_rcvlist_all) + remove_proc_entry(CAN_PROC_RCVLIST_ALL, net->can.proc_dir); - if (pde_rcvlist_fil) - can_remove_proc_readentry(CAN_PROC_RCVLIST_FIL); + if (net->can.pde_rcvlist_fil) + remove_proc_entry(CAN_PROC_RCVLIST_FIL, net->can.proc_dir); - if (pde_rcvlist_inv) - can_remove_proc_readentry(CAN_PROC_RCVLIST_INV); + if (net->can.pde_rcvlist_inv) + remove_proc_entry(CAN_PROC_RCVLIST_INV, net->can.proc_dir); - if (pde_rcvlist_eff) - can_remove_proc_readentry(CAN_PROC_RCVLIST_EFF); + if (net->can.pde_rcvlist_eff) + remove_proc_entry(CAN_PROC_RCVLIST_EFF, net->can.proc_dir); - if (pde_rcvlist_sff) - can_remove_proc_readentry(CAN_PROC_RCVLIST_SFF); + if (net->can.pde_rcvlist_sff) + remove_proc_entry(CAN_PROC_RCVLIST_SFF, net->can.proc_dir); - if (can_dir) - remove_proc_entry("can", init_net.proc_net); + if (net->can.proc_dir) + remove_proc_entry("can", net->proc_net); } diff --git a/net/can/raw.c b/net/can/raw.c index 6dc546a06673ff41fc121c546ebd0567bb0da05f..864c80dbdb72d11cdd24183cec3769222b339ccc 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -181,20 +181,21 @@ static void raw_rcv(struct sk_buff *oskb, void *data) kfree_skb(skb); } -static int raw_enable_filters(struct net_device *dev, struct sock *sk, - struct can_filter *filter, int count) +static int raw_enable_filters(struct net *net, struct net_device *dev, + struct sock *sk, struct can_filter *filter, + int count) { int err = 0; int i; for (i = 0; i < count; i++) { - err = can_rx_register(dev, filter[i].can_id, + err = can_rx_register(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk, "raw", sk); if (err) { /* clean up successfully registered filters */ while (--i >= 0) - can_rx_unregister(dev, filter[i].can_id, + can_rx_unregister(net, dev, filter[i].can_id, filter[i].can_mask, raw_rcv, sk); break; @@ -204,57 +205,62 @@ static int raw_enable_filters(struct net_device *dev, struct sock *sk, return err; } -static int raw_enable_errfilter(struct net_device *dev, struct sock *sk, - can_err_mask_t err_mask) +static int raw_enable_errfilter(struct net *net, struct net_device *dev, + struct sock *sk, can_err_mask_t err_mask) { int err = 0; if (err_mask) - err = can_rx_register(dev, 0, err_mask | CAN_ERR_FLAG, + err = can_rx_register(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk, "raw", sk); return err; } -static void raw_disable_filters(struct net_device *dev, struct sock *sk, - struct can_filter *filter, int count) +static void raw_disable_filters(struct net *net, struct net_device *dev, + struct sock *sk, struct can_filter *filter, + int count) { int i; for (i = 0; i < count; i++) - can_rx_unregister(dev, filter[i].can_id, filter[i].can_mask, - raw_rcv, sk); + can_rx_unregister(net, dev, filter[i].can_id, + filter[i].can_mask, raw_rcv, sk); } -static inline void raw_disable_errfilter(struct net_device *dev, +static inline void raw_disable_errfilter(struct net *net, + struct net_device *dev, struct sock *sk, can_err_mask_t err_mask) { if (err_mask) - can_rx_unregister(dev, 0, err_mask | CAN_ERR_FLAG, + can_rx_unregister(net, dev, 0, err_mask | CAN_ERR_FLAG, raw_rcv, sk); } -static inline void raw_disable_allfilters(struct net_device *dev, +static inline void raw_disable_allfilters(struct net *net, + struct net_device *dev, struct sock *sk) { struct raw_sock *ro = raw_sk(sk); - raw_disable_filters(dev, sk, ro->filter, ro->count); - raw_disable_errfilter(dev, sk, ro->err_mask); + raw_disable_filters(net, dev, sk, ro->filter, ro->count); + raw_disable_errfilter(net, dev, sk, ro->err_mask); } -static int raw_enable_allfilters(struct net_device *dev, struct sock *sk) +static int raw_enable_allfilters(struct net *net, struct net_device *dev, + struct sock *sk) { struct raw_sock *ro = raw_sk(sk); int err; - err = raw_enable_filters(dev, sk, ro->filter, ro->count); + err = raw_enable_filters(net, dev, sk, ro->filter, ro->count); if (!err) { - err = raw_enable_errfilter(dev, sk, ro->err_mask); + err = raw_enable_errfilter(net, dev, sk, ro->err_mask); if (err) - raw_disable_filters(dev, sk, ro->filter, ro->count); + raw_disable_filters(net, dev, sk, ro->filter, + ro->count); } return err; @@ -267,7 +273,7 @@ static int raw_notifier(struct notifier_block *nb, struct raw_sock *ro = container_of(nb, struct raw_sock, notifier); struct sock *sk = &ro->sk; - if (!net_eq(dev_net(dev), &init_net)) + if (!net_eq(dev_net(dev), sock_net(sk))) return NOTIFY_DONE; if (dev->type != ARPHRD_CAN) @@ -282,7 +288,7 @@ static int raw_notifier(struct notifier_block *nb, lock_sock(sk); /* remove current filters & unregister */ if (ro->bound) - raw_disable_allfilters(dev, sk); + raw_disable_allfilters(dev_net(dev), dev, sk); if (ro->count > 1) kfree(ro->filter); @@ -358,13 +364,13 @@ static int raw_release(struct socket *sock) if (ro->ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, ro->ifindex); + dev = dev_get_by_index(sock_net(sk), ro->ifindex); if (dev) { - raw_disable_allfilters(dev, sk); + raw_disable_allfilters(dev_net(dev), dev, sk); dev_put(dev); } } else - raw_disable_allfilters(NULL, sk); + raw_disable_allfilters(sock_net(sk), NULL, sk); } if (ro->count > 1) @@ -404,7 +410,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (addr->can_ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, addr->can_ifindex); + dev = dev_get_by_index(sock_net(sk), addr->can_ifindex); if (!dev) { err = -ENODEV; goto out; @@ -420,13 +426,13 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) ifindex = dev->ifindex; /* filters set by default/setsockopt */ - err = raw_enable_allfilters(dev, sk); + err = raw_enable_allfilters(sock_net(sk), dev, sk); dev_put(dev); } else { ifindex = 0; /* filters set by default/setsockopt */ - err = raw_enable_allfilters(NULL, sk); + err = raw_enable_allfilters(sock_net(sk), NULL, sk); } if (!err) { @@ -435,13 +441,15 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (ro->ifindex) { struct net_device *dev; - dev = dev_get_by_index(&init_net, ro->ifindex); + dev = dev_get_by_index(sock_net(sk), + ro->ifindex); if (dev) { - raw_disable_allfilters(dev, sk); + raw_disable_allfilters(dev_net(dev), + dev, sk); dev_put(dev); } } else - raw_disable_allfilters(NULL, sk); + raw_disable_allfilters(sock_net(sk), NULL, sk); } ro->ifindex = ifindex; ro->bound = 1; @@ -517,15 +525,16 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (ro->bound && ro->ifindex) - dev = dev_get_by_index(&init_net, ro->ifindex); + dev = dev_get_by_index(sock_net(sk), ro->ifindex); if (ro->bound) { /* (try to) register the new filters */ if (count == 1) - err = raw_enable_filters(dev, sk, &sfilter, 1); + err = raw_enable_filters(sock_net(sk), dev, sk, + &sfilter, 1); else - err = raw_enable_filters(dev, sk, filter, - count); + err = raw_enable_filters(sock_net(sk), dev, sk, + filter, count); if (err) { if (count > 1) kfree(filter); @@ -533,7 +542,8 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, } /* remove old filter registrations */ - raw_disable_filters(dev, sk, ro->filter, ro->count); + raw_disable_filters(sock_net(sk), dev, sk, ro->filter, + ro->count); } /* remove old filter space */ @@ -569,18 +579,20 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, lock_sock(sk); if (ro->bound && ro->ifindex) - dev = dev_get_by_index(&init_net, ro->ifindex); + dev = dev_get_by_index(sock_net(sk), ro->ifindex); /* remove current error mask */ if (ro->bound) { /* (try to) register the new err_mask */ - err = raw_enable_errfilter(dev, sk, err_mask); + err = raw_enable_errfilter(sock_net(sk), dev, sk, + err_mask); if (err) goto out_err; /* remove old err_mask registration */ - raw_disable_errfilter(dev, sk, ro->err_mask); + raw_disable_errfilter(sock_net(sk), dev, sk, + ro->err_mask); } /* link new err_mask to the socket */ @@ -741,7 +753,7 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) return -EINVAL; } - dev = dev_get_by_index(&init_net, ifindex); + dev = dev_get_by_index(sock_net(sk), ifindex); if (!dev) return -ENXIO; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 108533859a53292cde61a3cedd052a2579684e87..4fd02831beed20fb3d98475c7da2a0b4f2818513 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -45,6 +45,17 @@ bool libceph_compatible(void *data) } EXPORT_SYMBOL(libceph_compatible); +static int param_get_supported_features(char *buffer, + const struct kernel_param *kp) +{ + return sprintf(buffer, "0x%llx", CEPH_FEATURES_SUPPORTED_DEFAULT); +} +static const struct kernel_param_ops param_ops_supported_features = { + .get = param_get_supported_features, +}; +module_param_cb(supported_features, ¶m_ops_supported_features, NULL, + S_IRUGO); + /* * find filename portion of a path (/foo/bar/baz -> baz) */ @@ -187,7 +198,7 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return ptr; } - return __vmalloc(size, flags | __GFP_HIGHMEM, PAGE_KERNEL); + return __vmalloc(size, flags, PAGE_KERNEL); } @@ -596,9 +607,7 @@ EXPORT_SYMBOL(ceph_client_gid); /* * create a fresh client instance */ -struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, - u64 supported_features, - u64 required_features) +struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private) { struct ceph_client *client; struct ceph_entity_addr *myaddr = NULL; @@ -615,14 +624,12 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private, init_waitqueue_head(&client->auth_wq); client->auth_err = 0; - if (!ceph_test_opt(client, NOMSGAUTH)) - required_features |= CEPH_FEATURE_MSG_AUTH; - client->extra_mon_dispatch = NULL; - client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT | - supported_features; - client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT | - required_features; + client->supported_features = CEPH_FEATURES_SUPPORTED_DEFAULT; + client->required_features = CEPH_FEATURES_REQUIRED_DEFAULT; + + if (!ceph_test_opt(client, NOMSGAUTH)) + client->required_features |= CEPH_FEATURE_MSG_AUTH; /* msgr */ if (ceph_test_opt(client, MYIP)) diff --git a/net/ceph/cls_lock_client.c b/net/ceph/cls_lock_client.c index b9233b9903990bd38721213ce287a8045debd8c7..08ada893f01e6acb61b5f91425539a33d7b2c0d4 100644 --- a/net/ceph/cls_lock_client.c +++ b/net/ceph/cls_lock_client.c @@ -179,6 +179,57 @@ int ceph_cls_break_lock(struct ceph_osd_client *osdc, } EXPORT_SYMBOL(ceph_cls_break_lock); +int ceph_cls_set_cookie(struct ceph_osd_client *osdc, + struct ceph_object_id *oid, + struct ceph_object_locator *oloc, + char *lock_name, u8 type, char *old_cookie, + char *tag, char *new_cookie) +{ + int cookie_op_buf_size; + int name_len = strlen(lock_name); + int old_cookie_len = strlen(old_cookie); + int tag_len = strlen(tag); + int new_cookie_len = strlen(new_cookie); + void *p, *end; + struct page *cookie_op_page; + int ret; + + cookie_op_buf_size = name_len + sizeof(__le32) + + old_cookie_len + sizeof(__le32) + + tag_len + sizeof(__le32) + + new_cookie_len + sizeof(__le32) + + sizeof(u8) + CEPH_ENCODING_START_BLK_LEN; + if (cookie_op_buf_size > PAGE_SIZE) + return -E2BIG; + + cookie_op_page = alloc_page(GFP_NOIO); + if (!cookie_op_page) + return -ENOMEM; + + p = page_address(cookie_op_page); + end = p + cookie_op_buf_size; + + /* encode cls_lock_set_cookie_op struct */ + ceph_start_encoding(&p, 1, 1, + cookie_op_buf_size - CEPH_ENCODING_START_BLK_LEN); + ceph_encode_string(&p, end, lock_name, name_len); + ceph_encode_8(&p, type); + ceph_encode_string(&p, end, old_cookie, old_cookie_len); + ceph_encode_string(&p, end, tag, tag_len); + ceph_encode_string(&p, end, new_cookie, new_cookie_len); + + dout("%s lock_name %s type %d old_cookie %s tag %s new_cookie %s\n", + __func__, lock_name, type, old_cookie, tag, new_cookie); + ret = ceph_osdc_call(osdc, oid, oloc, "lock", "set_cookie", + CEPH_OSD_FLAG_WRITE, cookie_op_page, + cookie_op_buf_size, NULL, NULL); + + dout("%s: status %d\n", __func__, ret); + __free_page(cookie_op_page); + return ret; +} +EXPORT_SYMBOL(ceph_cls_set_cookie); + void ceph_free_lockers(struct ceph_locker *lockers, u32 num_lockers) { int i; diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c index c62b2b029a6e6fe66fa3440551d0602fc8c4123e..71ba13927b3d1726bd488403de8274478a956091 100644 --- a/net/ceph/debugfs.c +++ b/net/ceph/debugfs.c @@ -62,7 +62,8 @@ static int osdmap_show(struct seq_file *s, void *p) return 0; down_read(&osdc->lock); - seq_printf(s, "epoch %d flags 0x%x\n", map->epoch, map->flags); + seq_printf(s, "epoch %u barrier %u flags 0x%x\n", map->epoch, + osdc->epoch_barrier, map->flags); for (n = rb_first(&map->pg_pools); n; n = rb_next(n)) { struct ceph_pg_pool_info *pi = @@ -177,9 +178,7 @@ static void dump_request(struct seq_file *s, struct ceph_osd_request *req) seq_printf(s, "%llu\t", req->r_tid); dump_target(s, &req->r_t); - seq_printf(s, "\t%d\t%u'%llu", req->r_attempts, - le32_to_cpu(req->r_replay_version.epoch), - le64_to_cpu(req->r_replay_version.version)); + seq_printf(s, "\t%d", req->r_attempts); for (i = 0; i < req->r_num_ops; i++) { struct ceph_osd_req_op *op = &req->r_ops[i]; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index f76bb333261384257490b0f5125207028e8352aa..5766a6c896c4fa7290fffdbee239701ca4e2eeef 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1386,8 +1386,9 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec now = CURRENT_TIME; + struct timespec now; + ktime_get_real_ts(&now); con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); ceph_encode_timespec(&con->out_temp_keepalive2, &now); con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), @@ -3176,8 +3177,9 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con, { if (interval > 0 && (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2)) { - struct timespec now = CURRENT_TIME; + struct timespec now; struct timespec ts; + ktime_get_real_ts(&now); jiffies_to_timespec(interval, &ts); ts = timespec_add(con->last_keepalive_ack, ts); return timespec_compare(&now, &ts) >= 0; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index e15ea9e4c4955fbd697e545cedfdb7f7925c347e..924f07c36ddbc2864c7428723766d0a64729c7c4 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -961,6 +961,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc, truncate_size, truncate_seq); } + req->r_abort_on_full = true; req->r_flags = flags; req->r_base_oloc.pool = layout->pool_id; req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns); @@ -1005,7 +1006,7 @@ static bool osd_registered(struct ceph_osd *osd) */ static void osd_init(struct ceph_osd *osd) { - atomic_set(&osd->o_ref, 1); + refcount_set(&osd->o_ref, 1); RB_CLEAR_NODE(&osd->o_node); osd->o_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT; @@ -1050,9 +1051,9 @@ static struct ceph_osd *create_osd(struct ceph_osd_client *osdc, int onum) static struct ceph_osd *get_osd(struct ceph_osd *osd) { - if (atomic_inc_not_zero(&osd->o_ref)) { - dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1, - atomic_read(&osd->o_ref)); + if (refcount_inc_not_zero(&osd->o_ref)) { + dout("get_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref)-1, + refcount_read(&osd->o_ref)); return osd; } else { dout("get_osd %p FAIL\n", osd); @@ -1062,9 +1063,9 @@ static struct ceph_osd *get_osd(struct ceph_osd *osd) static void put_osd(struct ceph_osd *osd) { - dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref), - atomic_read(&osd->o_ref) - 1); - if (atomic_dec_and_test(&osd->o_ref)) { + dout("put_osd %p %d -> %d\n", osd, refcount_read(&osd->o_ref), + refcount_read(&osd->o_ref) - 1); + if (refcount_dec_and_test(&osd->o_ref)) { osd_cleanup(osd); kfree(osd); } @@ -1297,8 +1298,9 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, __pool_full(pi); WARN_ON(pi->id != t->base_oloc.pool); - return (t->flags & CEPH_OSD_FLAG_READ && pauserd) || - (t->flags & CEPH_OSD_FLAG_WRITE && pausewr); + return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || + ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || + (osdc->osdmap->epoch < osdc->epoch_barrier); } enum calc_target_result { @@ -1503,9 +1505,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) ceph_encode_32(&p, req->r_flags); ceph_encode_timespec(p, &req->r_mtime); p += sizeof(struct ceph_timespec); - /* aka reassert_version */ - memcpy(p, &req->r_replay_version, sizeof(req->r_replay_version)); - p += sizeof(req->r_replay_version); + + /* reassert_version */ + memset(p, 0, sizeof(struct ceph_eversion)); + p += sizeof(struct ceph_eversion); /* oloc */ ceph_start_encoding(&p, 5, 4, @@ -1626,6 +1629,7 @@ static void maybe_request_map(struct ceph_osd_client *osdc) ceph_monc_renew_subs(&osdc->client->monc); } +static void complete_request(struct ceph_osd_request *req, int err); static void send_map_check(struct ceph_osd_request *req); static void __submit_request(struct ceph_osd_request *req, bool wrlocked) @@ -1635,6 +1639,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) enum calc_target_result ct_res; bool need_send = false; bool promoted = false; + bool need_abort = false; WARN_ON(req->r_tid); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); @@ -1650,8 +1655,13 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) goto promote; } - if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && - ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { + if (osdc->osdmap->epoch < osdc->epoch_barrier) { + dout("req %p epoch %u barrier %u\n", req, osdc->osdmap->epoch, + osdc->epoch_barrier); + req->r_t.paused = true; + maybe_request_map(osdc); + } else if ((req->r_flags & CEPH_OSD_FLAG_WRITE) && + ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR)) { dout("req %p pausewr\n", req); req->r_t.paused = true; maybe_request_map(osdc); @@ -1669,6 +1679,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) pr_warn_ratelimited("FULL or reached pool quota\n"); req->r_t.paused = true; maybe_request_map(osdc); + if (req->r_abort_on_full) + need_abort = true; } else if (!osd_homeless(osd)) { need_send = true; } else { @@ -1685,6 +1697,8 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) link_request(osd, req); if (need_send) send_request(req); + else if (need_abort) + complete_request(req, -ENOSPC); mutex_unlock(&osd->lock); if (ct_res == CALC_TARGET_POOL_DNE) @@ -1799,6 +1813,97 @@ static void abort_request(struct ceph_osd_request *req, int err) complete_request(req, err); } +static void update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) +{ + if (likely(eb > osdc->epoch_barrier)) { + dout("updating epoch_barrier from %u to %u\n", + osdc->epoch_barrier, eb); + osdc->epoch_barrier = eb; + /* Request map if we're not to the barrier yet */ + if (eb > osdc->osdmap->epoch) + maybe_request_map(osdc); + } +} + +void ceph_osdc_update_epoch_barrier(struct ceph_osd_client *osdc, u32 eb) +{ + down_read(&osdc->lock); + if (unlikely(eb > osdc->epoch_barrier)) { + up_read(&osdc->lock); + down_write(&osdc->lock); + update_epoch_barrier(osdc, eb); + up_write(&osdc->lock); + } else { + up_read(&osdc->lock); + } +} +EXPORT_SYMBOL(ceph_osdc_update_epoch_barrier); + +/* + * Drop all pending requests that are stalled waiting on a full condition to + * clear, and complete them with ENOSPC as the return code. Set the + * osdc->epoch_barrier to the latest map epoch that we've seen if any were + * cancelled. + */ +static void ceph_osdc_abort_on_full(struct ceph_osd_client *osdc) +{ + struct rb_node *n; + bool victims = false; + + dout("enter abort_on_full\n"); + + if (!ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) && !have_pool_full(osdc)) + goto out; + + /* Scan list and see if there is anything to abort */ + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + struct rb_node *m; + + m = rb_first(&osd->o_requests); + while (m) { + struct ceph_osd_request *req = rb_entry(m, + struct ceph_osd_request, r_node); + m = rb_next(m); + + if (req->r_abort_on_full) { + victims = true; + break; + } + } + if (victims) + break; + } + + if (!victims) + goto out; + + /* + * Update the barrier to current epoch if it's behind that point, + * since we know we have some calls to be aborted in the tree. + */ + update_epoch_barrier(osdc, osdc->osdmap->epoch); + + for (n = rb_first(&osdc->osds); n; n = rb_next(n)) { + struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node); + struct rb_node *m; + + m = rb_first(&osd->o_requests); + while (m) { + struct ceph_osd_request *req = rb_entry(m, + struct ceph_osd_request, r_node); + m = rb_next(m); + + if (req->r_abort_on_full && + (ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || + pool_full(osdc, req->r_t.target_oloc.pool))) + abort_request(req, -ENOSPC); + } + } +out: + dout("return abort_on_full barrier=%u\n", osdc->epoch_barrier); +} + static void check_pool_dne(struct ceph_osd_request *req) { struct ceph_osd_client *osdc = req->r_osdc; @@ -3252,11 +3357,13 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) pausewr = ceph_osdmap_flag(osdc, CEPH_OSDMAP_PAUSEWR) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || have_pool_full(osdc); - if (was_pauserd || was_pausewr || pauserd || pausewr) + if (was_pauserd || was_pausewr || pauserd || pausewr || + osdc->osdmap->epoch < osdc->epoch_barrier) maybe_request_map(osdc); kick_requests(osdc, &need_resend, &need_resend_linger); + ceph_osdc_abort_on_full(osdc); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP, osdc->osdmap->epoch); up_write(&osdc->lock); @@ -3574,7 +3681,7 @@ ceph_osdc_watch(struct ceph_osd_client *osdc, ceph_oid_copy(&lreq->t.base_oid, oid); ceph_oloc_copy(&lreq->t.base_oloc, oloc); lreq->t.flags = CEPH_OSD_FLAG_WRITE; - lreq->mtime = CURRENT_TIME; + ktime_get_real_ts(&lreq->mtime); lreq->reg_req = alloc_linger_request(lreq); if (!lreq->reg_req) { @@ -3632,7 +3739,7 @@ int ceph_osdc_unwatch(struct ceph_osd_client *osdc, ceph_oid_copy(&req->r_base_oid, &lreq->t.base_oid); ceph_oloc_copy(&req->r_base_oloc, &lreq->t.base_oloc); req->r_flags = CEPH_OSD_FLAG_WRITE; - req->r_mtime = CURRENT_TIME; + ktime_get_real_ts(&req->r_mtime); osd_req_op_watch_init(req, 0, lreq->linger_id, CEPH_OSD_WATCH_OP_UNWATCH); @@ -4126,7 +4233,7 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) close_osd(osd); } up_write(&osdc->lock); - WARN_ON(atomic_read(&osdc->homeless_osd.o_ref) != 1); + WARN_ON(refcount_read(&osdc->homeless_osd.o_ref) != 1); osd_cleanup(&osdc->homeless_osd); WARN_ON(!list_empty(&osdc->osd_lru)); diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c index 6864007e64fc3236f6d118f8a4a1869255bbba92..ce09f73be759c562cb9ffe093eb1c44350c8adde 100644 --- a/net/ceph/pagelist.c +++ b/net/ceph/pagelist.c @@ -16,7 +16,7 @@ static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl) void ceph_pagelist_release(struct ceph_pagelist *pl) { - if (!atomic_dec_and_test(&pl->refcnt)) + if (!refcount_dec_and_test(&pl->refcnt)) return; ceph_pagelist_unmap_tail(pl); while (!list_empty(&pl->head)) { diff --git a/net/ceph/snapshot.c b/net/ceph/snapshot.c index 705414e78ae0b05d2d1b8d5d8f8e8fbb6007bfb4..e14a5d038656f004da16aa044fdecca9d094a71b 100644 --- a/net/ceph/snapshot.c +++ b/net/ceph/snapshot.c @@ -49,7 +49,7 @@ struct ceph_snap_context *ceph_create_snap_context(u32 snap_count, if (!snapc) return NULL; - atomic_set(&snapc->nref, 1); + refcount_set(&snapc->nref, 1); snapc->num_snaps = snap_count; return snapc; @@ -59,7 +59,7 @@ EXPORT_SYMBOL(ceph_create_snap_context); struct ceph_snap_context *ceph_get_snap_context(struct ceph_snap_context *sc) { if (sc) - atomic_inc(&sc->nref); + refcount_inc(&sc->nref); return sc; } EXPORT_SYMBOL(ceph_get_snap_context); @@ -68,7 +68,7 @@ void ceph_put_snap_context(struct ceph_snap_context *sc) { if (!sc) return; - if (atomic_dec_and_test(&sc->nref)) { + if (refcount_dec_and_test(&sc->nref)) { /*printk(" deleting snap_context %p\n", sc);*/ kfree(sc); } diff --git a/net/core/datagram.c b/net/core/datagram.c index f4947e737f34a0d5dafb2e8232260f46a43fbc27..db1866f2ffcf6e3ed505ce5c1c5a4f11e6a908bd 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -256,8 +256,12 @@ struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags, } spin_unlock_irqrestore(&queue->lock, cpu_flags); - } while (sk_can_busy_loop(sk) && - sk_busy_loop(sk, flags & MSG_DONTWAIT)); + + if (!sk_can_busy_loop(sk)) + break; + + sk_busy_loop(sk, flags & MSG_DONTWAIT); + } while (!skb_queue_empty(&sk->sk_receive_queue)); error = -EAGAIN; @@ -760,7 +764,7 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (msg_data_left(msg) < chunk) { if (__skb_checksum_complete(skb)) - goto csum_error; + return -EINVAL; if (skb_copy_datagram_msg(skb, hlen, msg, chunk)) goto fault; } else { @@ -768,15 +772,16 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, if (skb_copy_and_csum_datagram(skb, hlen, &msg->msg_iter, chunk, &csum)) goto fault; - if (csum_fold(csum)) - goto csum_error; + + if (csum_fold(csum)) { + iov_iter_revert(&msg->msg_iter, chunk); + return -EINVAL; + } + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) netdev_rx_csum_fault(skb->dev); } return 0; -csum_error: - iov_iter_revert(&msg->msg_iter, chunk); - return -EINVAL; fault: return -EFAULT; } diff --git a/net/core/dev.c b/net/core/dev.c index 533a6d6f60920a01e8a9cef8fdb408950b914b05..96cf83da0d66b2db07d5504c4c3aec7da735ea46 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include @@ -95,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -2450,6 +2452,9 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) { smp_rmb(); atomic_set(&skb->users, 0); @@ -2972,6 +2977,9 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; + if (validate_xmit_xfrm(skb, features)) + goto out_kfree_skb; + /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3441,6 +3449,7 @@ EXPORT_SYMBOL(netdev_max_backlog); int netdev_tstamp_prequeue __read_mostly = 1; int netdev_budget __read_mostly = 300; +unsigned int __read_mostly netdev_budget_usecs = 2000; int weight_p __read_mostly = 64; /* old backlog weight */ int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */ int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */ @@ -4227,7 +4236,7 @@ static int __netif_receive_skb(struct sk_buff *skb) int ret; if (sk_memalloc_socks() && skb_pfmemalloc(skb)) { - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* * PFMEMALLOC skbs are special, they should @@ -4238,15 +4247,134 @@ static int __netif_receive_skb(struct sk_buff *skb) * Use PF_MEMALLOC as this saves us from propagating the allocation * context down to all allocation sites. */ - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = __netif_receive_skb_core(skb, true); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); } else ret = __netif_receive_skb_core(skb, false); return ret; } +static struct static_key generic_xdp_needed __read_mostly; + +static int generic_xdp_install(struct net_device *dev, struct netdev_xdp *xdp) +{ + struct bpf_prog *new = xdp->prog; + int ret = 0; + + switch (xdp->command) { + case XDP_SETUP_PROG: { + struct bpf_prog *old = rtnl_dereference(dev->xdp_prog); + + rcu_assign_pointer(dev->xdp_prog, new); + if (old) + bpf_prog_put(old); + + if (old && !new) { + static_key_slow_dec(&generic_xdp_needed); + } else if (new && !old) { + static_key_slow_inc(&generic_xdp_needed); + dev_disable_lro(dev); + } + break; + } + + case XDP_QUERY_PROG: + xdp->prog_attached = !!rcu_access_pointer(dev->xdp_prog); + break; + + default: + ret = -EINVAL; + break; + } + + return ret; +} + +static u32 netif_receive_generic_xdp(struct sk_buff *skb, + struct bpf_prog *xdp_prog) +{ + struct xdp_buff xdp; + u32 act = XDP_DROP; + void *orig_data; + int hlen, off; + u32 mac_len; + + /* Reinjected packets coming from act_mirred or similar should + * not get XDP generic processing. + */ + if (skb_cloned(skb)) + return XDP_PASS; + + if (skb_linearize(skb)) + goto do_drop; + + /* The XDP program wants to see the packet starting at the MAC + * header. + */ + mac_len = skb->data - skb_mac_header(skb); + hlen = skb_headlen(skb) + mac_len; + xdp.data = skb->data - mac_len; + xdp.data_end = xdp.data + hlen; + xdp.data_hard_start = skb->data - skb_headroom(skb); + orig_data = xdp.data; + + act = bpf_prog_run_xdp(xdp_prog, &xdp); + + off = xdp.data - orig_data; + if (off > 0) + __skb_pull(skb, off); + else if (off < 0) + __skb_push(skb, -off); + + switch (act) { + case XDP_TX: + __skb_push(skb, mac_len); + /* fall through */ + case XDP_PASS: + break; + + default: + bpf_warn_invalid_xdp_action(act); + /* fall through */ + case XDP_ABORTED: + trace_xdp_exception(skb->dev, xdp_prog, act); + /* fall through */ + case XDP_DROP: + do_drop: + kfree_skb(skb); + break; + } + + return act; +} + +/* When doing generic XDP we have to bypass the qdisc layer and the + * network taps in order to match in-driver-XDP behavior. + */ +static void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog) +{ + struct net_device *dev = skb->dev; + struct netdev_queue *txq; + bool free_skb = true; + int cpu, rc; + + txq = netdev_pick_tx(dev, skb, NULL); + cpu = smp_processor_id(); + HARD_TX_LOCK(dev, txq, cpu); + if (!netif_xmit_stopped(txq)) { + rc = netdev_start_xmit(skb, dev, txq, 0); + if (dev_xmit_complete(rc)) + free_skb = false; + } + HARD_TX_UNLOCK(dev, txq); + if (free_skb) { + trace_xdp_exception(dev, xdp_prog, XDP_TX); + kfree_skb(skb); + } +} + static int netif_receive_skb_internal(struct sk_buff *skb) { int ret; @@ -4258,6 +4386,21 @@ static int netif_receive_skb_internal(struct sk_buff *skb) rcu_read_lock(); + if (static_key_false(&generic_xdp_needed)) { + struct bpf_prog *xdp_prog = rcu_dereference(skb->dev->xdp_prog); + + if (xdp_prog) { + u32 act = netif_receive_generic_xdp(skb, xdp_prog); + + if (act != XDP_PASS) { + rcu_read_unlock(); + if (act == XDP_TX) + generic_xdp_tx(skb, xdp_prog); + return NET_RX_DROP; + } + } + } + #ifdef CONFIG_RPS if (static_key_false(&rps_needed)) { struct rps_dev_flow voidflow, *rflow = &voidflow; @@ -4490,7 +4633,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff enum gro_result ret; int grow; - if (!(skb->dev->features & NETIF_F_GRO)) + if (netif_elide_gro(skb->dev)) goto normal; if (skb->csum_bad) @@ -5060,27 +5203,28 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock) do_softirq(); } -bool sk_busy_loop(struct sock *sk, int nonblock) +void napi_busy_loop(unsigned int napi_id, + bool (*loop_end)(void *, unsigned long), + void *loop_end_arg) { - unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0; + unsigned long start_time = loop_end ? busy_loop_current_time() : 0; int (*napi_poll)(struct napi_struct *napi, int budget); void *have_poll_lock = NULL; struct napi_struct *napi; - int rc; restart: - rc = false; napi_poll = NULL; rcu_read_lock(); - napi = napi_by_id(sk->sk_napi_id); + napi = napi_by_id(napi_id); if (!napi) goto out; preempt_disable(); for (;;) { - rc = 0; + int work = 0; + local_bh_disable(); if (!napi_poll) { unsigned long val = READ_ONCE(napi->state); @@ -5098,16 +5242,15 @@ bool sk_busy_loop(struct sock *sk, int nonblock) have_poll_lock = netpoll_poll_lock(napi); napi_poll = napi->poll; } - rc = napi_poll(napi, BUSY_POLL_BUDGET); - trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); + work = napi_poll(napi, BUSY_POLL_BUDGET); + trace_napi_poll(napi, work, BUSY_POLL_BUDGET); count: - if (rc > 0) - __NET_ADD_STATS(sock_net(sk), - LINUX_MIB_BUSYPOLLRXPACKETS, rc); + if (work > 0) + __NET_ADD_STATS(dev_net(napi->dev), + LINUX_MIB_BUSYPOLLRXPACKETS, work); local_bh_enable(); - if (nonblock || !skb_queue_empty(&sk->sk_receive_queue) || - busy_loop_timeout(end_time)) + if (!loop_end || loop_end(loop_end_arg, start_time)) break; if (unlikely(need_resched())) { @@ -5116,9 +5259,8 @@ bool sk_busy_loop(struct sock *sk, int nonblock) preempt_enable(); rcu_read_unlock(); cond_resched(); - rc = !skb_queue_empty(&sk->sk_receive_queue); - if (rc || busy_loop_timeout(end_time)) - return rc; + if (loop_end(loop_end_arg, start_time)) + return; goto restart; } cpu_relax(); @@ -5126,12 +5268,10 @@ bool sk_busy_loop(struct sock *sk, int nonblock) if (napi_poll) busy_poll_stop(napi, have_poll_lock); preempt_enable(); - rc = !skb_queue_empty(&sk->sk_receive_queue); out: rcu_read_unlock(); - return rc; } -EXPORT_SYMBOL(sk_busy_loop); +EXPORT_SYMBOL(napi_busy_loop); #endif /* CONFIG_NET_RX_BUSY_POLL */ @@ -5143,10 +5283,10 @@ static void napi_hash_add(struct napi_struct *napi) spin_lock(&napi_hash_lock); - /* 0..NR_CPUS+1 range is reserved for sender_cpu use */ + /* 0..NR_CPUS range is reserved for sender_cpu use */ do { - if (unlikely(++napi_gen_id < NR_CPUS + 1)) - napi_gen_id = NR_CPUS + 1; + if (unlikely(++napi_gen_id < MIN_NAPI_ID)) + napi_gen_id = MIN_NAPI_ID; } while (napi_by_id(napi_gen_id)); napi->napi_id = napi_gen_id; @@ -5310,7 +5450,8 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) static __latent_entropy void net_rx_action(struct softirq_action *h) { struct softnet_data *sd = this_cpu_ptr(&softnet_data); - unsigned long time_limit = jiffies + 2; + unsigned long time_limit = jiffies + + usecs_to_jiffies(netdev_budget_usecs); int budget = netdev_budget; LIST_HEAD(list); LIST_HEAD(repoll); @@ -6714,13 +6855,16 @@ EXPORT_SYMBOL(dev_change_proto_down); /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device + * @extack: netlink extended ack * @fd: new program fd or negative value to clear * @flags: xdp-related flags * * Set or clear a bpf program for a device */ -int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) +int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, + int fd, u32 flags) { + int (*xdp_op)(struct net_device *dev, struct netdev_xdp *xdp); const struct net_device_ops *ops = dev->netdev_ops; struct bpf_prog *prog = NULL; struct netdev_xdp xdp; @@ -6728,14 +6872,16 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) ASSERT_RTNL(); - if (!ops->ndo_xdp) - return -EOPNOTSUPP; + xdp_op = ops->ndo_xdp; + if (!xdp_op || (flags & XDP_FLAGS_SKB_MODE)) + xdp_op = generic_xdp_install; + if (fd >= 0) { if (flags & XDP_FLAGS_UPDATE_IF_NOEXIST) { memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_QUERY_PROG; - err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0) return err; if (xdp.prog_attached) @@ -6749,9 +6895,10 @@ int dev_change_xdp_fd(struct net_device *dev, int fd, u32 flags) memset(&xdp, 0, sizeof(xdp)); xdp.command = XDP_SETUP_PROG; + xdp.extack = extack; xdp.prog = prog; - err = ops->ndo_xdp(dev, &xdp); + err = xdp_op(dev, &xdp); if (err < 0 && prog) bpf_prog_put(prog); @@ -7102,13 +7249,10 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, else netif_dormant_off(dev); - if (netif_carrier_ok(rootdev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); - } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); - } + if (netif_carrier_ok(rootdev)) + netif_carrier_on(dev); + else + netif_carrier_off(dev); } EXPORT_SYMBOL(netif_stacked_transfer_operstate); @@ -7121,12 +7265,10 @@ static int netif_alloc_rx_queues(struct net_device *dev) BUG_ON(count < 1); - rx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!rx) { - rx = vzalloc(sz); - if (!rx) - return -ENOMEM; - } + rx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!rx) + return -ENOMEM; + dev->_rx = rx; for (i = 0; i < count; i++) @@ -7163,12 +7305,10 @@ static int netif_alloc_netdev_queues(struct net_device *dev) if (count < 1 || count > 0xffff) return -EINVAL; - tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!tx) { - tx = vzalloc(sz); - if (!tx) - return -ENOMEM; - } + tx = kvzalloc(sz, GFP_KERNEL | __GFP_REPEAT); + if (!tx) + return -ENOMEM; + dev->_tx = tx; netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); @@ -7702,9 +7842,7 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, /* ensure 32-byte alignment of whole construct */ alloc_size += NETDEV_ALIGN - 1; - p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT); - if (!p) - p = vzalloc(alloc_size); + p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_REPEAT); if (!p) return NULL; @@ -7791,6 +7929,7 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; + struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); @@ -7809,6 +7948,12 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; + prog = rcu_dereference_protected(dev->xdp_prog, 1); + if (prog) { + bpf_prog_put(prog); + static_key_slow_dec(&generic_xdp_needed); + } + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); diff --git a/net/core/devlink.c b/net/core/devlink.c index e9c1e6acfb6d196d4373dcc36bcf76577952dd32..b0b87a292e7ccac2221a2425510b3c28e1df97f7 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -1397,10 +1397,10 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, u32 seq, int flags) { const struct devlink_ops *ops = devlink->ops; + u8 inline_mode, encap_mode; void *hdr; int err = 0; u16 mode; - u8 inline_mode; hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd); if (!hdr) @@ -1429,6 +1429,15 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink, goto nla_put_failure; } + if (ops->eswitch_encap_mode_get) { + err = ops->eswitch_encap_mode_get(devlink, &encap_mode); + if (err) + goto nla_put_failure; + err = nla_put_u8(msg, DEVLINK_ATTR_ESWITCH_ENCAP_MODE, encap_mode); + if (err) + goto nla_put_failure; + } + genlmsg_end(msg, hdr); return 0; @@ -1468,9 +1477,9 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, { struct devlink *devlink = info->user_ptr[0]; const struct devlink_ops *ops = devlink->ops; - u16 mode; - u8 inline_mode; + u8 inline_mode, encap_mode; int err = 0; + u16 mode; if (!ops) return -EOPNOTSUPP; @@ -1494,7 +1503,695 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb, return err; } + if (info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]) { + if (!ops->eswitch_encap_mode_set) + return -EOPNOTSUPP; + encap_mode = nla_get_u8(info->attrs[DEVLINK_ATTR_ESWITCH_ENCAP_MODE]); + err = ops->eswitch_encap_mode_set(devlink, encap_mode); + if (err) + return err; + } + + return 0; +} + +int devlink_dpipe_match_put(struct sk_buff *skb, + struct devlink_dpipe_match *match) +{ + struct devlink_dpipe_header *header = match->header; + struct devlink_dpipe_field *field = &header->fields[match->field_id]; + struct nlattr *match_attr; + + match_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_MATCH); + if (!match_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_MATCH_TYPE, match->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, match->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, match_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, match_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_match_put); + +static int devlink_dpipe_matches_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *matches_attr; + + matches_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_MATCHES); + if (!matches_attr) + return -EMSGSIZE; + + if (table->table_ops->matches_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, matches_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, matches_attr); + return -EMSGSIZE; +} + +int devlink_dpipe_action_put(struct sk_buff *skb, + struct devlink_dpipe_action *action) +{ + struct devlink_dpipe_header *header = action->header; + struct devlink_dpipe_field *field = &header->fields[action->field_id]; + struct nlattr *action_attr; + + action_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ACTION); + if (!action_attr) + return -EMSGSIZE; + + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_ACTION_TYPE, action->type) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_INDEX, action->header_index) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + nla_nest_end(skb, action_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, action_attr); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_action_put); + +static int devlink_dpipe_actions_put(struct devlink_dpipe_table *table, + struct sk_buff *skb) +{ + struct nlattr *actions_attr; + + actions_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE_ACTIONS); + if (!actions_attr) + return -EMSGSIZE; + + if (table->table_ops->actions_dump(table->priv, skb)) + goto nla_put_failure; + + nla_nest_end(skb, actions_attr); + return 0; + +nla_put_failure: + nla_nest_cancel(skb, actions_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_table_put(struct sk_buff *skb, + struct devlink_dpipe_table *table) +{ + struct nlattr *table_attr; + + table_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLE); + if (!table_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_TABLE_NAME, table->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_SIZE, table->size, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (nla_put_u8(skb, DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED, + table->counters_enabled)) + goto nla_put_failure; + + if (devlink_dpipe_matches_put(table, skb)) + goto nla_put_failure; + + if (devlink_dpipe_actions_put(table, skb)) + goto nla_put_failure; + + nla_nest_end(skb, table_attr); return 0; + +nla_put_failure: + nla_nest_cancel(skb, table_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_send_and_alloc_skb(struct sk_buff **pskb, + struct genl_info *info) +{ + int err; + + if (*pskb) { + err = genlmsg_reply(*pskb, info); + if (err) + return err; + } + *pskb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!*pskb) + return -ENOMEM; + return 0; +} + +static int devlink_dpipe_tables_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + struct nlattr *tables_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + table = list_first_entry(dpipe_tables, + struct devlink_dpipe_table, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + tables_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_TABLES); + if (!tables_attr) + goto nla_put_failure; + + i = 0; + incomplete = false; + list_for_each_entry_from(table, dpipe_tables, list) { + if (!table_name) { + err = devlink_dpipe_table_put(skb, table); + if (err) { + if (!i) + goto err_table_put; + incomplete = true; + break; + } + } else { + if (!strcmp(table->name, table_name)) { + err = devlink_dpipe_table_put(skb, table); + if (err) + break; + } + } + i++; + } + + nla_nest_end(skb, tables_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_table_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name = NULL; + + if (info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + + return devlink_dpipe_tables_fill(info, DEVLINK_CMD_DPIPE_TABLE_GET, 0, + &devlink->dpipe_table_list, + table_name); +} + +static int devlink_dpipe_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE, + value->value_size, value->value)) + return -EMSGSIZE; + if (value->mask) + if (nla_put(skb, DEVLINK_ATTR_DPIPE_VALUE_MASK, + value->value_size, value->mask)) + return -EMSGSIZE; + if (value->mapping_valid) + if (nla_put_u32(skb, DEVLINK_ATTR_DPIPE_VALUE_MAPPING, + value->mapping_value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->action) + return -EINVAL; + if (devlink_dpipe_action_put(skb, value->action)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_action_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *action_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + action_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ACTION_VALUE); + if (!action_attr) + return -EMSGSIZE; + err = devlink_dpipe_action_value_put(skb, &values[i]); + if (err) + goto err_action_value_put; + nla_nest_end(skb, action_attr); + } + return 0; + +err_action_value_put: + nla_nest_cancel(skb, action_attr); + return err; +} + +static int devlink_dpipe_match_value_put(struct sk_buff *skb, + struct devlink_dpipe_value *value) +{ + if (!value->match) + return -EINVAL; + if (devlink_dpipe_match_put(skb, value->match)) + return -EMSGSIZE; + if (devlink_dpipe_value_put(skb, value)) + return -EMSGSIZE; + return 0; +} + +static int devlink_dpipe_match_values_put(struct sk_buff *skb, + struct devlink_dpipe_value *values, + unsigned int values_count) +{ + struct nlattr *match_attr; + int i; + int err; + + for (i = 0; i < values_count; i++) { + match_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_MATCH_VALUE); + if (!match_attr) + return -EMSGSIZE; + err = devlink_dpipe_match_value_put(skb, &values[i]); + if (err) + goto err_match_value_put; + nla_nest_end(skb, match_attr); + } + return 0; + +err_match_value_put: + nla_nest_cancel(skb, match_attr); + return err; +} + +static int devlink_dpipe_entry_put(struct sk_buff *skb, + struct devlink_dpipe_entry *entry) +{ + struct nlattr *entry_attr, *matches_attr, *actions_attr; + int err; + + entry_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_ENTRY); + if (!entry_attr) + return -EMSGSIZE; + + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_INDEX, entry->index, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (entry->counter_valid) + if (nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_ENTRY_COUNTER, + entry->counter, DEVLINK_ATTR_PAD)) + goto nla_put_failure; + + matches_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_MATCH_VALUES); + if (!matches_attr) + goto nla_put_failure; + + err = devlink_dpipe_match_values_put(skb, entry->match_values, + entry->match_values_count); + if (err) { + nla_nest_cancel(skb, matches_attr); + goto err_match_values_put; + } + nla_nest_end(skb, matches_attr); + + actions_attr = nla_nest_start(skb, + DEVLINK_ATTR_DPIPE_ENTRY_ACTION_VALUES); + if (!actions_attr) + goto nla_put_failure; + + err = devlink_dpipe_action_values_put(skb, entry->action_values, + entry->action_values_count); + if (err) { + nla_nest_cancel(skb, actions_attr); + goto err_action_values_put; + } + nla_nest_end(skb, actions_attr); + + nla_nest_end(skb, entry_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; +err_match_values_put: +err_action_values_put: + nla_nest_cancel(skb, entry_attr); + return err; +} + +static struct devlink_dpipe_table * +devlink_dpipe_table_find(struct list_head *dpipe_tables, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + list_for_each_entry_rcu(table, dpipe_tables, list) { + if (!strcmp(table->name, table_name)) + return table; + } + return NULL; +} + +int devlink_dpipe_entry_ctx_prepare(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + struct devlink *devlink; + int err; + + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx->skb, + dump_ctx->info); + if (err) + return err; + + dump_ctx->hdr = genlmsg_put(dump_ctx->skb, + dump_ctx->info->snd_portid, + dump_ctx->info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, + dump_ctx->cmd); + if (!dump_ctx->hdr) + goto nla_put_failure; + + devlink = dump_ctx->info->user_ptr[0]; + if (devlink_nl_put_handle(dump_ctx->skb, devlink)) + goto nla_put_failure; + dump_ctx->nest = nla_nest_start(dump_ctx->skb, + DEVLINK_ATTR_DPIPE_ENTRIES); + if (!dump_ctx->nest) + goto nla_put_failure; + return 0; + +nla_put_failure: + genlmsg_cancel(dump_ctx->skb, dump_ctx->hdr); + nlmsg_free(dump_ctx->skb); + return -EMSGSIZE; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_prepare); + +int devlink_dpipe_entry_ctx_append(struct devlink_dpipe_dump_ctx *dump_ctx, + struct devlink_dpipe_entry *entry) +{ + return devlink_dpipe_entry_put(dump_ctx->skb, entry); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_append); + +int devlink_dpipe_entry_ctx_close(struct devlink_dpipe_dump_ctx *dump_ctx) +{ + nla_nest_end(dump_ctx->skb, dump_ctx->nest); + genlmsg_end(dump_ctx->skb, dump_ctx->hdr); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_entry_ctx_close); + +static int devlink_dpipe_entries_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_table *table) +{ + struct devlink_dpipe_dump_ctx dump_ctx; + struct nlmsghdr *nlh; + int err; + + dump_ctx.skb = NULL; + dump_ctx.cmd = cmd; + dump_ctx.info = info; + + err = table->table_ops->entries_dump(table->priv, + table->counters_enabled, + &dump_ctx); + if (err) + goto err_entries_dump; + +send_done: + nlh = nlmsg_put(dump_ctx.skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&dump_ctx.skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(dump_ctx.skb, info); + +err_entries_dump: +err_skb_send_alloc: + genlmsg_cancel(dump_ctx.skb, dump_ctx.hdr); + nlmsg_free(dump_ctx.skb); + return err; +} + +static int devlink_nl_cmd_dpipe_entries_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_dpipe_table *table; + const char *table_name; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (!table->table_ops->entries_dump) + return -EINVAL; + + return devlink_dpipe_entries_fill(info, DEVLINK_CMD_DPIPE_ENTRIES_GET, + 0, table); +} + +static int devlink_dpipe_fields_put(struct sk_buff *skb, + const struct devlink_dpipe_header *header) +{ + struct devlink_dpipe_field *field; + struct nlattr *field_attr; + int i; + + for (i = 0; i < header->fields_count; i++) { + field = &header->fields[i]; + field_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_FIELD); + if (!field_attr) + return -EMSGSIZE; + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_FIELD_NAME, field->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_ID, field->id) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_BITWIDTH, field->bitwidth) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_FIELD_MAPPING_TYPE, field->mapping_type)) + goto nla_put_failure; + nla_nest_end(skb, field_attr); + } + return 0; + +nla_put_failure: + nla_nest_cancel(skb, field_attr); + return -EMSGSIZE; +} + +static int devlink_dpipe_header_put(struct sk_buff *skb, + struct devlink_dpipe_header *header) +{ + struct nlattr *fields_attr, *header_attr; + int err; + + header_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER); + if (!header_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_DPIPE_HEADER_NAME, header->name) || + nla_put_u32(skb, DEVLINK_ATTR_DPIPE_HEADER_ID, header->id) || + nla_put_u8(skb, DEVLINK_ATTR_DPIPE_HEADER_GLOBAL, header->global)) + goto nla_put_failure; + + fields_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADER_FIELDS); + if (!fields_attr) + goto nla_put_failure; + + err = devlink_dpipe_fields_put(skb, header); + if (err) { + nla_nest_cancel(skb, fields_attr); + goto nla_put_failure; + } + nla_nest_end(skb, fields_attr); + nla_nest_end(skb, header_attr); + return 0; + +nla_put_failure: + err = -EMSGSIZE; + nla_nest_cancel(skb, header_attr); + return err; +} + +static int devlink_dpipe_headers_fill(struct genl_info *info, + enum devlink_command cmd, int flags, + struct devlink_dpipe_headers * + dpipe_headers) +{ + struct devlink *devlink = info->user_ptr[0]; + struct nlattr *headers_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + void *hdr; + int i, j; + int err; + + i = 0; +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) + return -EMSGSIZE; + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + headers_attr = nla_nest_start(skb, DEVLINK_ATTR_DPIPE_HEADERS); + if (!headers_attr) + goto nla_put_failure; + + j = 0; + for (; i < dpipe_headers->headers_count; i++) { + err = devlink_dpipe_header_put(skb, dpipe_headers->headers[i]); + if (err) { + if (!j) + goto err_table_put; + break; + } + j++; + } + nla_nest_end(skb, headers_attr); + genlmsg_end(skb, hdr); + if (i != dpipe_headers->headers_count) + goto start_again; + +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_table_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_dpipe_headers_get(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (!devlink->dpipe_headers) + return -EOPNOTSUPP; + return devlink_dpipe_headers_fill(info, DEVLINK_CMD_DPIPE_HEADERS_GET, + 0, devlink->dpipe_headers); +} + +static int devlink_dpipe_table_counters_set(struct devlink *devlink, + const char *table_name, + bool enable) +{ + struct devlink_dpipe_table *table; + + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + return -EINVAL; + + if (table->counter_control_extern) + return -EOPNOTSUPP; + + if (!(table->counters_enabled ^ enable)) + return 0; + + table->counters_enabled = enable; + if (table->table_ops->counters_set_update) + table->table_ops->counters_set_update(table->priv, enable); + return 0; +} + +static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + const char *table_name; + bool counters_enable; + + if (!info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME] || + !info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]) + return -EINVAL; + + table_name = nla_data(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_NAME]); + counters_enable = !!nla_get_u8(info->attrs[DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED]); + + return devlink_dpipe_table_counters_set(devlink, table_name, + counters_enable); } static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { @@ -1512,6 +2209,9 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_SB_TC_INDEX] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_MODE] = { .type = NLA_U16 }, [DEVLINK_ATTR_ESWITCH_INLINE_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, + [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, + [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, }; static const struct genl_ops devlink_nl_ops[] = { @@ -1644,6 +2344,34 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_GET, + .doit = devlink_nl_cmd_dpipe_table_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_ENTRIES_GET, + .doit = devlink_nl_cmd_dpipe_entries_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_HEADERS_GET, + .doit = devlink_nl_cmd_dpipe_headers_get, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_DPIPE_TABLE_COUNTERS_SET, + .doit = devlink_nl_cmd_dpipe_table_counters_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -1680,6 +2408,7 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) devlink_net_set(devlink, &init_net); INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); + INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -1880,6 +2609,133 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) } EXPORT_SYMBOL_GPL(devlink_sb_unregister); +/** + * devlink_dpipe_headers_register - register dpipe headers + * + * @devlink: devlink + * @dpipe_headers: dpipe header array + * + * Register the headers supported by hardware. + */ +int devlink_dpipe_headers_register(struct devlink *devlink, + struct devlink_dpipe_headers *dpipe_headers) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = dpipe_headers; + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); + +/** + * devlink_dpipe_headers_unregister - unregister dpipe headers + * + * @devlink: devlink + * + * Unregister the headers supported by hardware. + */ +void devlink_dpipe_headers_unregister(struct devlink *devlink) +{ + mutex_lock(&devlink_mutex); + devlink->dpipe_headers = NULL; + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); + +/** + * devlink_dpipe_table_counter_enabled - check if counter allocation + * required + * @devlink: devlink + * @table_name: tables name + * + * Used by driver to check if counter allocation is required. + * After counter allocation is turned on the table entries + * are updated to include counter statistics. + * + * After that point on the driver must respect the counter + * state so that each entry added to the table is added + * with a counter. + */ +bool devlink_dpipe_table_counter_enabled(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + bool enabled; + + rcu_read_lock(); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + enabled = false; + if (table) + enabled = table->counters_enabled; + rcu_read_unlock(); + return enabled; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_counter_enabled); + +/** + * devlink_dpipe_table_register - register dpipe table + * + * @devlink: devlink + * @table_name: table name + * @table_ops: table ops + * @priv: priv + * @size: size + * @counter_control_extern: external control for counters + */ +int devlink_dpipe_table_register(struct devlink *devlink, + const char *table_name, + struct devlink_dpipe_table_ops *table_ops, + void *priv, u64 size, + bool counter_control_extern) +{ + struct devlink_dpipe_table *table; + + if (devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name)) + return -EEXIST; + + table = kzalloc(sizeof(*table), GFP_KERNEL); + if (!table) + return -ENOMEM; + + table->name = table_name; + table->table_ops = table_ops; + table->priv = priv; + table->size = size; + table->counter_control_extern = counter_control_extern; + + mutex_lock(&devlink_mutex); + list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); + mutex_unlock(&devlink_mutex); + return 0; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); + +/** + * devlink_dpipe_table_unregister - unregister dpipe table + * + * @devlink: devlink + * @table_name: table name + */ +void devlink_dpipe_table_unregister(struct devlink *devlink, + const char *table_name) +{ + struct devlink_dpipe_table *table; + + mutex_lock(&devlink_mutex); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) + goto unlock; + list_del_rcu(&table->list); + mutex_unlock(&devlink_mutex); + kfree_rcu(table, rcu); + return; +unlock: + mutex_unlock(&devlink_mutex); +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index fb55327dcfeabdaf3eeecc3a8d176ae215612649..70ccda233bd1f1aab18535e6d9d0419bb9a1a23b 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -412,9 +412,8 @@ static int __init init_net_drop_monitor(void) for_each_possible_cpu(cpu) { data = &per_cpu(dm_cpu_data, cpu); INIT_WORK(&data->dm_alert_work, send_dm_alert); - init_timer(&data->send_timer); - data->send_timer.data = (unsigned long)data; - data->send_timer.function = sched_send_work; + setup_timer(&data->send_timer, sched_send_work, + (unsigned long)data); spin_lock_init(&data->lock); reset_per_cpu_data(data); } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index aecb2c7241b697e79628fdb79467f5087b2bbf9f..03111a2d6653b8f06427540eb5864b255af4ead3 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -90,6 +90,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_GSO_UDP_TUNNEL_CSUM_BIT] = "tx-udp_tnl-csum-segmentation", [NETIF_F_GSO_PARTIAL_BIT] = "tx-gso-partial", [NETIF_F_GSO_SCTP_BIT] = "tx-sctp-segmentation", + [NETIF_F_GSO_ESP_BIT] = "tx-esp-segmentation", [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc", [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp", @@ -103,12 +104,15 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", [NETIF_F_HW_TC_BIT] = "hw-tc-offload", + [NETIF_F_HW_ESP_BIT] = "esp-hw-offload", + [NETIF_F_HW_ESP_TX_CSUM_BIT] = "esp-tx-csum-hw-offload", }; static const char rss_hash_func_strings[ETH_RSS_HASH_FUNCS_COUNT][ETH_GSTRING_LEN] = { [ETH_RSS_HASH_TOP_BIT] = "toeplitz", [ETH_RSS_HASH_XOR_BIT] = "xor", + [ETH_RSS_HASH_CRC32_BIT] = "crc32", }; static const char diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index b6791d94841d56cf8b1027d3ba2d71dd21302caf..f21c4d3aeae0cf59a8d9239cf1e581e6d136c740 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -23,6 +23,20 @@ static const struct fib_kuid_range fib_kuid_range_unset = { KUIDT_INIT(~0), }; +bool fib_rule_matchall(const struct fib_rule *rule) +{ + if (rule->iifindex || rule->oifindex || rule->mark || rule->tun_id || + rule->flags) + return false; + if (rule->suppress_ifgroup != -1 || rule->suppress_prefixlen != -1) + return false; + if (!uid_eq(rule->uid_range.start, fib_kuid_range_unset.start) || + !uid_eq(rule->uid_range.end, fib_kuid_range_unset.end)) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib_rule_matchall); + int fib_default_rule_add(struct fib_rules_ops *ops, u32 pref, u32 table, u32 flags) { @@ -354,7 +368,8 @@ static int rule_exists(struct fib_rules_ops *ops, struct fib_rule_hdr *frh, return 0; } -int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) +int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -372,7 +387,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) goto errout; @@ -425,6 +440,7 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_TUN_ID]) rule->tun_id = nla_get_be64(tb[FRA_TUN_ID]); + err = -EINVAL; if (tb[FRA_L3MDEV]) { #ifdef CONFIG_NET_L3_MASTER_DEV rule->l3mdev = nla_get_u8(tb[FRA_L3MDEV]); @@ -446,7 +462,6 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) else rule->suppress_ifgroup = -1; - err = -EINVAL; if (tb[FRA_GOTO]) { if (rule->action != FR_ACT_GOTO) goto errout_free; @@ -547,7 +562,8 @@ int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh) } EXPORT_SYMBOL_GPL(fib_nl_newrule); -int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) +int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_rule_hdr *frh = nlmsg_data(nlh); @@ -566,7 +582,7 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) goto errout; } - err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy); + err = nlmsg_parse(nlh, sizeof(*frh), tb, FRA_MAX, ops->policy, extack); if (err < 0) goto errout; @@ -576,8 +592,10 @@ int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[FRA_UID_RANGE]) { range = nla_get_kuid_range(tb); - if (!uid_range_set(&range)) + if (!uid_range_set(&range)) { + err = -EINVAL; goto errout; + } } else { range = fib_kuid_range_unset; } diff --git a/net/core/filter.c b/net/core/filter.c index ebaeaf2e46e8bd0171379604930d232f205afd07..a253a6197e6b37a7ae2fe451c646b01c861a3e22 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -52,6 +53,7 @@ #include #include #include +#include /** * sk_filter_trim_cap - run a packet through a socket filter @@ -91,7 +93,12 @@ int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap) rcu_read_lock(); filter = rcu_dereference(sk->sk_filter); if (filter) { - unsigned int pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + struct sock *save_sk = skb->sk; + unsigned int pkt_len; + + skb->sk = sk; + pkt_len = bpf_prog_run_save_cb(filter->prog, skb); + skb->sk = save_sk; err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM; } rcu_read_unlock(); @@ -348,7 +355,8 @@ static bool convert_bpf_extensions(struct sock_filter *fp, * @new_prog: buffer where converted program will be stored * @new_len: pointer to store length of converted program * - * Remap 'sock_filter' style BPF instruction set to 'sock_filter_ext' style. + * Remap 'sock_filter' style classic BPF (cBPF) instruction set to 'bpf_insn' + * style extended BPF (eBPF). * Conversion workflow: * * 1) First pass for calculating the new program length: @@ -928,7 +936,7 @@ static void sk_filter_release_rcu(struct rcu_head *rcu) */ static void sk_filter_release(struct sk_filter *fp) { - if (atomic_dec_and_test(&fp->refcnt)) + if (refcount_dec_and_test(&fp->refcnt)) call_rcu(&fp->rcu, sk_filter_release_rcu); } @@ -943,20 +951,27 @@ void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) /* try to charge the socket memory if there is space available * return true on success */ -bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +static bool __sk_filter_charge(struct sock *sk, struct sk_filter *fp) { u32 filter_size = bpf_prog_size(fp->prog->len); /* same check as in sock_kmalloc() */ if (filter_size <= sysctl_optmem_max && atomic_read(&sk->sk_omem_alloc) + filter_size < sysctl_optmem_max) { - atomic_inc(&fp->refcnt); atomic_add(filter_size, &sk->sk_omem_alloc); return true; } return false; } +bool sk_filter_charge(struct sock *sk, struct sk_filter *fp) +{ + bool ret = __sk_filter_charge(sk, fp); + if (ret) + refcount_inc(&fp->refcnt); + return ret; +} + static struct bpf_prog *bpf_migrate_filter(struct bpf_prog *fp) { struct sock_filter *old_prog; @@ -1179,12 +1194,12 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk) return -ENOMEM; fp->prog = prog; - atomic_set(&fp->refcnt, 0); - if (!sk_filter_charge(sk, fp)) { + if (!__sk_filter_charge(sk, fp)) { kfree(fp); return -ENOMEM; } + refcount_set(&fp->refcnt, 1); old_fp = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); @@ -2599,6 +2614,36 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) +{ + return skb->sk ? sock_gen_cookie(skb->sk) : 0; +} + +static const struct bpf_func_proto bpf_get_socket_cookie_proto = { + .func = bpf_get_socket_cookie, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + +BPF_CALL_1(bpf_get_socket_uid, struct sk_buff *, skb) +{ + struct sock *sk = sk_to_full_sk(skb->sk); + kuid_t kuid; + + if (!sk || !sk_fullsock(sk)) + return overflowuid; + kuid = sock_net_uid(sock_net(sk), sk); + return from_kuid_munged(sock_net(sk)->user_ns, kuid); +} + +static const struct bpf_func_proto bpf_get_socket_uid_proto = { + .func = bpf_get_socket_uid, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -2633,6 +2678,10 @@ sk_filter_func_proto(enum bpf_func_id func_id) switch (func_id) { case BPF_FUNC_skb_load_bytes: return &bpf_skb_load_bytes_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; default: return bpf_base_func_proto(func_id); } @@ -2692,6 +2741,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id) return &bpf_get_smp_processor_id_proto; case BPF_FUNC_skb_under_cgroup: return &bpf_skb_under_cgroup_proto; + case BPF_FUNC_get_socket_cookie: + return &bpf_get_socket_cookie_proto; + case BPF_FUNC_get_socket_uid: + return &bpf_get_socket_uid_proto; default: return bpf_base_func_proto(func_id); } @@ -2715,12 +2768,7 @@ xdp_func_proto(enum bpf_func_id func_id) static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id) { - switch (func_id) { - case BPF_FUNC_skb_load_bytes: - return &bpf_skb_load_bytes_proto; - default: - return bpf_base_func_proto(func_id); - } + return sk_filter_func_proto(func_id); } static const struct bpf_func_proto * @@ -3154,6 +3202,19 @@ static u32 bpf_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_MOV64_REG(si->dst_reg, si->dst_reg); else *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#endif + break; + + case offsetof(struct __sk_buff, napi_id): +#if defined(CONFIG_NET_RX_BUSY_POLL) + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, napi_id) != 4); + + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->src_reg, + offsetof(struct sk_buff, napi_id)); + *insn++ = BPF_JMP_IMM(BPF_JGE, si->dst_reg, MIN_NAPI_ID, 1); + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); +#else + *insn++ = BPF_MOV64_IMM(si->dst_reg, 0); #endif break; } @@ -3252,111 +3313,55 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } -static const struct bpf_verifier_ops sk_filter_ops = { +const struct bpf_verifier_ops sk_filter_prog_ops = { .get_func_proto = sk_filter_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, }; -static const struct bpf_verifier_ops tc_cls_act_ops = { +const struct bpf_verifier_ops tc_cls_act_prog_ops = { .get_func_proto = tc_cls_act_func_proto, .is_valid_access = tc_cls_act_is_valid_access, .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops xdp_ops = { +const struct bpf_verifier_ops xdp_prog_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .test_run = bpf_prog_test_run_xdp, }; -static const struct bpf_verifier_ops cg_skb_ops = { +const struct bpf_verifier_ops cg_skb_prog_ops = { .get_func_proto = cg_skb_func_proto, .is_valid_access = sk_filter_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops lwt_inout_ops = { +const struct bpf_verifier_ops lwt_inout_prog_ops = { .get_func_proto = lwt_inout_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops lwt_xmit_ops = { +const struct bpf_verifier_ops lwt_xmit_prog_ops = { .get_func_proto = lwt_xmit_func_proto, .is_valid_access = lwt_is_valid_access, .convert_ctx_access = bpf_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, + .test_run = bpf_prog_test_run_skb, }; -static const struct bpf_verifier_ops cg_sock_ops = { +const struct bpf_verifier_ops cg_sock_prog_ops = { .get_func_proto = bpf_base_func_proto, .is_valid_access = sock_filter_is_valid_access, .convert_ctx_access = sock_filter_convert_ctx_access, }; -static struct bpf_prog_type_list sk_filter_type __ro_after_init = { - .ops = &sk_filter_ops, - .type = BPF_PROG_TYPE_SOCKET_FILTER, -}; - -static struct bpf_prog_type_list sched_cls_type __ro_after_init = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_CLS, -}; - -static struct bpf_prog_type_list sched_act_type __ro_after_init = { - .ops = &tc_cls_act_ops, - .type = BPF_PROG_TYPE_SCHED_ACT, -}; - -static struct bpf_prog_type_list xdp_type __ro_after_init = { - .ops = &xdp_ops, - .type = BPF_PROG_TYPE_XDP, -}; - -static struct bpf_prog_type_list cg_skb_type __ro_after_init = { - .ops = &cg_skb_ops, - .type = BPF_PROG_TYPE_CGROUP_SKB, -}; - -static struct bpf_prog_type_list lwt_in_type __ro_after_init = { - .ops = &lwt_inout_ops, - .type = BPF_PROG_TYPE_LWT_IN, -}; - -static struct bpf_prog_type_list lwt_out_type __ro_after_init = { - .ops = &lwt_inout_ops, - .type = BPF_PROG_TYPE_LWT_OUT, -}; - -static struct bpf_prog_type_list lwt_xmit_type __ro_after_init = { - .ops = &lwt_xmit_ops, - .type = BPF_PROG_TYPE_LWT_XMIT, -}; - -static struct bpf_prog_type_list cg_sock_type __ro_after_init = { - .ops = &cg_sock_ops, - .type = BPF_PROG_TYPE_CGROUP_SOCK -}; - -static int __init register_sk_filter_ops(void) -{ - bpf_register_prog_type(&sk_filter_type); - bpf_register_prog_type(&sched_cls_type); - bpf_register_prog_type(&sched_act_type); - bpf_register_prog_type(&xdp_type); - bpf_register_prog_type(&cg_skb_type); - bpf_register_prog_type(&cg_sock_type); - bpf_register_prog_type(&lwt_in_type); - bpf_register_prog_type(&lwt_out_type); - bpf_register_prog_type(&lwt_xmit_type); - - return 0; -} -late_initcall(register_sk_filter_ops); - int sk_detach_filter(struct sock *sk) { int ret = -ENOENT; diff --git a/net/core/flow.c b/net/core/flow.c index f765c11d8df567d704998185482c3d220280c148..f7f5d1932a2720767dd31f4033f196815ff08447 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -47,7 +47,7 @@ struct flow_flush_info { static struct kmem_cache *flow_cachep __read_mostly; -#define flow_cache_hash_size(cache) (1 << (cache)->hash_shift) +#define flow_cache_hash_size(cache) (1U << (cache)->hash_shift) #define FLOW_HASH_RND_PERIOD (10 * 60 * HZ) static void flow_cache_new_hashrnd(unsigned long arg) @@ -99,7 +99,8 @@ static void flow_cache_gc_task(struct work_struct *work) } static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, - int deleted, struct list_head *gc_list, + unsigned int deleted, + struct list_head *gc_list, struct netns_xfrm *xfrm) { if (deleted) { @@ -114,17 +115,18 @@ static void flow_cache_queue_garbage(struct flow_cache_percpu *fcp, static void __flow_cache_shrink(struct flow_cache *fc, struct flow_cache_percpu *fcp, - int shrink_to) + unsigned int shrink_to) { struct flow_cache_entry *fle; struct hlist_node *tmp; LIST_HEAD(gc_list); - int i, deleted = 0; + unsigned int deleted = 0; struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, flow_cache_global); + unsigned int i; for (i = 0; i < flow_cache_hash_size(fc); i++) { - int saved = 0; + unsigned int saved = 0; hlist_for_each_entry_safe(fle, tmp, &fcp->hash_table[i], u.hlist) { @@ -145,7 +147,7 @@ static void __flow_cache_shrink(struct flow_cache *fc, static void flow_cache_shrink(struct flow_cache *fc, struct flow_cache_percpu *fcp) { - int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); + unsigned int shrink_to = fc->low_watermark / flow_cache_hash_size(fc); __flow_cache_shrink(fc, fcp, shrink_to); } @@ -161,7 +163,7 @@ static void flow_new_hash_rnd(struct flow_cache *fc, static u32 flow_hash_code(struct flow_cache *fc, struct flow_cache_percpu *fcp, const struct flowi *key, - size_t keysize) + unsigned int keysize) { const u32 *k = (const u32 *) key; const u32 length = keysize * sizeof(flow_compare_t) / sizeof(u32); @@ -174,7 +176,7 @@ static u32 flow_hash_code(struct flow_cache *fc, * important assumptions that we can here, such as alignment. */ static int flow_key_compare(const struct flowi *key1, const struct flowi *key2, - size_t keysize) + unsigned int keysize) { const flow_compare_t *k1, *k1_lim, *k2; @@ -199,7 +201,7 @@ flow_cache_lookup(struct net *net, const struct flowi *key, u16 family, u8 dir, struct flow_cache_percpu *fcp; struct flow_cache_entry *fle, *tfle; struct flow_cache_object *flo; - size_t keysize; + unsigned int keysize; unsigned int hash; local_bh_disable(); @@ -295,9 +297,10 @@ static void flow_cache_flush_tasklet(unsigned long data) struct flow_cache_entry *fle; struct hlist_node *tmp; LIST_HEAD(gc_list); - int i, deleted = 0; + unsigned int deleted = 0; struct netns_xfrm *xfrm = container_of(fc, struct netns_xfrm, flow_cache_global); + unsigned int i; fcp = this_cpu_ptr(fc->percpu); for (i = 0; i < flow_cache_hash_size(fc); i++) { @@ -327,7 +330,7 @@ static void flow_cache_flush_tasklet(unsigned long data) static int flow_cache_percpu_empty(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp; - int i; + unsigned int i; fcp = per_cpu_ptr(fc->percpu, cpu); for (i = 0; i < flow_cache_hash_size(fc); i++) @@ -402,12 +405,12 @@ void flow_cache_flush_deferred(struct net *net) static int flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); + unsigned int sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); if (!fcp->hash_table) { fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); if (!fcp->hash_table) { - pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + pr_err("NET: failed to allocate flow cache sz %u\n", sz); return -ENOMEM; } fcp->hash_rnd_recalc = 1; diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index d98d4998213da6103665d62d5a85613631236f19..28d94bce4df8eb046dd6a03103c9249b29e74cd8 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -113,6 +113,235 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, } EXPORT_SYMBOL(__skb_flow_get_ports); +enum flow_dissect_ret { + FLOW_DISSECT_RET_OUT_GOOD, + FLOW_DISSECT_RET_OUT_BAD, + FLOW_DISSECT_RET_OUT_PROTO_AGAIN, +}; + +static enum flow_dissect_ret +__skb_flow_dissect_mpls(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_keyid *key_keyid; + struct mpls_label *hdr, _hdr[2]; + u32 entry, label; + + if (!dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY) && + !dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) + return FLOW_DISSECT_RET_OUT_GOOD; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, + hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + entry = ntohl(hdr[0].entry); + label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT; + + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_MPLS)) { + struct flow_dissector_key_mpls *key_mpls; + + key_mpls = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS, + target_container); + key_mpls->mpls_label = label; + key_mpls->mpls_ttl = (entry & MPLS_LS_TTL_MASK) + >> MPLS_LS_TTL_SHIFT; + key_mpls->mpls_tc = (entry & MPLS_LS_TC_MASK) + >> MPLS_LS_TC_SHIFT; + key_mpls->mpls_bos = (entry & MPLS_LS_S_MASK) + >> MPLS_LS_S_SHIFT; + } + + if (label == MPLS_LABEL_ENTROPY) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_MPLS_ENTROPY, + target_container); + key_keyid->keyid = hdr[1].entry & htonl(MPLS_LS_LABEL_MASK); + } + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_arp(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, void *data, int nhoff, int hlen) +{ + struct flow_dissector_key_arp *key_arp; + struct { + unsigned char ar_sha[ETH_ALEN]; + unsigned char ar_sip[4]; + unsigned char ar_tha[ETH_ALEN]; + unsigned char ar_tip[4]; + } *arp_eth, _arp_eth; + const struct arphdr *arp; + struct arphdr _arp; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ARP)) + return FLOW_DISSECT_RET_OUT_GOOD; + + arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, + hlen, &_arp); + if (!arp) + return FLOW_DISSECT_RET_OUT_BAD; + + if (arp->ar_hrd != htons(ARPHRD_ETHER) || + arp->ar_pro != htons(ETH_P_IP) || + arp->ar_hln != ETH_ALEN || + arp->ar_pln != 4 || + (arp->ar_op != htons(ARPOP_REPLY) && + arp->ar_op != htons(ARPOP_REQUEST))) + return FLOW_DISSECT_RET_OUT_BAD; + + arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), + sizeof(_arp_eth), data, + hlen, &_arp_eth); + if (!arp_eth) + return FLOW_DISSECT_RET_OUT_BAD; + + key_arp = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_ARP, + target_container); + + memcpy(&key_arp->sip, arp_eth->ar_sip, sizeof(key_arp->sip)); + memcpy(&key_arp->tip, arp_eth->ar_tip, sizeof(key_arp->tip)); + + /* Only store the lower byte of the opcode; + * this covers ARPOP_REPLY and ARPOP_REQUEST. + */ + key_arp->op = ntohs(arp->ar_op) & 0xff; + + ether_addr_copy(key_arp->sha, arp_eth->ar_sha); + ether_addr_copy(key_arp->tha, arp_eth->ar_tha); + + return FLOW_DISSECT_RET_OUT_GOOD; +} + +static enum flow_dissect_ret +__skb_flow_dissect_gre(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + struct flow_dissector *flow_dissector, + void *target_container, void *data, + __be16 *p_proto, int *p_nhoff, int *p_hlen, + unsigned int flags) +{ + struct flow_dissector_key_keyid *key_keyid; + struct gre_base_hdr *hdr, _hdr; + int offset = 0; + u16 gre_ver; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), + data, *p_hlen, &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + /* Only look inside GRE without routing */ + if (hdr->flags & GRE_ROUTING) + return FLOW_DISSECT_RET_OUT_GOOD; + + /* Only look inside GRE for version 0 and 1 */ + gre_ver = ntohs(hdr->flags & GRE_VERSION); + if (gre_ver > 1) + return FLOW_DISSECT_RET_OUT_GOOD; + + *p_proto = hdr->protocol; + if (gre_ver) { + /* Version1 must be PPTP, and check the flags */ + if (!(*p_proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) + return FLOW_DISSECT_RET_OUT_GOOD; + } + + offset += sizeof(struct gre_base_hdr); + + if (hdr->flags & GRE_CSUM) + offset += sizeof(((struct gre_full_hdr *) 0)->csum) + + sizeof(((struct gre_full_hdr *) 0)->reserved1); + + if (hdr->flags & GRE_KEY) { + const __be32 *keyid; + __be32 _keyid; + + keyid = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_keyid), + data, *p_hlen, &_keyid); + if (!keyid) + return FLOW_DISSECT_RET_OUT_BAD; + + if (dissector_uses_key(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID)) { + key_keyid = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_GRE_KEYID, + target_container); + if (gre_ver == 0) + key_keyid->keyid = *keyid; + else + key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; + } + offset += sizeof(((struct gre_full_hdr *) 0)->key); + } + + if (hdr->flags & GRE_SEQ) + offset += sizeof(((struct pptp_gre_header *) 0)->seq); + + if (gre_ver == 0) { + if (*p_proto == htons(ETH_P_TEB)) { + const struct ethhdr *eth; + struct ethhdr _eth; + + eth = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_eth), + data, *p_hlen, &_eth); + if (!eth) + return FLOW_DISSECT_RET_OUT_BAD; + *p_proto = eth->h_proto; + offset += sizeof(*eth); + + /* Cap headers that we access via pointers at the + * end of the Ethernet header as our maximum alignment + * at that point is only 2 bytes. + */ + if (NET_IP_ALIGN) + *p_hlen = *p_nhoff + offset; + } + } else { /* version 1, must be PPTP */ + u8 _ppp_hdr[PPP_HDRLEN]; + u8 *ppp_hdr; + + if (hdr->flags & GRE_ACK) + offset += sizeof(((struct pptp_gre_header *) 0)->ack); + + ppp_hdr = __skb_header_pointer(skb, *p_nhoff + offset, + sizeof(_ppp_hdr), + data, *p_hlen, _ppp_hdr); + if (!ppp_hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + switch (PPP_PROTOCOL(ppp_hdr)) { + case PPP_IP: + *p_proto = htons(ETH_P_IP); + break; + case PPP_IPV6: + *p_proto = htons(ETH_P_IPV6); + break; + default: + /* Could probably catch some more like MPLS */ + break; + } + + offset += PPP_HDRLEN; + } + + *p_nhoff += offset; + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_OUT_PROTO_AGAIN; +} + /** * __skb_flow_dissect - extract the flow_keys struct and return it * @skb: sk_buff to extract the flow from, can be NULL if the rest are specified @@ -138,12 +367,10 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; struct flow_dissector_key_addrs *key_addrs; - struct flow_dissector_key_arp *key_arp; struct flow_dissector_key_ports *key_ports; struct flow_dissector_key_icmp *key_icmp; struct flow_dissector_key_tags *key_tags; struct flow_dissector_key_vlan *key_vlan; - struct flow_dissector_key_keyid *key_keyid; bool skip_vlan = false; u8 ip_proto = 0; bool ret; @@ -181,7 +408,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, memcpy(key_eth_addrs, ð->h_dest, sizeof(*key_eth_addrs)); } -again: +proto_again: switch (proto) { case htons(ETH_P_IP): { const struct iphdr *iph; @@ -284,7 +511,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, proto = vlan->h_vlan_encapsulated_proto; nhoff += sizeof(*vlan); if (skip_vlan) - goto again; + goto proto_again; } skip_vlan = true; @@ -307,7 +534,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, } } - goto again; + goto proto_again; } case htons(ETH_P_PPP_SES): { struct { @@ -349,31 +576,17 @@ bool __skb_flow_dissect(const struct sk_buff *skb, } case htons(ETH_P_MPLS_UC): - case htons(ETH_P_MPLS_MC): { - struct mpls_label *hdr, _hdr[2]; + case htons(ETH_P_MPLS_MC): mpls: - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, - hlen, &_hdr); - if (!hdr) - goto out_bad; - - if ((ntohl(hdr[0].entry) & MPLS_LS_LABEL_MASK) >> - MPLS_LS_LABEL_SHIFT == MPLS_LABEL_ENTROPY) { - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY)) { - key_keyid = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_MPLS_ENTROPY, - target_container); - key_keyid->keyid = hdr[1].entry & - htonl(MPLS_LS_LABEL_MASK); - } - + switch (__skb_flow_dissect_mpls(skb, flow_dissector, + target_container, data, + nhoff, hlen)) { + case FLOW_DISSECT_RET_OUT_GOOD: goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: + default: + goto out_bad; } - - goto out_good; - } - case htons(ETH_P_FCOE): if ((hlen - nhoff) < FCOE_HEADER_LEN) goto out_bad; @@ -382,177 +595,33 @@ bool __skb_flow_dissect(const struct sk_buff *skb, goto out_good; case htons(ETH_P_ARP): - case htons(ETH_P_RARP): { - struct { - unsigned char ar_sha[ETH_ALEN]; - unsigned char ar_sip[4]; - unsigned char ar_tha[ETH_ALEN]; - unsigned char ar_tip[4]; - } *arp_eth, _arp_eth; - const struct arphdr *arp; - struct arphdr _arp; - - arp = __skb_header_pointer(skb, nhoff, sizeof(_arp), data, - hlen, &_arp); - if (!arp) - goto out_bad; - - if (arp->ar_hrd != htons(ARPHRD_ETHER) || - arp->ar_pro != htons(ETH_P_IP) || - arp->ar_hln != ETH_ALEN || - arp->ar_pln != 4 || - (arp->ar_op != htons(ARPOP_REPLY) && - arp->ar_op != htons(ARPOP_REQUEST))) - goto out_bad; - - arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp), - sizeof(_arp_eth), data, - hlen, - &_arp_eth); - if (!arp_eth) + case htons(ETH_P_RARP): + switch (__skb_flow_dissect_arp(skb, flow_dissector, + target_container, data, + nhoff, hlen)) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: + default: goto out_bad; - - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_ARP)) { - - key_arp = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_ARP, - target_container); - - memcpy(&key_arp->sip, arp_eth->ar_sip, - sizeof(key_arp->sip)); - memcpy(&key_arp->tip, arp_eth->ar_tip, - sizeof(key_arp->tip)); - - /* Only store the lower byte of the opcode; - * this covers ARPOP_REPLY and ARPOP_REQUEST. - */ - key_arp->op = ntohs(arp->ar_op) & 0xff; - - ether_addr_copy(key_arp->sha, arp_eth->ar_sha); - ether_addr_copy(key_arp->tha, arp_eth->ar_tha); } - - goto out_good; - } - default: goto out_bad; } ip_proto_again: switch (ip_proto) { - case IPPROTO_GRE: { - struct gre_base_hdr *hdr, _hdr; - u16 gre_ver; - int offset = 0; - - hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); - if (!hdr) + case IPPROTO_GRE: + switch (__skb_flow_dissect_gre(skb, key_control, flow_dissector, + target_container, data, + &proto, &nhoff, &hlen, flags)) { + case FLOW_DISSECT_RET_OUT_GOOD: + goto out_good; + case FLOW_DISSECT_RET_OUT_BAD: goto out_bad; - - /* Only look inside GRE without routing */ - if (hdr->flags & GRE_ROUTING) - break; - - /* Only look inside GRE for version 0 and 1 */ - gre_ver = ntohs(hdr->flags & GRE_VERSION); - if (gre_ver > 1) - break; - - proto = hdr->protocol; - if (gre_ver) { - /* Version1 must be PPTP, and check the flags */ - if (!(proto == GRE_PROTO_PPP && (hdr->flags & GRE_KEY))) - break; - } - - offset += sizeof(struct gre_base_hdr); - - if (hdr->flags & GRE_CSUM) - offset += sizeof(((struct gre_full_hdr *)0)->csum) + - sizeof(((struct gre_full_hdr *)0)->reserved1); - - if (hdr->flags & GRE_KEY) { - const __be32 *keyid; - __be32 _keyid; - - keyid = __skb_header_pointer(skb, nhoff + offset, sizeof(_keyid), - data, hlen, &_keyid); - if (!keyid) - goto out_bad; - - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID)) { - key_keyid = skb_flow_dissector_target(flow_dissector, - FLOW_DISSECTOR_KEY_GRE_KEYID, - target_container); - if (gre_ver == 0) - key_keyid->keyid = *keyid; - else - key_keyid->keyid = *keyid & GRE_PPTP_KEY_MASK; - } - offset += sizeof(((struct gre_full_hdr *)0)->key); - } - - if (hdr->flags & GRE_SEQ) - offset += sizeof(((struct pptp_gre_header *)0)->seq); - - if (gre_ver == 0) { - if (proto == htons(ETH_P_TEB)) { - const struct ethhdr *eth; - struct ethhdr _eth; - - eth = __skb_header_pointer(skb, nhoff + offset, - sizeof(_eth), - data, hlen, &_eth); - if (!eth) - goto out_bad; - proto = eth->h_proto; - offset += sizeof(*eth); - - /* Cap headers that we access via pointers at the - * end of the Ethernet header as our maximum alignment - * at that point is only 2 bytes. - */ - if (NET_IP_ALIGN) - hlen = (nhoff + offset); - } - } else { /* version 1, must be PPTP */ - u8 _ppp_hdr[PPP_HDRLEN]; - u8 *ppp_hdr; - - if (hdr->flags & GRE_ACK) - offset += sizeof(((struct pptp_gre_header *)0)->ack); - - ppp_hdr = __skb_header_pointer(skb, nhoff + offset, - sizeof(_ppp_hdr), - data, hlen, _ppp_hdr); - if (!ppp_hdr) - goto out_bad; - - switch (PPP_PROTOCOL(ppp_hdr)) { - case PPP_IP: - proto = htons(ETH_P_IP); - break; - case PPP_IPV6: - proto = htons(ETH_P_IPV6); - break; - default: - /* Could probably catch some more like MPLS */ - break; - } - - offset += PPP_HDRLEN; + case FLOW_DISSECT_RET_OUT_PROTO_AGAIN: + goto proto_again; } - - nhoff += offset; - key_control->flags |= FLOW_DIS_ENCAPSULATION; - if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) - goto out_good; - - goto again; - } case NEXTHDR_HOP: case NEXTHDR_ROUTING: case NEXTHDR_DEST: { diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index c98bbfbd26b8645bb2a273bdee61f285606e215d..814e58a3ce8b6a16a3fa60917b359f0fd3bf969d 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -13,7 +13,7 @@ int gro_cells_receive(struct gro_cells *gcells, struct sk_buff *skb) struct net_device *dev = skb->dev; struct gro_cell *cell; - if (!gcells->cells || skb_cloned(skb) || !(dev->features & NETIF_F_GRO)) + if (!gcells->cells || skb_cloned(skb) || netif_elide_gro(dev)) return netif_rx(skb); cell = this_cpu_ptr(gcells->cells); diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c index 0cfe7b0216c3522a8d05d404423de939ad2cc2d5..b3bc0a31af9f3bd2d21fd12b9b2885d9b1573a4d 100644 --- a/net/core/lwt_bpf.c +++ b/net/core/lwt_bpf.c @@ -209,7 +209,8 @@ static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, int ret; u32 fd; - ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy); + ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, + NULL); if (ret < 0) return ret; @@ -249,7 +250,7 @@ static int bpf_build_state(struct nlattr *nla, if (family != AF_INET && family != AF_INET6) return -EAFNOSUPPORT; - ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy); + ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, NULL); if (ret < 0) return ret; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 6df9f8fabf0ca5d2ced3070406900b7ec28a7924..cfae3d5fe11f10035394aea48fb5332244058a03 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -162,7 +162,6 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; struct nlattr *attrs; - struct nlattr *nla; u16 encap_type; int attrlen; @@ -170,7 +169,6 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining) attrlen = rtnh_attrlen(rtnh); if (attrlen > 0) { attrs = rtnh_attrs(rtnh); - nla = nla_find(attrs, attrlen, RTA_ENCAP); nla_entype = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); if (nla_entype) { @@ -205,7 +203,7 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) { const struct lwtunnel_encap_ops *ops; struct nlattr *nest; - int ret = -EINVAL; + int ret; if (!lwtstate) return 0; @@ -214,8 +212,11 @@ int lwtunnel_fill_encap(struct sk_buff *skb, struct lwtunnel_state *lwtstate) lwtstate->type > LWTUNNEL_ENCAP_MAX) return 0; - ret = -EOPNOTSUPP; nest = nla_nest_start(skb, RTA_ENCAP); + if (!nest) + return -EMSGSIZE; + + ret = -EOPNOTSUPP; rcu_read_lock(); ops = rcu_dereference(lwtun_encaps[lwtstate->type]); if (likely(ops && ops->fill_encap)) diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 4526cbd7e28a1fcdecfc06a41985fd4d19634457..58b0bcc125b5559f299dc2f195deccb5c43d0844 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -52,8 +52,9 @@ do { \ #define PNEIGH_HASHMASK 0xF static void neigh_timer_handler(unsigned long arg); -static void __neigh_notify(struct neighbour *n, int type, int flags); -static void neigh_update_notify(struct neighbour *neigh); +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid); +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid); static int pneigh_ifdown(struct neigh_table *tbl, struct net_device *dev); #ifdef CONFIG_PROC_FS @@ -99,7 +100,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) if (neigh->parms->neigh_cleanup) neigh->parms->neigh_cleanup(neigh); - __neigh_notify(neigh, RTM_DELNEIGH, 0); + __neigh_notify(neigh, RTM_DELNEIGH, 0, 0); call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); neigh_release(neigh); } @@ -949,7 +950,7 @@ static void neigh_timer_handler(unsigned long arg) } if (notify) - neigh_update_notify(neigh); + neigh_update_notify(neigh, 0); neigh_release(neigh); } @@ -1073,7 +1074,7 @@ static void neigh_update_hhs(struct neighbour *neigh) */ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, - u32 flags) + u32 flags, u32 nlmsg_pid) { u8 old; int err; @@ -1230,7 +1231,7 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, write_unlock_bh(&neigh->lock); if (notify) - neigh_update_notify(neigh); + neigh_update_notify(neigh, nlmsg_pid); return err; } @@ -1261,7 +1262,7 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, lladdr || !dev->addr_len); if (neigh) neigh_update(neigh, lladdr, NUD_STALE, - NEIGH_UPDATE_F_OVERRIDE); + NEIGH_UPDATE_F_OVERRIDE, 0); return neigh; } EXPORT_SYMBOL(neigh_event_ns); @@ -1589,7 +1590,8 @@ static struct neigh_table *neigh_find_table(int family) return tbl; } -static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -1639,14 +1641,16 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, + NETLINK_CB(skb).portid); neigh_release(neigh); out: return err; } -static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { int flags = NEIGH_UPDATE_F_ADMIN | NEIGH_UPDATE_F_OVERRIDE; struct net *net = sock_net(skb->sk); @@ -1659,7 +1663,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) int err; ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) goto out; @@ -1730,7 +1734,8 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh) neigh_event_send(neigh, NULL); err = 0; } else - err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); + err = neigh_update(neigh, lladdr, ndm->ndm_state, flags, + NETLINK_CB(skb).portid); neigh_release(neigh); out: @@ -1933,7 +1938,8 @@ static const struct nla_policy nl_ntbl_parm_policy[NDTPA_MAX+1] = { [NDTPA_LOCKTIME] = { .type = NLA_U64 }, }; -static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) +static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct neigh_table *tbl; @@ -1943,7 +1949,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) int err, tidx; err = nlmsg_parse(nlh, sizeof(*ndtmsg), tb, NDTA_MAX, - nl_neightbl_policy); + nl_neightbl_policy, extack); if (err < 0) goto errout; @@ -1981,7 +1987,7 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh) int i, ifindex = 0; err = nla_parse_nested(tbp, NDTPA_MAX, tb[NDTA_PARMS], - nl_ntbl_parm_policy); + nl_ntbl_parm_policy, extack); if (err < 0) goto errout_tbl_lock; @@ -2230,10 +2236,10 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn, return -EMSGSIZE; } -static void neigh_update_notify(struct neighbour *neigh) +static void neigh_update_notify(struct neighbour *neigh, u32 nlmsg_pid) { call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, neigh); - __neigh_notify(neigh, RTM_NEWNEIGH, 0); + __neigh_notify(neigh, RTM_NEWNEIGH, 0, nlmsg_pid); } static bool neigh_master_filtered(struct net_device *dev, int master_idx) @@ -2272,7 +2278,7 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, unsigned int flags = NLM_F_MULTI; int err; - err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(struct ndmsg), tb, NDA_MAX, NULL, NULL); if (!err) { if (tb[NDA_IFINDEX]) filter_idx = nla_get_u32(tb[NDA_IFINDEX]); @@ -2831,7 +2837,8 @@ static inline size_t neigh_nlmsg_size(void) + nla_total_size(4); /* NDA_PROBES */ } -static void __neigh_notify(struct neighbour *n, int type, int flags) +static void __neigh_notify(struct neighbour *n, int type, int flags, + u32 pid) { struct net *net = dev_net(n->dev); struct sk_buff *skb; @@ -2841,7 +2848,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) if (skb == NULL) goto errout; - err = neigh_fill_info(skb, n, 0, 0, type, flags); + err = neigh_fill_info(skb, n, pid, 0, type, flags); if (err < 0) { /* -EMSGSIZE implies BUG in neigh_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -2857,7 +2864,7 @@ static void __neigh_notify(struct neighbour *n, int type, int flags) void neigh_app_ns(struct neighbour *n) { - __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST); + __neigh_notify(n, RTM_GETNEIGH, NLM_F_REQUEST, 0); } EXPORT_SYMBOL(neigh_app_ns); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 652468ff65b79d5c9d45ece0d3289de480bbc603..1934efd4a9d4986f3497abc40968fd08c91896b3 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -35,7 +35,8 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), + .count = ATOMIC_INIT(1), + .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -571,7 +572,8 @@ static const struct nla_policy rtnl_net_policy[NETNSA_MAX + 1] = { [NETNSA_FD] = { .type = NLA_U32 }, }; -static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; @@ -579,7 +581,7 @@ static int rtnl_net_newid(struct sk_buff *skb, struct nlmsghdr *nlh) int nsid, err; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy); + rtnl_net_policy, extack); if (err < 0) return err; if (!tb[NETNSA_NSID]) @@ -644,7 +646,8 @@ static int rtnl_net_fill(struct sk_buff *skb, u32 portid, u32 seq, int flags, return -EMSGSIZE; } -static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[NETNSA_MAX + 1]; @@ -653,7 +656,7 @@ static int rtnl_net_getid(struct sk_buff *skb, struct nlmsghdr *nlh) int err, id; err = nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, NETNSA_MAX, - rtnl_net_policy); + rtnl_net_policy, extack); if (err < 0) return err; if (tb[NETNSA_PID]) diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 9424673009c14e0fb288b8e4041dba596b37ee8d..29be2466970cd670daa7a8abdd54929c9af39026 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -105,15 +105,21 @@ static void queue_process(struct work_struct *work) while ((skb = skb_dequeue(&npinfo->txq))) { struct net_device *dev = skb->dev; struct netdev_queue *txq; + unsigned int q_index; if (!netif_device_present(dev) || !netif_running(dev)) { kfree_skb(skb); continue; } - txq = skb_get_tx_queue(dev, skb); - local_irq_save(flags); + /* check if skb->queue_mapping is still valid */ + q_index = skb_get_queue_mapping(skb); + if (unlikely(q_index >= dev->real_num_tx_queues)) { + q_index = q_index % dev->real_num_tx_queues; + skb_set_queue_mapping(skb, q_index); + } + txq = netdev_get_tx_queue(dev, q_index); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (netif_xmit_frozen_or_stopped(txq) || netpoll_start_xmit(skb, dev, txq) != NETDEV_TX_OK) { diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 0f9275ee55958156a6cbac3f0d2b1ff54c3c89a5..1c4810919a0a35900d45a659de0cd780b7e500d3 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -11,6 +11,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include #include #include #include diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index c4e84c55824085b343679cc295a81f7b35e3acaf..bcb0f610ee422a12ad6b8ae41763296ba0fa3c6a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -896,15 +896,13 @@ static size_t rtnl_port_size(const struct net_device *dev, return port_self_size; } -static size_t rtnl_xdp_size(const struct net_device *dev) +static size_t rtnl_xdp_size(void) { size_t xdp_size = nla_total_size(0) + /* nest IFLA_XDP */ - nla_total_size(1); /* XDP_ATTACHED */ + nla_total_size(1) + /* XDP_ATTACHED */ + nla_total_size(4); /* XDP_FLAGS */ - if (!dev->netdev_ops->ndo_xdp) - return 0; - else - return xdp_size; + return xdp_size; } static noinline size_t if_nlmsg_size(const struct net_device *dev, @@ -943,7 +941,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */ + nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */ - + rtnl_xdp_size(dev) /* IFLA_XDP */ + + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(1); /* IFLA_PROTO_DOWN */ } @@ -1056,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; @@ -1251,23 +1249,35 @@ static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev) static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) { - struct netdev_xdp xdp_op = {}; struct nlattr *xdp; + u32 xdp_flags = 0; + u8 val = 0; int err; - if (!dev->netdev_ops->ndo_xdp) - return 0; xdp = nla_nest_start(skb, IFLA_XDP); if (!xdp) return -EMSGSIZE; - xdp_op.command = XDP_QUERY_PROG; - err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); - if (err) - goto err_cancel; - err = nla_put_u8(skb, IFLA_XDP_ATTACHED, xdp_op.prog_attached); + if (rcu_access_pointer(dev->xdp_prog)) { + xdp_flags = XDP_FLAGS_SKB_MODE; + val = 1; + } else if (dev->netdev_ops->ndo_xdp) { + struct netdev_xdp xdp_op = {}; + + xdp_op.command = XDP_QUERY_PROG; + err = dev->netdev_ops->ndo_xdp(dev, &xdp_op); + if (err) + goto err_cancel; + val = xdp_op.prog_attached; + } + err = nla_put_u8(skb, IFLA_XDP_ATTACHED, val); if (err) goto err_cancel; + if (xdp_flags) { + err = nla_put_u32(skb, IFLA_XDP_FLAGS, xdp_flags); + if (err) + goto err_cancel; + } nla_nest_end(skb, xdp); return 0; @@ -1515,7 +1525,8 @@ static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla const struct rtnl_link_ops *ops = NULL; struct nlattr *linfo[IFLA_INFO_MAX + 1]; - if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0) + if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, + ifla_info_policy, NULL) < 0) return NULL; if (linfo[IFLA_INFO_KIND]) { @@ -1592,8 +1603,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) hdrlen = nlmsg_len(cb->nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { - + if (nlmsg_parse(cb->nlh, hdrlen, tb, IFLA_MAX, + ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); @@ -1640,9 +1651,10 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) return skb->len; } -int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len) +int rtnl_nla_parse_ifla(struct nlattr **tb, const struct nlattr *head, int len, + struct netlink_ext_ack *exterr) { - return nla_parse(tb, IFLA_MAX, head, len, ifla_policy); + return nla_parse(tb, IFLA_MAX, head, len, ifla_policy, exterr); } EXPORT_SYMBOL(rtnl_nla_parse_ifla); @@ -1907,6 +1919,7 @@ static int do_set_master(struct net_device *dev, int ifindex) #define DO_SETLINK_NOTIFY 0x03 static int do_setlink(const struct sk_buff *skb, struct net_device *dev, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, struct nlattr **tb, char *ifname, int status) { const struct net_device_ops *ops = dev->netdev_ops; @@ -2078,7 +2091,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } err = nla_parse_nested(vfinfo, IFLA_VF_MAX, attr, - ifla_vf_policy); + ifla_vf_policy, NULL); if (err < 0) goto errout; err = do_setvfinfo(dev, vfinfo); @@ -2106,7 +2119,7 @@ static int do_setlink(const struct sk_buff *skb, goto errout; } err = nla_parse_nested(port, IFLA_PORT_MAX, attr, - ifla_port_policy); + ifla_port_policy, NULL); if (err < 0) goto errout; if (!port[IFLA_PORT_VF]) { @@ -2126,7 +2139,8 @@ static int do_setlink(const struct sk_buff *skb, struct nlattr *port[IFLA_PORT_MAX+1]; err = nla_parse_nested(port, IFLA_PORT_MAX, - tb[IFLA_PORT_SELF], ifla_port_policy); + tb[IFLA_PORT_SELF], ifla_port_policy, + NULL); if (err < 0) goto errout; @@ -2170,7 +2184,7 @@ static int do_setlink(const struct sk_buff *skb, u32 xdp_flags = 0; err = nla_parse_nested(xdp, IFLA_XDP_MAX, tb[IFLA_XDP], - ifla_xdp_policy); + ifla_xdp_policy, NULL); if (err < 0) goto errout; @@ -2188,7 +2202,7 @@ static int do_setlink(const struct sk_buff *skb, } if (xdp[IFLA_XDP_FD]) { - err = dev_change_xdp_fd(dev, + err = dev_change_xdp_fd(dev, extack, nla_get_s32(xdp[IFLA_XDP_FD]), xdp_flags); if (err) @@ -2210,7 +2224,8 @@ static int do_setlink(const struct sk_buff *skb, return err; } -static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2219,7 +2234,8 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; char ifname[IFNAMSIZ]; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, + extack); if (err < 0) goto errout; @@ -2246,7 +2262,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (err < 0) goto errout; - err = do_setlink(skb, dev, ifm, tb, ifname, 0); + err = do_setlink(skb, dev, ifm, extack, tb, ifname, 0); errout: return err; } @@ -2303,7 +2319,8 @@ int rtnl_delete_link(struct net_device *dev) } EXPORT_SYMBOL_GPL(rtnl_delete_link); -static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev; @@ -2312,7 +2329,7 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) struct nlattr *tb[IFLA_MAX+1]; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2407,6 +2424,7 @@ EXPORT_SYMBOL(rtnl_create_link); static int rtnl_group_changelink(const struct sk_buff *skb, struct net *net, int group, struct ifinfomsg *ifm, + struct netlink_ext_ack *extack, struct nlattr **tb) { struct net_device *dev, *aux; @@ -2414,7 +2432,7 @@ static int rtnl_group_changelink(const struct sk_buff *skb, for_each_netdev_safe(net, dev, aux) { if (dev->group == group) { - err = do_setlink(skb, dev, ifm, tb, NULL, 0); + err = do_setlink(skb, dev, ifm, extack, tb, NULL, 0); if (err < 0) return err; } @@ -2423,7 +2441,8 @@ static int rtnl_group_changelink(const struct sk_buff *skb, return 0; } -static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct rtnl_link_ops *ops; @@ -2441,7 +2460,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) #ifdef CONFIG_MODULES replay: #endif - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2472,7 +2491,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (tb[IFLA_LINKINFO]) { err = nla_parse_nested(linkinfo, IFLA_INFO_MAX, - tb[IFLA_LINKINFO], ifla_info_policy); + tb[IFLA_LINKINFO], ifla_info_policy, + NULL); if (err < 0) return err; } else @@ -2497,7 +2517,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) if (ops->maxtype && linkinfo[IFLA_INFO_DATA]) { err = nla_parse_nested(attr, ops->maxtype, linkinfo[IFLA_INFO_DATA], - ops->policy); + ops->policy, NULL); if (err < 0) return err; data = attr; @@ -2515,7 +2535,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) err = nla_parse_nested(slave_attr, m_ops->slave_maxtype, linkinfo[IFLA_INFO_SLAVE_DATA], - m_ops->slave_policy); + m_ops->slave_policy, + NULL); if (err < 0) return err; slave_data = slave_attr; @@ -2557,14 +2578,15 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) status |= DO_SETLINK_NOTIFY; } - return do_setlink(skb, dev, ifm, tb, ifname, status); + return do_setlink(skb, dev, ifm, extack, tb, ifname, + status); } if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { if (ifm->ifi_index == 0 && tb[IFLA_GROUP]) return rtnl_group_changelink(skb, net, nla_get_u32(tb[IFLA_GROUP]), - ifm, tb); + ifm, extack, tb); return -ENODEV; } @@ -2673,7 +2695,8 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh) } } -static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) +static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -2684,7 +2707,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh) int err; u32 ext_filter_mask = 0; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; @@ -2734,7 +2757,7 @@ static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) hdrlen = nlmsg_len(nlh) < sizeof(struct ifinfomsg) ? sizeof(struct rtgenmsg) : sizeof(struct ifinfomsg); - if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy) >= 0) { + if (nlmsg_parse(nlh, hdrlen, tb, IFLA_MAX, ifla_policy, NULL) >= 0) { if (tb[IFLA_EXT_MASK]) ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); } @@ -2955,7 +2978,8 @@ static int fdb_vid_parse(struct nlattr *vlan_attr, u16 *p_vid) return 0; } -static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -2965,7 +2989,7 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh) u16 vid; int err; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; @@ -3055,7 +3079,8 @@ int ndo_dflt_fdb_del(struct ndmsg *ndm, } EXPORT_SYMBOL(ndo_dflt_fdb_del); -static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ndmsg *ndm; @@ -3068,7 +3093,7 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh) if (!netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); + err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL, extack); if (err < 0) return err; @@ -3203,8 +3228,8 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb) int err = 0; int fidx = 0; - if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, IFLA_MAX, - ifla_policy) == 0) { + if (nlmsg_parse(cb->nlh, sizeof(struct ifinfomsg), tb, + IFLA_MAX, ifla_policy, NULL) == 0) { if (tb[IFLA_MASTER]) br_idx = nla_get_u32(tb[IFLA_MASTER]); } @@ -3498,7 +3523,8 @@ static int rtnl_bridge_notify(struct net_device *dev) return err; } -static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -3572,7 +3598,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_bridge_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifinfomsg *ifm; @@ -3940,7 +3967,8 @@ static size_t if_nlmsg_stats_size(const struct net_device *dev, return size; } -static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnl_stats_get(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *dev = NULL; @@ -4046,7 +4074,8 @@ static int rtnl_stats_dump(struct sk_buff *skb, struct netlink_callback *cb) /* Process one rtnetlink message. */ -static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); rtnl_doit_func doit; @@ -4101,7 +4130,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (doit == NULL) return -EOPNOTSUPP; - return doit(skb, nlh); + return doit(skb, nlh, extack); } static void rtnetlink_rcv(struct sk_buff *skb) @@ -4116,22 +4145,16 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { - case NETDEV_UP: - case NETDEV_DOWN: - case NETDEV_PRE_UP: - case NETDEV_POST_INIT: - case NETDEV_REGISTER: - case NETDEV_CHANGE: - case NETDEV_PRE_TYPE_CHANGE: - case NETDEV_GOING_DOWN: - case NETDEV_UNREGISTER: - case NETDEV_UNREGISTER_FINAL: - case NETDEV_RELEASE: - case NETDEV_JOIN: - case NETDEV_BONDING_INFO: + case NETDEV_REBOOT: + case NETDEV_CHANGENAME: + case NETDEV_FEAT_CHANGE: + case NETDEV_BONDING_FAILOVER: + case NETDEV_NOTIFY_PEERS: + case NETDEV_RESEND_IGMP: + case NETDEV_CHANGEINFODATA: + rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; default: - rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL); break; } return NOTIFY_DONE; @@ -4185,6 +4208,7 @@ void __init rtnetlink_init(void) rtnl_register(PF_UNSPEC, RTM_GETADDR, NULL, rtnl_dump_all, NULL); rtnl_register(PF_UNSPEC, RTM_GETROUTE, NULL, rtnl_dump_all, NULL); + rtnl_register(PF_UNSPEC, RTM_GETNETCONF, NULL, rtnl_dump_all, NULL); rtnl_register(PF_BRIDGE, RTM_NEWNEIGH, rtnl_fdb_add, NULL, NULL); rtnl_register(PF_BRIDGE, RTM_DELNEIGH, rtnl_fdb_del, NULL, NULL); diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c index d28da7d363f170f35d88623e2b864f04a67c3de5..ae35cce3a40d70387bee815798933aa43a0e6d84 100644 --- a/net/core/secure_seq.c +++ b/net/core/secure_seq.c @@ -24,9 +24,13 @@ static siphash_key_t ts_secret __read_mostly; static __always_inline void net_secret_init(void) { - net_get_random_once(&ts_secret, sizeof(ts_secret)); net_get_random_once(&net_secret, sizeof(net_secret)); } + +static __always_inline void ts_secret_init(void) +{ + net_get_random_once(&ts_secret, sizeof(ts_secret)); +} #endif #ifdef CONFIG_INET @@ -47,7 +51,7 @@ static u32 seq_scale(u32 seq) #endif #if IS_ENABLED(CONFIG_IPV6) -static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) +u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) { const struct { struct in6_addr saddr; @@ -60,12 +64,14 @@ static u32 secure_tcpv6_ts_off(const __be32 *saddr, const __be32 *daddr) if (sysctl_tcp_timestamps != 1) return 0; + ts_secret_init(); return siphash(&combined, offsetofend(typeof(combined), daddr), &ts_secret); } +EXPORT_SYMBOL(secure_tcpv6_ts_off); -u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, - __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcpv6_seq(const __be32 *saddr, const __be32 *daddr, + __be16 sport, __be16 dport) { const struct { struct in6_addr saddr; @@ -78,14 +84,14 @@ u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr, .sport = sport, .dport = dport }; - u64 hash; + u32 hash; + net_secret_init(); hash = siphash(&combined, offsetofend(typeof(combined), dport), &net_secret); - *tsoff = secure_tcpv6_ts_off(saddr, daddr); return seq_scale(hash); } -EXPORT_SYMBOL(secure_tcpv6_sequence_number); +EXPORT_SYMBOL(secure_tcpv6_seq); u32 secure_ipv6_port_ephemeral(const __be32 *saddr, const __be32 *daddr, __be16 dport) @@ -107,30 +113,30 @@ EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #ifdef CONFIG_INET -static u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr) +u32 secure_tcp_ts_off(__be32 saddr, __be32 daddr) { if (sysctl_tcp_timestamps != 1) return 0; + ts_secret_init(); return siphash_2u32((__force u32)saddr, (__force u32)daddr, &ts_secret); } -/* secure_tcp_sequence_number(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), +/* secure_tcp_seq_and_tsoff(a, b, 0, d) == secure_ipv4_port_ephemeral(a, b, d), * but fortunately, `sport' cannot be 0 in any circumstances. If this changes, * it would be easy enough to have the former function use siphash_4u32, passing * the arguments as separate u32. */ - -u32 secure_tcp_sequence_number(__be32 saddr, __be32 daddr, - __be16 sport, __be16 dport, u32 *tsoff) +u32 secure_tcp_seq(__be32 saddr, __be32 daddr, + __be16 sport, __be16 dport) { - u64 hash; + u32 hash; + net_secret_init(); hash = siphash_3u32((__force u32)saddr, (__force u32)daddr, (__force u32)sport << 16 | (__force u32)dport, &net_secret); - *tsoff = secure_tcp_ts_off(saddr, daddr); return seq_scale(hash); } diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9f781092fda9cb8cac22b0743b4bc7666a3bd91a..346d3e85dfbc2eca1ded0442ecb78d31e1768523 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -1576,6 +1576,8 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len) skb_set_tail_pointer(skb, len); } + if (!skb->sk || skb->destructor == sock_edemux) + skb_condense(skb); return 0; } EXPORT_SYMBOL(___pskb_trim); @@ -1980,7 +1982,6 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, .pages = pages, .partial = partial, .nr_pages_max = MAX_SKB_FRAGS, - .flags = flags, .ops = &nosteal_pipe_buf_ops, .spd_release = sock_spd_release, }; @@ -3082,22 +3083,32 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb, if (sg && csum && (mss != GSO_BY_FRAGS)) { if (!(features & NETIF_F_GSO_PARTIAL)) { struct sk_buff *iter; + unsigned int frag_len; if (!list_skb || !net_gso_ok(features, skb_shinfo(head_skb)->gso_type)) goto normal; - /* Split the buffer at the frag_list pointer. - * This is based on the assumption that all - * buffers in the chain excluding the last - * containing the same amount of data. + /* If we get here then all the required + * GSO features except frag_list are supported. + * Try to split the SKB to multiple GSO SKBs + * with no frag_list. + * Currently we can do that only when the buffers don't + * have a linear part and all the buffers except + * the last are of the same length. */ + frag_len = list_skb->len; skb_walk_frags(head_skb, iter) { - if (skb_headlen(iter)) + if (frag_len != iter->len && iter->next) + goto normal; + if (skb_headlen(iter) && !iter->head_frag) goto normal; len -= iter->len; } + + if (len != frag_len) + goto normal; } /* GSO partial only requires that we trim off any excess that @@ -3807,6 +3818,7 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb, serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING; serr->ee.ee_info = tstype; serr->opt_stats = opt_stats; + serr->header.h4.iif = skb->dev ? skb->dev->ifindex : 0; if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) { serr->ee.ee_data = skb_shinfo(skb)->tskey; if (sk->sk_protocol == IPPROTO_TCP && diff --git a/net/core/sock.c b/net/core/sock.c index 2c4f574168fbdcebc0cc82c0c8a36214992d6224..79c6aee6af9b817bd7086f04ae8f46342a3bf4b6 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -102,6 +102,7 @@ #include #include #include +#include #include #include #include @@ -247,12 +248,66 @@ static const char *const af_family_kern_slock_key_strings[AF_MAX+1] = { static const char *const af_family_kern_clock_key_strings[AF_MAX+1] = { _sock_locks("k-clock-") }; +static const char *const af_family_rlock_key_strings[AF_MAX+1] = { + "rlock-AF_UNSPEC", "rlock-AF_UNIX" , "rlock-AF_INET" , + "rlock-AF_AX25" , "rlock-AF_IPX" , "rlock-AF_APPLETALK", + "rlock-AF_NETROM", "rlock-AF_BRIDGE" , "rlock-AF_ATMPVC" , + "rlock-AF_X25" , "rlock-AF_INET6" , "rlock-AF_ROSE" , + "rlock-AF_DECnet", "rlock-AF_NETBEUI" , "rlock-AF_SECURITY" , + "rlock-AF_KEY" , "rlock-AF_NETLINK" , "rlock-AF_PACKET" , + "rlock-AF_ASH" , "rlock-AF_ECONET" , "rlock-AF_ATMSVC" , + "rlock-AF_RDS" , "rlock-AF_SNA" , "rlock-AF_IRDA" , + "rlock-AF_PPPOX" , "rlock-AF_WANPIPE" , "rlock-AF_LLC" , + "rlock-27" , "rlock-28" , "rlock-AF_CAN" , + "rlock-AF_TIPC" , "rlock-AF_BLUETOOTH", "rlock-AF_IUCV" , + "rlock-AF_RXRPC" , "rlock-AF_ISDN" , "rlock-AF_PHONET" , + "rlock-AF_IEEE802154", "rlock-AF_CAIF" , "rlock-AF_ALG" , + "rlock-AF_NFC" , "rlock-AF_VSOCK" , "rlock-AF_KCM" , + "rlock-AF_QIPCRTR", "rlock-AF_SMC" , "rlock-AF_MAX" +}; +static const char *const af_family_wlock_key_strings[AF_MAX+1] = { + "wlock-AF_UNSPEC", "wlock-AF_UNIX" , "wlock-AF_INET" , + "wlock-AF_AX25" , "wlock-AF_IPX" , "wlock-AF_APPLETALK", + "wlock-AF_NETROM", "wlock-AF_BRIDGE" , "wlock-AF_ATMPVC" , + "wlock-AF_X25" , "wlock-AF_INET6" , "wlock-AF_ROSE" , + "wlock-AF_DECnet", "wlock-AF_NETBEUI" , "wlock-AF_SECURITY" , + "wlock-AF_KEY" , "wlock-AF_NETLINK" , "wlock-AF_PACKET" , + "wlock-AF_ASH" , "wlock-AF_ECONET" , "wlock-AF_ATMSVC" , + "wlock-AF_RDS" , "wlock-AF_SNA" , "wlock-AF_IRDA" , + "wlock-AF_PPPOX" , "wlock-AF_WANPIPE" , "wlock-AF_LLC" , + "wlock-27" , "wlock-28" , "wlock-AF_CAN" , + "wlock-AF_TIPC" , "wlock-AF_BLUETOOTH", "wlock-AF_IUCV" , + "wlock-AF_RXRPC" , "wlock-AF_ISDN" , "wlock-AF_PHONET" , + "wlock-AF_IEEE802154", "wlock-AF_CAIF" , "wlock-AF_ALG" , + "wlock-AF_NFC" , "wlock-AF_VSOCK" , "wlock-AF_KCM" , + "wlock-AF_QIPCRTR", "wlock-AF_SMC" , "wlock-AF_MAX" +}; +static const char *const af_family_elock_key_strings[AF_MAX+1] = { + "elock-AF_UNSPEC", "elock-AF_UNIX" , "elock-AF_INET" , + "elock-AF_AX25" , "elock-AF_IPX" , "elock-AF_APPLETALK", + "elock-AF_NETROM", "elock-AF_BRIDGE" , "elock-AF_ATMPVC" , + "elock-AF_X25" , "elock-AF_INET6" , "elock-AF_ROSE" , + "elock-AF_DECnet", "elock-AF_NETBEUI" , "elock-AF_SECURITY" , + "elock-AF_KEY" , "elock-AF_NETLINK" , "elock-AF_PACKET" , + "elock-AF_ASH" , "elock-AF_ECONET" , "elock-AF_ATMSVC" , + "elock-AF_RDS" , "elock-AF_SNA" , "elock-AF_IRDA" , + "elock-AF_PPPOX" , "elock-AF_WANPIPE" , "elock-AF_LLC" , + "elock-27" , "elock-28" , "elock-AF_CAN" , + "elock-AF_TIPC" , "elock-AF_BLUETOOTH", "elock-AF_IUCV" , + "elock-AF_RXRPC" , "elock-AF_ISDN" , "elock-AF_PHONET" , + "elock-AF_IEEE802154", "elock-AF_CAIF" , "elock-AF_ALG" , + "elock-AF_NFC" , "elock-AF_VSOCK" , "elock-AF_KCM" , + "elock-AF_QIPCRTR", "elock-AF_SMC" , "elock-AF_MAX" +}; /* - * sk_callback_lock locking rules are per-address-family, + * sk_callback_lock and sk queues locking rules are per-address-family, * so split the lock classes by using a per-AF key: */ static struct lock_class_key af_callback_keys[AF_MAX]; +static struct lock_class_key af_rlock_keys[AF_MAX]; +static struct lock_class_key af_wlock_keys[AF_MAX]; +static struct lock_class_key af_elock_keys[AF_MAX]; static struct lock_class_key af_kern_callback_keys[AF_MAX]; /* Take into consideration the size of the struct sk_buff overhead in the @@ -318,14 +373,14 @@ EXPORT_SYMBOL_GPL(sk_clear_memalloc); int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { int ret; - unsigned long pflags = current->flags; + unsigned int noreclaim_flag; /* these should have been dropped before queueing */ BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); - current->flags |= PF_MEMALLOC; + noreclaim_flag = memalloc_noreclaim_save(); ret = sk->sk_backlog_rcv(sk, skb); - tsk_restore_flags(current, pflags, PF_MEMALLOC); + memalloc_noreclaim_restore(noreclaim_flag); return ret; } @@ -1029,6 +1084,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, union { int val; + u64 val64; struct linger ling; struct timeval tm; } v; @@ -1259,6 +1315,40 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sk->sk_incoming_cpu; break; + case SO_MEMINFO: + { + u32 meminfo[SK_MEMINFO_VARS]; + + if (get_user(len, optlen)) + return -EFAULT; + + sk_get_meminfo(sk, meminfo); + + len = min_t(unsigned int, len, sizeof(meminfo)); + if (copy_to_user(optval, &meminfo, len)) + return -EFAULT; + + goto lenout; + } + +#ifdef CONFIG_NET_RX_BUSY_POLL + case SO_INCOMING_NAPI_ID: + v.val = READ_ONCE(sk->sk_napi_id); + + /* aggregate non-NAPI IDs down to 0 */ + if (v.val < MIN_NAPI_ID) + v.val = 0; + + break; +#endif + + case SO_COOKIE: + lv = sizeof(u64); + if (len < lv) + return -EINVAL; + v.val64 = sock_gen_cookie(sk); + break; + default: /* We implement the SO_SNDLOWAT etc to not be settable * (1003.1g 7). @@ -1483,6 +1573,27 @@ void sk_free(struct sock *sk) } EXPORT_SYMBOL(sk_free); +static void sk_init_common(struct sock *sk) +{ + skb_queue_head_init(&sk->sk_receive_queue); + skb_queue_head_init(&sk->sk_write_queue); + skb_queue_head_init(&sk->sk_error_queue); + + rwlock_init(&sk->sk_callback_lock); + lockdep_set_class_and_name(&sk->sk_receive_queue.lock, + af_rlock_keys + sk->sk_family, + af_family_rlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_write_queue.lock, + af_wlock_keys + sk->sk_family, + af_family_wlock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_error_queue.lock, + af_elock_keys + sk->sk_family, + af_family_elock_key_strings[sk->sk_family]); + lockdep_set_class_and_name(&sk->sk_callback_lock, + af_callback_keys + sk->sk_family, + af_family_clock_key_strings[sk->sk_family]); +} + /** * sk_clone_lock - clone a socket, and lock its clone * @sk: the socket to clone @@ -1516,13 +1627,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) */ atomic_set(&newsk->sk_wmem_alloc, 1); atomic_set(&newsk->sk_omem_alloc, 0); - skb_queue_head_init(&newsk->sk_receive_queue); - skb_queue_head_init(&newsk->sk_write_queue); - - rwlock_init(&newsk->sk_callback_lock); - lockdep_set_class_and_name(&newsk->sk_callback_lock, - af_callback_keys + newsk->sk_family, - af_family_clock_key_strings[newsk->sk_family]); + sk_init_common(newsk); newsk->sk_dst_cache = NULL; newsk->sk_dst_pending_confirm = 0; @@ -1533,7 +1638,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK; sock_reset_flag(newsk, SOCK_DONE); - skb_queue_head_init(&newsk->sk_error_queue); filter = rcu_dereference_protected(newsk->sk_filter, 1); if (filter != NULL) @@ -2466,10 +2570,7 @@ EXPORT_SYMBOL(sk_stop_timer); void sock_init_data(struct socket *sock, struct sock *sk) { - skb_queue_head_init(&sk->sk_receive_queue); - skb_queue_head_init(&sk->sk_write_queue); - skb_queue_head_init(&sk->sk_error_queue); - + sk_init_common(sk); sk->sk_send_head = NULL; init_timer(&sk->sk_timer); @@ -2521,7 +2622,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp = ktime_set(-1L, 0); + sk->sk_stamp = SK_DEFAULT_STAMP; #ifdef CONFIG_NET_RX_BUSY_POLL sk->sk_napi_id = 0; @@ -2802,6 +2903,21 @@ void sk_common_release(struct sock *sk) } EXPORT_SYMBOL(sk_common_release); +void sk_get_meminfo(const struct sock *sk, u32 *mem) +{ + memset(mem, 0, sizeof(*mem) * SK_MEMINFO_VARS); + + mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); + mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; + mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); + mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; + mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; + mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; + mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); + mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; + mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); +} + #ifdef CONFIG_PROC_FS #define PROTO_INUSE_NR 64 /* should be enough for the first time */ struct prot_inuse { @@ -3142,3 +3258,14 @@ static int __init proto_init(void) subsys_initcall(proto_init); #endif /* PROC_FS */ + +#ifdef CONFIG_NET_RX_BUSY_POLL +bool sk_busy_loop_end(void *p, unsigned long start_time) +{ + struct sock *sk = p; + + return !skb_queue_empty(&sk->sk_receive_queue) || + sk_busy_loop_timeout(sk, start_time); +} +EXPORT_SYMBOL(sk_busy_loop_end); +#endif /* CONFIG_NET_RX_BUSY_POLL */ diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c index 6b10573cc9faa790fe261b452b85f3b774c3ec21..217f4e3b82f6edca841b66089ab0772bd8d391e7 100644 --- a/net/core/sock_diag.c +++ b/net/core/sock_diag.c @@ -19,7 +19,7 @@ static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh); static DEFINE_MUTEX(sock_diag_table_mutex); static struct workqueue_struct *broadcast_wq; -static u64 sock_gen_cookie(struct sock *sk) +u64 sock_gen_cookie(struct sock *sk) { while (1) { u64 res = atomic64_read(&sk->sk_cookie); @@ -59,15 +59,7 @@ int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype) { u32 mem[SK_MEMINFO_VARS]; - mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk); - mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf; - mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk); - mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf; - mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc; - mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued; - mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc); - mem[SK_MEMINFO_BACKLOG] = sk->sk_backlog.len; - mem[SK_MEMINFO_DROPS] = atomic_read(&sk->sk_drops); + sk_get_meminfo(sk, mem); return nla_put(skb, attrtype, sizeof(mem), &mem); } @@ -246,7 +238,8 @@ static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { int ret; diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 9a1a352fd1ebe598e4925bcda037dc0e4a2288bc..eed1ebf7f29d0fac552074b127e5636fecede65f 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -13,9 +13,9 @@ static DEFINE_SPINLOCK(reuseport_lock); -static struct sock_reuseport *__reuseport_alloc(u16 max_socks) +static struct sock_reuseport *__reuseport_alloc(unsigned int max_socks) { - size_t size = sizeof(struct sock_reuseport) + + unsigned int size = sizeof(struct sock_reuseport) + sizeof(struct sock *) * max_socks; struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC); diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 7f9cc400eca08c01c9014476aa4daf0852505b20..ea23254b2457cf15eeae495130dab437090f8e2e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -452,6 +452,14 @@ static struct ctl_table net_core_table[] = { .extra1 = &one, .extra2 = &max_skb_frags, }, + { + .procname = "netdev_budget_usecs", + .data = &netdev_budget_usecs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { } }; diff --git a/net/core/utils.c b/net/core/utils.c index 6592d7bbed394086a8ba8efcb370fb1d75db4449..93066bd0305ab73dbee756125e2cb77b7f46aa44 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -26,9 +26,11 @@ #include #include #include +#include #include #include +#include #include #include @@ -51,7 +53,7 @@ EXPORT_SYMBOL(net_ratelimit); __be32 in_aton(const char *str) { - unsigned long l; + unsigned int l; unsigned int val; int i; @@ -300,6 +302,107 @@ int in6_pton(const char *src, int srclen, } EXPORT_SYMBOL(in6_pton); +static int inet4_pton(const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in *addr4 = (struct sockaddr_in *)addr; + int srclen = strlen(src); + + if (srclen > INET_ADDRSTRLEN) + return -EINVAL; + + if (in4_pton(src, srclen, (u8 *)&addr4->sin_addr.s_addr, + '\n', NULL) == 0) + return -EINVAL; + + addr4->sin_family = AF_INET; + addr4->sin_port = htons(port_num); + + return 0; +} + +static int inet6_pton(struct net *net, const char *src, u16 port_num, + struct sockaddr_storage *addr) +{ + struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)addr; + const char *scope_delim; + int srclen = strlen(src); + + if (srclen > INET6_ADDRSTRLEN) + return -EINVAL; + + if (in6_pton(src, srclen, (u8 *)&addr6->sin6_addr.s6_addr, + '%', &scope_delim) == 0) + return -EINVAL; + + if (ipv6_addr_type(&addr6->sin6_addr) & IPV6_ADDR_LINKLOCAL && + src + srclen != scope_delim && *scope_delim == '%') { + struct net_device *dev; + char scope_id[16]; + size_t scope_len = min_t(size_t, sizeof(scope_id) - 1, + src + srclen - scope_delim - 1); + + memcpy(scope_id, scope_delim + 1, scope_len); + scope_id[scope_len] = '\0'; + + dev = dev_get_by_name(net, scope_id); + if (dev) { + addr6->sin6_scope_id = dev->ifindex; + dev_put(dev); + } else if (kstrtouint(scope_id, 0, &addr6->sin6_scope_id)) { + return -EINVAL; + } + } + + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(port_num); + + return 0; +} + +/** + * inet_pton_with_scope - convert an IPv4/IPv6 and port to socket address + * @net: net namespace (used for scope handling) + * @af: address family, AF_INET, AF_INET6 or AF_UNSPEC for either + * @src: the start of the address string + * @port: the start of the port string (or NULL for none) + * @addr: output socket address + * + * Return zero on success, return errno when any error occurs. + */ +int inet_pton_with_scope(struct net *net, __kernel_sa_family_t af, + const char *src, const char *port, struct sockaddr_storage *addr) +{ + u16 port_num; + int ret = -EINVAL; + + if (port) { + if (kstrtou16(port, 0, &port_num)) + return -EINVAL; + } else { + port_num = 0; + } + + switch (af) { + case AF_INET: + ret = inet4_pton(src, port_num, addr); + break; + case AF_INET6: + ret = inet6_pton(net, src, port_num, addr); + break; + case AF_UNSPEC: + ret = inet4_pton(src, port_num, addr); + if (ret) + ret = inet6_pton(net, src, port_num, addr); + break; + default: + pr_err("unexpected address family %d\n", af); + }; + + return ret; +} +EXPORT_SYMBOL(inet_pton_with_scope); + void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr) { diff --git a/net/dcb/dcbnl.c b/net/dcb/dcbnl.c index 3202d75329b504b6442441becc31157e0c2be7a5..93106120f98770699d28e8b82701b132144baa87 100644 --- a/net/dcb/dcbnl.c +++ b/net/dcb/dcbnl.c @@ -245,8 +245,7 @@ static int dcbnl_getpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, - tb[DCB_ATTR_PFC_CFG], - dcbnl_pfc_up_nest); + tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; @@ -304,7 +303,7 @@ static int dcbnl_getcap(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested(data, DCB_CAP_ATTR_MAX, tb[DCB_ATTR_CAP], - dcbnl_cap_nest); + dcbnl_cap_nest, NULL); if (ret) return ret; @@ -348,7 +347,7 @@ static int dcbnl_getnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], - dcbnl_numtcs_nest); + dcbnl_numtcs_nest, NULL); if (ret) return ret; @@ -393,7 +392,7 @@ static int dcbnl_setnumtcs(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested(data, DCB_NUMTCS_ATTR_MAX, tb[DCB_ATTR_NUMTCS], - dcbnl_numtcs_nest); + dcbnl_numtcs_nest, NULL); if (ret) return ret; @@ -452,7 +451,7 @@ static int dcbnl_getapp(struct net_device *netdev, struct nlmsghdr *nlh, return -EINVAL; ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], - dcbnl_app_nest); + dcbnl_app_nest, NULL); if (ret) return ret; @@ -520,7 +519,7 @@ static int dcbnl_setapp(struct net_device *netdev, struct nlmsghdr *nlh, return -EINVAL; ret = nla_parse_nested(app_tb, DCB_APP_ATTR_MAX, tb[DCB_ATTR_APP], - dcbnl_app_nest); + dcbnl_app_nest, NULL); if (ret) return ret; @@ -577,8 +576,8 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->getpgbwgcfgrx) return -EOPNOTSUPP; - ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, - tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest); + ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], + dcbnl_pg_nest, NULL); if (ret) return ret; @@ -597,8 +596,8 @@ static int __dcbnl_pg_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, data = pg_tb[DCB_PG_ATTR_TC_ALL]; else data = pg_tb[i]; - ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, - data, dcbnl_tc_param_nest); + ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, data, + dcbnl_tc_param_nest, NULL); if (ret) goto err_pg; @@ -735,8 +734,7 @@ static int dcbnl_setpfccfg(struct net_device *netdev, struct nlmsghdr *nlh, return -EOPNOTSUPP; ret = nla_parse_nested(data, DCB_PFC_UP_ATTR_MAX, - tb[DCB_ATTR_PFC_CFG], - dcbnl_pfc_up_nest); + tb[DCB_ATTR_PFC_CFG], dcbnl_pfc_up_nest, NULL); if (ret) return ret; @@ -791,8 +789,8 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->setpgbwgcfgrx) return -EOPNOTSUPP; - ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, - tb[DCB_ATTR_PG_CFG], dcbnl_pg_nest); + ret = nla_parse_nested(pg_tb, DCB_PG_ATTR_MAX, tb[DCB_ATTR_PG_CFG], + dcbnl_pg_nest, NULL); if (ret) return ret; @@ -801,7 +799,7 @@ static int __dcbnl_pg_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, continue; ret = nla_parse_nested(param_tb, DCB_TC_ATTR_PARAM_MAX, - pg_tb[i], dcbnl_tc_param_nest); + pg_tb[i], dcbnl_tc_param_nest, NULL); if (ret) return ret; @@ -889,8 +887,8 @@ static int dcbnl_bcn_getcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->getbcncfg) return -EOPNOTSUPP; - ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX, - tb[DCB_ATTR_BCN], dcbnl_bcn_nest); + ret = nla_parse_nested(bcn_tb, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], + dcbnl_bcn_nest, NULL); if (ret) return ret; @@ -948,9 +946,8 @@ static int dcbnl_bcn_setcfg(struct net_device *netdev, struct nlmsghdr *nlh, !netdev->dcbnl_ops->setbcnrp) return -EOPNOTSUPP; - ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX, - tb[DCB_ATTR_BCN], - dcbnl_pfc_up_nest); + ret = nla_parse_nested(data, DCB_BCN_ATTR_MAX, tb[DCB_ATTR_BCN], + dcbnl_pfc_up_nest, NULL); if (ret) return ret; @@ -1424,8 +1421,8 @@ static int dcbnl_ieee_set(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_IEEE]) return -EINVAL; - err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, - tb[DCB_ATTR_IEEE], dcbnl_ieee_policy); + err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], + dcbnl_ieee_policy, NULL); if (err) return err; @@ -1508,8 +1505,8 @@ static int dcbnl_ieee_del(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_IEEE]) return -EINVAL; - err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, - tb[DCB_ATTR_IEEE], dcbnl_ieee_policy); + err = nla_parse_nested(ieee, DCB_ATTR_IEEE_MAX, tb[DCB_ATTR_IEEE], + dcbnl_ieee_policy, NULL); if (err) return err; @@ -1581,8 +1578,8 @@ static int dcbnl_getfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; - ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], - dcbnl_featcfg_nest); + ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, + tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) return ret; @@ -1625,8 +1622,8 @@ static int dcbnl_setfeatcfg(struct net_device *netdev, struct nlmsghdr *nlh, if (!tb[DCB_ATTR_FEATCFG]) return -EINVAL; - ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, tb[DCB_ATTR_FEATCFG], - dcbnl_featcfg_nest); + ret = nla_parse_nested(data, DCB_FEATCFG_ATTR_MAX, + tb[DCB_ATTR_FEATCFG], dcbnl_featcfg_nest, NULL); if (ret) goto err; @@ -1699,7 +1696,8 @@ static const struct reply_func reply_funcs[DCB_CMD_MAX+1] = { [DCB_CMD_CEE_GET] = { RTM_GETDCB, dcbnl_cee_get }, }; -static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct net_device *netdev; @@ -1715,7 +1713,7 @@ static int dcb_doit(struct sk_buff *skb, struct nlmsghdr *nlh) return -EPERM; ret = nlmsg_parse(nlh, sizeof(*dcb), tb, DCB_ATTR_MAX, - dcbnl_rtnl_policy); + dcbnl_rtnl_policy, extack); if (ret < 0) return ret; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index b99168b0fabf2a8c65defdd0b93d362630774e1a..f75482bdee9a0e5cc9ef89806b7d2d67ddb38ba1 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -951,7 +951,7 @@ static struct proto dccp_v4_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp_request_sock_ops, .twsk_prot = &dccp_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index d9b6a4e403e701fd9b9ecf92bac496e45570054e..840f14aaa01635e0f6fb62232b67814736c84b5d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1014,7 +1014,7 @@ static struct proto dccp_v6_prot = { .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .rsk_prot = &dccp6_request_sock_ops, .twsk_prot = &dccp6_timewait_sock_ops, .h.hashinfo = &dccp_hashinfo, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 7de5b40a5d0d1245ad995877f779e0d87d1cf398..405483a07efc7ac2efcfe86e285a7673547c9691 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -132,6 +132,7 @@ Version 0.0.6 2.1.110 07-aug-98 Eduardo Marcelo Serrat #include #include #include +#include #include #include #include @@ -1469,18 +1470,18 @@ static int __dn_setsockopt(struct socket *sock, int level,int optname, char __us case DSO_NODELAY: if (optlen != sizeof(int)) return -EINVAL; - if (scp->nonagle == 2) + if (scp->nonagle == TCP_NAGLE_CORK) return -EINVAL; - scp->nonagle = (u.val == 0) ? 0 : 1; + scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_OFF; /* if (scp->nonagle == 1) { Push pending frames } */ break; case DSO_CORK: if (optlen != sizeof(int)) return -EINVAL; - if (scp->nonagle == 1) + if (scp->nonagle == TCP_NAGLE_OFF) return -EINVAL; - scp->nonagle = (u.val == 0) ? 0 : 2; + scp->nonagle = (u.val == 0) ? 0 : TCP_NAGLE_CORK; /* if (scp->nonagle == 0) { Push pending frames } */ break; @@ -1608,14 +1609,14 @@ static int __dn_getsockopt(struct socket *sock, int level,int optname, char __us case DSO_NODELAY: if (r_len > sizeof(int)) r_len = sizeof(int); - val = (scp->nonagle == 1); + val = (scp->nonagle == TCP_NAGLE_OFF); r_data = &val; break; case DSO_CORK: if (r_len > sizeof(int)) r_len = sizeof(int); - val = (scp->nonagle == 2); + val = (scp->nonagle == TCP_NAGLE_CORK); r_data = &val; break; @@ -2360,7 +2361,8 @@ MODULE_AUTHOR("Linux DECnet Project Team"); MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(PF_DECnet); -static char banner[] __initdata = KERN_INFO "NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; +static const char banner[] __initconst = KERN_INFO +"NET4: DECnet for Linux: V.2.5.68s (C) 1995-2003 Linux DECnet Project Team\n"; static int __init decnet_init(void) { diff --git a/net/decnet/dn_dev.c b/net/decnet/dn_dev.c index 8fdd9f492b0ea2201db70cfe83ebe1f3384c7b04..9017a9a73ab5885002b9d63047e6bd5a7b91cc4b 100644 --- a/net/decnet/dn_dev.c +++ b/net/decnet/dn_dev.c @@ -565,7 +565,8 @@ static const struct nla_policy dn_ifa_policy[IFA_MAX+1] = { [IFA_FLAGS] = { .type = NLA_U32 }, }; -static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -581,7 +582,8 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) if (!net_eq(net, &init_net)) goto errout; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy, + extack); if (err < 0) goto errout; @@ -609,7 +611,8 @@ static int dn_nl_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -625,7 +628,8 @@ static int dn_nl_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, dn_ifa_policy, + extack); if (err < 0) return err; diff --git a/net/decnet/dn_fib.c b/net/decnet/dn_fib.c index 7af0ba6157a108578182eda3ec75ec7bd0c27859..f9058ebeb635c2efff457bf73d7895129b443cde 100644 --- a/net/decnet/dn_fib.c +++ b/net/decnet/dn_fib.c @@ -501,7 +501,8 @@ static inline u32 rtm_get_table(struct nlattr *attrs[], u8 table) return table; } -static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct dn_fib_table *tb; @@ -515,7 +516,8 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy); + err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy, + extack); if (err < 0) return err; @@ -526,7 +528,8 @@ static int dn_fib_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return tb->delete(tb, r, attrs, nlh, &NETLINK_CB(skb)); } -static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct dn_fib_table *tb; @@ -540,7 +543,8 @@ static int dn_fib_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy); + err = nlmsg_parse(nlh, sizeof(*r), attrs, RTA_MAX, rtm_dn_policy, + extack); if (err < 0) return err; diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 482730cd8a562e048b08f275551361ca813a8792..eeb5fc561f800f042023f95e88f0fe5fd64d3f52 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -110,7 +110,7 @@ struct neigh_table dn_neigh_table = { static int dn_neigh_construct(struct neighbour *neigh) { struct net_device *dev = neigh->dev; - struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); struct dn_dev *dn_db; struct neigh_parms *parms; @@ -339,7 +339,7 @@ int dn_to_neigh_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct dn_route *rt = (struct dn_route *) dst; struct neighbour *neigh = rt->n; - struct dn_neigh *dn = (struct dn_neigh *)neigh; + struct dn_neigh *dn = container_of(neigh, struct dn_neigh, n); struct dn_dev *dn_db; bool use_long; @@ -391,7 +391,7 @@ int dn_neigh_router_hello(struct net *net, struct sock *sk, struct sk_buff *skb) neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - dn = (struct dn_neigh *)neigh; + dn = container_of(neigh, struct dn_neigh, n); if (neigh) { write_lock(&neigh->lock); @@ -451,7 +451,7 @@ int dn_neigh_endnode_hello(struct net *net, struct sock *sk, struct sk_buff *skb neigh = __neigh_lookup(&dn_neigh_table, &src, skb->dev, 1); - dn = (struct dn_neigh *)neigh; + dn = container_of(neigh, struct dn_neigh, n); if (neigh) { write_lock(&neigh->lock); @@ -510,7 +510,7 @@ static void neigh_elist_cb(struct neighbour *neigh, void *_info) if (neigh->dev != s->dev) return; - dn = (struct dn_neigh *) neigh; + dn = container_of(neigh, struct dn_neigh, n); if (!(dn->flags & (DN_NDFLAG_R1|DN_NDFLAG_R2))) return; @@ -549,7 +549,7 @@ int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n) static inline void dn_neigh_format_entry(struct seq_file *seq, struct neighbour *n) { - struct dn_neigh *dn = (struct dn_neigh *) n; + struct dn_neigh *dn = container_of(n, struct dn_neigh, n); char buf[DN_ASCBUF_LEN]; read_lock(&n->lock); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index b1dc096d22f8c83e771b1df68d7815661ac51bae..4b9518a0d2489494ed88b6808f95803cf1b543ad 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -1640,7 +1640,8 @@ const struct nla_policy rtm_dn_policy[RTA_MAX + 1] = { /* * This is called by both endnodes and routers now. */ -static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm = nlmsg_data(nlh); @@ -1654,7 +1655,8 @@ static int dn_cache_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) if (!net_eq(net, &init_net)) return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_dn_policy, + extack); if (err < 0) return err; diff --git a/net/decnet/netfilter/dn_rtmsg.c b/net/decnet/netfilter/dn_rtmsg.c index 85f2fdc360c27b21cb5b9c486e2d05ffb1d557d7..1ed81ac6dd1a28b79dff9f9f4e8c1d0f99306a33 100644 --- a/net/decnet/netfilter/dn_rtmsg.c +++ b/net/decnet/netfilter/dn_rtmsg.c @@ -96,7 +96,7 @@ static unsigned int dnrmg_hook(void *priv, } -#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err)); return; } while (0) +#define RCV_SKB_FAIL(err) do { netlink_ack(skb, nlh, (err), NULL); return; } while (0) static inline void dnrmg_receive_user_skb(struct sk_buff *skb) { @@ -134,7 +134,7 @@ static int __init dn_rtmsg_init(void) return -ENOMEM; } - rv = nf_register_hook(&dnrmg_ops); + rv = nf_register_net_hook(&init_net, &dnrmg_ops); if (rv) { netlink_kernel_release(dnrmg); } @@ -144,7 +144,7 @@ static int __init dn_rtmsg_init(void) static void __exit dn_rtmsg_fini(void) { - nf_unregister_hook(&dnrmg_ops); + nf_unregister_net_hook(&init_net, &dnrmg_ops); netlink_kernel_release(dnrmg); } diff --git a/net/dsa/Kconfig b/net/dsa/Kconfig index 9649238eef404095a89d34a006dc0b504bc47038..81a0868edb1d619a6ade13b7817db90e7a2d69a2 100644 --- a/net/dsa/Kconfig +++ b/net/dsa/Kconfig @@ -6,7 +6,7 @@ config HAVE_NET_DSA config NET_DSA tristate "Distributed Switch Architecture" - depends on HAVE_NET_DSA + depends on HAVE_NET_DSA && MAY_USE_DEVLINK select NET_SWITCHDEV select PHYLIB ---help--- @@ -31,4 +31,10 @@ config NET_DSA_TAG_TRAILER config NET_DSA_TAG_QCA bool +config NET_DSA_TAG_MTK + bool + +config NET_DSA_TAG_LAN9303 + bool + endif diff --git a/net/dsa/Makefile b/net/dsa/Makefile index 31d343796251da06c979b3428f275f5911dfb2ab..0b747d75e65a265dc5059586e250a2b97ae3f067 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,6 +1,6 @@ # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o slave.o dsa2.o switch.o +dsa_core-y += dsa.o slave.o dsa2.o switch.o legacy.o # tagging formats dsa_core-$(CONFIG_NET_DSA_TAG_BRCM) += tag_brcm.o @@ -8,3 +8,5 @@ dsa_core-$(CONFIG_NET_DSA_TAG_DSA) += tag_dsa.o dsa_core-$(CONFIG_NET_DSA_TAG_EDSA) += tag_edsa.o dsa_core-$(CONFIG_NET_DSA_TAG_TRAILER) += tag_trailer.o dsa_core-$(CONFIG_NET_DSA_TAG_QCA) += tag_qca.o +dsa_core-$(CONFIG_NET_DSA_TAG_MTK) += tag_mtk.o +dsa_core-$(CONFIG_NET_DSA_TAG_LAN9303) += tag_lan9303.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index b6d4f6a23f06c9d794a5eedc4c9f79810d5b06e5..26130ae438da53f3f99a4e9fa712a572f40ca779 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -14,15 +14,17 @@ #include #include #include -#include #include #include #include #include #include +#include #include #include #include +#include +#include #include "dsa_priv.h" static struct sk_buff *dsa_slave_notag_xmit(struct sk_buff *skb, @@ -52,63 +54,16 @@ const struct dsa_device_ops *dsa_device_ops[DSA_TAG_LAST] = { #endif #ifdef CONFIG_NET_DSA_TAG_QCA [DSA_TAG_PROTO_QCA] = &qca_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_MTK + [DSA_TAG_PROTO_MTK] = &mtk_netdev_ops, +#endif +#ifdef CONFIG_NET_DSA_TAG_LAN9303 + [DSA_TAG_PROTO_LAN9303] = &lan9303_netdev_ops, #endif [DSA_TAG_PROTO_NONE] = &none_ops, }; -/* switch driver registration ***********************************************/ -static DEFINE_MUTEX(dsa_switch_drivers_mutex); -static LIST_HEAD(dsa_switch_drivers); - -void register_switch_driver(struct dsa_switch_driver *drv) -{ - mutex_lock(&dsa_switch_drivers_mutex); - list_add_tail(&drv->list, &dsa_switch_drivers); - mutex_unlock(&dsa_switch_drivers_mutex); -} -EXPORT_SYMBOL_GPL(register_switch_driver); - -void unregister_switch_driver(struct dsa_switch_driver *drv) -{ - mutex_lock(&dsa_switch_drivers_mutex); - list_del_init(&drv->list); - mutex_unlock(&dsa_switch_drivers_mutex); -} -EXPORT_SYMBOL_GPL(unregister_switch_driver); - -static const struct dsa_switch_ops * -dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, - const char **_name, void **priv) -{ - const struct dsa_switch_ops *ret; - struct list_head *list; - const char *name; - - ret = NULL; - name = NULL; - - mutex_lock(&dsa_switch_drivers_mutex); - list_for_each(list, &dsa_switch_drivers) { - const struct dsa_switch_ops *ops; - struct dsa_switch_driver *drv; - - drv = list_entry(list, struct dsa_switch_driver, list); - ops = drv->ops; - - name = ops->probe(parent, host_dev, sw_addr, priv); - if (name != NULL) { - ret = ops; - break; - } - } - mutex_unlock(&dsa_switch_drivers_mutex); - - *_name = name; - - return ret; -} - -/* basic switch operations **************************************************/ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, struct dsa_port *dport, int port) { @@ -140,23 +95,6 @@ int dsa_cpu_dsa_setup(struct dsa_switch *ds, struct device *dev, return 0; } -static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) -{ - struct dsa_port *dport; - int ret, port; - - for (port = 0; port < ds->num_ports; port++) { - if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) - continue; - - dport = &ds->ports[port]; - ret = dsa_cpu_dsa_setup(ds, dev, dport, port); - if (ret) - return ret; - } - return 0; -} - const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol) { const struct dsa_device_ops *ops; @@ -206,168 +144,6 @@ void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds) master->ethtool_ops = ds->dst->master_orig_ethtool_ops; } -static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) -{ - const struct dsa_switch_ops *ops = ds->ops; - struct dsa_switch_tree *dst = ds->dst; - struct dsa_chip_data *cd = ds->cd; - bool valid_name_found = false; - int index = ds->index; - int i, ret; - - /* - * Validate supplied switch configuration. - */ - for (i = 0; i < ds->num_ports; i++) { - char *name; - - name = cd->port_names[i]; - if (name == NULL) - continue; - - if (!strcmp(name, "cpu")) { - if (dst->cpu_switch) { - netdev_err(dst->master_netdev, - "multiple cpu ports?!\n"); - return -EINVAL; - } - dst->cpu_switch = ds; - dst->cpu_port = i; - ds->cpu_port_mask |= 1 << i; - } else if (!strcmp(name, "dsa")) { - ds->dsa_port_mask |= 1 << i; - } else { - ds->enabled_port_mask |= 1 << i; - } - valid_name_found = true; - } - - if (!valid_name_found && i == ds->num_ports) - return -EINVAL; - - /* Make the built-in MII bus mask match the number of ports, - * switch drivers can override this later - */ - ds->phys_mii_mask = ds->enabled_port_mask; - - /* - * If the CPU connects to this switch, set the switch tree - * tagging protocol to the preferred tagging format of this - * switch. - */ - if (dst->cpu_switch == ds) { - enum dsa_tag_protocol tag_protocol; - - tag_protocol = ops->get_tag_protocol(ds); - dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); - if (IS_ERR(dst->tag_ops)) - return PTR_ERR(dst->tag_ops); - - dst->rcv = dst->tag_ops->rcv; - } - - memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); - - /* - * Do basic register setup. - */ - ret = ops->setup(ds); - if (ret < 0) - return ret; - - ret = dsa_switch_register_notifier(ds); - if (ret) - return ret; - - if (ops->set_addr) { - ret = ops->set_addr(ds, dst->master_netdev->dev_addr); - if (ret < 0) - return ret; - } - - if (!ds->slave_mii_bus && ops->phy_read) { - ds->slave_mii_bus = devm_mdiobus_alloc(parent); - if (!ds->slave_mii_bus) - return -ENOMEM; - dsa_slave_mii_bus_init(ds); - - ret = mdiobus_register(ds->slave_mii_bus); - if (ret < 0) - return ret; - } - - /* - * Create network devices for physical switch ports. - */ - for (i = 0; i < ds->num_ports; i++) { - ds->ports[i].dn = cd->port_dn[i]; - - if (!(ds->enabled_port_mask & (1 << i))) - continue; - - ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); - if (ret < 0) - netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", - index, i, cd->port_names[i], ret); - } - - /* Perform configuration of the CPU and DSA ports */ - ret = dsa_cpu_dsa_setups(ds, parent); - if (ret < 0) - netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", - index); - - ret = dsa_cpu_port_ethtool_setup(ds); - if (ret) - return ret; - - return 0; -} - -static struct dsa_switch * -dsa_switch_setup(struct dsa_switch_tree *dst, int index, - struct device *parent, struct device *host_dev) -{ - struct dsa_chip_data *cd = dst->pd->chip + index; - const struct dsa_switch_ops *ops; - struct dsa_switch *ds; - int ret; - const char *name; - void *priv; - - /* - * Probe for switch model. - */ - ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); - if (!ops) { - netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", - index); - return ERR_PTR(-EINVAL); - } - netdev_info(dst->master_netdev, "[%d]: detected a %s switch\n", - index, name); - - - /* - * Allocate and initialise switch state. - */ - ds = dsa_switch_alloc(parent, DSA_MAX_PORTS); - if (!ds) - return ERR_PTR(-ENOMEM); - - ds->dst = dst; - ds->index = index; - ds->cd = cd; - ds->ops = ops; - ds->priv = priv; - - ret = dsa_switch_setup_one(ds, parent); - if (ret) - return ERR_PTR(ret); - - return ds; -} - void dsa_cpu_dsa_destroy(struct dsa_port *port) { struct device_node *port_dn = port->dn; @@ -376,86 +152,6 @@ void dsa_cpu_dsa_destroy(struct dsa_port *port) of_phy_deregister_fixed_link(port_dn); } -static void dsa_switch_destroy(struct dsa_switch *ds) -{ - int port; - - /* Destroy network devices for physical switch ports. */ - for (port = 0; port < ds->num_ports; port++) { - if (!(ds->enabled_port_mask & (1 << port))) - continue; - - if (!ds->ports[port].netdev) - continue; - - dsa_slave_destroy(ds->ports[port].netdev); - } - - /* Disable configuration of the CPU and DSA ports */ - for (port = 0; port < ds->num_ports; port++) { - if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) - continue; - dsa_cpu_dsa_destroy(&ds->ports[port]); - - /* Clearing a bit which is not set does no harm */ - ds->cpu_port_mask |= ~(1 << port); - ds->dsa_port_mask |= ~(1 << port); - } - - if (ds->slave_mii_bus && ds->ops->phy_read) - mdiobus_unregister(ds->slave_mii_bus); - - dsa_switch_unregister_notifier(ds); -} - -#ifdef CONFIG_PM_SLEEP -int dsa_switch_suspend(struct dsa_switch *ds) -{ - int i, ret = 0; - - /* Suspend slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) - continue; - - ret = dsa_slave_suspend(ds->ports[i].netdev); - if (ret) - return ret; - } - - if (ds->ops->suspend) - ret = ds->ops->suspend(ds); - - return ret; -} -EXPORT_SYMBOL_GPL(dsa_switch_suspend); - -int dsa_switch_resume(struct dsa_switch *ds) -{ - int i, ret = 0; - - if (ds->ops->resume) - ret = ds->ops->resume(ds); - - if (ret) - return ret; - - /* Resume slave network devices */ - for (i = 0; i < ds->num_ports; i++) { - if (!dsa_is_port_initialized(ds, i)) - continue; - - ret = dsa_slave_resume(ds->ports[i].netdev); - if (ret) - return ret; - } - - return 0; -} -EXPORT_SYMBOL_GPL(dsa_switch_resume); -#endif - -/* platform driver init and cleanup *****************************************/ static int dev_is_class(struct device *dev, void *class) { if (dev->class != NULL && !strcmp(dev->class->name, class)) @@ -474,24 +170,6 @@ static struct device *dev_find_class(struct device *parent, char *class) return device_find_child(parent, class, dev_is_class); } -struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) -{ - struct device *d; - - d = dev_find_class(dev, "mdio_bus"); - if (d != NULL) { - struct mii_bus *bus; - - bus = to_mii_bus(d); - put_device(d); - - return bus; - } - - return NULL; -} -EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); - struct net_device *dsa_dev_to_net_device(struct device *dev) { struct device *d; @@ -511,456 +189,43 @@ struct net_device *dsa_dev_to_net_device(struct device *dev) } EXPORT_SYMBOL_GPL(dsa_dev_to_net_device); -#ifdef CONFIG_OF -static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, - struct dsa_chip_data *cd, - int chip_index, int port_index, - struct device_node *link) -{ - const __be32 *reg; - int link_sw_addr; - struct device_node *parent_sw; - int len; - - parent_sw = of_get_parent(link); - if (!parent_sw) - return -EINVAL; - - reg = of_get_property(parent_sw, "reg", &len); - if (!reg || (len != sizeof(*reg) * 2)) - return -EINVAL; - - /* - * Get the destination switch number from the second field of its 'reg' - * property, i.e. for "reg = <0x19 1>" sw_addr is '1'. - */ - link_sw_addr = be32_to_cpup(reg + 1); - - if (link_sw_addr >= pd->nr_chips) - return -EINVAL; - - cd->rtable[link_sw_addr] = port_index; - - return 0; -} - -static int dsa_of_probe_links(struct dsa_platform_data *pd, - struct dsa_chip_data *cd, - int chip_index, int port_index, - struct device_node *port, - const char *port_name) -{ - struct device_node *link; - int link_index; - int ret; - - for (link_index = 0;; link_index++) { - link = of_parse_phandle(port, "link", link_index); - if (!link) - break; - - if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) { - ret = dsa_of_setup_routing_table(pd, cd, chip_index, - port_index, link); - if (ret) - return ret; - } - } - return 0; -} - -static void dsa_of_free_platform_data(struct dsa_platform_data *pd) -{ - int i; - int port_index; - - for (i = 0; i < pd->nr_chips; i++) { - port_index = 0; - while (port_index < DSA_MAX_PORTS) { - kfree(pd->chip[i].port_names[port_index]); - port_index++; - } - - /* Drop our reference to the MDIO bus device */ - if (pd->chip[i].host_dev) - put_device(pd->chip[i].host_dev); - } - kfree(pd->chip); -} - -static int dsa_of_probe(struct device *dev) -{ - struct device_node *np = dev->of_node; - struct device_node *child, *mdio, *ethernet, *port; - struct mii_bus *mdio_bus, *mdio_bus_switch; - struct net_device *ethernet_dev; - struct dsa_platform_data *pd; - struct dsa_chip_data *cd; - const char *port_name; - int chip_index, port_index; - const unsigned int *sw_addr, *port_reg; - u32 eeprom_len; - int ret; - - mdio = of_parse_phandle(np, "dsa,mii-bus", 0); - if (!mdio) - return -EINVAL; - - mdio_bus = of_mdio_find_bus(mdio); - if (!mdio_bus) - return -EPROBE_DEFER; - - ethernet = of_parse_phandle(np, "dsa,ethernet", 0); - if (!ethernet) { - ret = -EINVAL; - goto out_put_mdio; - } - - ethernet_dev = of_find_net_device_by_node(ethernet); - if (!ethernet_dev) { - ret = -EPROBE_DEFER; - goto out_put_mdio; - } - - pd = kzalloc(sizeof(*pd), GFP_KERNEL); - if (!pd) { - ret = -ENOMEM; - goto out_put_ethernet; - } - - dev->platform_data = pd; - pd->of_netdev = ethernet_dev; - pd->nr_chips = of_get_available_child_count(np); - if (pd->nr_chips > DSA_MAX_SWITCHES) - pd->nr_chips = DSA_MAX_SWITCHES; - - pd->chip = kcalloc(pd->nr_chips, sizeof(struct dsa_chip_data), - GFP_KERNEL); - if (!pd->chip) { - ret = -ENOMEM; - goto out_free; - } - - chip_index = -1; - for_each_available_child_of_node(np, child) { - int i; - - chip_index++; - cd = &pd->chip[chip_index]; - - cd->of_node = child; - - /* Initialize the routing table */ - for (i = 0; i < DSA_MAX_SWITCHES; ++i) - cd->rtable[i] = DSA_RTABLE_NONE; - - /* When assigning the host device, increment its refcount */ - cd->host_dev = get_device(&mdio_bus->dev); - - sw_addr = of_get_property(child, "reg", NULL); - if (!sw_addr) - continue; - - cd->sw_addr = be32_to_cpup(sw_addr); - if (cd->sw_addr >= PHY_MAX_ADDR) - continue; - - if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) - cd->eeprom_len = eeprom_len; - - mdio = of_parse_phandle(child, "mii-bus", 0); - if (mdio) { - mdio_bus_switch = of_mdio_find_bus(mdio); - if (!mdio_bus_switch) { - ret = -EPROBE_DEFER; - goto out_free_chip; - } - - /* Drop the mdio_bus device ref, replacing the host - * device with the mdio_bus_switch device, keeping - * the refcount from of_mdio_find_bus() above. - */ - put_device(cd->host_dev); - cd->host_dev = &mdio_bus_switch->dev; - } - - for_each_available_child_of_node(child, port) { - port_reg = of_get_property(port, "reg", NULL); - if (!port_reg) - continue; - - port_index = be32_to_cpup(port_reg); - if (port_index >= DSA_MAX_PORTS) - break; - - port_name = of_get_property(port, "label", NULL); - if (!port_name) - continue; - - cd->port_dn[port_index] = port; - - cd->port_names[port_index] = kstrdup(port_name, - GFP_KERNEL); - if (!cd->port_names[port_index]) { - ret = -ENOMEM; - goto out_free_chip; - } - - ret = dsa_of_probe_links(pd, cd, chip_index, - port_index, port, port_name); - if (ret) - goto out_free_chip; - - } - } - - /* The individual chips hold their own refcount on the mdio bus, - * so drop ours */ - put_device(&mdio_bus->dev); - - return 0; - -out_free_chip: - dsa_of_free_platform_data(pd); -out_free: - kfree(pd); - dev->platform_data = NULL; -out_put_ethernet: - put_device(ðernet_dev->dev); -out_put_mdio: - put_device(&mdio_bus->dev); - return ret; -} - -static void dsa_of_remove(struct device *dev) -{ - struct dsa_platform_data *pd = dev->platform_data; - - if (!dev->of_node) - return; - - dsa_of_free_platform_data(pd); - put_device(&pd->of_netdev->dev); - kfree(pd); -} -#else -static inline int dsa_of_probe(struct device *dev) -{ - return 0; -} - -static inline void dsa_of_remove(struct device *dev) -{ -} -#endif - -static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, - struct device *parent, struct dsa_platform_data *pd) -{ - int i; - unsigned configured = 0; - - dst->pd = pd; - dst->master_netdev = dev; - dst->cpu_port = -1; - - for (i = 0; i < pd->nr_chips; i++) { - struct dsa_switch *ds; - - ds = dsa_switch_setup(dst, i, parent, pd->chip[i].host_dev); - if (IS_ERR(ds)) { - netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n", - i, PTR_ERR(ds)); - continue; - } - - dst->ds[i] = ds; - - ++configured; - } - - /* - * If no switch was found, exit cleanly - */ - if (!configured) - return -EPROBE_DEFER; - - /* - * If we use a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point on get - * sent to the tag format's receive function. - */ - wmb(); - dev->dsa_ptr = (void *)dst; - - return 0; -} - -static int dsa_probe(struct platform_device *pdev) -{ - struct dsa_platform_data *pd = pdev->dev.platform_data; - struct net_device *dev; - struct dsa_switch_tree *dst; - int ret; - - if (pdev->dev.of_node) { - ret = dsa_of_probe(&pdev->dev); - if (ret) - return ret; - - pd = pdev->dev.platform_data; - } - - if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL)) - return -EINVAL; - - if (pd->of_netdev) { - dev = pd->of_netdev; - dev_hold(dev); - } else { - dev = dsa_dev_to_net_device(pd->netdev); - } - if (dev == NULL) { - ret = -EPROBE_DEFER; - goto out; - } - - if (dev->dsa_ptr != NULL) { - dev_put(dev); - ret = -EEXIST; - goto out; - } - - dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL); - if (dst == NULL) { - dev_put(dev); - ret = -ENOMEM; - goto out; - } - - platform_set_drvdata(pdev, dst); - - ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); - if (ret) { - dev_put(dev); - goto out; - } - - return 0; - -out: - dsa_of_remove(&pdev->dev); - - return ret; -} - -static void dsa_remove_dst(struct dsa_switch_tree *dst) -{ - int i; - - dst->master_netdev->dsa_ptr = NULL; - - /* If we used a tagging format that doesn't have an ethertype - * field, make sure that all packets from this point get sent - * without the tag and go through the regular receive path. - */ - wmb(); - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; - - if (ds) - dsa_switch_destroy(ds); - } - - dsa_cpu_port_ethtool_restore(dst->cpu_switch); - - dev_put(dst->master_netdev); -} - -static int dsa_remove(struct platform_device *pdev) -{ - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); - - dsa_remove_dst(dst); - dsa_of_remove(&pdev->dev); - - return 0; -} - -static void dsa_shutdown(struct platform_device *pdev) -{ -} - static int dsa_switch_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; + struct sk_buff *nskb = NULL; if (unlikely(dst == NULL)) { kfree_skb(skb); return 0; } - return dst->rcv(skb, dev, pt, orig_dev); -} - -static struct packet_type dsa_pack_type __read_mostly = { - .type = cpu_to_be16(ETH_P_XDSA), - .func = dsa_switch_rcv, -}; - -#ifdef CONFIG_PM_SLEEP -static int dsa_suspend(struct device *d) -{ - struct platform_device *pdev = to_platform_device(d); - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); - int i, ret = 0; - - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + return 0; - if (ds != NULL) - ret = dsa_switch_suspend(ds); + nskb = dst->rcv(skb, dev, pt, orig_dev); + if (!nskb) { + kfree_skb(skb); + return 0; } - return ret; -} - -static int dsa_resume(struct device *d) -{ - struct platform_device *pdev = to_platform_device(d); - struct dsa_switch_tree *dst = platform_get_drvdata(pdev); - int i, ret = 0; + skb = nskb; + skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; + skb->protocol = eth_type_trans(skb, skb->dev); - for (i = 0; i < dst->pd->nr_chips; i++) { - struct dsa_switch *ds = dst->ds[i]; + skb->dev->stats.rx_packets++; + skb->dev->stats.rx_bytes += skb->len; - if (ds != NULL) - ret = dsa_switch_resume(ds); - } + netif_receive_skb(skb); - return ret; + return 0; } -#endif - -static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); -static const struct of_device_id dsa_of_match_table[] = { - { .compatible = "marvell,dsa", }, - {} -}; -MODULE_DEVICE_TABLE(of, dsa_of_match_table); - -static struct platform_driver dsa_driver = { - .probe = dsa_probe, - .remove = dsa_remove, - .shutdown = dsa_shutdown, - .driver = { - .name = "dsa", - .of_match_table = dsa_of_match_table, - .pm = &dsa_pm_ops, - }, +static struct packet_type dsa_pack_type __read_mostly = { + .type = cpu_to_be16(ETH_P_XDSA), + .func = dsa_switch_rcv, }; static int __init dsa_init_module(void) @@ -971,7 +236,7 @@ static int __init dsa_init_module(void) if (rc) return rc; - rc = platform_driver_register(&dsa_driver); + rc = dsa_legacy_register(); if (rc) return rc; @@ -985,7 +250,7 @@ static void __exit dsa_cleanup_module(void) { dsa_slave_unregister_notifier(); dev_remove_pack(&dsa_pack_type); - platform_driver_unregister(&dsa_driver); + dsa_legacy_unregister(); } module_exit(dsa_cleanup_module); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index 737be6470c7f27ba032d01667e039f3c03c17ae8..033b3bfb63dc1887b15b3e08f00a00f70b706ec4 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -13,16 +13,20 @@ #include #include #include +#include #include #include -#include #include #include +#include #include "dsa_priv.h" static LIST_HEAD(dsa_switch_trees); static DEFINE_MUTEX(dsa2_mutex); +static const struct devlink_ops dsa_devlink_ops = { +}; + static struct dsa_switch_tree *dsa_get_dst(u32 tree) { struct dsa_switch_tree *dst; @@ -222,12 +226,18 @@ static int dsa_dsa_port_apply(struct dsa_port *port, u32 index, return err; } - return 0; + memset(&ds->ports[index].devlink_port, 0, + sizeof(ds->ports[index].devlink_port)); + + return devlink_port_register(ds->devlink, + &ds->ports[index].devlink_port, + index); } static void dsa_dsa_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { + devlink_port_unregister(&ds->ports[index].devlink_port); dsa_cpu_dsa_destroy(port); } @@ -245,12 +255,17 @@ static int dsa_cpu_port_apply(struct dsa_port *port, u32 index, ds->cpu_port_mask |= BIT(index); - return 0; + memset(&ds->ports[index].devlink_port, 0, + sizeof(ds->ports[index].devlink_port)); + err = devlink_port_register(ds->devlink, &ds->ports[index].devlink_port, + index); + return err; } static void dsa_cpu_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { + devlink_port_unregister(&ds->ports[index].devlink_port); dsa_cpu_dsa_destroy(port); ds->cpu_port_mask &= ~BIT(index); @@ -275,12 +290,23 @@ static int dsa_user_port_apply(struct dsa_port *port, u32 index, return err; } + memset(&ds->ports[index].devlink_port, 0, + sizeof(ds->ports[index].devlink_port)); + err = devlink_port_register(ds->devlink, &ds->ports[index].devlink_port, + index); + if (err) + return err; + + devlink_port_type_eth_set(&ds->ports[index].devlink_port, + ds->ports[index].netdev); + return 0; } static void dsa_user_port_unapply(struct dsa_port *port, u32 index, struct dsa_switch *ds) { + devlink_port_unregister(&ds->ports[index].devlink_port); if (ds->ports[index].netdev) { dsa_slave_destroy(ds->ports[index].netdev); ds->ports[index].netdev = NULL; @@ -301,6 +327,17 @@ static int dsa_ds_apply(struct dsa_switch_tree *dst, struct dsa_switch *ds) */ ds->phys_mii_mask = ds->enabled_port_mask; + /* Add the switch to devlink before calling setup, so that setup can + * add dpipe tables + */ + ds->devlink = devlink_alloc(&dsa_devlink_ops, 0); + if (!ds->devlink) + return -ENOMEM; + + err = devlink_register(ds->devlink, ds->dev); + if (err) + return err; + err = ds->ops->setup(ds); if (err < 0) return err; @@ -381,6 +418,13 @@ static void dsa_ds_unapply(struct dsa_switch_tree *dst, struct dsa_switch *ds) mdiobus_unregister(ds->slave_mii_bus); dsa_switch_unregister_notifier(ds); + + if (ds->devlink) { + devlink_unregister(ds->devlink); + devlink_free(ds->devlink); + ds->devlink = NULL; + } + } static int dsa_dst_apply(struct dsa_switch_tree *dst) diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 0706a511244e92ff0174173eb388a41ff59141f4..f4a88e4852138864081c1871fc882803e53247db 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -17,8 +17,9 @@ struct dsa_device_ops { struct sk_buff *(*xmit)(struct sk_buff *skb, struct net_device *dev); - int (*rcv)(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev); + struct sk_buff *(*rcv)(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev); }; struct dsa_slave_priv { @@ -54,6 +55,10 @@ const struct dsa_device_ops *dsa_resolve_tag_protocol(int tag_protocol); int dsa_cpu_port_ethtool_setup(struct dsa_switch *ds); void dsa_cpu_port_ethtool_restore(struct dsa_switch *ds); +/* legacy.c */ +int dsa_legacy_register(void); +void dsa_legacy_unregister(void); + /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; void dsa_slave_mii_bus_init(struct dsa_switch *ds); @@ -85,4 +90,10 @@ extern const struct dsa_device_ops brcm_netdev_ops; /* tag_qca.c */ extern const struct dsa_device_ops qca_netdev_ops; +/* tag_mtk.c */ +extern const struct dsa_device_ops mtk_netdev_ops; + +/* tag_lan9303.c */ +extern const struct dsa_device_ops lan9303_netdev_ops; + #endif diff --git a/net/dsa/legacy.c b/net/dsa/legacy.c new file mode 100644 index 0000000000000000000000000000000000000000..ad345c8b0b0693cc214b212ccbb402859d910134 --- /dev/null +++ b/net/dsa/legacy.c @@ -0,0 +1,818 @@ +/* + * net/dsa/legacy.c - Hardware switch handling + * Copyright (c) 2008-2009 Marvell Semiconductor + * Copyright (c) 2013 Florian Fainelli + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "dsa_priv.h" + +/* switch driver registration ***********************************************/ +static DEFINE_MUTEX(dsa_switch_drivers_mutex); +static LIST_HEAD(dsa_switch_drivers); + +void register_switch_driver(struct dsa_switch_driver *drv) +{ + mutex_lock(&dsa_switch_drivers_mutex); + list_add_tail(&drv->list, &dsa_switch_drivers); + mutex_unlock(&dsa_switch_drivers_mutex); +} +EXPORT_SYMBOL_GPL(register_switch_driver); + +void unregister_switch_driver(struct dsa_switch_driver *drv) +{ + mutex_lock(&dsa_switch_drivers_mutex); + list_del_init(&drv->list); + mutex_unlock(&dsa_switch_drivers_mutex); +} +EXPORT_SYMBOL_GPL(unregister_switch_driver); + +static const struct dsa_switch_ops * +dsa_switch_probe(struct device *parent, struct device *host_dev, int sw_addr, + const char **_name, void **priv) +{ + const struct dsa_switch_ops *ret; + struct list_head *list; + const char *name; + + ret = NULL; + name = NULL; + + mutex_lock(&dsa_switch_drivers_mutex); + list_for_each(list, &dsa_switch_drivers) { + const struct dsa_switch_ops *ops; + struct dsa_switch_driver *drv; + + drv = list_entry(list, struct dsa_switch_driver, list); + ops = drv->ops; + + name = ops->probe(parent, host_dev, sw_addr, priv); + if (name != NULL) { + ret = ops; + break; + } + } + mutex_unlock(&dsa_switch_drivers_mutex); + + *_name = name; + + return ret; +} + +/* basic switch operations **************************************************/ +static int dsa_cpu_dsa_setups(struct dsa_switch *ds, struct device *dev) +{ + struct dsa_port *dport; + int ret, port; + + for (port = 0; port < ds->num_ports; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + + dport = &ds->ports[port]; + ret = dsa_cpu_dsa_setup(ds, dev, dport, port); + if (ret) + return ret; + } + return 0; +} + +static int dsa_switch_setup_one(struct dsa_switch *ds, struct device *parent) +{ + const struct dsa_switch_ops *ops = ds->ops; + struct dsa_switch_tree *dst = ds->dst; + struct dsa_chip_data *cd = ds->cd; + bool valid_name_found = false; + int index = ds->index; + int i, ret; + + /* + * Validate supplied switch configuration. + */ + for (i = 0; i < ds->num_ports; i++) { + char *name; + + name = cd->port_names[i]; + if (name == NULL) + continue; + + if (!strcmp(name, "cpu")) { + if (dst->cpu_switch) { + netdev_err(dst->master_netdev, + "multiple cpu ports?!\n"); + return -EINVAL; + } + dst->cpu_switch = ds; + dst->cpu_port = i; + ds->cpu_port_mask |= 1 << i; + } else if (!strcmp(name, "dsa")) { + ds->dsa_port_mask |= 1 << i; + } else { + ds->enabled_port_mask |= 1 << i; + } + valid_name_found = true; + } + + if (!valid_name_found && i == ds->num_ports) + return -EINVAL; + + /* Make the built-in MII bus mask match the number of ports, + * switch drivers can override this later + */ + ds->phys_mii_mask = ds->enabled_port_mask; + + /* + * If the CPU connects to this switch, set the switch tree + * tagging protocol to the preferred tagging format of this + * switch. + */ + if (dst->cpu_switch == ds) { + enum dsa_tag_protocol tag_protocol; + + tag_protocol = ops->get_tag_protocol(ds); + dst->tag_ops = dsa_resolve_tag_protocol(tag_protocol); + if (IS_ERR(dst->tag_ops)) + return PTR_ERR(dst->tag_ops); + + dst->rcv = dst->tag_ops->rcv; + } + + memcpy(ds->rtable, cd->rtable, sizeof(ds->rtable)); + + /* + * Do basic register setup. + */ + ret = ops->setup(ds); + if (ret < 0) + return ret; + + ret = dsa_switch_register_notifier(ds); + if (ret) + return ret; + + if (ops->set_addr) { + ret = ops->set_addr(ds, dst->master_netdev->dev_addr); + if (ret < 0) + return ret; + } + + if (!ds->slave_mii_bus && ops->phy_read) { + ds->slave_mii_bus = devm_mdiobus_alloc(parent); + if (!ds->slave_mii_bus) + return -ENOMEM; + dsa_slave_mii_bus_init(ds); + + ret = mdiobus_register(ds->slave_mii_bus); + if (ret < 0) + return ret; + } + + /* + * Create network devices for physical switch ports. + */ + for (i = 0; i < ds->num_ports; i++) { + ds->ports[i].dn = cd->port_dn[i]; + + if (!(ds->enabled_port_mask & (1 << i))) + continue; + + ret = dsa_slave_create(ds, parent, i, cd->port_names[i]); + if (ret < 0) + netdev_err(dst->master_netdev, "[%d]: can't create dsa slave device for port %d(%s): %d\n", + index, i, cd->port_names[i], ret); + } + + /* Perform configuration of the CPU and DSA ports */ + ret = dsa_cpu_dsa_setups(ds, parent); + if (ret < 0) + netdev_err(dst->master_netdev, "[%d] : can't configure CPU and DSA ports\n", + index); + + ret = dsa_cpu_port_ethtool_setup(ds); + if (ret) + return ret; + + return 0; +} + +static struct dsa_switch * +dsa_switch_setup(struct dsa_switch_tree *dst, int index, + struct device *parent, struct device *host_dev) +{ + struct dsa_chip_data *cd = dst->pd->chip + index; + const struct dsa_switch_ops *ops; + struct dsa_switch *ds; + int ret; + const char *name; + void *priv; + + /* + * Probe for switch model. + */ + ops = dsa_switch_probe(parent, host_dev, cd->sw_addr, &name, &priv); + if (!ops) { + netdev_err(dst->master_netdev, "[%d]: could not detect attached switch\n", + index); + return ERR_PTR(-EINVAL); + } + netdev_info(dst->master_netdev, "[%d]: detected a %s switch\n", + index, name); + + + /* + * Allocate and initialise switch state. + */ + ds = dsa_switch_alloc(parent, DSA_MAX_PORTS); + if (!ds) + return ERR_PTR(-ENOMEM); + + ds->dst = dst; + ds->index = index; + ds->cd = cd; + ds->ops = ops; + ds->priv = priv; + + ret = dsa_switch_setup_one(ds, parent); + if (ret) + return ERR_PTR(ret); + + return ds; +} + +static void dsa_switch_destroy(struct dsa_switch *ds) +{ + int port; + + /* Destroy network devices for physical switch ports. */ + for (port = 0; port < ds->num_ports; port++) { + if (!(ds->enabled_port_mask & (1 << port))) + continue; + + if (!ds->ports[port].netdev) + continue; + + dsa_slave_destroy(ds->ports[port].netdev); + } + + /* Disable configuration of the CPU and DSA ports */ + for (port = 0; port < ds->num_ports; port++) { + if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port))) + continue; + dsa_cpu_dsa_destroy(&ds->ports[port]); + + /* Clearing a bit which is not set does no harm */ + ds->cpu_port_mask |= ~(1 << port); + ds->dsa_port_mask |= ~(1 << port); + } + + if (ds->slave_mii_bus && ds->ops->phy_read) + mdiobus_unregister(ds->slave_mii_bus); + + dsa_switch_unregister_notifier(ds); +} + +#ifdef CONFIG_PM_SLEEP +int dsa_switch_suspend(struct dsa_switch *ds) +{ + int i, ret = 0; + + /* Suspend slave network devices */ + for (i = 0; i < ds->num_ports; i++) { + if (!dsa_is_port_initialized(ds, i)) + continue; + + ret = dsa_slave_suspend(ds->ports[i].netdev); + if (ret) + return ret; + } + + if (ds->ops->suspend) + ret = ds->ops->suspend(ds); + + return ret; +} +EXPORT_SYMBOL_GPL(dsa_switch_suspend); + +int dsa_switch_resume(struct dsa_switch *ds) +{ + int i, ret = 0; + + if (ds->ops->resume) + ret = ds->ops->resume(ds); + + if (ret) + return ret; + + /* Resume slave network devices */ + for (i = 0; i < ds->num_ports; i++) { + if (!dsa_is_port_initialized(ds, i)) + continue; + + ret = dsa_slave_resume(ds->ports[i].netdev); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(dsa_switch_resume); +#endif + +/* platform driver init and cleanup *****************************************/ +static int dev_is_class(struct device *dev, void *class) +{ + if (dev->class != NULL && !strcmp(dev->class->name, class)) + return 1; + + return 0; +} + +static struct device *dev_find_class(struct device *parent, char *class) +{ + if (dev_is_class(parent, class)) { + get_device(parent); + return parent; + } + + return device_find_child(parent, class, dev_is_class); +} + +struct mii_bus *dsa_host_dev_to_mii_bus(struct device *dev) +{ + struct device *d; + + d = dev_find_class(dev, "mdio_bus"); + if (d != NULL) { + struct mii_bus *bus; + + bus = to_mii_bus(d); + put_device(d); + + return bus; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(dsa_host_dev_to_mii_bus); + +#ifdef CONFIG_OF +static int dsa_of_setup_routing_table(struct dsa_platform_data *pd, + struct dsa_chip_data *cd, + int chip_index, int port_index, + struct device_node *link) +{ + const __be32 *reg; + int link_sw_addr; + struct device_node *parent_sw; + int len; + + parent_sw = of_get_parent(link); + if (!parent_sw) + return -EINVAL; + + reg = of_get_property(parent_sw, "reg", &len); + if (!reg || (len != sizeof(*reg) * 2)) + return -EINVAL; + + /* + * Get the destination switch number from the second field of its 'reg' + * property, i.e. for "reg = <0x19 1>" sw_addr is '1'. + */ + link_sw_addr = be32_to_cpup(reg + 1); + + if (link_sw_addr >= pd->nr_chips) + return -EINVAL; + + cd->rtable[link_sw_addr] = port_index; + + return 0; +} + +static int dsa_of_probe_links(struct dsa_platform_data *pd, + struct dsa_chip_data *cd, + int chip_index, int port_index, + struct device_node *port, + const char *port_name) +{ + struct device_node *link; + int link_index; + int ret; + + for (link_index = 0;; link_index++) { + link = of_parse_phandle(port, "link", link_index); + if (!link) + break; + + if (!strcmp(port_name, "dsa") && pd->nr_chips > 1) { + ret = dsa_of_setup_routing_table(pd, cd, chip_index, + port_index, link); + if (ret) + return ret; + } + } + return 0; +} + +static void dsa_of_free_platform_data(struct dsa_platform_data *pd) +{ + int i; + int port_index; + + for (i = 0; i < pd->nr_chips; i++) { + port_index = 0; + while (port_index < DSA_MAX_PORTS) { + kfree(pd->chip[i].port_names[port_index]); + port_index++; + } + + /* Drop our reference to the MDIO bus device */ + if (pd->chip[i].host_dev) + put_device(pd->chip[i].host_dev); + } + kfree(pd->chip); +} + +static int dsa_of_probe(struct device *dev) +{ + struct device_node *np = dev->of_node; + struct device_node *child, *mdio, *ethernet, *port; + struct mii_bus *mdio_bus, *mdio_bus_switch; + struct net_device *ethernet_dev; + struct dsa_platform_data *pd; + struct dsa_chip_data *cd; + const char *port_name; + int chip_index, port_index; + const unsigned int *sw_addr, *port_reg; + u32 eeprom_len; + int ret; + + mdio = of_parse_phandle(np, "dsa,mii-bus", 0); + if (!mdio) + return -EINVAL; + + mdio_bus = of_mdio_find_bus(mdio); + if (!mdio_bus) + return -EPROBE_DEFER; + + ethernet = of_parse_phandle(np, "dsa,ethernet", 0); + if (!ethernet) { + ret = -EINVAL; + goto out_put_mdio; + } + + ethernet_dev = of_find_net_device_by_node(ethernet); + if (!ethernet_dev) { + ret = -EPROBE_DEFER; + goto out_put_mdio; + } + + pd = kzalloc(sizeof(*pd), GFP_KERNEL); + if (!pd) { + ret = -ENOMEM; + goto out_put_ethernet; + } + + dev->platform_data = pd; + pd->of_netdev = ethernet_dev; + pd->nr_chips = of_get_available_child_count(np); + if (pd->nr_chips > DSA_MAX_SWITCHES) + pd->nr_chips = DSA_MAX_SWITCHES; + + pd->chip = kcalloc(pd->nr_chips, sizeof(struct dsa_chip_data), + GFP_KERNEL); + if (!pd->chip) { + ret = -ENOMEM; + goto out_free; + } + + chip_index = -1; + for_each_available_child_of_node(np, child) { + int i; + + chip_index++; + cd = &pd->chip[chip_index]; + + cd->of_node = child; + + /* Initialize the routing table */ + for (i = 0; i < DSA_MAX_SWITCHES; ++i) + cd->rtable[i] = DSA_RTABLE_NONE; + + /* When assigning the host device, increment its refcount */ + cd->host_dev = get_device(&mdio_bus->dev); + + sw_addr = of_get_property(child, "reg", NULL); + if (!sw_addr) + continue; + + cd->sw_addr = be32_to_cpup(sw_addr); + if (cd->sw_addr >= PHY_MAX_ADDR) + continue; + + if (!of_property_read_u32(child, "eeprom-length", &eeprom_len)) + cd->eeprom_len = eeprom_len; + + mdio = of_parse_phandle(child, "mii-bus", 0); + if (mdio) { + mdio_bus_switch = of_mdio_find_bus(mdio); + if (!mdio_bus_switch) { + ret = -EPROBE_DEFER; + goto out_free_chip; + } + + /* Drop the mdio_bus device ref, replacing the host + * device with the mdio_bus_switch device, keeping + * the refcount from of_mdio_find_bus() above. + */ + put_device(cd->host_dev); + cd->host_dev = &mdio_bus_switch->dev; + } + + for_each_available_child_of_node(child, port) { + port_reg = of_get_property(port, "reg", NULL); + if (!port_reg) + continue; + + port_index = be32_to_cpup(port_reg); + if (port_index >= DSA_MAX_PORTS) + break; + + port_name = of_get_property(port, "label", NULL); + if (!port_name) + continue; + + cd->port_dn[port_index] = port; + + cd->port_names[port_index] = kstrdup(port_name, + GFP_KERNEL); + if (!cd->port_names[port_index]) { + ret = -ENOMEM; + goto out_free_chip; + } + + ret = dsa_of_probe_links(pd, cd, chip_index, + port_index, port, port_name); + if (ret) + goto out_free_chip; + + } + } + + /* The individual chips hold their own refcount on the mdio bus, + * so drop ours */ + put_device(&mdio_bus->dev); + + return 0; + +out_free_chip: + dsa_of_free_platform_data(pd); +out_free: + kfree(pd); + dev->platform_data = NULL; +out_put_ethernet: + put_device(ðernet_dev->dev); +out_put_mdio: + put_device(&mdio_bus->dev); + return ret; +} + +static void dsa_of_remove(struct device *dev) +{ + struct dsa_platform_data *pd = dev->platform_data; + + if (!dev->of_node) + return; + + dsa_of_free_platform_data(pd); + put_device(&pd->of_netdev->dev); + kfree(pd); +} +#else +static inline int dsa_of_probe(struct device *dev) +{ + return 0; +} + +static inline void dsa_of_remove(struct device *dev) +{ +} +#endif + +static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev, + struct device *parent, struct dsa_platform_data *pd) +{ + int i; + unsigned configured = 0; + + dst->pd = pd; + dst->master_netdev = dev; + dst->cpu_port = -1; + + for (i = 0; i < pd->nr_chips; i++) { + struct dsa_switch *ds; + + ds = dsa_switch_setup(dst, i, parent, pd->chip[i].host_dev); + if (IS_ERR(ds)) { + netdev_err(dev, "[%d]: couldn't create dsa switch instance (error %ld)\n", + i, PTR_ERR(ds)); + continue; + } + + dst->ds[i] = ds; + + ++configured; + } + + /* + * If no switch was found, exit cleanly + */ + if (!configured) + return -EPROBE_DEFER; + + /* + * If we use a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point on get + * sent to the tag format's receive function. + */ + wmb(); + dev->dsa_ptr = (void *)dst; + + return 0; +} + +static int dsa_probe(struct platform_device *pdev) +{ + struct dsa_platform_data *pd = pdev->dev.platform_data; + struct net_device *dev; + struct dsa_switch_tree *dst; + int ret; + + if (pdev->dev.of_node) { + ret = dsa_of_probe(&pdev->dev); + if (ret) + return ret; + + pd = pdev->dev.platform_data; + } + + if (pd == NULL || (pd->netdev == NULL && pd->of_netdev == NULL)) + return -EINVAL; + + if (pd->of_netdev) { + dev = pd->of_netdev; + dev_hold(dev); + } else { + dev = dsa_dev_to_net_device(pd->netdev); + } + if (dev == NULL) { + ret = -EPROBE_DEFER; + goto out; + } + + if (dev->dsa_ptr != NULL) { + dev_put(dev); + ret = -EEXIST; + goto out; + } + + dst = devm_kzalloc(&pdev->dev, sizeof(*dst), GFP_KERNEL); + if (dst == NULL) { + dev_put(dev); + ret = -ENOMEM; + goto out; + } + + platform_set_drvdata(pdev, dst); + + ret = dsa_setup_dst(dst, dev, &pdev->dev, pd); + if (ret) { + dev_put(dev); + goto out; + } + + return 0; + +out: + dsa_of_remove(&pdev->dev); + + return ret; +} + +static void dsa_remove_dst(struct dsa_switch_tree *dst) +{ + int i; + + dst->master_netdev->dsa_ptr = NULL; + + /* If we used a tagging format that doesn't have an ethertype + * field, make sure that all packets from this point get sent + * without the tag and go through the regular receive path. + */ + wmb(); + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds) + dsa_switch_destroy(ds); + } + + dsa_cpu_port_ethtool_restore(dst->cpu_switch); + + dev_put(dst->master_netdev); +} + +static int dsa_remove(struct platform_device *pdev) +{ + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + + dsa_remove_dst(dst); + dsa_of_remove(&pdev->dev); + + return 0; +} + +static void dsa_shutdown(struct platform_device *pdev) +{ +} + +#ifdef CONFIG_PM_SLEEP +static int dsa_suspend(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_suspend(ds); + } + + return ret; +} + +static int dsa_resume(struct device *d) +{ + struct platform_device *pdev = to_platform_device(d); + struct dsa_switch_tree *dst = platform_get_drvdata(pdev); + int i, ret = 0; + + for (i = 0; i < dst->pd->nr_chips; i++) { + struct dsa_switch *ds = dst->ds[i]; + + if (ds != NULL) + ret = dsa_switch_resume(ds); + } + + return ret; +} +#endif + +static SIMPLE_DEV_PM_OPS(dsa_pm_ops, dsa_suspend, dsa_resume); + +static const struct of_device_id dsa_of_match_table[] = { + { .compatible = "marvell,dsa", }, + {} +}; +MODULE_DEVICE_TABLE(of, dsa_of_match_table); + +static struct platform_driver dsa_driver = { + .probe = dsa_probe, + .remove = dsa_remove, + .shutdown = dsa_shutdown, + .driver = { + .name = "dsa", + .of_match_table = dsa_of_match_table, + .pm = &dsa_pm_ops, + }, +}; + +int dsa_legacy_register(void) +{ + return platform_driver_register(&dsa_driver); +} + +void dsa_legacy_unregister(void) +{ + platform_driver_unregister(&dsa_driver); +} diff --git a/net/dsa/slave.c b/net/dsa/slave.c index c34872e1febc4b75d1b69b18a8a1189405ca30fa..7693182df81e61d14540cd43be3ae7e134eef576 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -419,8 +420,8 @@ static int dsa_slave_vlan_filtering(struct net_device *dev, return 0; } -static int dsa_fastest_ageing_time(struct dsa_switch *ds, - unsigned int ageing_time) +static unsigned int dsa_fastest_ageing_time(struct dsa_switch *ds, + unsigned int ageing_time) { int i; @@ -443,9 +444,13 @@ static int dsa_slave_ageing_time(struct net_device *dev, unsigned long ageing_jiffies = clock_t_to_jiffies(attr->u.ageing_time); unsigned int ageing_time = jiffies_to_msecs(ageing_jiffies); - /* bridge skips -EOPNOTSUPP, so skip the prepare phase */ - if (switchdev_trans_ph_prepare(trans)) + if (switchdev_trans_ph_prepare(trans)) { + if (ds->ageing_time_min && ageing_time < ds->ageing_time_min) + return -ERANGE; + if (ds->ageing_time_max && ageing_time > ds->ageing_time_max) + return -ERANGE; return 0; + } /* Keep the fastest ageing time in case of multiple bridges */ p->dp->ageing_time = ageing_time; diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 6456dacf9ae9e0b88585defc026179423a80e4e4..ca6e26e514f089cfa8991ef6a7bd790bd8db14fe 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -1,7 +1,8 @@ /* * Handling of a single switch chip, part of a switch fabric * - * Copyright (c) 2017 Vivien Didelot + * Copyright (c) 2017 Savoir-faire Linux Inc. + * Vivien Didelot * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -19,9 +20,9 @@ static int dsa_switch_bridge_join(struct dsa_switch *ds, if (ds->index == info->sw_index && ds->ops->port_bridge_join) return ds->ops->port_bridge_join(ds, info->port, info->br); - if (ds->index != info->sw_index) - dev_dbg(ds->dev, "crosschip DSA port %d.%d bridged to %s\n", - info->sw_index, info->port, netdev_name(info->br)); + if (ds->index != info->sw_index && ds->ops->crosschip_bridge_join) + return ds->ops->crosschip_bridge_join(ds, info->sw_index, + info->port, info->br); return 0; } @@ -32,9 +33,9 @@ static int dsa_switch_bridge_leave(struct dsa_switch *ds, if (ds->index == info->sw_index && ds->ops->port_bridge_leave) ds->ops->port_bridge_leave(ds, info->port, info->br); - if (ds->index != info->sw_index) - dev_dbg(ds->dev, "crosschip DSA port %d.%d unbridged from %s\n", - info->sw_index, info->port, netdev_name(info->br)); + if (ds->index != info->sw_index && ds->ops->crosschip_bridge_leave) + ds->ops->crosschip_bridge_leave(ds, info->sw_index, info->port, + info->br); return 0; } diff --git a/net/dsa/tag_brcm.c b/net/dsa/tag_brcm.c index 5d925b6b2bb14f78f84a06b84b4fa19bd6846e82..2a9b52c5af86b5308d7d71a3340a0e48768bdb60 100644 --- a/net/dsa/tag_brcm.c +++ b/net/dsa/tag_brcm.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "dsa_priv.h" /* This tag length is 4 bytes, older ones were 6 bytes, we do not @@ -91,23 +92,17 @@ static struct sk_buff *brcm_tag_xmit(struct sk_buff *skb, struct net_device *dev return NULL; } -static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; int source_port; u8 *brcm_tag; - if (unlikely(dst == NULL)) - goto out_drop; - ds = dst->cpu_switch; - skb = skb_unshare(skb, GFP_ATOMIC); - if (skb == NULL) - goto out; - if (unlikely(!pskb_may_pull(skb, BRCM_TAG_LEN))) goto out_drop; @@ -139,22 +134,12 @@ static int brcm_tag_rcv(struct sk_buff *skb, struct net_device *dev, skb->data - ETH_HLEN - BRCM_TAG_LEN, 2 * ETH_ALEN); - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; skb->dev = ds->ports[source_port].netdev; - skb->protocol = eth_type_trans(skb, skb->dev); - - skb->dev->stats.rx_packets++; - skb->dev->stats.rx_bytes += skb->len; - netif_receive_skb(skb); - - return 0; + return skb; out_drop: - kfree_skb(skb); -out: - return 0; + return NULL; } const struct dsa_device_ops brcm_netdev_ops = { diff --git a/net/dsa/tag_dsa.c b/net/dsa/tag_dsa.c index 72579ceea381b7e2bce99a28208810b707434f09..1c6633f0de01909f950a03349b1b4c08c2151839 100644 --- a/net/dsa/tag_dsa.c +++ b/net/dsa/tag_dsa.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dsa_priv.h" #define DSA_HLEN 4 @@ -67,8 +68,9 @@ static struct sk_buff *dsa_xmit(struct sk_buff *skb, struct net_device *dev) return NULL; } -static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *dsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; @@ -76,13 +78,6 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, int source_device; int source_port; - if (unlikely(dst == NULL)) - goto out_drop; - - skb = skb_unshare(skb, GFP_ATOMIC); - if (skb == NULL) - goto out; - if (unlikely(!pskb_may_pull(skb, DSA_HLEN))) goto out_drop; @@ -164,21 +159,11 @@ static int dsa_rcv(struct sk_buff *skb, struct net_device *dev, } skb->dev = ds->ports[source_port].netdev; - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, skb->dev); - - skb->dev->stats.rx_packets++; - skb->dev->stats.rx_bytes += skb->len; - netif_receive_skb(skb); - - return 0; + return skb; out_drop: - kfree_skb(skb); -out: - return 0; + return NULL; } const struct dsa_device_ops dsa_netdev_ops = { diff --git a/net/dsa/tag_edsa.c b/net/dsa/tag_edsa.c index 648c051817a1b4a4e64cda67bab8c81288027a4e..d9c668aa5e54682914a0e61b3cb6373a71a0ee8b 100644 --- a/net/dsa/tag_edsa.c +++ b/net/dsa/tag_edsa.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dsa_priv.h" #define DSA_HLEN 4 @@ -80,8 +81,9 @@ static struct sk_buff *edsa_xmit(struct sk_buff *skb, struct net_device *dev) return NULL; } -static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *edsa_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; @@ -89,13 +91,6 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, int source_device; int source_port; - if (unlikely(dst == NULL)) - goto out_drop; - - skb = skb_unshare(skb, GFP_ATOMIC); - if (skb == NULL) - goto out; - if (unlikely(!pskb_may_pull(skb, EDSA_HLEN))) goto out_drop; @@ -183,21 +178,11 @@ static int edsa_rcv(struct sk_buff *skb, struct net_device *dev, } skb->dev = ds->ports[source_port].netdev; - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, skb->dev); - - skb->dev->stats.rx_packets++; - skb->dev->stats.rx_bytes += skb->len; - netif_receive_skb(skb); - - return 0; + return skb; out_drop: - kfree_skb(skb); -out: - return 0; + return NULL; } const struct dsa_device_ops edsa_netdev_ops = { diff --git a/net/dsa/tag_lan9303.c b/net/dsa/tag_lan9303.c new file mode 100644 index 0000000000000000000000000000000000000000..70130ed5c21a9fa5e218e51ab4448c35c9005eba --- /dev/null +++ b/net/dsa/tag_lan9303.c @@ -0,0 +1,136 @@ +/* + * Copyright (C) 2017 Pengutronix, Juergen Borleis + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ +#include +#include +#include +#include +#include "dsa_priv.h" + +/* To define the outgoing port and to discover the incoming port a regular + * VLAN tag is used by the LAN9303. But its VID meaning is 'special': + * + * Dest MAC Src MAC TAG Type + * ...| 1 2 3 4 5 6 | 1 2 3 4 5 6 | 1 2 3 4 | 1 2 |... + * |<------->| + * TAG: + * |<------------->| + * | 1 2 | 3 4 | + * TPID VID + * 0x8100 + * + * VID bit 3 indicates a request for an ALR lookup. + * + * If VID bit 3 is zero, then bits 0 and 1 specify the destination port + * (0, 1, 2) or broadcast (3) or the source port (1, 2). + * + * VID bit 4 is used to specify if the STP port state should be overridden. + * Required when no forwarding between the external ports should happen. + */ + +#define LAN9303_TAG_LEN 4 +#define LAN9303_MAX_PORTS 3 + +static struct sk_buff *lan9303_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u16 *lan9303_tag; + + /* insert a special VLAN tag between the MAC addresses + * and the current ethertype field. + */ + if (skb_cow_head(skb, LAN9303_TAG_LEN) < 0) { + dev_dbg(&dev->dev, + "Cannot make room for the special tag. Dropping packet\n"); + goto out_free; + } + + /* provide 'LAN9303_TAG_LEN' bytes additional space */ + skb_push(skb, LAN9303_TAG_LEN); + + /* make room between MACs and Ether-Type */ + memmove(skb->data, skb->data + LAN9303_TAG_LEN, 2 * ETH_ALEN); + + lan9303_tag = (u16 *)(skb->data + 2 * ETH_ALEN); + lan9303_tag[0] = htons(ETH_P_8021Q); + lan9303_tag[1] = htons(p->dp->index | BIT(4)); + + return skb; +out_free: + kfree_skb(skb); + return NULL; +} + +static struct sk_buff *lan9303_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + u16 *lan9303_tag; + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + unsigned int source_port; + + ds = dst->ds[0]; + + if (unlikely(!ds)) { + dev_warn_ratelimited(&dev->dev, "Dropping packet, due to missing DSA switch device\n"); + return NULL; + } + + if (unlikely(!pskb_may_pull(skb, LAN9303_TAG_LEN))) { + dev_warn_ratelimited(&dev->dev, + "Dropping packet, cannot pull\n"); + return NULL; + } + + /* '->data' points into the middle of our special VLAN tag information: + * + * ~ MAC src | 0x81 | 0x00 | 0xyy | 0xzz | ether type + * ^ + * ->data + */ + lan9303_tag = (u16 *)(skb->data - 2); + + if (lan9303_tag[0] != htons(ETH_P_8021Q)) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid VLAN marker\n"); + return NULL; + } + + source_port = ntohs(lan9303_tag[1]) & 0x3; + + if (source_port >= LAN9303_MAX_PORTS) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid source port\n"); + return NULL; + } + + if (!ds->ports[source_port].netdev) { + dev_warn_ratelimited(&dev->dev, "Dropping packet due to invalid netdev or device\n"); + return NULL; + } + + /* remove the special VLAN tag between the MAC addresses + * and the current ethertype field. + */ + skb_pull_rcsum(skb, 2 + 2); + memmove(skb->data - ETH_HLEN, skb->data - (ETH_HLEN + LAN9303_TAG_LEN), + 2 * ETH_ALEN); + + /* forward the packet to the dedicated interface */ + skb->dev = ds->ports[source_port].netdev; + + return skb; +} + +const struct dsa_device_ops lan9303_netdev_ops = { + .xmit = lan9303_xmit, + .rcv = lan9303_rcv, +}; diff --git a/net/dsa/tag_mtk.c b/net/dsa/tag_mtk.c new file mode 100644 index 0000000000000000000000000000000000000000..837cdddb53f093c6283451f7626612312008bd0d --- /dev/null +++ b/net/dsa/tag_mtk.c @@ -0,0 +1,100 @@ +/* + * Mediatek DSA Tag support + * Copyright (C) 2017 Landen Chao + * Sean Wang + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include +#include "dsa_priv.h" + +#define MTK_HDR_LEN 4 +#define MTK_HDR_RECV_SOURCE_PORT_MASK GENMASK(2, 0) +#define MTK_HDR_XMIT_DP_BIT_MASK GENMASK(5, 0) + +static struct sk_buff *mtk_tag_xmit(struct sk_buff *skb, + struct net_device *dev) +{ + struct dsa_slave_priv *p = netdev_priv(dev); + u8 *mtk_tag; + + if (skb_cow_head(skb, MTK_HDR_LEN) < 0) + goto out_free; + + skb_push(skb, MTK_HDR_LEN); + + memmove(skb->data, skb->data + MTK_HDR_LEN, 2 * ETH_ALEN); + + /* Build the tag after the MAC Source Address */ + mtk_tag = skb->data + 2 * ETH_ALEN; + mtk_tag[0] = 0; + mtk_tag[1] = (1 << p->dp->index) & MTK_HDR_XMIT_DP_BIT_MASK; + mtk_tag[2] = 0; + mtk_tag[3] = 0; + + return skb; + +out_free: + kfree_skb(skb); + return NULL; +} + +static struct sk_buff *mtk_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) +{ + struct dsa_switch_tree *dst = dev->dsa_ptr; + struct dsa_switch *ds; + int port; + __be16 *phdr, hdr; + + if (unlikely(!pskb_may_pull(skb, MTK_HDR_LEN))) + goto out_drop; + + /* The MTK header is added by the switch between src addr + * and ethertype at this point, skb->data points to 2 bytes + * after src addr so header should be 2 bytes right before. + */ + phdr = (__be16 *)(skb->data - 2); + hdr = ntohs(*phdr); + + /* Remove MTK tag and recalculate checksum. */ + skb_pull_rcsum(skb, MTK_HDR_LEN); + + memmove(skb->data - ETH_HLEN, + skb->data - ETH_HLEN - MTK_HDR_LEN, + 2 * ETH_ALEN); + + /* This protocol doesn't support cascading multiple + * switches so it's safe to assume the switch is first + * in the tree. + */ + ds = dst->ds[0]; + if (!ds) + goto out_drop; + + /* Get source port information */ + port = (hdr & MTK_HDR_RECV_SOURCE_PORT_MASK); + if (!ds->ports[port].netdev) + goto out_drop; + + skb->dev = ds->ports[port].netdev; + + return skb; + +out_drop: + return NULL; +} + +const struct dsa_device_ops mtk_netdev_ops = { + .xmit = mtk_tag_xmit, + .rcv = mtk_tag_rcv, +}; diff --git a/net/dsa/tag_qca.c b/net/dsa/tag_qca.c index 30240f343aea8450b13936159b4b9a76126a8977..3ba3f59f7a3433b3731a5b3dc501eb22660d7cce 100644 --- a/net/dsa/tag_qca.c +++ b/net/dsa/tag_qca.c @@ -12,6 +12,7 @@ */ #include +#include #include "dsa_priv.h" #define QCA_HDR_LEN 2 @@ -65,8 +66,9 @@ static struct sk_buff *qca_tag_xmit(struct sk_buff *skb, struct net_device *dev) return NULL; } -static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; @@ -74,13 +76,6 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, int port; __be16 *phdr, hdr; - if (unlikely(!dst)) - goto out_drop; - - skb = skb_unshare(skb, GFP_ATOMIC); - if (!skb) - goto out; - if (unlikely(!pskb_may_pull(skb, QCA_HDR_LEN))) goto out_drop; @@ -114,22 +109,12 @@ static int qca_tag_rcv(struct sk_buff *skb, struct net_device *dev, goto out_drop; /* Update skb & forward the frame accordingly */ - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; skb->dev = ds->ports[port].netdev; - skb->protocol = eth_type_trans(skb, skb->dev); - - skb->dev->stats.rx_packets++; - skb->dev->stats.rx_bytes += skb->len; - netif_receive_skb(skb); - - return 0; + return skb; out_drop: - kfree_skb(skb); -out: - return 0; + return NULL; } const struct dsa_device_ops qca_netdev_ops = { diff --git a/net/dsa/tag_trailer.c b/net/dsa/tag_trailer.c index 26f977176978085af9c034319c754a1ac7501d4c..aafc2fc74c3067dd05c67c409e40e8ceef33cf0c 100644 --- a/net/dsa/tag_trailer.c +++ b/net/dsa/tag_trailer.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "dsa_priv.h" static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) @@ -57,22 +58,17 @@ static struct sk_buff *trailer_xmit(struct sk_buff *skb, struct net_device *dev) return nskb; } -static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, - struct packet_type *pt, struct net_device *orig_dev) +static struct sk_buff *trailer_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, + struct net_device *orig_dev) { struct dsa_switch_tree *dst = dev->dsa_ptr; struct dsa_switch *ds; u8 *trailer; int source_port; - if (unlikely(dst == NULL)) - goto out_drop; ds = dst->cpu_switch; - skb = skb_unshare(skb, GFP_ATOMIC); - if (skb == NULL) - goto out; - if (skb_linearize(skb)) goto out_drop; @@ -88,21 +84,11 @@ static int trailer_rcv(struct sk_buff *skb, struct net_device *dev, pskb_trim_rcsum(skb, skb->len - 4); skb->dev = ds->ports[source_port].netdev; - skb_push(skb, ETH_HLEN); - skb->pkt_type = PACKET_HOST; - skb->protocol = eth_type_trans(skb, skb->dev); - - skb->dev->stats.rx_packets++; - skb->dev->stats.rx_bytes += skb->len; - netif_receive_skb(skb); - - return 0; + return skb; out_drop: - kfree_skb(skb); -out: - return 0; + return NULL; } const struct dsa_device_ops trailer_netdev_ops = { diff --git a/net/hsr/hsr_netlink.c b/net/hsr/hsr_netlink.c index 1ab30e7d3f99e19c5e54fd9747cfce7a5c1f559b..81dac16933fc05a19224a7c9ef450d34dea1be91 100644 --- a/net/hsr/hsr_netlink.c +++ b/net/hsr/hsr_netlink.c @@ -350,7 +350,7 @@ static int hsr_get_node_status(struct sk_buff *skb_in, struct genl_info *info) return 0; invalid: - netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); + netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; nla_put_failure: @@ -432,7 +432,7 @@ static int hsr_get_node_list(struct sk_buff *skb_in, struct genl_info *info) return 0; invalid: - netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL); + netlink_ack(skb_in, nlmsg_hdr(skb_in), -EINVAL, NULL); return 0; nla_put_failure: diff --git a/net/ieee802154/nl802154.c b/net/ieee802154/nl802154.c index fc60cd061f3966a2381803a8e4d85206770253dc..99f6c254ea777c8786421d069f8f2414d0eb34e5 100644 --- a/net/ieee802154/nl802154.c +++ b/net/ieee802154/nl802154.c @@ -249,8 +249,7 @@ nl802154_prepare_wpan_dev_dump(struct sk_buff *skb, if (!cb->args[0]) { err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, genl_family_attrbuf(&nl802154_fam), - nl802154_fam.maxattr, - nl802154_policy); + nl802154_fam.maxattr, nl802154_policy, NULL); if (err) goto out_unlock; @@ -562,8 +561,8 @@ static int nl802154_dump_wpan_phy_parse(struct sk_buff *skb, struct nl802154_dump_wpan_phy_state *state) { struct nlattr **tb = genl_family_attrbuf(&nl802154_fam); - int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, - tb, nl802154_fam.maxattr, nl802154_policy); + int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl802154_fam.hdrsize, tb, + nl802154_fam.maxattr, nl802154_policy, NULL); /* TODO check if we can handle error here, * we have no backward compatibility @@ -1308,7 +1307,7 @@ ieee802154_llsec_parse_dev_addr(struct nlattr *nla, struct nlattr *attrs[NL802154_DEV_ADDR_ATTR_MAX + 1]; if (!nla || nla_parse_nested(attrs, NL802154_DEV_ADDR_ATTR_MAX, nla, - nl802154_dev_addr_policy)) + nl802154_dev_addr_policy, NULL)) return -EINVAL; if (!attrs[NL802154_DEV_ADDR_ATTR_PAN_ID] || @@ -1348,7 +1347,7 @@ ieee802154_llsec_parse_key_id(struct nlattr *nla, struct nlattr *attrs[NL802154_KEY_ID_ATTR_MAX + 1]; if (!nla || nla_parse_nested(attrs, NL802154_KEY_ID_ATTR_MAX, nla, - nl802154_key_id_policy)) + nl802154_key_id_policy, NULL)) return -EINVAL; if (!attrs[NL802154_KEY_ID_ATTR_MODE]) @@ -1565,7 +1564,7 @@ static int nl802154_add_llsec_key(struct sk_buff *skb, struct genl_info *info) if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], - nl802154_key_policy)) + nl802154_key_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_KEY_ATTR_USAGE_FRAMES] || @@ -1615,7 +1614,7 @@ static int nl802154_del_llsec_key(struct sk_buff *skb, struct genl_info *info) if (nla_parse_nested(attrs, NL802154_KEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_KEY], - nl802154_key_policy)) + nl802154_key_policy, info->extack)) return -EINVAL; if (ieee802154_llsec_parse_key_id(attrs[NL802154_KEY_ATTR_ID], &id) < 0) @@ -1729,8 +1728,8 @@ ieee802154_llsec_parse_device(struct nlattr *nla, { struct nlattr *attrs[NL802154_DEV_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, nla, - nl802154_dev_policy)) + if (!nla || nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, + nla, nl802154_dev_policy, NULL)) return -EINVAL; memset(dev, 0, sizeof(*dev)); @@ -1783,7 +1782,7 @@ static int nl802154_del_llsec_dev(struct sk_buff *skb, struct genl_info *info) if (nla_parse_nested(attrs, NL802154_DEV_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVICE], - nl802154_dev_policy)) + nl802154_dev_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEV_ATTR_EXTENDED_ADDR]) @@ -1911,7 +1910,7 @@ static int nl802154_add_llsec_devkey(struct sk_buff *skb, struct genl_info *info if (!info->attrs[NL802154_ATTR_SEC_DEVKEY] || nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], - nl802154_devkey_policy) < 0) + nl802154_devkey_policy, info->extack) < 0) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_FRAME_COUNTER] || @@ -1943,7 +1942,7 @@ static int nl802154_del_llsec_devkey(struct sk_buff *skb, struct genl_info *info if (nla_parse_nested(attrs, NL802154_DEVKEY_ATTR_MAX, info->attrs[NL802154_ATTR_SEC_DEVKEY], - nl802154_devkey_policy)) + nl802154_devkey_policy, info->extack)) return -EINVAL; if (!attrs[NL802154_DEVKEY_ATTR_EXTENDED_ADDR]) @@ -2063,8 +2062,8 @@ llsec_parse_seclevel(struct nlattr *nla, struct ieee802154_llsec_seclevel *sl) { struct nlattr *attrs[NL802154_SECLEVEL_ATTR_MAX + 1]; - if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, nla, - nl802154_seclevel_policy)) + if (!nla || nla_parse_nested(attrs, NL802154_SECLEVEL_ATTR_MAX, + nla, nl802154_seclevel_policy, NULL)) return -EINVAL; memset(sl, 0, sizeof(*sl)); diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index c6d4238ff94a8a329bf44bf59b30fb09fb07f707..f83de23a30e7e77303d011dee35eb92342adab5c 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp_rate.o tcp_recovery.o \ tcp_offload.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ - fib_frontend.o fib_semantics.o fib_trie.o \ + fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6b1fc6e4278ef4f1cba58412977918af31d73e62..f3dad16613437c0c7ac3e9c7518a0929cddb3ca7 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1343,6 +1343,9 @@ struct sk_buff **inet_gro_receive(struct sk_buff **head, struct sk_buff *skb) if (*(u8 *)iph != 0x45) goto out_unlock; + if (ip_is_fragment(iph)) + goto out_unlock; + if (unlikely(ip_fast_csum((u8 *)iph, 5))) goto out_unlock; @@ -1599,8 +1602,9 @@ static const struct net_protocol igmp_protocol = { }; #endif -static const struct net_protocol tcp_protocol = { +static struct net_protocol tcp_protocol = { .early_demux = tcp_v4_early_demux, + .early_demux_handler = tcp_v4_early_demux, .handler = tcp_v4_rcv, .err_handler = tcp_v4_err, .no_policy = 1, @@ -1608,8 +1612,9 @@ static const struct net_protocol tcp_protocol = { .icmp_strict_tag_validation = 1, }; -static const struct net_protocol udp_protocol = { +static struct net_protocol udp_protocol = { .early_demux = udp_v4_early_demux, + .early_demux_handler = udp_v4_early_demux, .handler = udp_rcv, .err_handler = udp_err, .no_policy = 1, @@ -1720,6 +1725,8 @@ static __net_init int inet_init_net(struct net *net) net->ipv4.sysctl_ip_default_ttl = IPDEFTTL; net->ipv4.sysctl_ip_dynaddr = 0; net->ipv4.sysctl_ip_early_demux = 1; + net->ipv4.sysctl_udp_early_demux = 1; + net->ipv4.sysctl_tcp_early_demux = 1; #ifdef CONFIG_SYSCTL net->ipv4.sysctl_ip_prot_sock = PROT_SOCK; #endif diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 51b27ae09fbd725bcd8030982e5850215ac4ce5c..0937b34c27cacb2dec73a67a76ff11fe26722500 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -872,7 +872,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb) skb->pkt_type != PACKET_HOST) state = NUD_STALE; neigh_update(n, sha, state, - override ? NEIGH_UPDATE_F_OVERRIDE : 0); + override ? NEIGH_UPDATE_F_OVERRIDE : 0, 0); neigh_release(n); } @@ -1033,7 +1033,7 @@ static int arp_req_set(struct net *net, struct arpreq *r, err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, NEIGH_UPDATE_F_OVERRIDE | - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } return err; @@ -1084,7 +1084,7 @@ static int arp_invalidate(struct net_device *dev, __be32 ip) if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| - NEIGH_UPDATE_F_ADMIN); + NEIGH_UPDATE_F_ADMIN, 0); neigh_release(neigh); } diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index cebedd545e5e2863afcfe116309725e2cd57206c..df14815a3b8ce74aeb613458ffaf5f6eee4a263d 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -571,7 +571,8 @@ static int ip_mc_config(struct sock *sk, bool join, const struct in_ifaddr *ifa) return ret; } -static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -582,7 +583,8 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, + extack); if (err < 0) goto errout; @@ -752,7 +754,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, struct in_device *in_dev; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv4_policy, + NULL); if (err < 0) goto errout; @@ -843,7 +846,8 @@ static struct in_ifaddr *find_matching_ifa(struct in_ifaddr *ifa) return NULL; } -static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct in_ifaddr *ifa; @@ -1192,6 +1196,18 @@ static int inet_gifconf(struct net_device *dev, char __user *buf, int len) return done; } +static __be32 in_dev_select_addr(const struct in_device *in_dev, + int scope) +{ + for_primary_ifa(in_dev) { + if (ifa->ifa_scope != RT_SCOPE_LINK && + ifa->ifa_scope <= scope) + return ifa->ifa_local; + } endfor_ifa(in_dev); + + return 0; +} + __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) { __be32 addr = 0; @@ -1228,13 +1244,9 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (master_idx && (dev = dev_get_by_index_rcu(net, master_idx)) && (in_dev = __in_dev_get_rcu(dev))) { - for_primary_ifa(in_dev) { - if (ifa->ifa_scope != RT_SCOPE_LINK && - ifa->ifa_scope <= scope) { - addr = ifa->ifa_local; - goto out_unlock; - } - } endfor_ifa(in_dev); + addr = in_dev_select_addr(in_dev, scope); + if (addr) + goto out_unlock; } /* Not loopback addresses on loopback should be preferred @@ -1249,13 +1261,9 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope) if (!in_dev) continue; - for_primary_ifa(in_dev) { - if (ifa->ifa_scope != RT_SCOPE_LINK && - ifa->ifa_scope <= scope) { - addr = ifa->ifa_local; - goto out_unlock; - } - } endfor_ifa(in_dev); + addr = in_dev_select_addr(in_dev, scope); + if (addr) + goto out_unlock; } out_unlock: rcu_read_unlock(); @@ -1713,7 +1721,7 @@ static int inet_validate_link_af(const struct net_device *dev, if (dev && !__in_dev_get_rtnl(dev)) return -EAFNOSUPPORT; - err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy); + err = nla_parse_nested(tb, IFLA_INET_MAX, nla, inet_af_policy, NULL); if (err < 0) return err; @@ -1741,7 +1749,7 @@ static int inet_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!in_dev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL) < 0) + if (nla_parse_nested(tb, IFLA_INET_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET_CONF]) { @@ -1798,6 +1806,9 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; + if (!devconf) + goto out; + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, IPV4_DEVCONF(*devconf, FORWARDING)) < 0) @@ -1819,6 +1830,7 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0) goto nla_put_failure; +out: nlmsg_end(skb, nlh); return 0; @@ -1827,8 +1839,8 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex, return -EMSGSIZE; } -void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv4_devconf *devconf) +void inet_netconf_notify_devconf(struct net *net, int event, int type, + int ifindex, struct ipv4_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; @@ -1838,7 +1850,7 @@ void inet_netconf_notify_devconf(struct net *net, int type, int ifindex, goto errout; err = inet_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, - RTM_NEWNETCONF, 0, type); + event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1861,7 +1873,8 @@ static const struct nla_policy devconf_ipv4_policy[NETCONFA_MAX+1] = { }; static int inet_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; @@ -1874,7 +1887,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv4_policy); + devconf_ipv4_policy, extack); if (err < 0) goto errout; @@ -2017,10 +2030,12 @@ static void inet_forward_change(struct net *net) IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on; IPV4_DEVCONF_DFLT(net, FORWARDING) = on; - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); @@ -2033,7 +2048,8 @@ static void inet_forward_change(struct net *net) in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IN_DEV_CONF_SET(in_dev, FORWARDING, on); - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, dev->ifindex, &in_dev->cnf); } } @@ -2078,19 +2094,22 @@ static int devinet_conf_proc(struct ctl_table *ctl, int write, if (i == IPV4_DEVCONF_RP_FILTER - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_RP_FILTER, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_RP_FILTER, ifindex, cnf); } if (i == IPV4_DEVCONF_PROXY_ARP - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, ifindex, cnf); } if (i == IPV4_DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN - 1 && new_value != old_value) { ifindex = devinet_conf_ifindex(net, cnf); - inet_netconf_notify_devconf(net, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, ifindex, cnf); } } @@ -2125,7 +2144,7 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, container_of(cnf, struct in_device, cnf); if (*valp) dev_disable_lro(idev->dev); - inet_netconf_notify_devconf(net, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_FORWARDING, idev->dev->ifindex, cnf); @@ -2133,7 +2152,8 @@ static int devinet_sysctl_forward(struct ctl_table *ctl, int write, rtnl_unlock(); rt_cache_flush(net); } else - inet_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv4.devconf_dflt); } @@ -2255,7 +2275,8 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, p->sysctl = t; - inet_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, + ifindex, p); return 0; free: @@ -2264,16 +2285,18 @@ static int __devinet_sysctl_register(struct net *net, char *dev_name, return -ENOBUFS; } -static void __devinet_sysctl_unregister(struct ipv4_devconf *cnf) +static void __devinet_sysctl_unregister(struct net *net, + struct ipv4_devconf *cnf, int ifindex) { struct devinet_sysctl_table *t = cnf->sysctl; - if (!t) - return; + if (t) { + cnf->sysctl = NULL; + unregister_net_sysctl_table(t->sysctl_header); + kfree(t); + } - cnf->sysctl = NULL; - unregister_net_sysctl_table(t->sysctl_header); - kfree(t); + inet_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int devinet_sysctl_register(struct in_device *idev) @@ -2295,7 +2318,9 @@ static int devinet_sysctl_register(struct in_device *idev) static void devinet_sysctl_unregister(struct in_device *idev) { - __devinet_sysctl_unregister(&idev->cnf); + struct net *net = dev_net(idev->dev); + + __devinet_sysctl_unregister(net, &idev->cnf, idev->dev->ifindex); neigh_sysctl_unregister(idev->arp_parms); } @@ -2370,9 +2395,9 @@ static __net_init int devinet_init_net(struct net *net) #ifdef CONFIG_SYSCTL err_reg_ctl: - __devinet_sysctl_unregister(dflt); + __devinet_sysctl_unregister(net, dflt, NETCONFA_IFINDEX_DEFAULT); err_reg_dflt: - __devinet_sysctl_unregister(all); + __devinet_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: if (tbl != ctl_forward_entry) kfree(tbl); @@ -2394,8 +2419,10 @@ static __net_exit void devinet_exit_net(struct net *net) tbl = net->ipv4.forw_hdr->ctl_table_arg; unregister_net_sysctl_table(net->ipv4.forw_hdr); - __devinet_sysctl_unregister(net->ipv4.devconf_dflt); - __devinet_sysctl_unregister(net->ipv4.devconf_all); + __devinet_sysctl_unregister(net, net->ipv4.devconf_dflt, + NETCONFA_IFINDEX_DEFAULT); + __devinet_sysctl_unregister(net, net->ipv4.devconf_all, + NETCONFA_IFINDEX_ALL); kfree(tbl); #endif kfree(net->ipv4.devconf_dflt); diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index b1e24446e2972444835dd85c02434f3b089ba552..65cc02bd82bc87991a29135a58c059c7916a5a35 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -152,21 +152,28 @@ static void esp_output_restore_header(struct sk_buff *skb) } static struct ip_esp_hdr *esp_output_set_extra(struct sk_buff *skb, + struct xfrm_state *x, struct ip_esp_hdr *esph, struct esp_output_extra *extra) { - struct xfrm_state *x = skb_dst(skb)->xfrm; - /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + __u32 seqhi; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo) + seqhi = xo->seq.hi; + else + seqhi = XFRM_SKB_CB(skb)->seq.output.hi; + extra->esphoff = (unsigned char *)esph - skb_transport_header(skb); esph = (struct ip_esp_hdr *)((unsigned char *)esph - 4); extra->seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + esph->seq_no = htonl(seqhi); } esph->spi = x->id.spi; @@ -198,98 +205,56 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +static void esp_output_udp_encap(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { - struct esp_output_extra *extra; - int err = -ENOMEM; - struct ip_esp_hdr *esph; - struct crypto_aead *aead; - struct aead_request *req; - struct scatterlist *sg, *dsg; - struct sk_buff *trailer; - struct page *page; - void *tmp; - u8 *iv; - u8 *tail; - u8 *vaddr; - int blksize; - int clen; - int alen; - int plen; - int ivlen; - int tfclen; - int nfrags; - int assoclen; - int extralen; - int tailen; - __be64 seqno; - __u8 proto = *skb_mac_header(skb); - - /* skb is pure payload to encrypt */ - - aead = x->data; - alen = crypto_aead_authsize(aead); - ivlen = crypto_aead_ivsize(aead); - - tfclen = 0; - if (x->tfcpad) { - struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); - u32 padto; - - padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); - if (skb->len < padto) - tfclen = padto - skb->len; + int encap_type; + struct udphdr *uh; + __be32 *udpdata32; + __be16 sport, dport; + struct xfrm_encap_tmpl *encap = x->encap; + struct ip_esp_hdr *esph = esp->esph; + + spin_lock_bh(&x->lock); + sport = encap->encap_sport; + dport = encap->encap_dport; + encap_type = encap->encap_type; + spin_unlock_bh(&x->lock); + + uh = (struct udphdr *)esph; + uh->source = sport; + uh->dest = dport; + uh->len = htons(skb->len + esp->tailen + - skb_transport_offset(skb)); + uh->check = 0; + + switch (encap_type) { + default: + case UDP_ENCAP_ESPINUDP: + esph = (struct ip_esp_hdr *)(uh + 1); + break; + case UDP_ENCAP_ESPINUDP_NON_IKE: + udpdata32 = (__be32 *)(uh + 1); + udpdata32[0] = udpdata32[1] = 0; + esph = (struct ip_esp_hdr *)(udpdata32 + 2); + break; } - blksize = ALIGN(crypto_aead_blocksize(aead), 4); - clen = ALIGN(skb->len + 2 + tfclen, blksize); - plen = clen - skb->len - tfclen; - tailen = tfclen + plen + alen; - assoclen = sizeof(*esph); - extralen = 0; - if (x->props.flags & XFRM_STATE_ESN) { - extralen += sizeof(*extra); - assoclen += sizeof(__be32); - } + *skb_mac_header(skb) = IPPROTO_UDP; + esp->esph = esph; +} - *skb_mac_header(skb) = IPPROTO_ESP; - esph = ip_esp_hdr(skb); +int esp_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +{ + u8 *tail; + u8 *vaddr; + int nfrags; + struct page *page; + struct sk_buff *trailer; + int tailen = esp->tailen; /* this is non-NULL only with UDP Encapsulation */ - if (x->encap) { - struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - __be32 *udpdata32; - __be16 sport, dport; - int encap_type; - - spin_lock_bh(&x->lock); - sport = encap->encap_sport; - dport = encap->encap_dport; - encap_type = encap->encap_type; - spin_unlock_bh(&x->lock); - - uh = (struct udphdr *)esph; - uh->source = sport; - uh->dest = dport; - uh->len = htons(skb->len + tailen - - skb_transport_offset(skb)); - uh->check = 0; - - switch (encap_type) { - default: - case UDP_ENCAP_ESPINUDP: - esph = (struct ip_esp_hdr *)(uh + 1); - break; - case UDP_ENCAP_ESPINUDP_NON_IKE: - udpdata32 = (__be32 *)(uh + 1); - udpdata32[0] = udpdata32[1] = 0; - esph = (struct ip_esp_hdr *)(udpdata32 + 2); - break; - } - - *skb_mac_header(skb) = IPPROTO_UDP; - } + if (x->encap) + esp_output_udp_encap(x, skb, esp); if (!skb_cloned(skb)) { if (tailen <= skb_availroom(skb)) { @@ -304,6 +269,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) struct sock *sk = skb->sk; struct page_frag *pfrag = &x->xfrag; + esp->inplace = false; + allocsize = ALIGN(tailen, L1_CACHE_BYTES); spin_lock_bh(&x->lock); @@ -320,10 +287,12 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) tail = vaddr + pfrag->offset; - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); kunmap_atomic(vaddr); + spin_unlock_bh(&x->lock); + nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -339,107 +308,113 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) if (sk) atomic_add(tailen, &sk->sk_wmem_alloc); - skb_push(skb, -skb_network_offset(skb)); - - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; - - tmp = esp_alloc_tmp(aead, nfrags + 2, extralen); - if (!tmp) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - - extra = esp_tmp_extra(tmp); - iv = esp_tmp_iv(aead, tmp, extralen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); - dsg = &sg[nfrags]; - - esph = esp_output_set_extra(skb, esph, extra); - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); - - if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - - skb_shinfo(skb)->nr_frags = 1; - - page = pfrag->page; - get_page(page); - /* replace page frags in skb with new page */ - __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); - pfrag->offset = pfrag->offset + allocsize; - - sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - spin_unlock_bh(&x->lock); - - goto skip_cow2; + goto out; } } cow: - err = skb_cow_data(skb, tailen, &trailer); - if (err < 0) - goto error; - nfrags = err; + nfrags = skb_cow_data(skb, tailen, &trailer); + if (nfrags < 0) + goto out; tail = skb_tail_pointer(trailer); - esph = ip_esp_hdr(skb); + esp->esph = ip_esp_hdr(skb); skip_cow: - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); + pskb_put(skb, trailer, tailen); - pskb_put(skb, trailer, clen - skb->len + alen); - skb_push(skb, -skb_network_offset(skb)); - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; +out: + return nfrags; +} +EXPORT_SYMBOL_GPL(esp_output_head); - tmp = esp_alloc_tmp(aead, nfrags, extralen); - if (!tmp) { - err = -ENOMEM; - goto error; +int esp_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +{ + u8 *iv; + int alen; + void *tmp; + int ivlen; + int assoclen; + int extralen; + struct page *page; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct aead_request *req; + struct scatterlist *sg, *dsg; + struct esp_output_extra *extra; + int err = -ENOMEM; + + assoclen = sizeof(struct ip_esp_hdr); + extralen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + extralen += sizeof(*extra); + assoclen += sizeof(__be32); } + aead = x->data; + alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); + + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, extralen); + if (!tmp) + goto error; + extra = esp_tmp_extra(tmp); iv = esp_tmp_iv(aead, tmp, extralen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - dsg = sg; - esph = esp_output_set_extra(skb, esph, extra); + if (esp->inplace) + dsg = sg; + else + dsg = &sg[esp->nfrags]; - sg_init_table(sg, nfrags); + esph = esp_output_set_extra(skb, x, esp->esph, extra); + esp->esph = esph; + + sg_init_table(sg, esp->nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); + assoclen + ivlen + esp->clen + alen); + + if (!esp->inplace) { + int allocsize; + struct page_frag *pfrag = &x->xfrag; + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + spin_unlock_bh(&x->lock); + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + } -skip_cow2: if ((x->props.flags & XFRM_STATE_ESN)) aead_request_set_callback(req, 0, esp_output_done_esn, skb); else aead_request_set_callback(req, 0, esp_output_done, skb); - aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); + aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv); aead_request_set_ad(req, assoclen); - seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); - memset(iv, 0, ivlen); - memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8), min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; @@ -465,11 +440,63 @@ static int esp_output(struct xfrm_state *x, struct sk_buff *skb) error: return err; } +EXPORT_SYMBOL_GPL(esp_output_tail); -static int esp_input_done2(struct sk_buff *skb, int err) +static int esp_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int alen; + int blksize; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + + esp.inplace = true; + + esp.proto = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp4_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + esp.tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + esp.esph = ip_esp_hdr(skb); + + esp.nfrags = esp_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + + esph = esp.esph; + esph->spi = x->id.spi; + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + skb_push(skb, -skb_network_offset(skb)); + + return esp_output_tail(x, skb, &esp); +} + +int esp_input_done2(struct sk_buff *skb, int err) { const struct iphdr *iph; struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -478,7 +505,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) u8 nexthdr[2]; int padlen; - kfree(ESP_SKB_CB(skb)->tmp); + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) goto out; @@ -549,6 +577,7 @@ static int esp_input_done2(struct sk_buff *skb, int err) out: return err; } +EXPORT_SYMBOL_GPL(esp_input_done2); static void esp_input_done(struct crypto_async_request *base, int err) { @@ -751,13 +780,17 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; + u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - aead = crypto_alloc_aead(aead_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(aead_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -787,6 +820,7 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; + u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -812,7 +846,10 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - aead = crypto_alloc_aead(authenc_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(authenc_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -931,7 +968,7 @@ static const struct xfrm_type esp_type = .destructor = esp_destroy, .get_mtu = esp4_get_mtu, .input = esp_input, - .output = esp_output + .output = esp_output, }; static struct xfrm4_protocol esp4_protocol = { diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 1de4426324064ff991399e7ef0f0b98c68386a5c..e0666016a7642c017b4693d32723245adc484982 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -43,27 +43,31 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; - err = secpath_set(skb); - if (err) - goto out; + xo = xfrm_offload(skb); + if (!xo || !(xo->flags & CRYPTO_DONE)) { + err = secpath_set(skb); + if (err) + goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) - goto out; + if (skb->sp->len == XFRM_MAX_DEPTH) + goto out; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ip_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET); - if (!x) - goto out; + x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ip_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET); + if (!x) + goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + skb->sp->xvec[skb->sp->len++] = x; + skb->sp->olen++; - xo = xfrm_offload(skb); - if (!xo) { - xfrm_state_put(x); - goto out; + xo = xfrm_offload(skb); + if (!xo) { + xfrm_state_put(x); + goto out; + } } + xo->flags |= XFRM_GRO; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = NULL; @@ -84,19 +88,214 @@ static struct sk_buff **esp4_gro_receive(struct sk_buff **head, return NULL; } +static void esp4_gso_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct iphdr *iph = ip_hdr(skb); + struct xfrm_offload *xo = xfrm_offload(skb); + int proto = iph->protocol; + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + xo->proto = proto; +} + +static struct sk_buff *esp4_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + __u32 seq; + int err = 0; + struct sk_buff *skb2; + struct xfrm_state *x; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t esp_features = features; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (!xo) + goto out; + + seq = xo->seq.low; + + x = skb->sp->xvec[skb->sp->len - 1]; + aead = x->data; + esph = ip_esp_hdr(skb); + + if (esph->spi != x->id.spi) + goto out; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + goto out; + + __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); + + skb->encap_hdr_csum = 1; + + if (!(features & NETIF_F_HW_ESP)) + esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + + segs = x->outer_mode->gso_segment(x, skb, esp_features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + __skb_pull(skb, skb->data - skb_mac_header(skb)); + + skb2 = segs; + do { + struct sk_buff *nskb = skb2->next; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_GSO_SEGMENT; + xo->seq.low = seq; + xo->seq.hi = xfrm_replay_seqhi(x, seq); + + if(!(features & NETIF_F_HW_ESP)) + xo->flags |= CRYPTO_FALLBACK; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (err) { + kfree_skb_list(segs); + return ERR_PTR(err); + } + + if (!skb_is_gso(skb2)) + seq++; + else + seq += skb_shinfo(skb2)->gso_segs; + + skb_push(skb2, skb2->mac_len); + skb2 = nskb; + } while (skb2); + +out: + return segs; +} + +static int esp_input_tail(struct xfrm_state *x, struct sk_buff *skb) +{ + struct crypto_aead *aead = x->data; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) + return -EINVAL; + + skb->ip_summed = CHECKSUM_NONE; + + return esp_input_done2(skb, 0); +} + +static int esp_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) +{ + int err; + int alen; + int blksize; + struct xfrm_offload *xo; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + bool hw_offload = true; + + esp.inplace = true; + + xo = xfrm_offload(skb); + + if (!xo) + return -EINVAL; + + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) { + xo->flags |= CRYPTO_FALLBACK; + hw_offload = false; + } + + esp.proto = xo->proto; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + /* XXX: Add support for tfc padding here. */ + + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + esp.esph = ip_esp_hdr(skb); + + + if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + esp.nfrags = esp_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + } + + esph = esp.esph; + esph->spi = x->id.spi; + + skb_push(skb, -skb_network_offset(skb)); + + if (xo->flags & XFRM_GSO_SEGMENT) { + esph->seq_no = htonl(xo->seq.low); + } else { + ip_hdr(skb)->tot_len = htons(skb->len); + ip_send_check(ip_hdr(skb)); + } + + if (hw_offload) + return 0; + + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + err = esp_output_tail(x, skb, &esp); + if (err < 0) + return err; + + secpath_reset(skb); + + return 0; +} + static const struct net_offload esp4_offload = { .callbacks = { .gro_receive = esp4_gro_receive, + .gso_segment = esp4_gso_segment, }, }; +static const struct xfrm_type_offload esp_type_offload = { + .description = "ESP4 OFFLOAD", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .input_tail = esp_input_tail, + .xmit = esp_xmit, + .encap = esp4_gso_encap, +}; + static int __init esp4_offload_init(void) { + if (xfrm_register_type_offload(&esp_type_offload, AF_INET) < 0) { + pr_info("%s: can't add xfrm type offload\n", __func__); + return -EAGAIN; + } + return inet_add_offload(&esp4_offload, IPPROTO_ESP); } static void __exit esp4_offload_exit(void) { + if (xfrm_unregister_type_offload(&esp_type_offload, AF_INET) < 0) + pr_info("%s: can't remove xfrm type offload\n", __func__); + inet_del_offload(&esp4_offload, IPPROTO_ESP); } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 8f2133ffc2ff1b94871408a5f934cb938d3462b5..39bd1edee67649af86eb582c0ea478713360a506 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -632,7 +632,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, int err, remaining; struct rtmsg *rtm; - err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy); + err = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipv4_policy, + NULL); if (err < 0) goto errout; @@ -709,7 +710,8 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, return err; } -static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -731,7 +733,8 @@ static int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) return err; } -static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct fib_config cfg; @@ -1127,7 +1130,8 @@ static void fib_disable_ip(struct net_device *dev, unsigned long event, { if (fib_sync_down_dev(dev, event, force)) fib_flush(dev_net(dev)); - rt_cache_flush(dev_net(dev)); + else + rt_cache_flush(dev_net(dev)); arp_ifdown(dev); } diff --git a/net/ipv4/fib_notifier.c b/net/ipv4/fib_notifier.c new file mode 100644 index 0000000000000000000000000000000000000000..e0714d975947160565dfb1bd9a86bc7a592c73b1 --- /dev/null +++ b/net/ipv4/fib_notifier.c @@ -0,0 +1,86 @@ +#include +#include +#include +#include +#include +#include +#include + +static ATOMIC_NOTIFIER_HEAD(fib_chain); + +int call_fib_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + info->net = net; + return nb->notifier_call(nb, event_type, info); +} + +int call_fib_notifiers(struct net *net, enum fib_event_type event_type, + struct fib_notifier_info *info) +{ + net->ipv4.fib_seq++; + info->net = net; + return atomic_notifier_call_chain(&fib_chain, event_type, info); +} + +static unsigned int fib_seq_sum(void) +{ + unsigned int fib_seq = 0; + struct net *net; + + rtnl_lock(); + for_each_net(net) + fib_seq += net->ipv4.fib_seq; + rtnl_unlock(); + + return fib_seq; +} + +static bool fib_dump_is_consistent(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb), + unsigned int fib_seq) +{ + atomic_notifier_chain_register(&fib_chain, nb); + if (fib_seq == fib_seq_sum()) + return true; + atomic_notifier_chain_unregister(&fib_chain, nb); + if (cb) + cb(nb); + return false; +} + +#define FIB_DUMP_MAX_RETRIES 5 +int register_fib_notifier(struct notifier_block *nb, + void (*cb)(struct notifier_block *nb)) +{ + int retries = 0; + + do { + unsigned int fib_seq = fib_seq_sum(); + struct net *net; + + /* Mutex semantics guarantee that every change done to + * FIB tries before we read the change sequence counter + * is now visible to us. + */ + rcu_read_lock(); + for_each_net_rcu(net) { + fib_rules_notify(net, nb); + fib_notify(net, nb); + } + rcu_read_unlock(); + + if (fib_dump_is_consistent(nb, cb, fib_seq)) + return 0; + } while (++retries < FIB_DUMP_MAX_RETRIES); + + return -EBUSY; +} +EXPORT_SYMBOL(register_fib_notifier); + +int unregister_fib_notifier(struct notifier_block *nb) +{ + return atomic_notifier_chain_unregister(&fib_chain, nb); +} +EXPORT_SYMBOL(unregister_fib_notifier); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 2e50062f642d61bdc0c0893de4e4ff84b5be6c8d..778ecf977eb2bd7b7b3808691aaf818fc4bd680d 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -47,6 +47,27 @@ struct fib4_rule { #endif }; +static bool fib4_rule_matchall(const struct fib_rule *rule) +{ + struct fib4_rule *r = container_of(rule, struct fib4_rule, common); + + if (r->dst_len || r->src_len || r->tos) + return false; + return fib_rule_matchall(rule); +} + +bool fib4_rule_default(const struct fib_rule *rule) +{ + if (!fib4_rule_matchall(rule) || rule->action != FR_ACT_TO_TBL || + rule->l3mdev) + return false; + if (rule->table != RT_TABLE_LOCAL && rule->table != RT_TABLE_MAIN && + rule->table != RT_TABLE_DEFAULT) + return false; + return true; +} +EXPORT_SYMBOL_GPL(fib4_rule_default); + int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { @@ -164,12 +185,36 @@ static struct fib_table *fib_empty_table(struct net *net) return NULL; } +static int call_fib_rule_notifier(struct notifier_block *nb, struct net *net, + enum fib_event_type event_type, + struct fib_rule *rule) +{ + struct fib_rule_notifier_info info = { + .rule = rule, + }; + + return call_fib_notifier(nb, net, event_type, &info.info); +} + static int call_fib_rule_notifiers(struct net *net, - enum fib_event_type event_type) + enum fib_event_type event_type, + struct fib_rule *rule) +{ + struct fib_rule_notifier_info info = { + .rule = rule, + }; + + return call_fib_notifiers(net, event_type, &info.info); +} + +/* Called with rcu_read_lock() */ +void fib_rules_notify(struct net *net, struct notifier_block *nb) { - struct fib_notifier_info info; + struct fib_rules_ops *ops = net->ipv4.rules_ops; + struct fib_rule *rule; - return call_fib_notifiers(net, event_type, &info); + list_for_each_entry_rcu(rule, &ops->rules_list, list) + call_fib_rule_notifier(nb, net, FIB_EVENT_RULE_ADD, rule); } static const struct nla_policy fib4_rule_policy[FRA_MAX+1] = { @@ -228,7 +273,7 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, rule4->tos = frh->tos; net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_ADD, rule); err = 0; errout: @@ -250,7 +295,7 @@ static int fib4_rule_delete(struct fib_rule *rule) net->ipv4.fib_num_tclassid_users--; #endif net->ipv4.fib_has_custom_rules = true; - call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL); + call_fib_rule_notifiers(net, FIB_EVENT_RULE_DEL, rule); errout: return err; } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 317026a39cfa2b49bf06182d89a11af0fa2688af..da449ddb8cc172bd9091c00057a69a095f98b56d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -57,7 +57,6 @@ static unsigned int fib_info_cnt; static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; #ifdef CONFIG_IP_ROUTE_MULTIPATH -u32 fib_multipath_secret __read_mostly; #define for_nexthops(fi) { \ int nhsel; const struct fib_nh *nh; \ @@ -576,9 +575,6 @@ static void fib_rebalance(struct fib_info *fi) atomic_set(&nexthop_nh->nh_upper_bound, upper_bound); } endfor_nexthops(fi); - - net_get_random_once(&fib_multipath_secret, - sizeof(fib_multipath_secret)); } static inline void fib_add_weight(struct fib_info *fi, @@ -1641,7 +1637,7 @@ void fib_select_multipath(struct fib_result *res, int hash) #endif void fib_select_path(struct net *net, struct fib_result *res, - struct flowi4 *fl4, int mp_hash) + struct flowi4 *fl4, const struct sk_buff *skb) { bool oif_check; @@ -1650,10 +1646,9 @@ void fib_select_path(struct net *net, struct fib_result *res, #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi->fib_nhs > 1 && oif_check) { - if (mp_hash < 0) - mp_hash = get_hash_from_flowi4(fl4) >> 1; + int h = fib_multipath_hash(res->fi, fl4, skb); - fib_select_multipath(res, mp_hash); + fib_select_multipath(res, h); } else #endif diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 2f0d8233950faeac91f287644bc9476f19f74578..1201409ba1dcb18ee028003b065410b87bf4a602 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -84,43 +84,6 @@ #include #include "fib_lookup.h" -static unsigned int fib_seq_sum(void) -{ - unsigned int fib_seq = 0; - struct net *net; - - rtnl_lock(); - for_each_net(net) - fib_seq += net->ipv4.fib_seq; - rtnl_unlock(); - - return fib_seq; -} - -static ATOMIC_NOTIFIER_HEAD(fib_chain); - -static int call_fib_notifier(struct notifier_block *nb, struct net *net, - enum fib_event_type event_type, - struct fib_notifier_info *info) -{ - info->net = net; - return nb->notifier_call(nb, event_type, info); -} - -static void fib_rules_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type) -{ -#ifdef CONFIG_IP_MULTIPLE_TABLES - struct fib_notifier_info info; - - if (net->ipv4.fib_has_custom_rules) - call_fib_notifier(nb, net, event_type, &info); -#endif -} - -static void fib_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type); - static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, @@ -137,62 +100,6 @@ static int call_fib_entry_notifier(struct notifier_block *nb, struct net *net, return call_fib_notifier(nb, net, event_type, &info.info); } -static bool fib_dump_is_consistent(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb), - unsigned int fib_seq) -{ - atomic_notifier_chain_register(&fib_chain, nb); - if (fib_seq == fib_seq_sum()) - return true; - atomic_notifier_chain_unregister(&fib_chain, nb); - if (cb) - cb(nb); - return false; -} - -#define FIB_DUMP_MAX_RETRIES 5 -int register_fib_notifier(struct notifier_block *nb, - void (*cb)(struct notifier_block *nb)) -{ - int retries = 0; - - do { - unsigned int fib_seq = fib_seq_sum(); - struct net *net; - - /* Mutex semantics guarantee that every change done to - * FIB tries before we read the change sequence counter - * is now visible to us. - */ - rcu_read_lock(); - for_each_net_rcu(net) { - fib_rules_notify(net, nb, FIB_EVENT_RULE_ADD); - fib_notify(net, nb, FIB_EVENT_ENTRY_ADD); - } - rcu_read_unlock(); - - if (fib_dump_is_consistent(nb, cb, fib_seq)) - return 0; - } while (++retries < FIB_DUMP_MAX_RETRIES); - - return -EBUSY; -} -EXPORT_SYMBOL(register_fib_notifier); - -int unregister_fib_notifier(struct notifier_block *nb) -{ - return atomic_notifier_chain_unregister(&fib_chain, nb); -} -EXPORT_SYMBOL(unregister_fib_notifier); - -int call_fib_notifiers(struct net *net, enum fib_event_type event_type, - struct fib_notifier_info *info) -{ - net->ipv4.fib_seq++; - info->net = net; - return atomic_notifier_call_chain(&fib_chain, event_type, info); -} - static int call_fib_entry_notifiers(struct net *net, enum fib_event_type event_type, u32 dst, int dst_len, struct fib_info *fi, @@ -1995,8 +1902,7 @@ int fib_table_flush(struct net *net, struct fib_table *tb) } static void fib_leaf_notify(struct net *net, struct key_vector *l, - struct fib_table *tb, struct notifier_block *nb, - enum fib_event_type event_type) + struct fib_table *tb, struct notifier_block *nb) { struct fib_alias *fa; @@ -2012,22 +1918,21 @@ static void fib_leaf_notify(struct net *net, struct key_vector *l, if (tb->tb_id != fa->tb_id) continue; - call_fib_entry_notifier(nb, net, event_type, l->key, + call_fib_entry_notifier(nb, net, FIB_EVENT_ENTRY_ADD, l->key, KEYLENGTH - fa->fa_slen, fi, fa->fa_tos, fa->fa_type, fa->tb_id); } } static void fib_table_notify(struct net *net, struct fib_table *tb, - struct notifier_block *nb, - enum fib_event_type event_type) + struct notifier_block *nb) { struct trie *t = (struct trie *)tb->tb_data; struct key_vector *l, *tp = t->kv; t_key key = 0; while ((l = leaf_walk_rcu(&tp, key)) != NULL) { - fib_leaf_notify(net, l, tb, nb, event_type); + fib_leaf_notify(net, l, tb, nb); key = l->key + 1; /* stop in case of wrap around */ @@ -2036,8 +1941,7 @@ static void fib_table_notify(struct net *net, struct fib_table *tb, } } -static void fib_notify(struct net *net, struct notifier_block *nb, - enum fib_event_type event_type) +void fib_notify(struct net *net, struct notifier_block *nb) { unsigned int h; @@ -2046,7 +1950,7 @@ static void fib_notify(struct net *net, struct notifier_block *nb, struct fib_table *tb; hlist_for_each_entry_rcu(tb, head, tb_hlist) - fib_table_notify(net, tb, nb, event_type); + fib_table_notify(net, tb, nb); } } diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index fc310db2708bf6c9e96befe413e89ac931818f74..43318b5f56474bc15253e74e156962dd2c8df01f 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -464,22 +464,6 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) local_bh_enable(); } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - -/* Source and destination is swapped. See ip_multipath_icmp_hash */ -static int icmp_multipath_hash_skb(const struct sk_buff *skb) -{ - const struct iphdr *iph = ip_hdr(skb); - - return fib_multipath_hash(iph->daddr, iph->saddr); -} - -#else - -#define icmp_multipath_hash_skb(skb) (-1) - -#endif - static struct rtable *icmp_route_lookup(struct net *net, struct flowi4 *fl4, struct sk_buff *skb_in, @@ -505,8 +489,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); - rt = __ip_route_output_key_hash(net, fl4, - icmp_multipath_hash_skb(skb_in)); + rt = __ip_route_output_key_hash(net, fl4, skb_in); if (IS_ERR(rt)) return rt; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 5e313c1ac94fc88eca5fe3a0e9e46e551e955ff0..1054d330bf9df3189a21dbb08e27c0e6ad136775 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -794,6 +794,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, /* listeners have SOCK_RCU_FREE, not the children */ sock_reset_flag(newsk, SOCK_RCU_FREE); + inet_sk(newsk)->mc_list = NULL; + newsk->sk_mark = inet_rsk(req)->ir_mark; atomic64_set(&newsk->sk_cookie, atomic64_read(&inet_rsk(req)->ir_cookie)); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 8bea74298173f5198e7b3083b5afb5e1398df977..e9a59d2d91d4061f299065173d9212eefe72ef89 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -678,11 +678,7 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) /* no more locks than number of hash buckets */ nblocks = min(nblocks, hashinfo->ehash_mask + 1); - hashinfo->ehash_locks = kmalloc_array(nblocks, locksz, - GFP_KERNEL | __GFP_NOWARN); - if (!hashinfo->ehash_locks) - hashinfo->ehash_locks = vmalloc(nblocks * locksz); - + hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); if (!hashinfo->ehash_locks) return -ENOMEM; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index c9c1cb635d9afd0c5ccdec4402aa2aac29cc889a..e90c80a548ad8f61b8542fdd8a2a55a2562a0e07 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -829,7 +829,8 @@ static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[]) static int ipgre_netlink_parms(struct net_device *dev, struct nlattr *data[], struct nlattr *tb[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, + __u32 *fwmark) { struct ip_tunnel *t = netdev_priv(dev); @@ -886,6 +887,9 @@ static int ipgre_netlink_parms(struct net_device *dev, t->ignore_df = !!nla_get_u8(data[IFLA_GRE_IGNORE_DF]); } + if (data[IFLA_GRE_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); + return 0; } @@ -957,6 +961,7 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, { struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = 0; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { @@ -967,31 +972,32 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, return err; } - err = ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_newlink(dev, tb, &p); + return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = t->fwmark; int err; if (ipgre_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - err = ipgre_netlink_parms(dev, data, tb, &p); + err = ipgre_netlink_parms(dev, data, tb, &p, &fwmark); if (err < 0) return err; - return ip_tunnel_changelink(dev, tb, &p); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipgre_get_size(const struct net_device *dev) @@ -1029,6 +1035,8 @@ static size_t ipgre_get_size(const struct net_device *dev) nla_total_size(0) + /* IFLA_GRE_IGNORE_DF */ nla_total_size(1) + + /* IFLA_GRE_FWMARK */ + nla_total_size(4) + 0; } @@ -1049,7 +1057,8 @@ static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_TTL, p->iph.ttl) || nla_put_u8(skb, IFLA_GRE_TOS, p->iph.tos) || nla_put_u8(skb, IFLA_GRE_PMTUDISC, - !!(p->iph.frag_off & htons(IP_DF)))) + !!(p->iph.frag_off & htons(IP_DF))) || + nla_put_u32(skb, IFLA_GRE_FWMARK, t->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1093,6 +1102,7 @@ static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_GRE_COLLECT_METADATA] = { .type = NLA_FLAG }, [IFLA_GRE_IGNORE_DF] = { .type = NLA_U8 }, + [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipgre_link_ops __read_mostly = { diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index d6feabb0351607f282e1f78f159c0ccb88bcec96..fa2dc8f692c631f1ff7fe814c3ee27f0de2a41d8 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -313,6 +313,7 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); struct rtable *rt; struct net_device *dev = skb->dev; + void (*edemux)(struct sk_buff *skb); /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing @@ -329,8 +330,8 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) int protocol = iph->protocol; ipprot = rcu_dereference(inet_protos[protocol]); - if (ipprot && ipprot->early_demux) { - ipprot->early_demux(skb); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) { + edemux(skb); /* must reload iph, skb->head might have changed */ iph = ip_hdr(skb); } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index ebd953bc5607f3b25fffddcb26a5c65e5490cb2b..ec4fe3d4b5c9c17d53d7464b42b82a564ae48e54 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -330,7 +330,6 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, sent to multicast group to reach destination designated router. */ struct ip_ra_chain __rcu *ip_ra_chain; -static DEFINE_SPINLOCK(ip_ra_lock); static void ip_ra_destroy_rcu(struct rcu_head *head) @@ -352,21 +351,17 @@ int ip_ra_control(struct sock *sk, unsigned char on, new_ra = on ? kmalloc(sizeof(*new_ra), GFP_KERNEL) : NULL; - spin_lock_bh(&ip_ra_lock); for (rap = &ip_ra_chain; - (ra = rcu_dereference_protected(*rap, - lockdep_is_held(&ip_ra_lock))) != NULL; + (ra = rtnl_dereference(*rap)) != NULL; rap = &ra->next) { if (ra->sk == sk) { if (on) { - spin_unlock_bh(&ip_ra_lock); kfree(new_ra); return -EADDRINUSE; } /* dont let ip_call_ra_chain() use sk again */ ra->sk = NULL; RCU_INIT_POINTER(*rap, ra->next); - spin_unlock_bh(&ip_ra_lock); if (ra->destructor) ra->destructor(sk); @@ -380,17 +375,14 @@ int ip_ra_control(struct sock *sk, unsigned char on, return 0; } } - if (!new_ra) { - spin_unlock_bh(&ip_ra_lock); + if (!new_ra) return -ENOBUFS; - } new_ra->sk = sk; new_ra->destructor = destructor; RCU_INIT_POINTER(new_ra->next, ra); rcu_assign_pointer(*rap, new_ra); sock_hold(sk); - spin_unlock_bh(&ip_ra_lock); return 0; } @@ -488,16 +480,15 @@ static bool ipv4_datagram_support_cmsg(const struct sock *sk, return false; /* Support IP_PKTINFO on tstamp packets if requested, to correlate - * timestamp with egress dev. Not possible for packets without dev + * timestamp with egress dev. Not possible for packets without iif * or without payload (SOF_TIMESTAMPING_OPT_TSONLY). */ - if ((!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG)) || - (!skb->dev)) + info = PKTINFO_SKB_CB(skb); + if (!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_CMSG) || + !info->ipi_ifindex) return false; - info = PKTINFO_SKB_CB(skb); info->ipi_spec_dst.s_addr = ip_hdr(skb)->saddr; - info->ipi_ifindex = skb->dev->ifindex; return true; } @@ -591,6 +582,7 @@ static bool setsockopt_needs_rtnl(int optname) case MCAST_LEAVE_GROUP: case MCAST_LEAVE_SOURCE_GROUP: case MCAST_UNBLOCK_SOURCE: + case IP_ROUTER_ALERT: return true; } return false; diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 823abaef006bd353cf0466f14dd3abaa26f80c07..b878ecbc0608fb433ae858b70e6e0101aa20fc4e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -293,7 +293,8 @@ static struct net_device *__ip_tunnel_create(struct net *net, static inline void init_tunnel_flow(struct flowi4 *fl4, int proto, __be32 daddr, __be32 saddr, - __be32 key, __u8 tos, int oif) + __be32 key, __u8 tos, int oif, + __u32 mark) { memset(fl4, 0, sizeof(*fl4)); fl4->flowi4_oif = oif; @@ -302,6 +303,7 @@ static inline void init_tunnel_flow(struct flowi4 *fl4, fl4->flowi4_tos = tos; fl4->flowi4_proto = proto; fl4->fl4_gre_key = key; + fl4->flowi4_mark = mark; } static int ip_tunnel_bind_dev(struct net_device *dev) @@ -322,7 +324,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev) init_tunnel_flow(&fl4, iph->protocol, iph->daddr, iph->saddr, tunnel->parms.o_key, - RT_TOS(iph->tos), tunnel->parms.link); + RT_TOS(iph->tos), tunnel->parms.link, + tunnel->fwmark); rt = ip_route_output_key(tunnel->net, &fl4); if (!IS_ERR(rt)) { @@ -578,7 +581,7 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, u8 proto) tos = ipv6_get_dsfield((const struct ipv6hdr *)inner_iph); } init_tunnel_flow(&fl4, proto, key->u.ipv4.dst, key->u.ipv4.src, 0, - RT_TOS(tos), tunnel->parms.link); + RT_TOS(tos), tunnel->parms.link, tunnel->fwmark); if (tunnel->encap.type != TUNNEL_ENCAP_NONE) goto tx_error; rt = ip_route_output_key(tunnel->net, &fl4); @@ -707,7 +710,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, } init_tunnel_flow(&fl4, protocol, dst, tnl_params->saddr, - tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link); + tunnel->parms.o_key, RT_TOS(tos), tunnel->parms.link, + tunnel->fwmark); if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0) goto tx_error; @@ -795,7 +799,8 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, struct ip_tunnel *t, struct net_device *dev, struct ip_tunnel_parm *p, - bool set_mtu) + bool set_mtu, + __u32 fwmark) { ip_tunnel_del(itn, t); t->parms.iph.saddr = p->iph.saddr; @@ -812,10 +817,11 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn, t->parms.iph.tos = p->iph.tos; t->parms.iph.frag_off = p->iph.frag_off; - if (t->parms.link != p->link) { + if (t->parms.link != p->link || t->fwmark != fwmark) { int mtu; t->parms.link = p->link; + t->fwmark = fwmark; mtu = ip_tunnel_bind_dev(dev); if (set_mtu) dev->mtu = mtu; @@ -893,7 +899,7 @@ int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd) if (t) { err = 0; - ip_tunnel_update(itn, t, dev, p, true); + ip_tunnel_update(itn, t, dev, p, true, 0); } else { err = -ENOENT; } @@ -1066,7 +1072,7 @@ void ip_tunnel_delete_net(struct ip_tunnel_net *itn, struct rtnl_link_ops *ops) EXPORT_SYMBOL_GPL(ip_tunnel_delete_net); int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p) + struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *nt; struct net *net = dev_net(dev); @@ -1087,6 +1093,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], nt->net = net; nt->parms = *p; + nt->fwmark = fwmark; err = register_netdevice(dev); if (err) goto out; @@ -1105,7 +1112,7 @@ int ip_tunnel_newlink(struct net_device *dev, struct nlattr *tb[], EXPORT_SYMBOL_GPL(ip_tunnel_newlink); int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], - struct ip_tunnel_parm *p) + struct ip_tunnel_parm *p, __u32 fwmark) { struct ip_tunnel *t; struct ip_tunnel *tunnel = netdev_priv(dev); @@ -1137,7 +1144,7 @@ int ip_tunnel_changelink(struct net_device *dev, struct nlattr *tb[], } } - ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU]); + ip_tunnel_update(itn, t, dev, p, !tb[IFLA_MTU], fwmark); return 0; } EXPORT_SYMBOL_GPL(ip_tunnel_changelink); diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index a31f47ccaad90deadb686b2a9091694c8c1ecb9d..baf196eaf1d81809a9264fb94daa58eae51f1bff 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -235,7 +235,7 @@ static int ip_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy); + err = nla_parse_nested(tb, LWTUNNEL_IP_MAX, attr, ip_tun_policy, NULL); if (err < 0) return err; @@ -332,7 +332,8 @@ static int ip6_tun_build_state(struct nlattr *attr, struct nlattr *tb[LWTUNNEL_IP6_MAX + 1]; int err; - err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy); + err = nla_parse_nested(tb, LWTUNNEL_IP6_MAX, attr, ip6_tun_policy, + NULL); if (err < 0) return err; diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c index 8b14f1404c8f7315495877a34ad6757aee4232a8..4ec9affb2252408b0c218ad63b17a891ff2ab8ed 100644 --- a/net/ipv4/ip_vti.c +++ b/net/ipv4/ip_vti.c @@ -471,7 +471,8 @@ static int vti_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void vti_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -497,24 +498,29 @@ static void vti_netlink_parms(struct nlattr *data[], if (data[IFLA_VTI_REMOTE]) parms->iph.daddr = nla_get_in_addr(data[IFLA_VTI_REMOTE]); + if (data[IFLA_VTI_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct ip_tunnel_parm parms; + __u32 fwmark = 0; - vti_netlink_parms(data, &parms); - return ip_tunnel_newlink(dev, tb, &parms); + vti_netlink_parms(data, &parms, &fwmark); + return ip_tunnel_newlink(dev, tb, &parms, fwmark); } static int vti_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); + __u32 fwmark = t->fwmark; struct ip_tunnel_parm p; - vti_netlink_parms(data, &p); - return ip_tunnel_changelink(dev, tb, &p); + vti_netlink_parms(data, &p, &fwmark); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t vti_get_size(const struct net_device *dev) @@ -530,6 +536,8 @@ static size_t vti_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_VTI_REMOTE */ nla_total_size(4) + + /* IFLA_VTI_FWMARK */ + nla_total_size(4) + 0; } @@ -538,11 +546,13 @@ static int vti_fill_info(struct sk_buff *skb, const struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm *p = &t->parms; - nla_put_u32(skb, IFLA_VTI_LINK, p->link); - nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key); - nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key); - nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr); - nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr); + if (nla_put_u32(skb, IFLA_VTI_LINK, p->link) || + nla_put_be32(skb, IFLA_VTI_IKEY, p->i_key) || + nla_put_be32(skb, IFLA_VTI_OKEY, p->o_key) || + nla_put_in_addr(skb, IFLA_VTI_LOCAL, p->iph.saddr) || + nla_put_in_addr(skb, IFLA_VTI_REMOTE, p->iph.daddr) || + nla_put_u32(skb, IFLA_VTI_FWMARK, t->fwmark)) + return -EMSGSIZE; return 0; } @@ -553,6 +563,7 @@ static const struct nla_policy vti_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_OKEY] = { .type = NLA_U32 }, [IFLA_VTI_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) }, [IFLA_VTI_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) }, + [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti_link_ops __read_mostly = { diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index dfb2ab2dd3c84d93b8d77df41d1160d26f162fc3..c3b12b1c71621b942dd4f9ad1d60ab5bb3c66e20 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -57,6 +57,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 00d4229b6954262e556f7a8fd9c072638d49a8d6..1e441c6f216025339caf5d82a04ea3570566641a 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -390,7 +390,8 @@ static int ipip_tunnel_validate(struct nlattr *tb[], struct nlattr *data[]) } static void ipip_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms, bool *collect_md) + struct ip_tunnel_parm *parms, bool *collect_md, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -428,6 +429,9 @@ static void ipip_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; + + if (data[IFLA_IPTUN_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -470,6 +474,7 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; + __u32 fwmark = 0; if (ipip_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); @@ -478,26 +483,27 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, return err; } - ipip_netlink_parms(data, &p, &t->collect_md); - return ip_tunnel_newlink(dev, tb, &p); + ipip_netlink_parms(data, &p, &t->collect_md, &fwmark); + return ip_tunnel_newlink(dev, tb, &p, fwmark); } static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { + struct ip_tunnel *t = netdev_priv(dev); struct ip_tunnel_parm p; struct ip_tunnel_encap ipencap; bool collect_md; + __u32 fwmark = t->fwmark; if (ipip_netlink_encap_parms(data, &ipencap)) { - struct ip_tunnel *t = netdev_priv(dev); int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; } - ipip_netlink_parms(data, &p, &collect_md); + ipip_netlink_parms(data, &p, &collect_md, &fwmark); if (collect_md) return -EINVAL; @@ -505,7 +511,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) return -EINVAL; - return ip_tunnel_changelink(dev, tb, &p); + return ip_tunnel_changelink(dev, tb, &p, fwmark); } static size_t ipip_get_size(const struct net_device *dev) @@ -535,6 +541,8 @@ static size_t ipip_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } @@ -550,7 +558,8 @@ static int ipip_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_TOS, parm->iph.tos) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, - !!(parm->iph.frag_off & htons(IP_DF)))) + !!(parm->iph.frag_off & htons(IP_DF))) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, @@ -585,6 +594,7 @@ static const struct nla_policy ipip_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ipip_link_ops __read_mostly = { diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index c0317c940bcdc303015f500b52198e0862440e17..3a02d52ed50ec54ce3f9f3eb07dc9f4133535a80 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -631,7 +631,7 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, in_dev = __in_dev_get_rtnl(dev); if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; - inet_netconf_notify_devconf(dev_net(dev), + inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); @@ -820,8 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex, - &in_dev->cnf); + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, + dev->ifindex, &in_dev->cnf); ip_rt_multicast_event(in_dev); /* Fill in the VIF structures */ @@ -1278,18 +1278,18 @@ static void mrtsock_destruct(struct sock *sk) struct net *net = sock_net(sk); struct mr_table *mrt; - rtnl_lock(); + ASSERT_RTNL(); ipmr_for_each_table(mrt, net) { if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); RCU_INIT_POINTER(mrt->mroute_sk, NULL); mroute_clean_tables(mrt, false); } } - rtnl_unlock(); } /* Socket options and virtual interface manipulation. The whole @@ -1344,7 +1344,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (ret == 0) { rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; - inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + inet_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv4.devconf_all); } @@ -1353,13 +1354,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, if (sk != rcu_access_pointer(mrt->mroute_sk)) { ret = -EACCES; } else { - /* We need to unlock here because mrtsock_destruct takes - * care of rtnl itself and we can't change that due to - * the IP_ROUTER_ALERT setsockopt which runs without it. - */ - rtnl_unlock(); ret = ip_ra_control(sk, 0, NULL); - goto out; + goto out_unlock; } break; case MRT_ADD_VIF: @@ -1470,7 +1466,6 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, } out_unlock: rtnl_unlock(); -out: return ret; } @@ -2428,7 +2423,8 @@ static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc) /* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct mfcctl *mfcc, int *mrtsock, - struct mr_table **mrtret) + struct mr_table **mrtret, + struct netlink_ext_ack *extack) { struct net_device *dev = NULL; u32 tblid = RT_TABLE_DEFAULT; @@ -2437,7 +2433,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, struct rtmsg *rtm; int ret, rem; - ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy); + ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy, + extack); if (ret < 0) goto out; rtm = nlmsg_data(nlh); @@ -2496,7 +2493,8 @@ static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh, } /* takes care of both newroute and delroute */ -static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh) +static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); int ret, mrtsock, parent; @@ -2505,7 +2503,7 @@ static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh) mrtsock = 0; tbl = NULL; - ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl); + ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack); if (ret < 0) return ret; diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 6241a81fd7f5a3df8fb3cf251bfdd407dda6a1f6..0bc3c3d73e61e00a04d73f32800256a62c8943a5 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -309,8 +309,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, */ for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct arpt_entry *e - = (struct arpt_entry *)(entry0 + pos); + struct arpt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; @@ -354,14 +353,12 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (pos == oldpos) goto next; - e = (struct arpt_entry *) - (entry0 + pos); + e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; - e = (struct arpt_entry *) - (entry0 + pos + size); + e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; @@ -376,16 +373,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = (struct arpt_entry *) - (entry0 + newpos); + e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } - e = (struct arpt_entry *) - (entry0 + newpos); + e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } @@ -562,8 +557,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0, XT_ERROR_TARGET) == 0) ++newinfo->stacksize; } - if (ret != 0) - goto out_free; ret = -EINVAL; if (i != repl->num_entries) @@ -683,7 +676,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ const struct xt_entry_target *t; - e = (struct arpt_entry *)(loc_cpu_entry + off); + e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; @@ -1130,7 +1123,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, int h; origsize = *size; - de = (struct arpt_entry *)*dstptr; + de = *dstptr; memcpy(de, e, sizeof(struct arpt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); @@ -1324,7 +1317,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, int ret; origsize = *size; - ce = (struct compat_arpt_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct arpt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 384b85713e062881d3d2554b0828f08ff28f443d..2a55a40211cbfb94a9ade41c33678bba4be2a7e4 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -382,7 +382,7 @@ mark_source_chains(const struct xt_table_info *newinfo, to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ipt_entry *e = (struct ipt_entry *)(entry0 + pos); + struct ipt_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; @@ -424,14 +424,12 @@ mark_source_chains(const struct xt_table_info *newinfo, if (pos == oldpos) goto next; - e = (struct ipt_entry *) - (entry0 + pos); + e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; - e = (struct ipt_entry *) - (entry0 + pos + size); + e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; @@ -446,16 +444,14 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = (struct ipt_entry *) - (entry0 + newpos); + e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } - e = (struct ipt_entry *) - (entry0 + newpos); + e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } @@ -834,7 +830,7 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_match *m; const struct xt_entry_target *t; - e = (struct ipt_entry *)(loc_cpu_entry + off); + e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; @@ -1229,7 +1225,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, int ret = 0; origsize = *size; - ce = (struct compat_ipt_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ipt_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) @@ -1366,7 +1362,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, struct xt_entry_match *ematch; origsize = *size; - de = (struct ipt_entry *)*dstptr; + de = *dstptr; memcpy(de, e, sizeof(struct ipt_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 9b8841316e7b94e375cc52d0dfd7f9fe89205195..038f293c23766de6cd77f361269743498c2dd18e 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -40,8 +41,8 @@ MODULE_DESCRIPTION("Xtables: CLUSTERIP target"); struct clusterip_config { struct list_head list; /* list of all configs */ - atomic_t refcount; /* reference count */ - atomic_t entries; /* number of entries/rules + refcount_t refcount; /* reference count */ + refcount_t entries; /* number of entries/rules * referencing us */ __be32 clusterip; /* the IP address */ @@ -77,7 +78,7 @@ struct clusterip_net { static inline void clusterip_config_get(struct clusterip_config *c) { - atomic_inc(&c->refcount); + refcount_inc(&c->refcount); } @@ -89,7 +90,7 @@ static void clusterip_config_rcu_free(struct rcu_head *head) static inline void clusterip_config_put(struct clusterip_config *c) { - if (atomic_dec_and_test(&c->refcount)) + if (refcount_dec_and_test(&c->refcount)) call_rcu_bh(&c->rcu, clusterip_config_rcu_free); } @@ -103,7 +104,7 @@ clusterip_config_entry_put(struct clusterip_config *c) struct clusterip_net *cn = net_generic(net, clusterip_net_id); local_bh_disable(); - if (atomic_dec_and_lock(&c->entries, &cn->lock)) { + if (refcount_dec_and_lock(&c->entries, &cn->lock)) { list_del_rcu(&c->list); spin_unlock(&cn->lock); local_bh_enable(); @@ -149,10 +150,10 @@ clusterip_config_find_get(struct net *net, __be32 clusterip, int entry) c = NULL; else #endif - if (unlikely(!atomic_inc_not_zero(&c->refcount))) + if (unlikely(!refcount_inc_not_zero(&c->refcount))) c = NULL; else if (entry) - atomic_inc(&c->entries); + refcount_inc(&c->entries); } rcu_read_unlock_bh(); @@ -188,8 +189,8 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip, clusterip_config_init_nodelist(c, i); c->hash_mode = i->hash_mode; c->hash_initval = i->hash_initval; - atomic_set(&c->refcount, 1); - atomic_set(&c->entries, 1); + refcount_set(&c->refcount, 1); + refcount_set(&c->entries, 1); spin_lock_bh(&cn->lock); if (__clusterip_config_find(net, ip)) { diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index 3240a2614e82bd82c674aac84ab140c91670f163..af2b69b6895f5ae8e326b27e05efa01cc1405e1b 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -293,12 +293,16 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); - return NF_DROP; - + consume_skb(skb); + return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); - return NF_DROP; + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } return XT_CONTINUE; @@ -367,10 +371,13 @@ static unsigned int ipv4_synproxy_hook(void *priv, * number match the one of first SYN. */ if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) + ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); - - return NF_DROP; + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } synproxy->isn = ntohl(th->ack_seq); @@ -409,19 +416,56 @@ static unsigned int ipv4_synproxy_hook(void *priv, return NF_ACCEPT; } +static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv4_synproxy_hook, + .pf = NFPROTO_IPV4, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + static int synproxy_tg4_check(const struct xt_tgchk_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); const struct ipt_entry *e = par->entryinfo; + int err; if (e->ip.proto != IPPROTO_TCP || e->ip.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_netns_get(par->net, par->family); + err = nf_ct_netns_get(par->net, par->family); + if (err) + return err; + + if (snet->hook_ref4 == 0) { + err = nf_register_net_hooks(par->net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; + } + } + + snet->hook_ref4++; + return err; } static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); + + snet->hook_ref4--; + if (snet->hook_ref4 == 0) + nf_unregister_net_hooks(par->net, ipv4_synproxy_ops, + ARRAY_SIZE(ipv4_synproxy_ops)); nf_ct_netns_put(par->net, par->family); } @@ -436,46 +480,14 @@ static struct xt_target synproxy_tg4_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_hook_ops ipv4_synproxy_ops[] __read_mostly = { - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv4_synproxy_hook, - .pf = NFPROTO_IPV4, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int __init synproxy_tg4_init(void) { - int err; - - err = nf_register_hooks(ipv4_synproxy_ops, - ARRAY_SIZE(ipv4_synproxy_ops)); - if (err < 0) - goto err1; - - err = xt_register_target(&synproxy_tg4_reg); - if (err < 0) - goto err2; - - return 0; - -err2: - nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); -err1: - return err; + return xt_register_target(&synproxy_tg4_reg); } static void __exit synproxy_tg4_exit(void) { xt_unregister_target(&synproxy_tg4_reg); - nf_unregister_hooks(ipv4_synproxy_ops, ARRAY_SIZE(ipv4_synproxy_ops)); } module_init(synproxy_tg4_init); diff --git a/net/ipv4/netfilter/nf_dup_ipv4.c b/net/ipv4/netfilter/nf_dup_ipv4.c index f0dbff05fc28174de694ecad1d1adcde63c314ec..39895b9ddeb9601307f073298ab690ae1f9972d4 100644 --- a/net/ipv4/netfilter/nf_dup_ipv4.c +++ b/net/ipv4/netfilter/nf_dup_ipv4.c @@ -69,8 +69,7 @@ void nf_dup_ipv4(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) /* Avoid counting cloned packets towards the original connection. */ nf_reset(skb); - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif /* * If we are in PREROUTING/INPUT, decrease the TTL to mitigate potential diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c index 6f5e8d01b876933a68e5f6cf8b2a48f8c4e17262..feedd759ca8043c3eff37e22d0f1c8301b229da2 100644 --- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c @@ -264,13 +264,7 @@ nf_nat_ipv4_fn(void *priv, struct sk_buff *skb, if (!ct) return NF_ACCEPT; - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; + nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c index ea91058b5f6f4245a84179f81a8641756f60efff..dc1dea15c1b4fb0d7ad3ad02779cde147e52c2cf 100644 --- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c +++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c @@ -37,7 +37,6 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, NF_CT_ASSERT(hooknum == NF_INET_POST_ROUTING); ct = nf_ct_get(skb, &ctinfo); - nat = nfct_nat(ct); NF_CT_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED || ctinfo == IP_CT_RELATED_REPLY)); @@ -56,7 +55,9 @@ nf_nat_masquerade_ipv4(struct sk_buff *skb, unsigned int hooknum, return NF_DROP; } - nat->masq_index = out->ifindex; + nat = nf_ct_nat_ext_add(ct); + if (nat) + nat->masq_index = out->ifindex; /* Transfer from original range. */ memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c index b3ca21b2ba9b638550b552c85e297234ccf39caa..8a69363b48846c628994e54c92a354ca46f71ebc 100644 --- a/net/ipv4/netfilter/nf_nat_pptp.c +++ b/net/ipv4/netfilter/nf_nat_pptp.c @@ -49,9 +49,14 @@ static void pptp_nat_expected(struct nf_conn *ct, const struct nf_ct_pptp_master *ct_pptp_info; const struct nf_nat_pptp *nat_pptp_info; struct nf_nat_range range; + struct nf_conn_nat *nat; + nat = nf_ct_nat_ext_add(ct); + if (WARN_ON_ONCE(!nat)) + return; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(master); - nat_pptp_info = &nfct_nat(master)->help.nat_pptp_info; /* And here goes the grand finale of corrosion... */ if (exp->dir == IP_CT_DIR_ORIGINAL) { @@ -120,13 +125,17 @@ pptp_outbound_pkt(struct sk_buff *skb, { struct nf_ct_pptp_master *ct_pptp_info; + struct nf_conn_nat *nat = nfct_nat(ct); struct nf_nat_pptp *nat_pptp_info; u_int16_t msg; __be16 new_callid; unsigned int cid_off; + if (WARN_ON_ONCE(!nat)) + return NF_DROP; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(ct); - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; new_callid = ct_pptp_info->pns_call_id; @@ -177,11 +186,11 @@ pptp_outbound_pkt(struct sk_buff *skb, ntohs(REQ_CID(pptpReq, cid_off)), ntohs(new_callid)); /* mangle packet */ - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, - cid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_callid), (char *)&new_callid, - sizeof(new_callid)) == 0) + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, + cid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_callid), (char *)&new_callid, + sizeof(new_callid))) return NF_DROP; return NF_ACCEPT; } @@ -191,11 +200,15 @@ pptp_exp_gre(struct nf_conntrack_expect *expect_orig, struct nf_conntrack_expect *expect_reply) { const struct nf_conn *ct = expect_orig->master; + struct nf_conn_nat *nat = nfct_nat(ct); struct nf_ct_pptp_master *ct_pptp_info; struct nf_nat_pptp *nat_pptp_info; + if (WARN_ON_ONCE(!nat)) + return; + + nat_pptp_info = &nat->help.nat_pptp_info; ct_pptp_info = nfct_help_data(ct); - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; /* save original PAC call ID in nat_info */ nat_pptp_info->pac_call_id = ct_pptp_info->pac_call_id; @@ -223,11 +236,15 @@ pptp_inbound_pkt(struct sk_buff *skb, union pptp_ctrl_union *pptpReq) { const struct nf_nat_pptp *nat_pptp_info; + struct nf_conn_nat *nat = nfct_nat(ct); u_int16_t msg; __be16 new_pcid; unsigned int pcid_off; - nat_pptp_info = &nfct_nat(ct)->help.nat_pptp_info; + if (WARN_ON_ONCE(!nat)) + return NF_DROP; + + nat_pptp_info = &nat->help.nat_pptp_info; new_pcid = nat_pptp_info->pns_call_id; switch (msg = ntohs(ctlh->messageType)) { @@ -271,11 +288,11 @@ pptp_inbound_pkt(struct sk_buff *skb, pr_debug("altering peer call id from 0x%04x to 0x%04x\n", ntohs(REQ_CID(pptpReq, pcid_off)), ntohs(new_pcid)); - if (nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, - pcid_off + sizeof(struct pptp_pkt_hdr) + - sizeof(struct PptpControlHeader), - sizeof(new_pcid), (char *)&new_pcid, - sizeof(new_pcid)) == 0) + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, + pcid_off + sizeof(struct pptp_pkt_hdr) + + sizeof(struct PptpControlHeader), + sizeof(new_pcid), (char *)&new_pcid, + sizeof(new_pcid))) return NF_DROP; return NF_ACCEPT; } diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index 53e49f5011d3ce482c5180edc47da7928db0f224..d5b1e0b3f6876995b6987aa47515ece9c7beaca3 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -827,8 +827,8 @@ static unsigned char snmp_object_decode(struct asn1_ctx *ctx, return 1; } -static unsigned char snmp_request_decode(struct asn1_ctx *ctx, - struct snmp_request *request) +static unsigned char noinline_for_stack +snmp_request_decode(struct asn1_ctx *ctx, struct snmp_request *request) { unsigned int cls, con, tag; unsigned char *end; @@ -920,10 +920,10 @@ static inline void mangle_address(unsigned char *begin, } } -static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, - struct snmp_v1_trap *trap, - const struct oct1_map *map, - __sum16 *check) +static unsigned char noinline_for_stack +snmp_trap_decode(struct asn1_ctx *ctx, struct snmp_v1_trap *trap, + const struct oct1_map *map, + __sum16 *check) { unsigned int cls, con, tag, len; unsigned char *end; @@ -998,18 +998,6 @@ static unsigned char snmp_trap_decode(struct asn1_ctx *ctx, * *****************************************************************************/ -static void hex_dump(const unsigned char *buf, size_t len) -{ - size_t i; - - for (i = 0; i < len; i++) { - if (i && !(i % 16)) - printk("\n"); - printk("%02x ", *(buf + i)); - } - printk("\n"); -} - /* * Parse and mangle SNMP message according to mapping. * (And this is the fucking 'basic' method). @@ -1026,7 +1014,8 @@ static int snmp_parse_mangle(unsigned char *msg, struct snmp_object *obj; if (debug > 1) - hex_dump(msg, len); + print_hex_dump(KERN_DEBUG, "", DUMP_PREFIX_NONE, 16, 1, + msg, len, 0); asn1_open(&ctx, msg, len); diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c index 146d86105183e1a456a0f17ed6bb5371aa1e8f76..7cd8d0d918f82e275e0ecd31c1cea8ec8fcf345d 100644 --- a/net/ipv4/netfilter/nf_reject_ipv4.c +++ b/net/ipv4/netfilter/nf_reject_ipv4.c @@ -104,7 +104,6 @@ EXPORT_SYMBOL_GPL(nf_reject_ip_tcphdr_put); void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) { struct sk_buff *nskb; - const struct iphdr *oiph; struct iphdr *niph; const struct tcphdr *oth; struct tcphdr _oth; @@ -116,8 +115,6 @@ void nf_send_reset(struct net *net, struct sk_buff *oldskb, int hook) if (skb_rtable(oldskb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) return; - oiph = ip_hdr(oldskb); - nskb = alloc_skb(sizeof(struct iphdr) + sizeof(struct tcphdr) + LL_MAX_HEADER, GFP_ATOMIC); if (!nskb) diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index a83d558e1aaed62b6bce05d9cce36405b0c210ef..e9293bdebba04f5b329cd4fd4a6b5b676ade5988 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -139,7 +139,7 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb, * SNAT-ted connection. */ ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct) && + if (ct && ((iph->protocol != IPPROTO_ICMP && ctinfo == IP_CT_ESTABLISHED_REPLY) || (iph->protocol == IPPROTO_ICMP && diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index 2981291910dd2cac2d508fcde89083afc22affd4..de3681df2ce71c7341d96dd5d2540ab110ac80eb 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -90,7 +90,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, nft_in(pkt)->ifindex); return; } @@ -99,7 +99,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, if (ipv4_is_zeronet(iph->saddr)) { if (ipv4_is_lbcast(iph->daddr) || ipv4_is_local_multicast(iph->daddr)) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, get_ifindex(pkt->skb->dev)); return; } @@ -212,7 +212,7 @@ nft_fib4_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_fib4_type __read_mostly = { .name = "fib", - .select_ops = &nft_fib4_select_ops, + .select_ops = nft_fib4_select_ops, .policy = nft_fib_policy, .maxattr = NFTA_FIB_MAX, .family = NFPROTO_IPV4, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 69cf49e8356d0184f774840c9dc96560f2ae2f2b..fa44e752a9a3f8eb9957314149ae15e6df10465a 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -199,7 +199,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TW", LINUX_MIB_TIMEWAITED), SNMP_MIB_ITEM("TWRecycled", LINUX_MIB_TIMEWAITRECYCLED), SNMP_MIB_ITEM("TWKilled", LINUX_MIB_TIMEWAITKILLED), - SNMP_MIB_ITEM("PAWSPassive", LINUX_MIB_PAWSPASSIVEREJECTED), SNMP_MIB_ITEM("PAWSActive", LINUX_MIB_PAWSACTIVEREJECTED), SNMP_MIB_ITEM("PAWSEstab", LINUX_MIB_PAWSESTABREJECTED), SNMP_MIB_ITEM("DelayedACKs", LINUX_MIB_DELAYEDACKS), @@ -282,6 +281,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPFastOpenPassiveFail", LINUX_MIB_TCPFASTOPENPASSIVEFAIL), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), + SNMP_MIB_ITEM("TCPFastOpenBlackhole", LINUX_MIB_TCPFASTOPENBLACKHOLE), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("BusyPollRxPackets", LINUX_MIB_BUSYPOLLRXPACKETS), SNMP_MIB_ITEM("TCPAutoCorking", LINUX_MIB_TCPAUTOCORKING), diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index 4b7c0ec65251ef40577a2d5e360fcbaed391a566..32a691b7ce2c7e79eab6491b52457a11e666f7d3 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,7 +28,7 @@ #include #include -const struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; +struct net_protocol __rcu *inet_protos[MAX_INET_PROTOS] __read_mostly; const struct net_offload __rcu *inet_offloads[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet_offloads); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 8119e1f66e036ad2a8372bf24dd943c7d9631d8e..bdffad875691ce7240040cfaf188eb4cf3ec7307 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct iphdr)) + return -EINVAL; + if (flags&MSG_PROBE) goto out; @@ -682,7 +685,9 @@ static void raw_close(struct sock *sk, long timeout) /* * Raw sockets may have direct kernel references. Kill them. */ + rtnl_lock(); ip_ra_control(sk, 0, NULL); + rtnl_unlock(); sk_common_release(sk); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index acd69cfe2951ed89213403b3e9556da6ddd50ba8..655d9eebe43e16a59102edcd3ea4bc177c6b341d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1250,15 +1250,11 @@ static void set_class_tag(struct rtable *rt, u32 tag) static unsigned int ipv4_default_advmss(const struct dst_entry *dst) { - unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); + unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); + unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size, + ip_rt_min_advmss); - if (advmss == 0) { - advmss = max_t(unsigned int, dst->dev->mtu - 40, - ip_rt_min_advmss); - if (advmss > 65535 - 40) - advmss = 65535 - 40; - } - return advmss; + return min(advmss, IPV4_MAX_PMTU - header_size); } static unsigned int ipv4_mtu(const struct dst_entry *dst) @@ -1734,45 +1730,97 @@ static int __mkroute_input(struct sk_buff *skb, } #ifdef CONFIG_IP_ROUTE_MULTIPATH - /* To make ICMP packets follow the right flow, the multipath hash is - * calculated from the inner IP addresses in reverse order. + * calculated from the inner IP addresses. */ -static int ip_multipath_icmp_hash(struct sk_buff *skb) +static void ip_multipath_l3_keys(const struct sk_buff *skb, + struct flow_keys *hash_keys) { const struct iphdr *outer_iph = ip_hdr(skb); - struct icmphdr _icmph; + const struct iphdr *inner_iph; const struct icmphdr *icmph; struct iphdr _inner_iph; - const struct iphdr *inner_iph; + struct icmphdr _icmph; + + hash_keys->addrs.v4addrs.src = outer_iph->saddr; + hash_keys->addrs.v4addrs.dst = outer_iph->daddr; + if (likely(outer_iph->protocol != IPPROTO_ICMP)) + return; if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) - goto standard_hash; + return; icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), &_icmph); if (!icmph) - goto standard_hash; + return; if (icmph->type != ICMP_DEST_UNREACH && icmph->type != ICMP_REDIRECT && icmph->type != ICMP_TIME_EXCEEDED && - icmph->type != ICMP_PARAMETERPROB) { - goto standard_hash; - } + icmph->type != ICMP_PARAMETERPROB) + return; inner_iph = skb_header_pointer(skb, outer_iph->ihl * 4 + sizeof(_icmph), sizeof(_inner_iph), &_inner_iph); if (!inner_iph) - goto standard_hash; + return; + hash_keys->addrs.v4addrs.src = inner_iph->saddr; + hash_keys->addrs.v4addrs.dst = inner_iph->daddr; +} - return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr); +/* if skb is set it will be used and fl4 can be NULL */ +int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, + const struct sk_buff *skb) +{ + struct net *net = fi->fib_net; + struct flow_keys hash_keys; + u32 mhash; -standard_hash: - return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr); -} + switch (net->ipv4.sysctl_fib_multipath_hash_policy) { + case 0: + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + if (skb) { + ip_multipath_l3_keys(skb, &hash_keys); + } else { + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + } + break; + case 1: + /* skb is currently provided only when forwarding */ + if (skb) { + unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; + struct flow_keys keys; + + /* short-circuit if we already have L4 hash present */ + if (skb->l4_hash) + return skb_get_hash_raw(skb) >> 1; + memset(&hash_keys, 0, sizeof(hash_keys)); + skb_flow_dissect_flow_keys(skb, &keys, flag); + hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; + hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; + hash_keys.ports.src = keys.ports.src; + hash_keys.ports.dst = keys.ports.dst; + hash_keys.basic.ip_proto = keys.basic.ip_proto; + } else { + memset(&hash_keys, 0, sizeof(hash_keys)); + hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; + hash_keys.addrs.v4addrs.src = fl4->saddr; + hash_keys.addrs.v4addrs.dst = fl4->daddr; + hash_keys.ports.src = fl4->fl4_sport; + hash_keys.ports.dst = fl4->fl4_dport; + hash_keys.basic.ip_proto = fl4->flowi4_proto; + } + break; + } + mhash = flow_hash_from_keys(&hash_keys); + return mhash >> 1; +} +EXPORT_SYMBOL_GPL(fib_multipath_hash); #endif /* CONFIG_IP_ROUTE_MULTIPATH */ static int ip_mkroute_input(struct sk_buff *skb, @@ -1782,12 +1830,8 @@ static int ip_mkroute_input(struct sk_buff *skb, { #ifdef CONFIG_IP_ROUTE_MULTIPATH if (res->fi && res->fi->fib_nhs > 1) { - int h; + int h = fib_multipath_hash(res->fi, NULL, skb); - if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP)) - h = ip_multipath_icmp_hash(skb); - else - h = fib_multipath_hash(saddr, daddr); fib_select_multipath(res, h); } #endif @@ -2203,7 +2247,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, */ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, - int mp_hash) + const struct sk_buff *skb) { struct net_device *dev_out = NULL; __u8 tos = RT_FL_TOS(fl4); @@ -2359,13 +2403,14 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, } /* L3 master device is the loopback for that domain */ - dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev; + dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(res)) ? : + net->loopback_dev; fl4->flowi4_oif = dev_out->ifindex; flags |= RTCF_LOCAL; goto make_route; } - fib_select_path(net, &res, fl4, mp_hash); + fib_select_path(net, &res, fl4, skb); dev_out = FIB_RES_DEV(res); fl4->flowi4_oif = dev_out->ifindex; @@ -2585,7 +2630,8 @@ static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, return -EMSGSIZE; } -static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; @@ -2601,7 +2647,8 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) u32 table_id = RT_TABLE_MAIN; kuid_t uid; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, + extack); if (err < 0) goto errout; @@ -2619,10 +2666,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) skb_reset_mac_header(skb); skb_reset_network_header(skb); - /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ - ip_hdr(skb)->protocol = IPPROTO_UDP; - skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); - src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; @@ -2632,6 +2675,15 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) else uid = (iif ? INVALID_UID : current_uid()); + /* Bugfix: need to give ip_route_input enough of an IP header to + * not gag. + */ + ip_hdr(skb)->protocol = IPPROTO_UDP; + ip_hdr(skb)->saddr = src; + ip_hdr(skb)->daddr = dst; + + skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); + memset(&fl4, 0, sizeof(fl4)); fl4.daddr = dst; fl4.saddr = src; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 496b97e17aaf7ed2cf41cef303cb0696927f66ac..0257d965f11119acf8c55888d6e672d171ef5f08 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -203,7 +204,7 @@ EXPORT_SYMBOL_GPL(__cookie_v4_check); struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, - struct dst_entry *dst) + struct dst_entry *dst, u32 tsoff) { struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; @@ -213,6 +214,7 @@ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, NULL, &own_req); if (child) { atomic_set(&req->rsk_refcnt, 1); + tcp_sk(child)->tsoffset = tsoff; sock_rps_save_rxhash(child, skb); inet_csk_reqsk_queue_add(sk, req, child); } else { @@ -292,6 +294,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; + u32 tsoff = 0; if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -311,6 +314,11 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); + if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { + tsoff = secure_tcp_ts_off(ip_hdr(skb)->daddr, ip_hdr(skb)->saddr); + tcp_opt.rcv_tsecr -= tsoff; + } + if (!cookie_timestamp_decode(&tcp_opt)) goto out; @@ -381,7 +389,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst); - ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst); + ret = tcp_get_cookie_sock(sk, skb, req, &rt->dst, tsoff); /* ip_queue_xmit() depends on our flow being setup * Normal sockets get it right from inet_csk_route_child_sock() */ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index d6880a6149ee80c6c75f4fe75b46a9d18d204d5d..86957e9cd6c6748ac00aa0307154bb131c43f1da 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -24,6 +24,7 @@ #include #include #include +#include static int zero; static int one = 1; @@ -294,6 +295,74 @@ static int proc_tcp_fastopen_key(struct ctl_table *ctl, int write, return ret; } +static void proc_configure_early_demux(int enabled, int protocol) +{ + struct net_protocol *ipprot; +#if IS_ENABLED(CONFIG_IPV6) + struct inet6_protocol *ip6prot; +#endif + + rcu_read_lock(); + + ipprot = rcu_dereference(inet_protos[protocol]); + if (ipprot) + ipprot->early_demux = enabled ? ipprot->early_demux_handler : + NULL; + +#if IS_ENABLED(CONFIG_IPV6) + ip6prot = rcu_dereference(inet6_protos[protocol]); + if (ip6prot) + ip6prot->early_demux = enabled ? ip6prot->early_demux_handler : + NULL; +#endif + rcu_read_unlock(); +} + +static int proc_tcp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_tcp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_TCP); + } + + return ret; +} + +static int proc_udp_early_demux(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = 0; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + + if (write && !ret) { + int enabled = init_net.ipv4.sysctl_udp_early_demux; + + proc_configure_early_demux(enabled, IPPROTO_UDP); + } + + return ret; +} + +static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table, + int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + int ret; + + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (write && ret == 0) + tcp_fastopen_active_timeout_reset(); + return ret; +} + static struct ctl_table ipv4_table[] = { { .procname = "tcp_timestamps", @@ -343,6 +412,14 @@ static struct ctl_table ipv4_table[] = { .maxlen = ((TCP_FASTOPEN_KEY_LENGTH * 2) + 10), .proc_handler = proc_tcp_fastopen_key, }, + { + .procname = "tcp_fastopen_blackhole_timeout_sec", + .data = &sysctl_tcp_fastopen_blackhole_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tfo_blackhole_detect_timeout, + .extra1 = &zero, + }, { .procname = "tcp_abort_on_overflow", .data = &sysctl_tcp_abort_on_overflow, @@ -749,6 +826,20 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, + { + .procname = "udp_early_demux", + .data = &init_net.ipv4.sysctl_udp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_udp_early_demux + }, + { + .procname = "tcp_early_demux", + .data = &init_net.ipv4.sysctl_tcp_early_demux, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_tcp_early_demux + }, { .procname = "ip_default_ttl", .data = &init_net.ipv4.sysctl_ip_default_ttl, @@ -980,13 +1071,6 @@ static struct ctl_table ipv4_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, - { - .procname = "tcp_tw_recycle", - .data = &init_net.ipv4.tcp_death_row.sysctl_tw_recycle, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec - }, { .procname = "tcp_max_syn_backlog", .data = &init_net.ipv4.sysctl_max_syn_backlog, @@ -1004,6 +1088,15 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = &zero, .extra2 = &one, }, + { + .procname = "fib_multipath_hash_policy", + .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "ip_unprivileged_port_start", diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 40ba4249a58677671b68bf495f32f15b7c5f62d7..1e4c76d2b8278ba71d6cc2cf7ebfe483e241f76e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -533,7 +533,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) if (tp->urg_data & TCP_URG_VALID) mask |= POLLPRI; - } else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { /* Active TCP fastopen socket with defer_connect * Return POLLOUT so application can call write() * in order for kernel to generate SYN+data @@ -2296,6 +2296,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); tcp_write_queue_purge(sk); + tcp_fastopen_active_disable_ofo_check(sk); skb_rbtree_purge(&tp->out_of_order_queue); inet->inet_dport = 0; @@ -2394,7 +2395,7 @@ static int tcp_repair_options_est(struct tcp_sock *tp, u16 snd_wscale = opt.opt_val & 0xFFFF; u16 rcv_wscale = opt.opt_val >> 16; - if (snd_wscale > 14 || rcv_wscale > 14) + if (snd_wscale > TCP_MAX_WSCALE || rcv_wscale > TCP_MAX_WSCALE) return -EFBIG; tp->rx_opt.snd_wscale = snd_wscale; @@ -2471,7 +2472,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, /* Values greater than interface MTU won't take effect. However * at the point when this call is done we typically don't yet * know which interface is going to be used */ - if (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW) { + if (val && (val < TCP_MIN_MSS || val > MAX_TCP_WINDOW)) { err = -EINVAL; break; } @@ -2852,7 +2853,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_ssthresh = tp->snd_ssthresh; info->tcpi_advmss = tp->advmss; - info->tcpi_rcv_rtt = jiffies_to_usecs(tp->rcv_rtt_est.rtt)>>3; + info->tcpi_rcv_rtt = tp->rcv_rtt_est.rtt_us >> 3; info->tcpi_rcv_space = tp->rcvq_space.space; info->tcpi_total_retrans = tp->total_retrans; diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 79c4817abc94d08265edb2dfa995e3e479148a16..6e3c512054a60715e8e2d16ffedd12cba6a3d2d9 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -168,12 +168,8 @@ void tcp_assign_congestion_control(struct sock *sk) } out: rcu_read_unlock(); + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - /* Clear out private data before diag gets it and - * the ca has not been initialized. - */ - if (ca->get_info) - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); if (ca->flags & TCP_CONG_NEEDS_ECN) INET_ECN_xmit(sk); else @@ -200,11 +196,10 @@ static void tcp_reinit_congestion_control(struct sock *sk, tcp_cleanup_congestion_control(sk); icsk->icsk_ca_ops = ca; icsk->icsk_ca_setsockopt = 1; + memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); - if (sk->sk_state != TCP_CLOSE) { - memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv)); + if (sk->sk_state != TCP_CLOSE) tcp_init_congestion_control(sk); - } } /* Manage refcounts on socket close. */ diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index c99230efcd52d2233ba22fbc117101c8e8cee113..0683ba447d775b6101a929a6aca3eb255cff8932 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -72,7 +72,7 @@ MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); module_param(hystart, int, 0644); MODULE_PARM_DESC(hystart, "turn on/off hybrid slow start algorithm"); module_param(hystart_detect, int, 0644); -MODULE_PARM_DESC(hystart_detect, "hyrbrid slow start detection mechanisms" +MODULE_PARM_DESC(hystart_detect, "hybrid slow start detection mechanisms" " 1: packet-train 2: delay 3: both packet-train and delay"); module_param(hystart_low_window, int, 0644); MODULE_PARM_DESC(hystart_low_window, "lower bound cwnd for hybrid slow start"); diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 8ea4e9787f82ba65cd07b4c2b663df76fe4eb143..4af82b914dd4bbdc47e37cf1cf70f206bd186db5 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -341,6 +341,13 @@ bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, cookie->len = -1; return false; } + + /* Firewall blackhole issue check */ + if (tcp_fastopen_active_should_disable(sk)) { + cookie->len = -1; + return false; + } + if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) { cookie->len = -1; return true; @@ -380,3 +387,98 @@ bool tcp_fastopen_defer_connect(struct sock *sk, int *err) return false; } EXPORT_SYMBOL(tcp_fastopen_defer_connect); + +/* + * The following code block is to deal with middle box issues with TFO: + * Middlebox firewall issues can potentially cause server's data being + * blackholed after a successful 3WHS using TFO. + * The proposed solution is to disable active TFO globally under the + * following circumstances: + * 1. client side TFO socket receives out of order FIN + * 2. client side TFO socket receives out of order RST + * We disable active side TFO globally for 1hr at first. Then if it + * happens again, we disable it for 2h, then 4h, 8h, ... + * And we reset the timeout back to 1hr when we see a successful active + * TFO connection with data exchanges. + */ + +/* Default to 1hr */ +unsigned int sysctl_tcp_fastopen_blackhole_timeout __read_mostly = 60 * 60; +static atomic_t tfo_active_disable_times __read_mostly = ATOMIC_INIT(0); +static unsigned long tfo_active_disable_stamp __read_mostly; + +/* Disable active TFO and record current jiffies and + * tfo_active_disable_times + */ +void tcp_fastopen_active_disable(struct sock *sk) +{ + atomic_inc(&tfo_active_disable_times); + tfo_active_disable_stamp = jiffies; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENBLACKHOLE); +} + +/* Reset tfo_active_disable_times to 0 */ +void tcp_fastopen_active_timeout_reset(void) +{ + atomic_set(&tfo_active_disable_times, 0); +} + +/* Calculate timeout for tfo active disable + * Return true if we are still in the active TFO disable period + * Return false if timeout already expired and we should use active TFO + */ +bool tcp_fastopen_active_should_disable(struct sock *sk) +{ + int tfo_da_times = atomic_read(&tfo_active_disable_times); + int multiplier; + unsigned long timeout; + + if (!tfo_da_times) + return false; + + /* Limit timout to max: 2^6 * initial timeout */ + multiplier = 1 << min(tfo_da_times - 1, 6); + timeout = multiplier * sysctl_tcp_fastopen_blackhole_timeout * HZ; + if (time_before(jiffies, tfo_active_disable_stamp + timeout)) + return true; + + /* Mark check bit so we can check for successful active TFO + * condition and reset tfo_active_disable_times + */ + tcp_sk(sk)->syn_fastopen_ch = 1; + return false; +} + +/* Disable active TFO if FIN is the only packet in the ofo queue + * and no data is received. + * Also check if we can reset tfo_active_disable_times if data is + * received successfully on a marked active TFO sockets opened on + * a non-loopback interface + */ +void tcp_fastopen_active_disable_ofo_check(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *p; + struct sk_buff *skb; + struct dst_entry *dst; + + if (!tp->syn_fastopen) + return; + + if (!tp->data_segs_in) { + p = rb_first(&tp->out_of_order_queue); + if (p && !rb_next(p)) { + skb = rb_entry(p, struct sk_buff, rbnode); + if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) { + tcp_fastopen_active_disable(sk); + return; + } + } + } else if (tp->syn_fastopen_ch && + atomic_read(&tfo_active_disable_times)) { + dst = sk_dst_get(sk); + if (!(dst && dst->dev && (dst->dev->flags & IFF_LOOPBACK))) + tcp_fastopen_active_timeout_reset(); + dst_release(dst); + } +} diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 659d1baefb2bba36d96e412eb7ca5a02996fb6dd..5a3ad09e2786fb41ad12681d09938c645b69866d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,6 @@ int sysctl_tcp_dsack __read_mostly = 1; int sysctl_tcp_app_win __read_mostly = 31; int sysctl_tcp_adv_win_scale __read_mostly = 1; EXPORT_SYMBOL(sysctl_tcp_adv_win_scale); -EXPORT_SYMBOL(sysctl_tcp_timestamps); /* rfc5961 challenge ack rate limiting */ int sysctl_tcp_challenge_ack_limit = 1000; @@ -442,7 +441,8 @@ void tcp_init_buffer_space(struct sock *sk) tcp_sndbuf_expand(sk); tp->rcvq_space.space = tp->rcv_wnd; - tp->rcvq_space.time = tcp_time_stamp; + skb_mstamp_get(&tp->tcp_mstamp); + tp->rcvq_space.time = tp->tcp_mstamp; tp->rcvq_space.seq = tp->copied_seq; maxwin = tcp_full_space(sk); @@ -518,7 +518,7 @@ EXPORT_SYMBOL(tcp_initialize_rcv_mss); */ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) { - u32 new_sample = tp->rcv_rtt_est.rtt; + u32 new_sample = tp->rcv_rtt_est.rtt_us; long m = sample; if (m == 0) @@ -548,21 +548,23 @@ static void tcp_rcv_rtt_update(struct tcp_sock *tp, u32 sample, int win_dep) new_sample = m << 3; } - if (tp->rcv_rtt_est.rtt != new_sample) - tp->rcv_rtt_est.rtt = new_sample; + tp->rcv_rtt_est.rtt_us = new_sample; } static inline void tcp_rcv_rtt_measure(struct tcp_sock *tp) { - if (tp->rcv_rtt_est.time == 0) + u32 delta_us; + + if (tp->rcv_rtt_est.time.v64 == 0) goto new_measure; if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq)) return; - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rcv_rtt_est.time, 1); + delta_us = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcv_rtt_est.time); + tcp_rcv_rtt_update(tp, delta_us, 1); new_measure: tp->rcv_rtt_est.seq = tp->rcv_nxt + tp->rcv_wnd; - tp->rcv_rtt_est.time = tcp_time_stamp; + tp->rcv_rtt_est.time = tp->tcp_mstamp; } static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, @@ -572,7 +574,10 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk, if (tp->rx_opt.rcv_tsecr && (TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) - tcp_rcv_rtt_update(tp, tcp_time_stamp - tp->rx_opt.rcv_tsecr, 0); + tcp_rcv_rtt_update(tp, + jiffies_to_usecs(tcp_time_stamp - + tp->rx_opt.rcv_tsecr), + 0); } /* @@ -585,8 +590,8 @@ void tcp_rcv_space_adjust(struct sock *sk) int time; int copied; - time = tcp_time_stamp - tp->rcvq_space.time; - if (time < (tp->rcv_rtt_est.rtt >> 3) || tp->rcv_rtt_est.rtt == 0) + time = skb_mstamp_us_delta(&tp->tcp_mstamp, &tp->rcvq_space.time); + if (time < (tp->rcv_rtt_est.rtt_us >> 3) || tp->rcv_rtt_est.rtt_us == 0) return; /* Number of bytes copied to user in last RTT */ @@ -642,7 +647,7 @@ void tcp_rcv_space_adjust(struct sock *sk) new_measure: tp->rcvq_space.seq = tp->copied_seq; - tp->rcvq_space.time = tcp_time_stamp; + tp->rcvq_space.time = tp->tcp_mstamp; } /* There is something which you must keep in mind when you analyze the @@ -1131,7 +1136,6 @@ struct tcp_sacktag_state { */ struct skb_mstamp first_sackt; struct skb_mstamp last_sackt; - struct skb_mstamp ack_time; /* Timestamp when the S/ACK was received */ struct rate_sample *rate; int flag; }; @@ -1214,8 +1218,7 @@ static u8 tcp_sacktag_one(struct sock *sk, return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { - tcp_rack_advance(tp, sacked, end_seq, - xmit_time, &state->ack_time); + tcp_rack_advance(tp, sacked, end_seq, xmit_time); if (sacked & TCPCB_SACKED_RETRANS) { /* If the segment is not tagged as lost, @@ -2760,8 +2763,7 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked) return false; } -static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, - const struct skb_mstamp *ack_time) +static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag) { struct tcp_sock *tp = tcp_sk(sk); @@ -2769,7 +2771,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, if (sysctl_tcp_recovery & TCP_RACK_LOSS_DETECTION) { u32 prior_retrans = tp->retrans_out; - tcp_rack_mark_lost(sk, ack_time); + tcp_rack_mark_lost(sk); if (prior_retrans > tp->retrans_out) *ack_flag |= FLAG_LOST_RETRANS; } @@ -2788,8 +2790,7 @@ static void tcp_rack_identify_loss(struct sock *sk, int *ack_flag, * tcp_xmit_retransmit_queue(). */ static void tcp_fastretrans_alert(struct sock *sk, const int acked, - bool is_dupack, int *ack_flag, int *rexmit, - const struct skb_mstamp *ack_time) + bool is_dupack, int *ack_flag, int *rexmit) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); @@ -2857,11 +2858,11 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, tcp_try_keep_open(sk); return; } - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); break; case TCP_CA_Loss: tcp_process_loss(sk, flag, is_dupack, rexmit); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!(icsk->icsk_ca_state == TCP_CA_Open || (*ack_flag & FLAG_LOST_RETRANS))) return; @@ -2877,7 +2878,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked, if (icsk->icsk_ca_state <= TCP_CA_Disorder) tcp_try_undo_dsack(sk); - tcp_rack_identify_loss(sk, ack_flag, ack_time); + tcp_rack_identify_loss(sk, ack_flag); if (!tcp_time_to_recover(sk, flag)) { tcp_try_to_open(sk, flag); return; @@ -3059,8 +3060,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, { const struct inet_connection_sock *icsk = inet_csk(sk); struct skb_mstamp first_ackt, last_ackt; - struct skb_mstamp *now = &sack->ack_time; struct tcp_sock *tp = tcp_sk(sk); + struct skb_mstamp *now = &tp->tcp_mstamp; u32 prior_sacked = tp->sacked_out; u32 reord = tp->packets_out; bool fully_acked = true; @@ -3120,8 +3121,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets, tp->delivered += acked_pcount; if (!tcp_skb_spurious_retrans(tp, skb)) tcp_rack_advance(tp, sacked, scb->end_seq, - &skb->skb_mstamp, - &sack->ack_time); + &skb->skb_mstamp); } if (sacked & TCPCB_LOST) tp->lost_out -= acked_pcount; @@ -3576,8 +3576,6 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (after(ack, tp->snd_nxt)) goto invalid_ack; - skb_mstamp_get(&sack_state.ack_time); - if (icsk->icsk_pending == ICSK_TIME_LOSS_PROBE) tcp_rearm_rto(sk); @@ -3647,8 +3645,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) if (tcp_ack_is_dubious(sk, flag)) { is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP)); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); } if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3660,8 +3657,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_schedule_loss_probe(sk); delivered = tp->delivered - delivered; /* freshly ACKed or SACKed */ lost = tp->lost - lost; /* freshly marked lost */ - tcp_rate_gen(sk, delivered, lost, &sack_state.ack_time, - sack_state.rate); + tcp_rate_gen(sk, delivered, lost, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); return 1; @@ -3669,8 +3665,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) no_queue: /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); /* If this ack opens up a zero window, clear backoff. It was * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. @@ -3691,11 +3686,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) * If data was DSACKed, see if we can undo a cwnd reduction. */ if (TCP_SKB_CB(skb)->sacked) { - skb_mstamp_get(&sack_state.ack_time); flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit, - &sack_state.ack_time); + tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit); tcp_xmit_recovery(sk, rexmit); } @@ -3768,11 +3761,12 @@ void tcp_parse_options(const struct sk_buff *skb, !estab && sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *)ptr; opt_rx->wscale_ok = 1; - if (snd_wscale > 14) { - net_info_ratelimited("%s: Illegal window scaling value %d >14 received\n", + if (snd_wscale > TCP_MAX_WSCALE) { + net_info_ratelimited("%s: Illegal window scaling value %d > %u received\n", __func__, - snd_wscale); - snd_wscale = 14; + snd_wscale, + TCP_MAX_WSCALE); + snd_wscale = TCP_MAX_WSCALE; } opt_rx->snd_wscale = snd_wscale; } @@ -4007,10 +4001,10 @@ void tcp_reset(struct sock *sk) /* This barrier is coupled with smp_rmb() in tcp_poll() */ smp_wmb(); + tcp_done(sk); + if (!sock_flag(sk, SOCK_DEAD)) sk->sk_error_report(sk); - - tcp_done(sk); } /* @@ -5299,8 +5293,16 @@ static bool tcp_validate_incoming(struct sock *sk, struct sk_buff *skb, if (rst_seq_match) tcp_reset(sk); - else + else { + /* Disable TFO if RST is out-of-order + * and no data has been received + * for current active TFO socket + */ + if (tp->syn_fastopen && !tp->data_segs_in && + sk->sk_state == TCP_ESTABLISHED) + tcp_fastopen_active_disable(sk); tcp_send_challenge_ack(sk, skb); + } goto discard; } @@ -5353,6 +5355,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); + skb_mstamp_get(&tp->tcp_mstamp); if (unlikely(!sk->sk_rx_dst)) inet_csk(sk)->icsk_af_ops->sk_rx_dst_set(sk, skb); /* @@ -5579,10 +5582,6 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb) else tp->pred_flags = 0; - if (!sock_flag(sk, SOCK_DEAD)) { - sk->sk_state_change(sk); - sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); - } } static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack, @@ -5651,6 +5650,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcp_sock *tp = tcp_sk(sk); struct tcp_fastopen_cookie foc = { .len = -1 }; int saved_clamp = tp->rx_opt.mss_clamp; + bool fastopen_fail; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) @@ -5754,10 +5754,15 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tcp_finish_connect(sk, skb); - if ((tp->syn_fastopen || tp->syn_data) && - tcp_rcv_fastopen_synack(sk, skb, &foc)) - return -1; + fastopen_fail = (tp->syn_fastopen || tp->syn_data) && + tcp_rcv_fastopen_synack(sk, skb, &foc); + if (!sock_flag(sk, SOCK_DEAD)) { + sk->sk_state_change(sk); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_OUT); + } + if (fastopen_fail) + return -1; if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -5911,6 +5916,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) case TCP_SYN_SENT: tp->rx_opt.saw_tstamp = 0; + skb_mstamp_get(&tp->tcp_mstamp); queued = tcp_rcv_synsent_state_process(sk, skb, th); if (queued >= 0) return queued; @@ -5922,6 +5928,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) return 0; } + skb_mstamp_get(&tp->tcp_mstamp); tp->rx_opt.saw_tstamp = 0; req = tp->fastopen_rsk; if (req) { @@ -6041,9 +6048,16 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) break; } - if (tp->linger2 < 0 || - (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && - after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt))) { + if (tp->linger2 < 0) { + tcp_done(sk); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); + return 1; + } + if (TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(skb)->seq && + after(TCP_SKB_CB(skb)->end_seq - th->fin, tp->rcv_nxt)) { + /* Receive out of order FIN after close() */ + if (tp->syn_fastopen && th->fin) + tcp_fastopen_active_disable(sk); tcp_done(sk); NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA); return 1; @@ -6332,37 +6346,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (security_inet_conn_request(sk, skb, req)) goto drop_and_free; - if (isn && tmp_opt.tstamp_ok) - af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + if (tmp_opt.tstamp_ok) + tcp_rsk(req)->ts_off = af_ops->init_ts_off(skb); if (!want_cookie && !isn) { - /* VJ's idea. We save last timestamp seen - * from the destination in peer table, when entering - * state TIME-WAIT, and check against it before - * accepting new connection request. - * - * If "isn" is not zero, this request hit alive - * timewait bucket, so that all the necessary checks - * are made in the function processing timewait state. - */ - if (net->ipv4.tcp_death_row.sysctl_tw_recycle) { - bool strict; - - dst = af_ops->route_req(sk, &fl, req, &strict); - - if (dst && strict && - !tcp_peer_is_proven(req, dst, true, - tmp_opt.saw_tstamp)) { - NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED); - goto drop_and_release; - } - } /* Kill the following clause, if you dislike this way. */ - else if (!net->ipv4.sysctl_tcp_syncookies && - (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < - (net->ipv4.sysctl_max_syn_backlog >> 2)) && - !tcp_peer_is_proven(req, dst, false, - tmp_opt.saw_tstamp)) { + if (!net->ipv4.sysctl_tcp_syncookies && + (net->ipv4.sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) < + (net->ipv4.sysctl_max_syn_backlog >> 2)) && + !tcp_peer_is_proven(req, dst)) { /* Without syncookies last quarter of * backlog is filled with destinations, * proven to be alive. @@ -6375,10 +6367,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, goto drop_and_release; } - isn = af_ops->init_seq(skb, &tcp_rsk(req)->ts_off); + isn = af_ops->init_seq(skb); } if (!dst) { - dst = af_ops->route_req(sk, &fl, req, NULL); + dst = af_ops->route_req(sk, &fl, req); if (!dst) goto drop_and_free; } @@ -6387,7 +6379,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, if (want_cookie) { isn = cookie_init_sequence(af_ops, sk, skb, &req->mss); - tcp_rsk(req)->ts_off = 0; req->cookie_ts = tmp_opt.tstamp_ok; if (!tmp_opt.tstamp_ok) inet_rsk(req)->ecn_ok = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 575e19dcc01763ef3fa938dea3ea51995b573163..5ab2aac5ca191075383fc75214da816873bb222c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -94,12 +94,18 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key, struct inet_hashinfo tcp_hashinfo; EXPORT_SYMBOL(tcp_hashinfo); -static u32 tcp_v4_init_sequence(const struct sk_buff *skb, u32 *tsoff) +static u32 tcp_v4_init_seq(const struct sk_buff *skb) { - return secure_tcp_sequence_number(ip_hdr(skb)->daddr, - ip_hdr(skb)->saddr, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source, tsoff); + return secure_tcp_seq(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static u32 tcp_v4_init_ts_off(const struct sk_buff *skb) +{ + return secure_tcp_ts_off(ip_hdr(skb)->daddr, + ip_hdr(skb)->saddr); } int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) @@ -145,7 +151,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) struct flowi4 *fl4; struct rtable *rt; int err; - u32 seq; struct ip_options_rcu *inet_opt; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; @@ -198,10 +203,6 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) tp->write_seq = 0; } - if (tcp_death_row->sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr) - tcp_fetch_timewait_stamp(sk, &rt->dst); - inet->inet_dport = usin->sin_port; sk_daddr_set(sk, daddr); @@ -236,13 +237,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) rt = NULL; if (likely(!tp->repair)) { - seq = secure_tcp_sequence_number(inet->inet_saddr, - inet->inet_daddr, - inet->inet_sport, - usin->sin_port, - &tp->tsoffset); if (!tp->write_seq) - tp->write_seq = seq; + tp->write_seq = secure_tcp_seq(inet->inet_saddr, + inet->inet_daddr, + inet->inet_sport, + usin->sin_port); + tp->tsoffset = secure_tcp_ts_off(inet->inet_saddr, + inet->inet_daddr); } inet->inet_id = tp->write_seq ^ jiffies; @@ -1217,19 +1218,9 @@ static void tcp_v4_init_req(struct request_sock *req, static struct dst_entry *tcp_v4_route_req(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict) + const struct request_sock *req) { - struct dst_entry *dst = inet_csk_route_req(sk, &fl->u.ip4, req); - - if (strict) { - if (fl->u.ip4.daddr == inet_rsk(req)->ir_rmt_addr) - *strict = true; - else - *strict = false; - } - - return dst; + return inet_csk_route_req(sk, &fl->u.ip4, req); } struct request_sock_ops tcp_request_sock_ops __read_mostly = { @@ -1253,7 +1244,8 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .cookie_init_seq = cookie_v4_init_sequence, #endif .route_req = tcp_v4_route_req, - .init_seq = tcp_v4_init_sequence, + .init_seq = tcp_v4_init_seq, + .init_ts_off = tcp_v4_init_ts_off, .send_synack = tcp_v4_send_synack, }; @@ -1423,8 +1415,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) if (!nsk) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) { rsk = nsk; goto reset; @@ -1871,6 +1861,9 @@ void tcp_v4_destroy_sock(struct sock *sk) /* Cleanup up the write buffer. */ tcp_write_queue_purge(sk); + /* Check if we want to disable active TFO */ + tcp_fastopen_active_disable_ofo_check(sk); + /* Cleans up our, hopefully empty, out_of_order_queue. */ skb_rbtree_purge(&tp->out_of_order_queue); @@ -2402,7 +2395,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, .h.hashinfo = &tcp_hashinfo, @@ -2466,7 +2459,6 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 0; cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row.sysctl_tw_recycle = 0; net->ipv4.tcp_death_row.sysctl_max_tw_buckets = (cnt + 1) / 2; net->ipv4.tcp_death_row.hashinfo = &tcp_hashinfo; diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 046fd3910873306d74207615d6997e1c847ea361..d6fb6c067af4641f232b94e7c590c212648e8173 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -264,13 +264,15 @@ static void tcp_lp_pkts_acked(struct sock *sk, const struct ack_sample *sample) { struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); + u32 delta; if (sample->rtt_us > 0) tcp_lp_rtt_sample(sk, sample->rtt_us); /* calc inference */ - if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) - lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + delta = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if ((s32)delta > 0) + lp->inference = 3 * delta; /* test if within inference */ if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c index 0f46e5fe31ad1b6809ada1f70bce7b63df4f8c9c..653bbd67e3a39b68d27d26d17571c00ce2854bfd 100644 --- a/net/ipv4/tcp_metrics.c +++ b/net/ipv4/tcp_metrics.c @@ -45,8 +45,6 @@ struct tcp_metrics_block { struct inetpeer_addr tcpm_saddr; struct inetpeer_addr tcpm_daddr; unsigned long tcpm_stamp; - u32 tcpm_ts; - u32 tcpm_ts_stamp; u32 tcpm_lock; u32 tcpm_vals[TCP_METRIC_MAX_KERNEL + 1]; struct tcp_fastopen_metrics tcpm_fastopen; @@ -123,8 +121,6 @@ static void tcpm_suck_dst(struct tcp_metrics_block *tm, tm->tcpm_vals[TCP_METRIC_SSTHRESH] = dst_metric_raw(dst, RTAX_SSTHRESH); tm->tcpm_vals[TCP_METRIC_CWND] = dst_metric_raw(dst, RTAX_CWND); tm->tcpm_vals[TCP_METRIC_REORDERING] = dst_metric_raw(dst, RTAX_REORDERING); - tm->tcpm_ts = 0; - tm->tcpm_ts_stamp = 0; if (fastopen_clear) { tm->tcpm_fastopen.mss = 0; tm->tcpm_fastopen.syn_loss = 0; @@ -273,48 +269,6 @@ static struct tcp_metrics_block *__tcp_get_metrics_req(struct request_sock *req, return tm; } -static struct tcp_metrics_block *__tcp_get_metrics_tw(struct inet_timewait_sock *tw) -{ - struct tcp_metrics_block *tm; - struct inetpeer_addr saddr, daddr; - unsigned int hash; - struct net *net; - - if (tw->tw_family == AF_INET) { - inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); - inetpeer_set_addr_v4(&daddr, tw->tw_daddr); - hash = ipv4_addr_hash(tw->tw_daddr); - } -#if IS_ENABLED(CONFIG_IPV6) - else if (tw->tw_family == AF_INET6) { - if (ipv6_addr_v4mapped(&tw->tw_v6_daddr)) { - inetpeer_set_addr_v4(&saddr, tw->tw_rcv_saddr); - inetpeer_set_addr_v4(&daddr, tw->tw_daddr); - hash = ipv4_addr_hash(tw->tw_daddr); - } else { - inetpeer_set_addr_v6(&saddr, &tw->tw_v6_rcv_saddr); - inetpeer_set_addr_v6(&daddr, &tw->tw_v6_daddr); - hash = ipv6_addr_hash(&tw->tw_v6_daddr); - } - } -#endif - else - return NULL; - - net = twsk_net(tw); - hash ^= net_hash_mix(net); - hash = hash_32(hash, tcp_metrics_hash_log); - - for (tm = rcu_dereference(tcp_metrics_hash[hash].chain); tm; - tm = rcu_dereference(tm->tcpm_next)) { - if (addr_same(&tm->tcpm_saddr, &saddr) && - addr_same(&tm->tcpm_daddr, &daddr) && - net_eq(tm_net(tm), net)) - break; - } - return tm; -} - static struct tcp_metrics_block *tcp_get_metrics(struct sock *sk, struct dst_entry *dst, bool create) @@ -573,8 +527,7 @@ void tcp_init_metrics(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, - bool paws_check, bool timestamps) +bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst) { struct tcp_metrics_block *tm; bool ret; @@ -584,94 +537,10 @@ bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst, rcu_read_lock(); tm = __tcp_get_metrics_req(req, dst); - if (paws_check) { - if (tm && - (u32)get_seconds() - tm->tcpm_ts_stamp < TCP_PAWS_MSL && - ((s32)(tm->tcpm_ts - req->ts_recent) > TCP_PAWS_WINDOW || - !timestamps)) - ret = false; - else - ret = true; - } else { - if (tm && tcp_metric_get(tm, TCP_METRIC_RTT) && tm->tcpm_ts_stamp) - ret = true; - else - ret = false; - } - rcu_read_unlock(); - - return ret; -} - -void tcp_fetch_timewait_stamp(struct sock *sk, struct dst_entry *dst) -{ - struct tcp_metrics_block *tm; - - rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); - if (tm) { - struct tcp_sock *tp = tcp_sk(sk); - - if ((u32)get_seconds() - tm->tcpm_ts_stamp <= TCP_PAWS_MSL) { - tp->rx_opt.ts_recent_stamp = tm->tcpm_ts_stamp; - tp->rx_opt.ts_recent = tm->tcpm_ts; - } - } - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(tcp_fetch_timewait_stamp); - -/* VJ's idea. Save last timestamp seen from this destination and hold - * it at least for normal timewait interval to use for duplicate - * segment detection in subsequent connections, before they enter - * synchronized state. - */ -bool tcp_remember_stamp(struct sock *sk) -{ - struct dst_entry *dst = __sk_dst_get(sk); - bool ret = false; - - if (dst) { - struct tcp_metrics_block *tm; - - rcu_read_lock(); - tm = tcp_get_metrics(sk, dst, true); - if (tm) { - struct tcp_sock *tp = tcp_sk(sk); - - if ((s32)(tm->tcpm_ts - tp->rx_opt.ts_recent) <= 0 || - ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && - tm->tcpm_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) { - tm->tcpm_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp; - tm->tcpm_ts = tp->rx_opt.ts_recent; - } - ret = true; - } - rcu_read_unlock(); - } - return ret; -} - -bool tcp_tw_remember_stamp(struct inet_timewait_sock *tw) -{ - struct tcp_metrics_block *tm; - bool ret = false; - - rcu_read_lock(); - tm = __tcp_get_metrics_tw(tw); - if (tm) { - const struct tcp_timewait_sock *tcptw; - struct sock *sk = (struct sock *) tw; - - tcptw = tcp_twsk(sk); - if ((s32)(tm->tcpm_ts - tcptw->tw_ts_recent) <= 0 || - ((u32)get_seconds() - tm->tcpm_ts_stamp > TCP_PAWS_MSL && - tm->tcpm_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) { - tm->tcpm_ts_stamp = (u32)tcptw->tw_ts_recent_stamp; - tm->tcpm_ts = tcptw->tw_ts_recent; - } + if (tm && tcp_metric_get(tm, TCP_METRIC_RTT)) ret = true; - } + else + ret = false; rcu_read_unlock(); return ret; @@ -791,14 +660,6 @@ static int tcp_metrics_fill_info(struct sk_buff *msg, jiffies - tm->tcpm_stamp, TCP_METRICS_ATTR_PAD) < 0) goto nla_put_failure; - if (tm->tcpm_ts_stamp) { - if (nla_put_s32(msg, TCP_METRICS_ATTR_TW_TS_STAMP, - (s32) (get_seconds() - tm->tcpm_ts_stamp)) < 0) - goto nla_put_failure; - if (nla_put_u32(msg, TCP_METRICS_ATTR_TW_TSVAL, - tm->tcpm_ts) < 0) - goto nla_put_failure; - } { int n = 0; @@ -1150,10 +1011,7 @@ static int __net_init tcp_net_metrics_init(struct net *net) tcp_metrics_hash_log = order_base_2(slots); size = sizeof(struct tcpm_hash_bucket) << tcp_metrics_hash_log; - tcp_metrics_hash = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); - if (!tcp_metrics_hash) - tcp_metrics_hash = vzalloc(size); - + tcp_metrics_hash = kvzalloc(size, GFP_KERNEL); if (!tcp_metrics_hash) return -ENOMEM; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 65c0f3d13eca47c6394c09925decf54287d01b48..717be4de53248352c758b50557987d898340dd4f 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -26,6 +26,7 @@ #include #include #include +#include int sysctl_tcp_abort_on_overflow __read_mostly; @@ -94,7 +95,6 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, struct tcp_options_received tmp_opt; struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); bool paws_reject = false; - struct inet_timewait_death_row *tcp_death_row = &sock_net((struct sock*)tw)->ipv4.tcp_death_row; tmp_opt.saw_tstamp = 0; if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) { @@ -149,12 +149,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, tcptw->tw_ts_recent = tmp_opt.rcv_tsval; } - if (tcp_death_row->sysctl_tw_recycle && - tcptw->tw_ts_recent_stamp && - tcp_tw_remember_stamp(tw)) - inet_twsk_reschedule(tw, tw->tw_timeout); - else - inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); return TCP_TW_ACK; } @@ -259,12 +254,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); struct inet_timewait_sock *tw; - bool recycle_ok = false; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; - if (tcp_death_row->sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tcp_remember_stamp(sk); - tw = inet_twsk_alloc(sk, tcp_death_row, state); if (tw) { @@ -317,13 +308,9 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) if (timeo < rto) timeo = rto; - if (recycle_ok) { - tw->tw_timeout = rto; - } else { - tw->tw_timeout = TCP_TIMEWAIT_LEN; - if (state == TCP_TIME_WAIT) - timeo = TCP_TIMEWAIT_LEN; - } + tw->tw_timeout = TCP_TIMEWAIT_LEN; + if (state == TCP_TIME_WAIT) + timeo = TCP_TIMEWAIT_LEN; inet_twsk_schedule(tw, timeo); /* Linkage updates. */ @@ -536,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp.v64 = 0; @@ -813,6 +801,9 @@ int tcp_child_process(struct sock *parent, struct sock *child, int ret = 0; int state = child->sk_state; + /* record NAPI ID of child */ + sk_mark_napi_id(child, skb); + tcp_segs_in(tcp_sk(child), skb); if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index c3c082ed38796f65948e7a1042e37813b71d5abf..4858e190f6ac130c9441f58cb8944cc82bf67270 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -212,12 +212,12 @@ void tcp_select_initial_window(int __space, __u32 mss, /* If no clamp set the clamp to the max possible scaled window */ if (*window_clamp == 0) - (*window_clamp) = (65535 << 14); + (*window_clamp) = (U16_MAX << TCP_MAX_WSCALE); space = min(*window_clamp, space); /* Quantize space offering to a multiple of mss if possible. */ if (space > mss) - space = (space / mss) * mss; + space = rounddown(space, mss); /* NOTE: offering an initial window larger than 32767 * will break some buggy TCP stacks. If the admin tells us @@ -234,13 +234,11 @@ void tcp_select_initial_window(int __space, __u32 mss, (*rcv_wscale) = 0; if (wscale_ok) { - /* Set window scaling on max possible window - * See RFC1323 for an explanation of the limit to 14 - */ + /* Set window scaling on max possible window */ space = max_t(u32, space, sysctl_tcp_rmem[2]); space = max_t(u32, space, sysctl_rmem_max); space = min_t(u32, space, *window_clamp); - while (space > 65535 && (*rcv_wscale) < 14) { + while (space > U16_MAX && (*rcv_wscale) < TCP_MAX_WSCALE) { space >>= 1; (*rcv_wscale)++; } @@ -253,7 +251,7 @@ void tcp_select_initial_window(int __space, __u32 mss, } /* Set the clamp no higher than max representable value */ - (*window_clamp) = min(65535U << (*rcv_wscale), *window_clamp); + (*window_clamp) = min_t(__u32, U16_MAX << (*rcv_wscale), *window_clamp); } EXPORT_SYMBOL(tcp_select_initial_window); @@ -1267,7 +1265,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, * eventually). The difference is that pulled data not copied, but * immediately discarded. */ -static void __pskb_trim_head(struct sk_buff *skb, int len) +static int __pskb_trim_head(struct sk_buff *skb, int len) { struct skb_shared_info *shinfo; int i, k, eat; @@ -1277,7 +1275,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) __skb_pull(skb, eat); len -= eat; if (!len) - return; + return 0; } eat = len; k = 0; @@ -1303,23 +1301,28 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) skb_reset_tail_pointer(skb); skb->data_len -= len; skb->len = skb->data_len; + return len; } /* Remove acked data from a packet in the transmit queue. */ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { + u32 delta_truesize; + if (skb_unclone(skb, GFP_ATOMIC)) return -ENOMEM; - __pskb_trim_head(skb, len); + delta_truesize = __pskb_trim_head(skb, len); TCP_SKB_CB(skb)->seq += len; skb->ip_summed = CHECKSUM_PARTIAL; - skb->truesize -= len; - sk->sk_wmem_queued -= len; - sk_mem_uncharge(sk, len); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + if (delta_truesize) { + skb->truesize -= delta_truesize; + sk->sk_wmem_queued -= delta_truesize; + sk_mem_uncharge(sk, delta_truesize); + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + } /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) @@ -1511,6 +1514,7 @@ static void tcp_cwnd_application_limited(struct sock *sk) static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); /* Track the maximum number of outstanding packets in each @@ -1533,7 +1537,8 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) tp->snd_cwnd_used = tp->packets_out; if (sysctl_tcp_slow_start_after_idle && - (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto) + (s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto && + !ca_ops->cong_control) tcp_cwnd_application_limited(sk); /* The following conditions together indicate the starvation @@ -2561,7 +2566,6 @@ u32 __tcp_select_window(struct sock *sk) /* Don't do rounding if we are using window scaling, since the * scaled window will not line up with the MSS boundary anyway. */ - window = tp->rcv_wnd; if (tp->rx_opt.rcv_wscale) { window = free_space; @@ -2569,10 +2573,9 @@ u32 __tcp_select_window(struct sock *sk) * Import case: prevent zero window announcement if * 1< mss. */ - if (((window >> tp->rx_opt.rcv_wscale) << tp->rx_opt.rcv_wscale) != window) - window = (((window >> tp->rx_opt.rcv_wscale) + 1) - << tp->rx_opt.rcv_wscale); + window = ALIGN(window, (1 << tp->rx_opt.rcv_wscale)); } else { + window = tp->rcv_wnd; /* Get the largest window that is a nice multiple of mss. * Window clamp already applied above. * If our current window offering is within 1 mss of the @@ -2582,7 +2585,7 @@ u32 __tcp_select_window(struct sock *sk) * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space / mss) * mss; + window = rounddown(free_space, mss); else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c index 9be1581a5a08c36f4544fbdabedd9741fb266a1e..c6a9fa8946462100947ab62d86464ff8f99565c2 100644 --- a/net/ipv4/tcp_rate.c +++ b/net/ipv4/tcp_rate.c @@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, /* Update the connection delivery information and generate a rate sample. */ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, - struct skb_mstamp *now, struct rate_sample *rs) + struct rate_sample *rs) { struct tcp_sock *tp = tcp_sk(sk); u32 snd_us, ack_us; @@ -120,7 +120,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * to carry current time, flags, stats like "tcp_sacktag_state". */ if (delivered) - tp->delivered_mstamp = *now; + tp->delivered_mstamp = tp->tcp_mstamp; rs->acked_sacked = delivered; /* freshly ACKed or SACKed */ rs->losses = lost; /* freshly marked lost */ @@ -138,7 +138,8 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, * longer phase. */ snd_us = rs->interval_us; /* send phase */ - ack_us = skb_mstamp_us_delta(now, &rs->prior_mstamp); /* ack phase */ + ack_us = skb_mstamp_us_delta(&tp->tcp_mstamp, + &rs->prior_mstamp); /* ack phase */ rs->interval_us = max(snd_us, ack_us); /* Normally we expect interval_us >= min-rtt. diff --git a/net/ipv4/tcp_recovery.c b/net/ipv4/tcp_recovery.c index d8acbd9f477a2ac6b0f8eee1bf59f3ab43abff07..362b8c75bfab44cf87c2a01398a146a271bc1119 100644 --- a/net/ipv4/tcp_recovery.c +++ b/net/ipv4/tcp_recovery.c @@ -45,8 +45,7 @@ static bool tcp_rack_sent_after(const struct skb_mstamp *t1, * or tcp_time_to_recover()'s "Trick#1: the loss is proven" code path will * make us enter the CA_Recovery state. */ -static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, - u32 *reo_timeout) +static void tcp_rack_detect_loss(struct sock *sk, u32 *reo_timeout) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -79,7 +78,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, * A packet is lost if its elapsed time is beyond * the recent RTT plus the reordering window. */ - u32 elapsed = skb_mstamp_us_delta(now, + u32 elapsed = skb_mstamp_us_delta(&tp->tcp_mstamp, &skb->skb_mstamp); s32 remaining = tp->rack.rtt_us + reo_wnd - elapsed; @@ -105,7 +104,7 @@ static void tcp_rack_detect_loss(struct sock *sk, const struct skb_mstamp *now, } } -void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) +void tcp_rack_mark_lost(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 timeout; @@ -115,7 +114,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) /* Reset the advanced flag to avoid unnecessary queue scanning */ tp->rack.advanced = 0; - tcp_rack_detect_loss(sk, now, &timeout); + tcp_rack_detect_loss(sk, &timeout); if (timeout) { timeout = usecs_to_jiffies(timeout + TCP_REO_TIMEOUT_MIN); inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, @@ -128,8 +127,7 @@ void tcp_rack_mark_lost(struct sock *sk, const struct skb_mstamp *now) * draft-cheng-tcpm-rack-00.txt */ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, - const struct skb_mstamp *xmit_time, - const struct skb_mstamp *ack_time) + const struct skb_mstamp *xmit_time) { u32 rtt_us; @@ -138,7 +136,7 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, end_seq, tp->rack.end_seq)) return; - rtt_us = skb_mstamp_us_delta(ack_time, xmit_time); + rtt_us = skb_mstamp_us_delta(&tp->tcp_mstamp, xmit_time); if (sacked & TCPCB_RETRANS) { /* If the sacked packet was retransmitted, it's ambiguous * whether the retransmission or the original (or the prior @@ -165,12 +163,11 @@ void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, void tcp_rack_reo_timeout(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - struct skb_mstamp now; u32 timeout, prior_inflight; - skb_mstamp_get(&now); prior_inflight = tcp_packets_in_flight(tp); - tcp_rack_detect_loss(sk, &now, &timeout); + skb_mstamp_get(&tp->tcp_mstamp); + tcp_rack_detect_loss(sk, &timeout); if (prior_inflight != tcp_packets_in_flight(tp)) { if (inet_csk(sk)->icsk_ca_state != TCP_CA_Recovery) { tcp_enter_recovery(sk, false); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index b2ab411c6d3728fa7dbdebde045532a7317f5166..14672543cf0bd27bc59976d5cec38d2d3bbcdd2c 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -201,11 +201,10 @@ static int tcp_write_timeout(struct sock *sk) if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) { /* Some middle-boxes may black-hole Fast Open _after_ * the handshake. Therefore we conservatively disable - * Fast Open on this path on recurring timeouts with - * few or zero bytes acked after Fast Open. + * Fast Open on this path on recurring timeouts after + * successful Fast Open. */ - if (tp->syn_data_acked && - tp->bytes_acked <= tp->rx_opt.mss_clamp) { + if (tp->syn_data_acked) { tcp_fastopen_cache_set(sk, 0, NULL, true, 0); if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1) NET_INC_STATS(sock_net(sk), diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index fed66dc0e0f5f242cf0af25434fa9cfa89998958..9775453b8d174c848dc09df83d1fa185422cd8cc 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -265,8 +265,8 @@ static size_t tcp_westwood_info(struct sock *sk, u32 ext, int *attr, if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { info->vegas.tcpv_enabled = 1; info->vegas.tcpv_rttcnt = 0; - info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt), - info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min), + info->vegas.tcpv_rtt = jiffies_to_usecs(ca->rtt); + info->vegas.tcpv_minrtt = jiffies_to_usecs(ca->rtt_min); *attr = INET_DIAG_VEGASINFO; return sizeof(struct tcpvegas_info); diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index b2be1d9757efb8ce8b82dc0a0fe3a475d193ea5b..781250151d40ee4559f7b90d15dccad8ffaeafd0 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -29,6 +29,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, u16 mac_len = skb->mac_len; int udp_offset, outer_hlen; __wsum partial; + bool need_ipsec; if (unlikely(!pskb_may_pull(skb, tnl_hlen))) goto out; @@ -62,8 +63,10 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb, ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP); + need_ipsec = skb_dst(skb) && dst_xfrm(skb_dst(skb)); /* Try to offload checksum if possible */ offload_csum = !!(need_csum && + !need_ipsec && (skb->dev->features & (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) : (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)))); diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index 4acc0508c5ebc65dc392de50a207901b2ea8d305..3d36644890bb6d3b0a755c811c60e920ad5cd8b8 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -12,6 +12,7 @@ #include #include #include +#include /* Add encapsulation header. * @@ -23,6 +24,8 @@ static int xfrm4_transport_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl * 4; + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); @@ -56,9 +59,40 @@ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) return 0; } +static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static void xfrm4_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + sizeof(struct iphdr) + x->props.header_len); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb_reset_transport_header(skb); + skb->transport_header -= x->props.header_len; + } +} + static struct xfrm_mode xfrm4_transport_mode = { .input = xfrm4_transport_input, .output = xfrm4_transport_output, + .gso_segment = xfrm4_transport_gso_segment, + .xmit = xfrm4_transport_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, }; diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 35feda67646494c92263cf30109432fb395fa1df..e6265e2c274e48a54cfc08afb9fee6c9fc2ad194 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -33,6 +33,9 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) struct iphdr *top_iph; int flags; + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct iphdr, protocol); @@ -96,11 +99,36 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) return err; } +static struct sk_buff *xfrm4_mode_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); + +} + +static void xfrm4_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb->network_header = skb->network_header - x->props.header_len; + skb->transport_header = skb->network_header + + sizeof(struct iphdr); + } + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + x->props.header_len); +} + static struct xfrm_mode xfrm4_tunnel_mode = { .input2 = xfrm4_mode_tunnel_input, .input = xfrm_prepare_input, .output2 = xfrm4_mode_tunnel_output, .output = xfrm4_prepare_output, + .gso_segment = xfrm4_mode_tunnel_gso_segment, + .xmit = xfrm4_mode_tunnel_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 7ee6518afa86ff785cacda0f115297ff6e5d0fa5..94b8702603bc54f36542977ed0d9c3b93d43b6b7 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -29,7 +29,8 @@ static int xfrm4_tunnel_check_size(struct sk_buff *skb) goto out; mtu = dst_mtu(skb_dst(skb)); - if (skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && skb_gso_network_seglen(skb) > ip_skb_dst_mtu(skb->sk, skb))) { skb->protocol = htons(ETH_P_IP); if (skb->sk) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index e2afe677a9d944a2c6c27a2e7b2d06227712cf89..48c452959d2c2fe687472c3732fe40246a8c863a 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -307,6 +307,7 @@ config IPV6_SEG6_LWTUNNEL bool "IPv6: Segment Routing Header encapsulation support" depends on IPV6 select LWTUNNEL + select DST_CACHE ---help--- Support for encapsulation of packets within an outer IPv6 header and a Segment Routing Header using the lightweight diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 80ce478c4851318f6eef9320bbdb8548641f920c..8d297a79b5680761b290aaa1bc947d05bcb60f3b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -224,6 +224,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -245,6 +246,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = { #endif .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, + .disable_policy = 0, }; static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { @@ -276,6 +278,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { .accept_ra_rtr_pref = 1, .rtr_probe_interval = 60 * HZ, #ifdef CONFIG_IPV6_ROUTE_INFO + .accept_ra_rt_info_min_plen = 0, .accept_ra_rt_info_max_plen = 0, #endif #endif @@ -297,6 +300,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = { #endif .enhanced_dad = 1, .addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64, + .disable_policy = 0, }; /* Check if a valid qdisc is available */ @@ -545,6 +549,9 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0) goto nla_put_failure; + if (!devconf) + goto out; + if ((all || type == NETCONFA_FORWARDING) && nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0) goto nla_put_failure; @@ -563,6 +570,7 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, devconf->ignore_routes_with_linkdown) < 0) goto nla_put_failure; +out: nlmsg_end(skb, nlh); return 0; @@ -571,8 +579,8 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex, return -EMSGSIZE; } -void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, - struct ipv6_devconf *devconf) +void inet6_netconf_notify_devconf(struct net *net, int event, int type, + int ifindex, struct ipv6_devconf *devconf) { struct sk_buff *skb; int err = -ENOBUFS; @@ -582,7 +590,7 @@ void inet6_netconf_notify_devconf(struct net *net, int type, int ifindex, goto errout; err = inet6_netconf_fill_devconf(skb, ifindex, devconf, 0, 0, - RTM_NEWNETCONF, 0, type); + event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -603,7 +611,8 @@ static const struct nla_policy devconf_ipv6_policy[NETCONFA_MAX+1] = { }; static int inet6_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX+1]; @@ -616,7 +625,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_ipv6_policy); + devconf_ipv6_policy, extack); if (err < 0) goto errout; @@ -765,7 +774,8 @@ static void dev_forward_change(struct inet6_dev *idev) else addrconf_leave_anycast(ifa); } - inet6_netconf_notify_devconf(dev_net(dev), NETCONFA_FORWARDING, + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, + NETCONFA_FORWARDING, dev->ifindex, &idev->cnf); } @@ -800,7 +810,8 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) if (p == &net->ipv6.devconf_dflt->forwarding) { if ((!newf) ^ (!old)) - inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); rtnl_unlock(); @@ -812,13 +823,15 @@ static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) net->ipv6.devconf_dflt->forwarding = newf; if ((!newf) ^ (!old_dflt)) - inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); addrconf_forward_change(net, newf); if ((!newf) ^ (!old)) - inet6_netconf_notify_devconf(net, NETCONFA_FORWARDING, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); } else if ((!newf) ^ (!old)) @@ -843,6 +856,7 @@ static void addrconf_linkdown_change(struct net *net, __s32 newf) idev->cnf.ignore_routes_with_linkdown = newf; if (changed) inet6_netconf_notify_devconf(dev_net(dev), + RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, dev->ifindex, &idev->cnf); @@ -865,6 +879,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) if (p == &net->ipv6.devconf_dflt->ignore_routes_with_linkdown) { if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, + RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); @@ -877,6 +892,7 @@ static int addrconf_fixup_linkdown(struct ctl_table *table, int *p, int newf) addrconf_linkdown_change(net, newf); if ((!newf) ^ (!old)) inet6_netconf_notify_devconf(net, + RTM_NEWNETCONF, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); @@ -944,6 +960,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, const struct in6_addr *peer_addr, int pfxlen, int scope, u32 flags, u32 valid_lft, u32 prefered_lft) { + struct net *net = dev_net(idev->dev); struct inet6_ifaddr *ifa = NULL; struct rt6_info *rt; unsigned int hash; @@ -990,6 +1007,10 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr, goto out; } + if (net->ipv6.devconf_all->disable_policy || + idev->cnf.disable_policy) + rt->dst.flags |= DST_NOPOLICY; + neigh_parms_data_state_setall(idev->nd_parms); ifa->addr = *addr; @@ -2053,12 +2074,23 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp) __ipv6_dev_ac_dec(ifp->idev, &addr); } -static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev) +static int addrconf_ifid_6lowpan(u8 *eui, struct net_device *dev) { - if (dev->addr_len != EUI64_ADDR_LEN) + switch (dev->addr_len) { + case ETH_ALEN: + memcpy(eui, dev->dev_addr, 3); + eui[3] = 0xFF; + eui[4] = 0xFE; + memcpy(eui + 5, dev->dev_addr + 3, 3); + break; + case EUI64_ADDR_LEN: + memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN); + eui[0] ^= 2; + break; + default: return -1; - memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN); - eui[0] ^= 2; + } + return 0; } @@ -2150,7 +2182,7 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev) case ARPHRD_TUNNEL: return addrconf_ifid_gre(eui, dev); case ARPHRD_6LOWPAN: - return addrconf_ifid_eui64(eui, dev); + return addrconf_ifid_6lowpan(eui, dev); case ARPHRD_IEEE1394: return addrconf_ifid_ieee1394(eui, dev); case ARPHRD_TUNNEL6: @@ -3271,14 +3303,24 @@ static void addrconf_gre_config(struct net_device *dev) static int fixup_permanent_addr(struct inet6_dev *idev, struct inet6_ifaddr *ifp) { - if (!ifp->rt) { - struct rt6_info *rt; + /* rt6i_ref == 0 means the host route was removed from the + * FIB, for example, if 'lo' device is taken down. In that + * case regenerate the host route. + */ + if (!ifp->rt || !atomic_read(&ifp->rt->rt6i_ref)) { + struct rt6_info *rt, *prev; rt = addrconf_dst_alloc(idev, &ifp->addr, false); if (unlikely(IS_ERR(rt))) return PTR_ERR(rt); + /* ifp->rt can be accessed outside of rtnl */ + spin_lock(&ifp->lock); + prev = ifp->rt; ifp->rt = rt; + spin_unlock(&ifp->lock); + + ip6_rt_put(prev); } if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) { @@ -3286,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev, idev->dev, 0, 0); } - addrconf_dad_start(ifp); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) + addrconf_dad_start(ifp); return 0; } @@ -3505,6 +3548,7 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, */ static struct notifier_block ipv6_dev_notf = { .notifier_call = addrconf_notify, + .priority = ADDRCONF_NOTIFY_PRIORITY, }; static void addrconf_type_change(struct net_device *dev, unsigned long event) @@ -3641,7 +3685,7 @@ static int addrconf_ifdown(struct net_device *dev, int how) if (keep) { /* set state to skip the notifier below */ state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; + ifa->state = INET6_IFADDR_STATE_PREDAD; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -4382,7 +4426,8 @@ static const struct nla_policy ifa_ipv6_policy[IFA_MAX+1] = { }; static int -inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) +inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; @@ -4391,7 +4436,8 @@ inet6_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh) u32 ifa_flags; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, + extack); if (err < 0) return err; @@ -4491,7 +4537,8 @@ static int inet6_addr_modify(struct inet6_ifaddr *ifp, u32 ifa_flags, } static int -inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) +inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifaddrmsg *ifm; @@ -4503,7 +4550,8 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh) u32 ifa_flags; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, + extack); if (err < 0) return err; @@ -4853,7 +4901,8 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) return inet6_dump_addr(skb, cb, type); } -static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct ifaddrmsg *ifm; @@ -4864,7 +4913,8 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh) struct sk_buff *skb; int err; - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_ipv6_policy, + extack); if (err < 0) goto errout; @@ -4975,6 +5025,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, array[DEVCONF_RTR_PROBE_INTERVAL] = jiffies_to_msecs(cnf->rtr_probe_interval); #ifdef CONFIG_IPV6_ROUTE_INFO + array[DEVCONF_ACCEPT_RA_RT_INFO_MIN_PLEN] = cnf->accept_ra_rt_info_min_plen; array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen; #endif #endif @@ -5006,6 +5057,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf, #endif array[DEVCONF_ENHANCED_DAD] = cnf->enhanced_dad; array[DEVCONF_ADDR_GEN_MODE] = cnf->addr_gen_mode; + array[DEVCONF_DISABLE_POLICY] = cnf->disable_policy; } static inline size_t inet6_ifla6_size(void) @@ -5232,7 +5284,8 @@ static int inet6_validate_link_af(const struct net_device *dev, if (dev && !__in6_dev_get(dev)) return -EAFNOSUPPORT; - return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy); + return nla_parse_nested(tb, IFLA_INET6_MAX, nla, inet6_af_policy, + NULL); } static int check_addr_gen_mode(int mode) @@ -5264,7 +5317,7 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla) if (!idev) return -EAFNOSUPPORT; - if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL) < 0) + if (nla_parse_nested(tb, IFLA_INET6_MAX, nla, NULL, NULL) < 0) BUG(); if (tb[IFLA_INET6_TOKEN]) { @@ -5667,17 +5720,20 @@ int addrconf_sysctl_proxy_ndp(struct ctl_table *ctl, int write, return restart_syscall(); if (valp == &net->ipv6.devconf_dflt->proxy_ndp) - inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_DEFAULT, net->ipv6.devconf_dflt); else if (valp == &net->ipv6.devconf_all->proxy_ndp) - inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); else { struct inet6_dev *idev = ctl->extra1; - inet6_netconf_notify_devconf(net, NETCONFA_PROXY_NEIGH, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_PROXY_NEIGH, idev->dev->ifindex, &idev->cnf); } @@ -5830,6 +5886,105 @@ int addrconf_sysctl_ignore_routes_with_linkdown(struct ctl_table *ctl, return ret; } +static +void addrconf_set_nopolicy(struct rt6_info *rt, int action) +{ + if (rt) { + if (action) + rt->dst.flags |= DST_NOPOLICY; + else + rt->dst.flags &= ~DST_NOPOLICY; + } +} + +static +void addrconf_disable_policy_idev(struct inet6_dev *idev, int val) +{ + struct inet6_ifaddr *ifa; + + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { + spin_lock(&ifa->lock); + if (ifa->rt) { + struct rt6_info *rt = ifa->rt; + struct fib6_table *table = rt->rt6i_table; + int cpu; + + read_lock(&table->tb6_lock); + addrconf_set_nopolicy(ifa->rt, val); + if (rt->rt6i_pcpu) { + for_each_possible_cpu(cpu) { + struct rt6_info **rtp; + + rtp = per_cpu_ptr(rt->rt6i_pcpu, cpu); + addrconf_set_nopolicy(*rtp, val); + } + } + read_unlock(&table->tb6_lock); + } + spin_unlock(&ifa->lock); + } + read_unlock_bh(&idev->lock); +} + +static +int addrconf_disable_policy(struct ctl_table *ctl, int *valp, int val) +{ + struct inet6_dev *idev; + struct net *net; + + if (!rtnl_trylock()) + return restart_syscall(); + + *valp = val; + + net = (struct net *)ctl->extra2; + if (valp == &net->ipv6.devconf_dflt->disable_policy) { + rtnl_unlock(); + return 0; + } + + if (valp == &net->ipv6.devconf_all->disable_policy) { + struct net_device *dev; + + for_each_netdev(net, dev) { + idev = __in6_dev_get(dev); + if (idev) + addrconf_disable_policy_idev(idev, val); + } + } else { + idev = (struct inet6_dev *)ctl->extra1; + addrconf_disable_policy_idev(idev, val); + } + + rtnl_unlock(); + return 0; +} + +static +int addrconf_sysctl_disable_policy(struct ctl_table *ctl, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int *valp = ctl->data; + int val = *valp; + loff_t pos = *ppos; + struct ctl_table lctl; + int ret; + + lctl = *ctl; + lctl.data = &val; + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); + + if (write && (*valp != val)) + ret = addrconf_disable_policy(ctl, valp, val); + + if (ret) + *ppos = pos; + + return ret; +} + static int minus_one = -1; static const int one = 1; static const int two_five_five = 255; @@ -6017,6 +6172,13 @@ static const struct ctl_table addrconf_sysctl[] = { .proc_handler = proc_dointvec_jiffies, }, #ifdef CONFIG_IPV6_ROUTE_INFO + { + .procname = "accept_ra_rt_info_min_plen", + .data = &ipv6_devconf.accept_ra_rt_info_min_plen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "accept_ra_rt_info_max_plen", .data = &ipv6_devconf.accept_ra_rt_info_max_plen, @@ -6187,6 +6349,13 @@ static const struct ctl_table addrconf_sysctl[] = { .mode = 0644, .proc_handler = addrconf_sysctl_addr_gen_mode, }, + { + .procname = "disable_policy", + .data = &ipv6_devconf.disable_policy, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = addrconf_sysctl_disable_policy, + }, { /* sentinel */ } @@ -6227,7 +6396,8 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, ifindex = NETCONFA_IFINDEX_DEFAULT; else ifindex = idev->dev->ifindex; - inet6_netconf_notify_devconf(net, NETCONFA_ALL, ifindex, p); + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, + ifindex, p); return 0; free: @@ -6236,7 +6406,8 @@ static int __addrconf_sysctl_register(struct net *net, char *dev_name, return -ENOBUFS; } -static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) +static void __addrconf_sysctl_unregister(struct net *net, + struct ipv6_devconf *p, int ifindex) { struct ctl_table *table; @@ -6247,6 +6418,8 @@ static void __addrconf_sysctl_unregister(struct ipv6_devconf *p) unregister_net_sysctl_table(p->sysctl_header); p->sysctl_header = NULL; kfree(table); + + inet6_netconf_notify_devconf(net, RTM_DELNETCONF, 0, ifindex, NULL); } static int addrconf_sysctl_register(struct inet6_dev *idev) @@ -6270,7 +6443,8 @@ static int addrconf_sysctl_register(struct inet6_dev *idev) static void addrconf_sysctl_unregister(struct inet6_dev *idev) { - __addrconf_sysctl_unregister(&idev->cnf); + __addrconf_sysctl_unregister(dev_net(idev->dev), &idev->cnf, + idev->dev->ifindex); neigh_sysctl_unregister(idev->nd_parms); } @@ -6313,7 +6487,7 @@ static int __net_init addrconf_init_net(struct net *net) #ifdef CONFIG_SYSCTL err_reg_dflt: - __addrconf_sysctl_unregister(all); + __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(dflt); #endif @@ -6326,8 +6500,10 @@ static int __net_init addrconf_init_net(struct net *net) static void __net_exit addrconf_exit_net(struct net *net) { #ifdef CONFIG_SYSCTL - __addrconf_sysctl_unregister(net->ipv6.devconf_dflt); - __addrconf_sysctl_unregister(net->ipv6.devconf_all); + __addrconf_sysctl_unregister(net, net->ipv6.devconf_dflt, + NETCONFA_IFINDEX_DEFAULT); + __addrconf_sysctl_unregister(net, net->ipv6.devconf_all, + NETCONFA_IFINDEX_ALL); #endif kfree(net->ipv6.devconf_dflt); kfree(net->ipv6.devconf_all); @@ -6398,6 +6574,8 @@ int __init addrconf_init(void) goto errlo; } + ip6_route_init_special_entries(); + for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index a8f6986dcbe5ea5a36a3cc6fcf51becebb50c33b..07cd7d248bb6822b787e4c74c8ef3d27d38e9d99 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -404,7 +404,8 @@ static const struct nla_policy ifal_policy[IFAL_MAX+1] = { [IFAL_LABEL] = { .len = sizeof(u32), }, }; -static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) +static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct ifaddrlblmsg *ifal; @@ -413,7 +414,8 @@ static int ip6addrlbl_newdel(struct sk_buff *skb, struct nlmsghdr *nlh) u32 label; int err = 0; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, + extack); if (err < 0) return err; @@ -521,7 +523,8 @@ static inline int ip6addrlbl_msgsize(void) + nla_total_size(4); /* IFAL_LABEL */ } -static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct ifaddrlblmsg *ifal; @@ -532,7 +535,8 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh) struct ip6addrlbl_entry *p; struct sk_buff *skb; - err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy); + err = nlmsg_parse(nlh, sizeof(*ifal), tb, IFAL_MAX, ifal_policy, + extack); if (err < 0) return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index a9a9553ee63df8eb6e16e00d5da8c29406435350..a88b5b5b79558948dcbad84cbdda99c1d574102e 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -933,8 +933,6 @@ static int __init inet6_init(void) if (err) goto igmp_fail; - ipv6_stub = &ipv6_stub_impl; - err = ipv6_netfilter_init(); if (err) goto netfilter_fail; @@ -1005,18 +1003,28 @@ static int __init inet6_init(void) if (err) goto seg6_fail; + err = igmp6_late_init(); + if (err) + goto igmp6_late_err; + #ifdef CONFIG_SYSCTL err = ipv6_sysctl_register(); if (err) goto sysctl_fail; #endif + + /* ensure that ipv6 stubs are visible only after ipv6 is ready */ + wmb(); + ipv6_stub = &ipv6_stub_impl; out: return err; #ifdef CONFIG_SYSCTL sysctl_fail: - seg6_exit(); + igmp6_late_cleanup(); #endif +igmp6_late_err: + seg6_exit(); seg6_fail: calipso_exit(); calipso_fail: diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index eec27f87efaca15133cf1d5225e37e6a2f6a6f8a..e011122ebd43c190aec3812099345ec852444284 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -405,9 +405,6 @@ static inline bool ipv6_datagram_support_addr(struct sock_exterr_skb *serr) * At one point, excluding local errors was a quick test to identify icmp/icmp6 * errors. This is no longer true, but the test remained, so the v6 stack, * unlike v4, also honors cmsg requests on all wifi and timestamp errors. - * - * Timestamp code paths do not initialize the fields expected by cmsg: - * the PKTINFO fields in skb->cb[]. Fill those in here. */ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, struct sock_exterr_skb *serr) @@ -419,14 +416,9 @@ static bool ip6_datagram_support_cmsg(struct sk_buff *skb, if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) return false; - if (!skb->dev) + if (!IP6CB(skb)->iif) return false; - if (skb->protocol == htons(ETH_P_IPV6)) - IP6CB(skb)->iif = skb->dev->ifindex; - else - PKTINFO_SKB_CB(skb)->ipi_ifindex = skb->dev->ifindex; - return true; } diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index ff54faa756317047af5af661d070ba9a5eb65429..1fe99ba8066c8de2a5717f0b611a152c94837734 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -170,19 +170,23 @@ static void esp_output_restore_header(struct sk_buff *skb) } static struct ip_esp_hdr *esp_output_set_esn(struct sk_buff *skb, + struct xfrm_state *x, struct ip_esp_hdr *esph, __be32 *seqhi) { - struct xfrm_state *x = skb_dst(skb)->xfrm; - /* For ESN we move the header forward by 4 bytes to * accomodate the high bits. We will move it back after * encryption. */ if ((x->props.flags & XFRM_STATE_ESN)) { + struct xfrm_offload *xo = xfrm_offload(skb); + esph = (void *)(skb_transport_header(skb) - sizeof(__be32)); *seqhi = esph->spi; - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); + if (xo) + esph->seq_no = htonl(xo->seq.hi); + else + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.hi); } esph->spi = x->id.spi; @@ -214,61 +218,16 @@ static void esp_output_fill_trailer(u8 *tail, int tfclen, int plen, __u8 proto) tail[plen - 1] = proto; } -static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +int esp6_output_head(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) { - int err; - struct ip_esp_hdr *esph; - struct crypto_aead *aead; - struct aead_request *req; - struct scatterlist *sg, *dsg; - struct sk_buff *trailer; - struct page *page; - void *tmp; - int blksize; - int clen; - int alen; - int plen; - int ivlen; - int tfclen; - int nfrags; - int assoclen; - int seqhilen; - int tailen; - u8 *iv; u8 *tail; u8 *vaddr; - __be32 *seqhi; - __be64 seqno; - __u8 proto = *skb_mac_header(skb); - - /* skb is pure payload to encrypt */ - aead = x->data; - alen = crypto_aead_authsize(aead); - ivlen = crypto_aead_ivsize(aead); - - tfclen = 0; - if (x->tfcpad) { - struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); - u32 padto; - - padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); - if (skb->len < padto) - tfclen = padto - skb->len; - } - blksize = ALIGN(crypto_aead_blocksize(aead), 4); - clen = ALIGN(skb->len + 2 + tfclen, blksize); - plen = clen - skb->len - tfclen; - tailen = tfclen + plen + alen; - - assoclen = sizeof(*esph); - seqhilen = 0; - - if (x->props.flags & XFRM_STATE_ESN) { - seqhilen += sizeof(__be32); - assoclen += seqhilen; - } + int nfrags; + struct page *page; + struct ip_esp_hdr *esph; + struct sk_buff *trailer; + int tailen = esp->tailen; - *skb_mac_header(skb) = IPPROTO_ESP; esph = ip_esp_hdr(skb); if (!skb_cloned(skb)) { @@ -284,6 +243,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) struct sock *sk = skb->sk; struct page_frag *pfrag = &x->xfrag; + esp->inplace = false; + allocsize = ALIGN(tailen, L1_CACHE_BYTES); spin_lock_bh(&x->lock); @@ -300,10 +261,12 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) tail = vaddr + pfrag->offset; - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); kunmap_atomic(vaddr); + spin_unlock_bh(&x->lock); + nfrags = skb_shinfo(skb)->nr_frags; __skb_fill_page_desc(skb, nfrags, page, pfrag->offset, @@ -319,108 +282,111 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) if (sk) atomic_add(tailen, &sk->sk_wmem_alloc); - skb_push(skb, -skb_network_offset(skb)); - - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; - - tmp = esp_alloc_tmp(aead, nfrags + 2, seqhilen); - if (!tmp) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - seqhi = esp_tmp_seqhi(tmp); - iv = esp_tmp_iv(aead, tmp, seqhilen); - req = esp_tmp_req(aead, iv); - sg = esp_req_sg(aead, req); - dsg = &sg[nfrags]; - - esph = esp_output_set_esn(skb, esph, seqhi); - - sg_init_table(sg, nfrags); - skb_to_sgvec(skb, sg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); - - if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { - spin_unlock_bh(&x->lock); - err = -ENOMEM; - goto error; - } - - skb_shinfo(skb)->nr_frags = 1; - - page = pfrag->page; - get_page(page); - /* replace page frags in skb with new page */ - __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); - pfrag->offset = pfrag->offset + allocsize; - - sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); - skb_to_sgvec(skb, dsg, - (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); - - spin_unlock_bh(&x->lock); - - goto skip_cow2; + goto out; } } cow: - err = skb_cow_data(skb, tailen, &trailer); - if (err < 0) - goto error; - nfrags = err; - + nfrags = skb_cow_data(skb, tailen, &trailer); + if (nfrags < 0) + goto out; tail = skb_tail_pointer(trailer); - esph = ip_esp_hdr(skb); skip_cow: - esp_output_fill_trailer(tail, tfclen, plen, proto); + esp_output_fill_trailer(tail, esp->tfclen, esp->plen, esp->proto); + pskb_put(skb, trailer, tailen); - pskb_put(skb, trailer, clen - skb->len + alen); - skb_push(skb, -skb_network_offset(skb)); +out: + return nfrags; +} +EXPORT_SYMBOL_GPL(esp6_output_head); - esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); - esph->spi = x->id.spi; +int esp6_output_tail(struct xfrm_state *x, struct sk_buff *skb, struct esp_info *esp) +{ + u8 *iv; + int alen; + void *tmp; + int ivlen; + int assoclen; + int seqhilen; + __be32 *seqhi; + struct page *page; + struct ip_esp_hdr *esph; + struct aead_request *req; + struct crypto_aead *aead; + struct scatterlist *sg, *dsg; + int err = -ENOMEM; - tmp = esp_alloc_tmp(aead, nfrags, seqhilen); - if (!tmp) { - err = -ENOMEM; - goto error; + assoclen = sizeof(struct ip_esp_hdr); + seqhilen = 0; + + if (x->props.flags & XFRM_STATE_ESN) { + seqhilen += sizeof(__be32); + assoclen += sizeof(__be32); } + aead = x->data; + alen = crypto_aead_authsize(aead); + ivlen = crypto_aead_ivsize(aead); + + tmp = esp_alloc_tmp(aead, esp->nfrags + 2, seqhilen); + if (!tmp) + goto error; + seqhi = esp_tmp_seqhi(tmp); iv = esp_tmp_iv(aead, tmp, seqhilen); req = esp_tmp_req(aead, iv); sg = esp_req_sg(aead, req); - dsg = sg; - esph = esp_output_set_esn(skb, esph, seqhi); + if (esp->inplace) + dsg = sg; + else + dsg = &sg[esp->nfrags]; - sg_init_table(sg, nfrags); + esph = esp_output_set_esn(skb, x, ip_esp_hdr(skb), seqhi); + + sg_init_table(sg, esp->nfrags); skb_to_sgvec(skb, sg, (unsigned char *)esph - skb->data, - assoclen + ivlen + clen + alen); + assoclen + ivlen + esp->clen + alen); + + if (!esp->inplace) { + int allocsize; + struct page_frag *pfrag = &x->xfrag; + + allocsize = ALIGN(skb->data_len, L1_CACHE_BYTES); + + spin_lock_bh(&x->lock); + if (unlikely(!skb_page_frag_refill(allocsize, pfrag, GFP_ATOMIC))) { + spin_unlock_bh(&x->lock); + goto error; + } + + skb_shinfo(skb)->nr_frags = 1; + + page = pfrag->page; + get_page(page); + /* replace page frags in skb with new page */ + __skb_fill_page_desc(skb, 0, page, pfrag->offset, skb->data_len); + pfrag->offset = pfrag->offset + allocsize; + spin_unlock_bh(&x->lock); + + sg_init_table(dsg, skb_shinfo(skb)->nr_frags + 1); + skb_to_sgvec(skb, dsg, + (unsigned char *)esph - skb->data, + assoclen + ivlen + esp->clen + alen); + } -skip_cow2: if ((x->props.flags & XFRM_STATE_ESN)) aead_request_set_callback(req, 0, esp_output_done_esn, skb); else aead_request_set_callback(req, 0, esp_output_done, skb); - aead_request_set_crypt(req, sg, dsg, ivlen + clen, iv); + aead_request_set_crypt(req, sg, dsg, ivlen + esp->clen, iv); aead_request_set_ad(req, assoclen); - seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + - ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); - memset(iv, 0, ivlen); - memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&seqno + 8 - min(ivlen, 8), + memcpy(iv + ivlen - min(ivlen, 8), (u8 *)&esp->seqno + 8 - min(ivlen, 8), min(ivlen, 8)); ESP_SKB_CB(skb)->tmp = tmp; @@ -446,10 +412,60 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) error: return err; } +EXPORT_SYMBOL_GPL(esp6_output_tail); -static int esp_input_done2(struct sk_buff *skb, int err) +static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) +{ + int alen; + int blksize; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + + esp.inplace = true; + + esp.proto = *skb_mac_header(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + if (x->tfcpad) { + struct xfrm_dst *dst = (struct xfrm_dst *)skb_dst(skb); + u32 padto; + + padto = min(x->tfcpad, esp6_get_mtu(x, dst->child_mtu_cached)); + if (skb->len < padto) + esp.tfclen = padto - skb->len; + } + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + esp.nfrags = esp6_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + + esph = ip_esp_hdr(skb); + esph->spi = x->id.spi; + + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + esp.seqno = cpu_to_be64(XFRM_SKB_CB(skb)->seq.output.low + + ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32)); + + skb_push(skb, -skb_network_offset(skb)); + + return esp6_output_tail(x, skb, &esp); +} + +int esp6_input_done2(struct sk_buff *skb, int err) { struct xfrm_state *x = xfrm_input_state(skb); + struct xfrm_offload *xo = xfrm_offload(skb); struct crypto_aead *aead = x->data; int alen = crypto_aead_authsize(aead); int hlen = sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead); @@ -458,7 +474,8 @@ static int esp_input_done2(struct sk_buff *skb, int err) int padlen; u8 nexthdr[2]; - kfree(ESP_SKB_CB(skb)->tmp); + if (!xo || (xo && !(xo->flags & CRYPTO_DONE))) + kfree(ESP_SKB_CB(skb)->tmp); if (unlikely(err)) goto out; @@ -492,12 +509,13 @@ static int esp_input_done2(struct sk_buff *skb, int err) out: return err; } +EXPORT_SYMBOL_GPL(esp6_input_done2); static void esp_input_done(struct crypto_async_request *base, int err) { struct sk_buff *skb = base->data; - xfrm_input_resume(skb, esp_input_done2(skb, err)); + xfrm_input_resume(skb, esp6_input_done2(skb, err)); } static void esp_input_restore_header(struct sk_buff *skb) @@ -619,7 +637,7 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) if ((x->props.flags & XFRM_STATE_ESN)) esp_input_restore_header(skb); - ret = esp_input_done2(skb, ret); + ret = esp6_input_done2(skb, ret); out: return ret; @@ -682,13 +700,17 @@ static int esp_init_aead(struct xfrm_state *x) char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; + u32 mask = 0; err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) goto error; - aead = crypto_alloc_aead(aead_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(aead_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; @@ -718,6 +740,7 @@ static int esp_init_authenc(struct xfrm_state *x) char authenc_name[CRYPTO_MAX_ALG_NAME]; unsigned int keylen; int err; + u32 mask = 0; err = -EINVAL; if (!x->ealg) @@ -743,7 +766,10 @@ static int esp_init_authenc(struct xfrm_state *x) goto error; } - aead = crypto_alloc_aead(authenc_name, 0, 0); + if (x->xso.offload_handle) + mask |= CRYPTO_ALG_ASYNC; + + aead = crypto_alloc_aead(authenc_name, 0, mask); err = PTR_ERR(aead); if (IS_ERR(aead)) goto error; diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index d914eb93204adbe02dceca9254da2598b6c8d99b..d950d43ba255442cf3079546a46b95693029f10c 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -45,27 +45,31 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, if ((err = xfrm_parse_spi(skb, IPPROTO_ESP, &spi, &seq)) != 0) goto out; - err = secpath_set(skb); - if (err) - goto out; + xo = xfrm_offload(skb); + if (!xo || !(xo->flags & CRYPTO_DONE)) { + err = secpath_set(skb); + if (err) + goto out; - if (skb->sp->len == XFRM_MAX_DEPTH) - goto out; + if (skb->sp->len == XFRM_MAX_DEPTH) + goto out; - x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, - (xfrm_address_t *)&ipv6_hdr(skb)->daddr, - spi, IPPROTO_ESP, AF_INET6); - if (!x) - goto out; + x = xfrm_state_lookup(dev_net(skb->dev), skb->mark, + (xfrm_address_t *)&ipv6_hdr(skb)->daddr, + spi, IPPROTO_ESP, AF_INET6); + if (!x) + goto out; - skb->sp->xvec[skb->sp->len++] = x; - skb->sp->olen++; + skb->sp->xvec[skb->sp->len++] = x; + skb->sp->olen++; - xo = xfrm_offload(skb); - if (!xo) { - xfrm_state_put(x); - goto out; + xo = xfrm_offload(skb); + if (!xo) { + xfrm_state_put(x); + goto out; + } } + xo->flags |= XFRM_GRO; XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = NULL; @@ -86,19 +90,216 @@ static struct sk_buff **esp6_gro_receive(struct sk_buff **head, return NULL; } +static void esp6_gso_encap(struct xfrm_state *x, struct sk_buff *skb) +{ + struct ip_esp_hdr *esph; + struct ipv6hdr *iph = ipv6_hdr(skb); + struct xfrm_offload *xo = xfrm_offload(skb); + int proto = iph->nexthdr; + + skb_push(skb, -skb_network_offset(skb)); + esph = ip_esp_hdr(skb); + *skb_mac_header(skb) = IPPROTO_ESP; + + esph->spi = x->id.spi; + esph->seq_no = htonl(XFRM_SKB_CB(skb)->seq.output.low); + + xo->proto = proto; +} + +static struct sk_buff *esp6_gso_segment(struct sk_buff *skb, + netdev_features_t features) +{ + __u32 seq; + int err = 0; + struct sk_buff *skb2; + struct xfrm_state *x; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct sk_buff *segs = ERR_PTR(-EINVAL); + netdev_features_t esp_features = features; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (!xo) + goto out; + + seq = xo->seq.low; + + x = skb->sp->xvec[skb->sp->len - 1]; + aead = x->data; + esph = ip_esp_hdr(skb); + + if (esph->spi != x->id.spi) + goto out; + + if (!pskb_may_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead))) + goto out; + + __skb_pull(skb, sizeof(*esph) + crypto_aead_ivsize(aead)); + + skb->encap_hdr_csum = 1; + + if (!(features & NETIF_F_HW_ESP)) + esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); + + segs = x->outer_mode->gso_segment(x, skb, esp_features); + if (IS_ERR_OR_NULL(segs)) + goto out; + + __skb_pull(skb, skb->data - skb_mac_header(skb)); + + skb2 = segs; + do { + struct sk_buff *nskb = skb2->next; + + xo = xfrm_offload(skb2); + xo->flags |= XFRM_GSO_SEGMENT; + xo->seq.low = seq; + xo->seq.hi = xfrm_replay_seqhi(x, seq); + + if(!(features & NETIF_F_HW_ESP)) + xo->flags |= CRYPTO_FALLBACK; + + x->outer_mode->xmit(x, skb2); + + err = x->type_offload->xmit(x, skb2, esp_features); + if (err) { + kfree_skb_list(segs); + return ERR_PTR(err); + } + + if (!skb_is_gso(skb2)) + seq++; + else + seq += skb_shinfo(skb2)->gso_segs; + + skb_push(skb2, skb2->mac_len); + skb2 = nskb; + } while (skb2); + +out: + return segs; +} + +static int esp6_input_tail(struct xfrm_state *x, struct sk_buff *skb) +{ + struct crypto_aead *aead = x->data; + + if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr) + crypto_aead_ivsize(aead))) + return -EINVAL; + + skb->ip_summed = CHECKSUM_NONE; + + return esp6_input_done2(skb, 0); +} + +static int esp6_xmit(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) +{ + int err; + int alen; + int blksize; + struct xfrm_offload *xo; + struct ip_esp_hdr *esph; + struct crypto_aead *aead; + struct esp_info esp; + bool hw_offload = true; + + esp.inplace = true; + + xo = xfrm_offload(skb); + + if (!xo) + return -EINVAL; + + if (!(features & NETIF_F_HW_ESP) || !x->xso.offload_handle || + (x->xso.dev != skb->dev)) { + xo->flags |= CRYPTO_FALLBACK; + hw_offload = false; + } + + esp.proto = xo->proto; + + /* skb is pure payload to encrypt */ + + aead = x->data; + alen = crypto_aead_authsize(aead); + + esp.tfclen = 0; + /* XXX: Add support for tfc padding here. */ + + blksize = ALIGN(crypto_aead_blocksize(aead), 4); + esp.clen = ALIGN(skb->len + 2 + esp.tfclen, blksize); + esp.plen = esp.clen - skb->len - esp.tfclen; + esp.tailen = esp.tfclen + esp.plen + alen; + + if (!hw_offload || (hw_offload && !skb_is_gso(skb))) { + esp.nfrags = esp6_output_head(x, skb, &esp); + if (esp.nfrags < 0) + return esp.nfrags; + } + + esph = ip_esp_hdr(skb); + esph->spi = x->id.spi; + + skb_push(skb, -skb_network_offset(skb)); + + if (xo->flags & XFRM_GSO_SEGMENT) { + esph->seq_no = htonl(xo->seq.low); + } else { + int len; + + len = skb->len - sizeof(struct ipv6hdr); + if (len > IPV6_MAXPLEN) + len = 0; + + ipv6_hdr(skb)->payload_len = htons(len); + } + + if (hw_offload) + return 0; + + esp.seqno = cpu_to_be64(xo->seq.low + ((u64)xo->seq.hi << 32)); + + err = esp6_output_tail(x, skb, &esp); + if (err < 0) + return err; + + secpath_reset(skb); + + return 0; +} + static const struct net_offload esp6_offload = { .callbacks = { .gro_receive = esp6_gro_receive, + .gso_segment = esp6_gso_segment, }, }; +static const struct xfrm_type_offload esp6_type_offload = { + .description = "ESP6 OFFLOAD", + .owner = THIS_MODULE, + .proto = IPPROTO_ESP, + .input_tail = esp6_input_tail, + .xmit = esp6_xmit, + .encap = esp6_gso_encap, +}; + static int __init esp6_offload_init(void) { + if (xfrm_register_type_offload(&esp6_type_offload, AF_INET6) < 0) { + pr_info("%s: can't add xfrm type offload\n", __func__); + return -EAGAIN; + } + return inet6_add_offload(&esp6_offload, IPPROTO_ESP); } static void __exit esp6_offload_exit(void) { + if (xfrm_unregister_type_offload(&esp6_type_offload, AF_INET6) < 0) + pr_info("%s: can't remove xfrm type offload\n", __func__); + inet6_del_offload(&esp6_offload, IPPROTO_ESP); } diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index 275cac628a95066f0a27e93f5015ddeb0172c28c..b636f1da9aece33532ccab5482aa72696f12db94 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -388,7 +388,6 @@ static int ipv6_srh_rcv(struct sk_buff *skb) icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, ((&hdr->segments_left) - skb_network_header(skb))); - kfree_skb(skb); return -1; } @@ -910,6 +909,8 @@ static void ipv6_push_rthdr(struct sk_buff *skb, u8 *proto, { switch (opt->type) { case IPV6_SRCRT_TYPE_0: + case IPV6_SRCRT_STRICT: + case IPV6_SRCRT_TYPE_2: ipv6_push_rthdr0(skb, proto, opt, addr_p, saddr); break; case IPV6_SRCRT_TYPE_4: @@ -945,13 +946,13 @@ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, if (opt->hopopt) ipv6_push_exthdr(skb, proto, NEXTHDR_HOP, opt->hopopt); } -EXPORT_SYMBOL(ipv6_push_nfrag_opts); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto) { if (opt->dst1opt) ipv6_push_exthdr(skb, proto, NEXTHDR_DEST, opt->dst1opt); } +EXPORT_SYMBOL(ipv6_push_frag_opts); struct ipv6_txoptions * ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) @@ -1164,6 +1165,8 @@ struct in6_addr *fl6_update_dst(struct flowi6 *fl6, switch (opt->srcrt->type) { case IPV6_SRCRT_TYPE_0: + case IPV6_SRCRT_STRICT: + case IPV6_SRCRT_TYPE_2: fl6->daddr = *((struct rt0_hdr *)opt->srcrt)->addr; break; case IPV6_SRCRT_TYPE_4: diff --git a/net/ipv6/ila/ila_lwt.c b/net/ipv6/ila/ila_lwt.c index ce1aae4a7fc8fa9daf9f1502d0ac77d2be2aee31..b3df03e3faa07d96f170e5ff0b535ebb16216c05 100644 --- a/net/ipv6/ila/ila_lwt.c +++ b/net/ipv6/ila/ila_lwt.c @@ -146,8 +146,7 @@ static int ila_build_state(struct nlattr *nla, return -EINVAL; } - ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, - ila_nl_policy); + ret = nla_parse_nested(tb, ILA_ATTR_MAX, nla, ila_nl_policy, NULL); if (ret < 0) return ret; diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c index af8f52ee7180868294930629f4f312c012a7a35c..2fd5ca151dcfca6034b0b0b27a3fe9abc7899e75 100644 --- a/net/ipv6/ila/ila_xlat.c +++ b/net/ipv6/ila/ila_xlat.c @@ -41,13 +41,7 @@ static int alloc_ila_locks(struct ila_net *ilan) size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU); if (sizeof(spinlock_t) != 0) { -#ifdef CONFIG_NUMA - if (size * sizeof(spinlock_t) > PAGE_SIZE) - ilan->locks = vmalloc(size * sizeof(spinlock_t)); - else -#endif - ilan->locks = kmalloc_array(size, sizeof(spinlock_t), - GFP_KERNEL); + ilan->locks = kvmalloc(size * sizeof(spinlock_t), GFP_KERNEL); if (!ilan->locks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 6fcb7cb49bb20dcfe518177177e3e0ac1c7d1091..8d128ba79b66de52d4d60628fe8df6cc9bccbc3b 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -544,6 +544,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev) & IPV6_TCLASS_MASK; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -603,6 +605,8 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -780,6 +784,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t, t->parms.o_key = p->o_key; t->parms.i_flags = p->i_flags; t->parms.o_flags = p->o_flags; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6gre_tnl_link_config(t, set_mtu); return 0; @@ -1249,6 +1254,9 @@ static void ip6gre_netlink_parms(struct nlattr *data[], if (data[IFLA_GRE_FLAGS]) parms->flags = nla_get_u32(data[IFLA_GRE_FLAGS]); + + if (data[IFLA_GRE_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_GRE_FWMARK]); } static int ip6gre_tap_init(struct net_device *dev) @@ -1470,6 +1478,8 @@ static size_t ip6gre_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_GRE_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_GRE_FWMARK */ + nla_total_size(4) + 0; } @@ -1490,7 +1500,8 @@ static int ip6gre_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_GRE_TTL, p->hop_limit) || nla_put_u8(skb, IFLA_GRE_ENCAP_LIMIT, p->encap_limit) || nla_put_be32(skb, IFLA_GRE_FLOWINFO, p->flowinfo) || - nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags)) + nla_put_u32(skb, IFLA_GRE_FLAGS, p->flags) || + nla_put_u32(skb, IFLA_GRE_FWMARK, p->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_GRE_ENCAP_TYPE, @@ -1525,6 +1536,7 @@ static const struct nla_policy ip6gre_policy[IFLA_GRE_MAX + 1] = { [IFLA_GRE_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_GRE_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_GRE_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6gre_link_ops __read_mostly = { diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index aacfb4bce1533b3f3b38e1173c18cb1bb6b33099..9ee208a348f559b27dcc56ee80bf9ad51630cb45 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -49,6 +49,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { + void (*edemux)(struct sk_buff *skb); + /* if ingress device is enslaved to an L3 master device pass the * skb to its handler for processing */ @@ -60,8 +62,8 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb) const struct inet6_protocol *ipprot; ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]); - if (ipprot && ipprot->early_demux) - ipprot->early_demux(skb); + if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) + edemux(skb); } if (!skb_valid_dst(skb)) ip6_route_input(skb); @@ -122,11 +124,14 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs)); /* * RFC4291 2.5.3 + * The loopback address must not be used as the source address in IPv6 + * packets that are sent outside of a single node. [..] * A packet received on an interface with a destination address * of loopback must be dropped. */ - if (!(dev->flags & IFF_LOOPBACK) && - ipv6_addr_loopback(&hdr->daddr)) + if ((ipv6_addr_loopback(&hdr->saddr) || + ipv6_addr_loopback(&hdr->daddr)) && + !(dev->flags & IFF_LOOPBACK)) goto err; /* RFC4291 Errata ID: 3480 diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 75fac933c209a0f430279dea10b5dd2426a7ed31..6eb2ae507500b30959134aa3ed35c53c7b79aad9 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -954,7 +954,7 @@ static void init_tel_txopt(struct ipv6_tel_txoption *opt, __u8 encap_limit) opt->dst_opt[5] = IPV6_TLV_PADN; opt->dst_opt[6] = 1; - opt->ops.dst0opt = (struct ipv6_opt_hdr *) opt->dst_opt; + opt->ops.dst1opt = (struct ipv6_opt_hdr *) opt->dst_opt; opt->ops.opt_nflen = 8; } @@ -1037,7 +1037,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, struct ip6_tnl *t = netdev_priv(dev); struct net *net = t->net; struct net_device_stats *stats = &t->dev->stats; - struct ipv6hdr *ipv6h = ipv6_hdr(skb); + struct ipv6hdr *ipv6h; struct ipv6_tel_txoption opt; struct dst_entry *dst = NULL, *ndst = NULL; struct net_device *tdev; @@ -1057,26 +1057,28 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, /* NBMA tunnel */ if (ipv6_addr_any(&t->parms.raddr)) { - struct in6_addr *addr6; - struct neighbour *neigh; - int addr_type; + if (skb->protocol == htons(ETH_P_IPV6)) { + struct in6_addr *addr6; + struct neighbour *neigh; + int addr_type; - if (!skb_dst(skb)) - goto tx_err_link_failure; + if (!skb_dst(skb)) + goto tx_err_link_failure; - neigh = dst_neigh_lookup(skb_dst(skb), - &ipv6_hdr(skb)->daddr); - if (!neigh) - goto tx_err_link_failure; + neigh = dst_neigh_lookup(skb_dst(skb), + &ipv6_hdr(skb)->daddr); + if (!neigh) + goto tx_err_link_failure; - addr6 = (struct in6_addr *)&neigh->primary_key; - addr_type = ipv6_addr_type(addr6); + addr6 = (struct in6_addr *)&neigh->primary_key; + addr_type = ipv6_addr_type(addr6); - if (addr_type == IPV6_ADDR_ANY) - addr6 = &ipv6_hdr(skb)->daddr; + if (addr_type == IPV6_ADDR_ANY) + addr6 = &ipv6_hdr(skb)->daddr; - memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); - neigh_release(neigh); + memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr)); + neigh_release(neigh); + } } else if (!(t->parms.flags & (IP6_TNL_F_USE_ORIG_TCLASS | IP6_TNL_F_USE_ORIG_FWMARK))) { /* enable the cache only only if the routing decision does @@ -1176,7 +1178,7 @@ int ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev, __u8 dsfield, if (encap_limit >= 0) { init_tel_txopt(&opt, encap_limit); - ipv6_push_nfrag_opts(skb, &opt.ops, &proto, NULL, NULL); + ipv6_push_frag_opts(skb, &opt.ops, &proto); } /* Calculate max headroom for all the headers and adjust @@ -1256,6 +1258,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) & IPV6_TCLASS_MASK; if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -1338,6 +1342,8 @@ ip6ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev) fl6.flowlabel |= ip6_flowlabel(ipv6h); if (t->parms.flags & IP6_TNL_F_USE_ORIG_FWMARK) fl6.flowi6_mark = skb->mark; + else + fl6.flowi6_mark = t->parms.fwmark; } fl6.flowi6_uid = sock_net_uid(dev_net(dev), NULL); @@ -1467,6 +1473,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.flowinfo = p->flowinfo; t->parms.link = p->link; t->parms.proto = p->proto; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); ip6_tnl_link_config(t); return 0; @@ -1918,6 +1925,9 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_COLLECT_METADATA]) parms->collect_md = true; + + if (data[IFLA_IPTUN_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], @@ -2054,6 +2064,8 @@ static size_t ip6_tnl_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_COLLECT_METADATA */ nla_total_size(0) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } @@ -2069,7 +2081,8 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_ENCAP_LIMIT, parm->encap_limit) || nla_put_be32(skb, IFLA_IPTUN_FLOWINFO, parm->flowinfo) || nla_put_u32(skb, IFLA_IPTUN_FLAGS, parm->flags) || - nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto)) + nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->proto) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, parm->fwmark)) goto nla_put_failure; if (nla_put_u16(skb, IFLA_IPTUN_ENCAP_TYPE, tunnel->encap.type) || @@ -2081,6 +2094,7 @@ static int ip6_tnl_fill_info(struct sk_buff *skb, const struct net_device *dev) if (parm->collect_md) if (nla_put_flag(skb, IFLA_IPTUN_COLLECT_METADATA)) goto nla_put_failure; + return 0; nla_put_failure: @@ -2109,6 +2123,7 @@ static const struct nla_policy ip6_tnl_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_COLLECT_METADATA] = { .type = NLA_FLAG }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops ip6_link_ops __read_mostly = { diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 3d8a3b63b4fdbec7d488194e21e0c9013f0ff6da..d67ef56454b25a088768e88fcd1f878a8c498f12 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -657,6 +657,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p) t->parms.i_key = p->i_key; t->parms.o_key = p->o_key; t->parms.proto = p->proto; + t->parms.fwmark = p->fwmark; dst_cache_reset(&t->dst_cache); vti6_link_config(t); return 0; @@ -933,6 +934,9 @@ static void vti6_netlink_parms(struct nlattr *data[], if (data[IFLA_VTI_OKEY]) parms->o_key = nla_get_be32(data[IFLA_VTI_OKEY]); + + if (data[IFLA_VTI_FWMARK]) + parms->fwmark = nla_get_u32(data[IFLA_VTI_FWMARK]); } static int vti6_newlink(struct net *src_net, struct net_device *dev, @@ -998,6 +1002,8 @@ static size_t vti6_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_VTI_OKEY */ nla_total_size(4) + + /* IFLA_VTI_FWMARK */ + nla_total_size(4) + 0; } @@ -1010,7 +1016,8 @@ static int vti6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_in6_addr(skb, IFLA_VTI_LOCAL, &parm->laddr) || nla_put_in6_addr(skb, IFLA_VTI_REMOTE, &parm->raddr) || nla_put_be32(skb, IFLA_VTI_IKEY, parm->i_key) || - nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key)) + nla_put_be32(skb, IFLA_VTI_OKEY, parm->o_key) || + nla_put_u32(skb, IFLA_VTI_FWMARK, parm->fwmark)) goto nla_put_failure; return 0; @@ -1024,6 +1031,7 @@ static const struct nla_policy vti6_policy[IFLA_VTI_MAX + 1] = { [IFLA_VTI_REMOTE] = { .len = sizeof(struct in6_addr) }, [IFLA_VTI_IKEY] = { .type = NLA_U32 }, [IFLA_VTI_OKEY] = { .type = NLA_U32 }, + [IFLA_VTI_FWMARK] = { .type = NLA_U32 }, }; static struct rtnl_link_ops vti6_link_ops __read_mostly = { diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 6ba6c900ebcf430cf313a2bef55ff69c114af218..374997d26488ea38db7ed42ff7e6a55ede249021 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -774,7 +774,8 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt) * Delete a VIF entry */ -static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) +static int mif6_delete(struct mr6_table *mrt, int vifi, int notify, + struct list_head *head) { struct mif_device *v; struct net_device *dev; @@ -815,12 +816,12 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head) in6_dev = __in6_dev_get(dev); if (in6_dev) { in6_dev->cnf.mc_forwarding--; - inet6_netconf_notify_devconf(dev_net(dev), + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } - if (v->flags & MIFF_REGISTER) + if ((v->flags & MIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); dev_put(dev); @@ -974,7 +975,7 @@ static int mif6_add(struct net *net, struct mr6_table *mrt, in6_dev = __in6_dev_get(dev); if (in6_dev) { in6_dev->cnf.mc_forwarding++; - inet6_netconf_notify_devconf(dev_net(dev), + inet6_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, dev->ifindex, &in6_dev->cnf); } @@ -1331,7 +1332,6 @@ static int ip6mr_device_event(struct notifier_block *this, struct mr6_table *mrt; struct mif_device *v; int ct; - LIST_HEAD(list); if (event != NETDEV_UNREGISTER) return NOTIFY_DONE; @@ -1340,10 +1340,9 @@ static int ip6mr_device_event(struct notifier_block *this, v = &mrt->vif6_table[0]; for (ct = 0; ct < mrt->maxvif; ct++, v++) { if (v->dev == dev) - mif6_delete(mrt, ct, &list); + mif6_delete(mrt, ct, 1, NULL); } } - unregister_netdevice_many(&list); return NOTIFY_DONE; } @@ -1552,7 +1551,7 @@ static void mroute_clean_tables(struct mr6_table *mrt, bool all) for (i = 0; i < mrt->maxvif; i++) { if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC)) continue; - mif6_delete(mrt, i, &list); + mif6_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); @@ -1599,7 +1598,8 @@ static int ip6mr_sk_init(struct mr6_table *mrt, struct sock *sk) write_unlock_bh(&mrt_lock); if (!err) - inet6_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); rtnl_unlock(); @@ -1620,7 +1620,7 @@ int ip6mr_sk_done(struct sock *sk) mrt->mroute6_sk = NULL; net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); - inet6_netconf_notify_devconf(net, + inet6_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING, NETCONFA_IFINDEX_ALL, net->ipv6.devconf_all); @@ -1707,7 +1707,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns if (copy_from_user(&mifi, optval, sizeof(mifi_t))) return -EFAULT; rtnl_lock(); - ret = mif6_delete(mrt, mifi, NULL); + ret = mif6_delete(mrt, mifi, 0, NULL); rtnl_unlock(); return ret; diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 1bdc703cb9668bd77690c3d8f1ec0062d7b88c43..07403fa164e18aac704e3b77d1e2a094ad53c04c 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -2463,7 +2463,6 @@ static void mld_ifc_event(struct inet6_dev *idev) mld_ifc_start_timer(idev, 1); } - static void igmp6_timer_handler(unsigned long data) { struct ifmcaddr6 *ma = (struct ifmcaddr6 *) data; @@ -2599,6 +2598,44 @@ void ipv6_mc_destroy_dev(struct inet6_dev *idev) write_unlock_bh(&idev->lock); } +static void ipv6_mc_rejoin_groups(struct inet6_dev *idev) +{ + struct ifmcaddr6 *pmc; + + ASSERT_RTNL(); + + if (mld_in_v1_mode(idev)) { + read_lock_bh(&idev->lock); + for (pmc = idev->mc_list; pmc; pmc = pmc->next) + igmp6_join_group(pmc); + read_unlock_bh(&idev->lock); + } else + mld_send_report(idev, NULL); +} + +static int ipv6_mc_netdev_event(struct notifier_block *this, + unsigned long event, + void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct inet6_dev *idev = __in6_dev_get(dev); + + switch (event) { + case NETDEV_RESEND_IGMP: + if (idev) + ipv6_mc_rejoin_groups(idev); + break; + default: + break; + } + + return NOTIFY_DONE; +} + +static struct notifier_block igmp6_netdev_notifier = { + .notifier_call = ipv6_mc_netdev_event, +}; + #ifdef CONFIG_PROC_FS struct igmp6_mc_iter_state { struct seq_net_private p; @@ -2970,7 +3007,17 @@ int __init igmp6_init(void) return register_pernet_subsys(&igmp6_net_ops); } +int __init igmp6_late_init(void) +{ + return register_netdevice_notifier(&igmp6_netdev_notifier); +} + void igmp6_cleanup(void) { unregister_pernet_subsys(&igmp6_net_ops); } + +void igmp6_late_cleanup(void) +{ + unregister_netdevice_notifier(&igmp6_netdev_notifier); +} diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 7ebac630d3c603186be2fc0dcbaac7d7e74bfde6..d310dc41209a39647b0ee7e8da2d38e90ac4333e 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -732,7 +732,7 @@ void ndisc_update(const struct net_device *dev, struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u8 icmp6_type, struct ndisc_options *ndopts) { - neigh_update(neigh, lladdr, new, flags); + neigh_update(neigh, lladdr, new, flags, 0); /* report ndisc ops about neighbour update */ ndisc_ops_update(dev, neigh, flags, icmp6_type, ndopts); } @@ -1418,6 +1418,8 @@ static void ndisc_router_discovery(struct sk_buff *skb) if (ri->prefix_len == 0 && !in6_dev->cnf.accept_ra_defrtr) continue; + if (ri->prefix_len < in6_dev->cnf.accept_ra_rt_info_min_plen) + continue; if (ri->prefix_len > in6_dev->cnf.accept_ra_rt_info_max_plen) continue; rt6_route_rcv(skb->dev, (u8 *)p, (p->nd_opt_len) << 3, @@ -1746,10 +1748,13 @@ static int ndisc_netdev_event(struct notifier_block *this, unsigned long event, case NETDEV_CHANGEADDR: neigh_changeaddr(&nd_tbl, dev); fib6_run_gc(0, net, false); + /* fallthrough */ + case NETDEV_UP: idev = in6_dev_get(dev); if (!idev) break; - if (idev->cnf.ndisc_notify) + if (idev->cnf.ndisc_notify || + net->ipv6.devconf_all->ndisc_notify) ndisc_send_unsol_na(dev); in6_dev_put(idev); break; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 1e15c54fd5e27dbafaf4d4b0b3de9008a724559f..1f90644056ac784f1a455751796d5fa0d9441915 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -51,15 +51,6 @@ void *ip6t_alloc_initial_table(const struct xt_table *info) } EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); -/* - We keep a set of rules for each CPU, so we can avoid write-locking - them in the softirq when updating the counters and therefore - only need to read-lock in the softirq; doing a write_lock_bh() in user - context stops packets coming through and allows user context to read - the counters or update the rules. - - Hence the start of any table is given by get_table() below. */ - /* Returns whether matches rule or not. */ /* Performance critical - called for every packet */ static inline bool @@ -411,7 +402,7 @@ mark_source_chains(const struct xt_table_info *newinfo, to 0 as we leave), and comefrom to save source hook bitmask */ for (hook = 0; hook < NF_INET_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; - struct ip6t_entry *e = (struct ip6t_entry *)(entry0 + pos); + struct ip6t_entry *e = entry0 + pos; if (!(valid_hooks & (1 << hook))) continue; @@ -453,14 +444,12 @@ mark_source_chains(const struct xt_table_info *newinfo, if (pos == oldpos) goto next; - e = (struct ip6t_entry *) - (entry0 + pos); + e = entry0 + pos; } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; - e = (struct ip6t_entry *) - (entry0 + pos + size); + e = entry0 + pos + size; if (pos + size >= newinfo->size) return 0; e->counters.pcnt = pos; @@ -475,16 +464,14 @@ mark_source_chains(const struct xt_table_info *newinfo, if (!xt_find_jump_offset(offsets, newpos, newinfo->number)) return 0; - e = (struct ip6t_entry *) - (entry0 + newpos); + e = entry0 + newpos; } else { /* ... this is a fallthru */ newpos = pos + e->next_offset; if (newpos >= newinfo->size) return 0; } - e = (struct ip6t_entry *) - (entry0 + newpos); + e = entry0 + newpos; e->counters.pcnt = pos; pos = newpos; } @@ -863,7 +850,7 @@ copy_entries_to_user(unsigned int total_size, const struct xt_entry_match *m; const struct xt_entry_target *t; - e = (struct ip6t_entry *)(loc_cpu_entry + off); + e = loc_cpu_entry + off; if (copy_to_user(userptr + off, e, sizeof(*e))) { ret = -EFAULT; goto free_counters; @@ -1258,7 +1245,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, int ret = 0; origsize = *size; - ce = (struct compat_ip6t_entry __user *)*dstptr; + ce = *dstptr; if (copy_to_user(ce, e, sizeof(struct ip6t_entry)) != 0 || copy_to_user(&ce->counters, &counters[i], sizeof(counters[i])) != 0) @@ -1394,7 +1381,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, struct xt_entry_match *ematch; origsize = *size; - de = (struct ip6t_entry *)*dstptr; + de = *dstptr; memcpy(de, e, sizeof(struct ip6t_entry)); memcpy(&de->counters, &e->counters, sizeof(e->counters)); diff --git a/net/ipv6/netfilter/ip6t_SYNPROXY.c b/net/ipv6/netfilter/ip6t_SYNPROXY.c index 4ef1ddd4bbbd813ff8e6ed46275ecdf48d5ff9a8..d3c4daa708b9014378f5cdec4b0cc811f9999d92 100644 --- a/net/ipv6/netfilter/ip6t_SYNPROXY.c +++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c @@ -307,12 +307,17 @@ synproxy_tg6(struct sk_buff *skb, const struct xt_action_param *par) XT_SYNPROXY_OPT_ECN); synproxy_send_client_synack(net, skb, th, &opts); - return NF_DROP; + consume_skb(skb); + return NF_STOLEN; } else if (th->ack && !(th->fin || th->rst || th->syn)) { /* ACK from client */ - synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq)); - return NF_DROP; + if (synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq))) { + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } return XT_CONTINUE; @@ -388,10 +393,13 @@ static unsigned int ipv6_synproxy_hook(void *priv, * number match the one of first SYN. */ if (synproxy_recv_client_ack(net, skb, th, &opts, - ntohl(th->seq) + 1)) + ntohl(th->seq) + 1)) { this_cpu_inc(snet->stats->cookie_retrans); - - return NF_DROP; + consume_skb(skb); + return NF_STOLEN; + } else { + return NF_DROP; + } } synproxy->isn = ntohl(th->ack_seq); @@ -430,20 +438,57 @@ static unsigned int ipv6_synproxy_hook(void *priv, return NF_ACCEPT; } +static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_IN, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, + { + .hook = ipv6_synproxy_hook, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, + }, +}; + static int synproxy_tg6_check(const struct xt_tgchk_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); const struct ip6t_entry *e = par->entryinfo; + int err; if (!(e->ipv6.flags & IP6T_F_PROTO) || e->ipv6.proto != IPPROTO_TCP || e->ipv6.invflags & XT_INV_PROTO) return -EINVAL; - return nf_ct_netns_get(par->net, par->family); + err = nf_ct_netns_get(par->net, par->family); + if (err) + return err; + + if (snet->hook_ref6 == 0) { + err = nf_register_net_hooks(par->net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); + if (err) { + nf_ct_netns_put(par->net, par->family); + return err; + } + } + + snet->hook_ref6++; + return err; } static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par) { + struct synproxy_net *snet = synproxy_pernet(par->net); + + snet->hook_ref6--; + if (snet->hook_ref6 == 0) + nf_unregister_net_hooks(par->net, ipv6_synproxy_ops, + ARRAY_SIZE(ipv6_synproxy_ops)); nf_ct_netns_put(par->net, par->family); } @@ -458,46 +503,14 @@ static struct xt_target synproxy_tg6_reg __read_mostly = { .me = THIS_MODULE, }; -static struct nf_hook_ops ipv6_synproxy_ops[] __read_mostly = { - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_IN, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, - { - .hook = ipv6_synproxy_hook, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_CONNTRACK_CONFIRM - 1, - }, -}; - static int __init synproxy_tg6_init(void) { - int err; - - err = nf_register_hooks(ipv6_synproxy_ops, - ARRAY_SIZE(ipv6_synproxy_ops)); - if (err < 0) - goto err1; - - err = xt_register_target(&synproxy_tg6_reg); - if (err < 0) - goto err2; - - return 0; - -err2: - nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); -err1: - return err; + return xt_register_target(&synproxy_tg6_reg); } static void __exit synproxy_tg6_exit(void) { xt_unregister_target(&synproxy_tg6_reg); - nf_unregister_hooks(ipv6_synproxy_ops, ARRAY_SIZE(ipv6_synproxy_ops)); } module_init(synproxy_tg6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index d2c2ccbfbe728f8ff2d7bc42c5e7feb2ae4ad97b..d5f028e33f658bd62deb5b9b0737c33366a22a83 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -221,8 +221,7 @@ icmpv6_error(struct net *net, struct nf_conn *tmpl, type = icmp6h->icmp6_type - 130; if (type >= 0 && type < sizeof(noct_valid_new) && noct_valid_new[type]) { - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return NF_ACCEPT; } diff --git a/net/ipv6/netfilter/nf_dup_ipv6.c b/net/ipv6/netfilter/nf_dup_ipv6.c index 888ecd106e5f37ba6c9e93bf1d1f161ddbaa280c..4a7ddeddbaabf1866a65cfda5d27e8e9c1e75b57 100644 --- a/net/ipv6/netfilter/nf_dup_ipv6.c +++ b/net/ipv6/netfilter/nf_dup_ipv6.c @@ -58,8 +58,7 @@ void nf_dup_ipv6(struct net *net, struct sk_buff *skb, unsigned int hooknum, #if IS_ENABLED(CONFIG_NF_CONNTRACK) nf_reset(skb); - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); #endif if (hooknum == NF_INET_PRE_ROUTING || hooknum == NF_INET_LOCAL_IN) { diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index e0be97e636a48f54c1488ca70ae97a9a13e8be61..b2b4f031b3a16b1f9f374221396ad02ccc79744e 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, - csum_partial(&inside->icmp6, + skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } @@ -273,13 +273,7 @@ nf_nat_ipv6_fn(void *priv, struct sk_buff *skb, if (!ct) return NF_ACCEPT; - /* Don't try to NAT if this packet is not conntracked */ - if (nf_ct_is_untracked(ct)) - return NF_ACCEPT; - - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; + nat = nfct_nat(ct); switch (ctinfo) { case IP_CT_RELATED: diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c index 051b6a6bfff6ca7c0dd42ceb43a417260a11bd23..2297c9f073bac63c90ed0578b474fd53b556a6e2 100644 --- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c @@ -30,6 +30,7 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, const struct net_device *out) { enum ip_conntrack_info ctinfo; + struct nf_conn_nat *nat; struct in6_addr src; struct nf_conn *ct; struct nf_nat_range newrange; @@ -42,7 +43,9 @@ nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range, &ipv6_hdr(skb)->daddr, 0, &src) < 0) return NF_DROP; - nfct_nat(ct)->masq_index = out->ifindex; + nat = nf_ct_nat_ext_add(ct); + if (nat) + nat->masq_index = out->ifindex; newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; newrange.min_addr.in6 = src; diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 765facf03d45c47b9913b1adcdaf59b6fe09383c..43f91d9b086c7d5c66860ae2eef4ace040acbcff 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -159,7 +159,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { - nft_fib_store_result(dest, priv->result, pkt, + nft_fib_store_result(dest, priv, pkt, nft_in(pkt)->ifindex); return; } @@ -246,7 +246,7 @@ nft_fib6_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_fib6_type __read_mostly = { .name = "fib", - .select_ops = &nft_fib6_select_ops, + .select_ops = nft_fib6_select_ops, .policy = nft_fib_policy, .maxattr = NFTA_FIB_MAX, .family = NFPROTO_IPV6, diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index e3770abe688a3a9059456fe9195adbfcdfb73157..b5d54d4f995c0f4bade2e3f1c4def9616252ca55 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -26,7 +26,7 @@ #include #if IS_ENABLED(CONFIG_IPV6) -const struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; +struct inet6_protocol __rcu *inet6_protos[MAX_INET_PROTOS] __read_mostly; EXPORT_SYMBOL(inet6_protos); int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index f174e76e6505d4045e940c9fceef765d2aaa937d..1f992d9e261d8b75226659a4cead95f8dc04dc4f 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct ipv6hdr)) + return -EINVAL; if (flags&MSG_PROBE) goto out; @@ -1178,8 +1180,7 @@ static int rawv6_ioctl(struct sock *sk, int cmd, unsigned long arg) spin_lock_bh(&sk->sk_receive_queue.lock); skb = skb_peek(&sk->sk_receive_queue); if (skb) - amount = skb_tail_pointer(skb) - - skb_transport_header(skb); + amount = skb->len; spin_unlock_bh(&sk->sk_receive_queue.lock); return put_user(amount, (int __user *)arg); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 9db1418993f2b8a5b4194895f243441033d4729a..dc61b0b5e64edf7bd69ab905573e38415abf2346 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1854,6 +1854,10 @@ static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg) int addr_type; int err = -EINVAL; + /* RTF_PCPU is an internal flag; can not be set by userspace */ + if (cfg->fc_flags & RTF_PCPU) + goto out; + if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128) goto out; #ifndef CONFIG_IPV6_SUBTREES @@ -2906,7 +2910,8 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, unsigned int pref; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, + NULL); if (err < 0) goto errout; @@ -3259,7 +3264,8 @@ static int ip6_route_multipath_del(struct fib6_config *cfg) return last_err; } -static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; @@ -3276,7 +3282,8 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) } } -static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct fib6_config cfg; int err; @@ -3564,7 +3571,8 @@ int rt6_dump_route(struct rt6_info *rt, void *p_arg) NLM_F_MULTI); } -static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) +static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[RTA_MAX+1]; @@ -3574,7 +3582,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh) struct flowi6 fl6; int err, iif = 0, oif = 0; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy, + extack); if (err < 0) goto errout; @@ -3700,7 +3709,10 @@ static int ip6_route_dev_notify(struct notifier_block *this, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct net *net = dev_net(dev); - if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) { + if (!(dev->flags & IFF_LOOPBACK)) + return NOTIFY_OK; + + if (event == NETDEV_REGISTER) { net->ipv6.ip6_null_entry->dst.dev = dev; net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev); #ifdef CONFIG_IPV6_MULTIPLE_TABLES @@ -3708,6 +3720,12 @@ static int ip6_route_dev_notify(struct notifier_block *this, net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev); net->ipv6.ip6_blk_hole_entry->dst.dev = dev; net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev); +#endif + } else if (event == NETDEV_UNREGISTER) { + in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev); +#ifdef CONFIG_IPV6_MULTIPLE_TABLES + in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev); + in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev); #endif } @@ -4015,9 +4033,24 @@ static struct pernet_operations ip6_route_net_late_ops = { static struct notifier_block ip6_route_dev_notifier = { .notifier_call = ip6_route_dev_notify, - .priority = 0, + .priority = ADDRCONF_NOTIFY_PRIORITY - 10, }; +void __init ip6_route_init_special_entries(void) +{ + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif +} + int __init ip6_route_init(void) { int ret; @@ -4044,17 +4077,6 @@ int __init ip6_route_init(void) ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; - /* Registering of the loopback is done before this portion of code, - * the loopback reference in rt6_info will not be taken, do it - * manually for init_net */ - init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #endif ret = fib6_init(); if (ret) goto out_register_subsys; diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index a855eb325b030a666fe92c56a2d432c77d9dfe7a..5f44ffed25768d83c31b31295474c5ecf623e986 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -53,6 +53,9 @@ bool seg6_validate_srh(struct ipv6_sr_hdr *srh, int len) struct sr6_tlv *tlv; unsigned int tlv_len; + if (trailing < sizeof(*tlv)) + return false; + tlv = (struct sr6_tlv *)((unsigned char *)srh + tlv_offset); tlv_len = sizeof(*tlv) + tlv->len; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 85582257d3af88146d435ef6c2e98f0bbef94a41..6a495490d43e1018206e628b917840d7ae8f9085 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -26,17 +26,13 @@ #include #include #include -#ifdef CONFIG_DST_CACHE #include -#endif #ifdef CONFIG_IPV6_SEG6_HMAC #include #endif struct seg6_lwt { -#ifdef CONFIG_DST_CACHE struct dst_cache cache; -#endif struct seg6_iptunnel_encap tuninfo[0]; }; @@ -105,7 +101,7 @@ static int seg6_do_srh_encap(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; tot_len = hdrlen + sizeof(*hdr); - err = pskb_expand_head(skb, tot_len, 0, GFP_ATOMIC); + err = skb_cow_head(skb, tot_len); if (unlikely(err)) return err; @@ -156,7 +152,7 @@ static int seg6_do_srh_inline(struct sk_buff *skb, struct ipv6_sr_hdr *osrh) hdrlen = (osrh->hdrlen + 1) << 3; - err = pskb_expand_head(skb, hdrlen, 0, GFP_ATOMIC); + err = skb_cow_head(skb, hdrlen); if (unlikely(err)) return err; @@ -237,6 +233,9 @@ static int seg6_do_srh(struct sk_buff *skb) static int seg6_input(struct sk_buff *skb) { + struct dst_entry *orig_dst = skb_dst(skb); + struct dst_entry *dst = NULL; + struct seg6_lwt *slwt; int err; err = seg6_do_srh(skb); @@ -245,8 +244,30 @@ static int seg6_input(struct sk_buff *skb) return err; } + slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); + + preempt_disable(); + dst = dst_cache_get(&slwt->cache); + preempt_enable(); + skb_dst_drop(skb); - ip6_route_input(skb); + + if (!dst) { + ip6_route_input(skb); + dst = skb_dst(skb); + if (!dst->error) { + preempt_disable(); + dst_cache_set_ip6(&slwt->cache, dst, + &ipv6_hdr(skb)->saddr); + preempt_enable(); + } + } else { + skb_dst_set(skb, dst); + } + + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + return err; return dst_input(skb); } @@ -264,11 +285,9 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) slwt = seg6_lwt_lwtunnel(orig_dst->lwtstate); -#ifdef CONFIG_DST_CACHE preempt_disable(); dst = dst_cache_get(&slwt->cache); preempt_enable(); -#endif if (unlikely(!dst)) { struct ipv6hdr *hdr = ipv6_hdr(skb); @@ -287,16 +306,18 @@ static int seg6_output(struct net *net, struct sock *sk, struct sk_buff *skb) goto drop; } -#ifdef CONFIG_DST_CACHE preempt_disable(); dst_cache_set_ip6(&slwt->cache, dst, &fl6.saddr); preempt_enable(); -#endif } skb_dst_drop(skb); skb_dst_set(skb, dst); + err = skb_cow_head(skb, LL_RESERVED_SPACE(dst->dev)); + if (unlikely(err)) + goto drop; + return dst_output(net, sk, skb); drop: kfree_skb(skb); @@ -315,7 +336,7 @@ static int seg6_build_state(struct nlattr *nla, int err; err = nla_parse_nested(tb, SEG6_IPTUNNEL_MAX, nla, - seg6_iptunnel_policy); + seg6_iptunnel_policy, NULL); if (err < 0) return err; @@ -355,13 +376,11 @@ static int seg6_build_state(struct nlattr *nla, slwt = seg6_lwt_lwtunnel(newts); -#ifdef CONFIG_DST_CACHE err = dst_cache_init(&slwt->cache, GFP_KERNEL); if (err) { kfree(newts); return err; } -#endif memcpy(&slwt->tuninfo, tuninfo, tuninfo_len); @@ -375,12 +394,10 @@ static int seg6_build_state(struct nlattr *nla, return 0; } -#ifdef CONFIG_DST_CACHE static void seg6_destroy_state(struct lwtunnel_state *lwt) { dst_cache_destroy(&seg6_lwt_lwtunnel(lwt)->cache); } -#endif static int seg6_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwtstate) @@ -414,9 +431,7 @@ static int seg6_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) static const struct lwtunnel_encap_ops seg6_iptun_ops = { .build_state = seg6_build_state, -#ifdef CONFIG_DST_CACHE .destroy_state = seg6_destroy_state, -#endif .output = seg6_output, .input = seg6_input, .fill_encap = seg6_fill_encap_info, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 99853c6e33a8c3def99ecb56e288cce4a38a997b..61e5902f068732b10f734c7937c7539d418820d7 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -881,11 +881,12 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, goto tx_error; } - rt = ip_route_output_ports(tunnel->net, &fl4, NULL, - dst, tiph->saddr, - 0, 0, - IPPROTO_IPV6, RT_TOS(tos), - tunnel->parms.link); + flowi4_init_output(&fl4, tunnel->parms.link, tunnel->fwmark, + RT_TOS(tos), RT_SCOPE_UNIVERSE, IPPROTO_IPV6, + 0, dst, tiph->saddr, 0, 0, + sock_net_uid(tunnel->net, NULL)); + rt = ip_route_output_flow(tunnel->net, &fl4, NULL); + if (IS_ERR(rt)) { dev->stats.tx_carrier_errors++; goto tx_error_icmp; @@ -1071,7 +1072,8 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) } } -static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) +static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p, + __u32 fwmark) { struct net *net = t->net; struct sit_net *sitn = net_generic(net, sit_net_id); @@ -1085,8 +1087,9 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p) ipip6_tunnel_link(sitn, t); t->parms.iph.ttl = p->iph.ttl; t->parms.iph.tos = p->iph.tos; - if (t->parms.link != p->link) { + if (t->parms.link != p->link || t->fwmark != fwmark) { t->parms.link = p->link; + t->fwmark = fwmark; ipip6_tunnel_bind_dev(t->dev); } dst_cache_reset(&t->dst_cache); @@ -1220,7 +1223,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) t = netdev_priv(dev); } - ipip6_tunnel_update(t, &p); + ipip6_tunnel_update(t, &p, t->fwmark); } if (t) { @@ -1418,7 +1421,8 @@ static int ipip6_validate(struct nlattr *tb[], struct nlattr *data[]) } static void ipip6_netlink_parms(struct nlattr *data[], - struct ip_tunnel_parm *parms) + struct ip_tunnel_parm *parms, + __u32 *fwmark) { memset(parms, 0, sizeof(*parms)); @@ -1457,6 +1461,8 @@ static void ipip6_netlink_parms(struct nlattr *data[], if (data[IFLA_IPTUN_PROTO]) parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + if (data[IFLA_IPTUN_FWMARK]) + *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } /* This function returns true when ENCAP attributes are present in the nl msg */ @@ -1549,7 +1555,7 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, return err; } - ipip6_netlink_parms(data, &nt->parms); + ipip6_netlink_parms(data, &nt->parms, &nt->fwmark); if (ipip6_tunnel_locate(net, &nt->parms, 0)) return -EEXIST; @@ -1577,6 +1583,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], #ifdef CONFIG_IPV6_SIT_6RD struct ip_tunnel_6rd ip6rd; #endif + __u32 fwmark = t->fwmark; int err; if (dev == sitn->fb_tunnel_dev) @@ -1588,7 +1595,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], return err; } - ipip6_netlink_parms(data, &p); + ipip6_netlink_parms(data, &p, &fwmark); if (((dev->flags & IFF_POINTOPOINT) && !p.iph.daddr) || (!(dev->flags & IFF_POINTOPOINT) && p.iph.daddr)) @@ -1602,7 +1609,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], } else t = netdev_priv(dev); - ipip6_tunnel_update(t, &p); + ipip6_tunnel_update(t, &p, fwmark); #ifdef CONFIG_IPV6_SIT_6RD if (ipip6_netlink_6rd_parms(data, &ip6rd)) @@ -1649,6 +1656,8 @@ static size_t ipip6_get_size(const struct net_device *dev) nla_total_size(2) + /* IFLA_IPTUN_ENCAP_DPORT */ nla_total_size(2) + + /* IFLA_IPTUN_FWMARK */ + nla_total_size(4) + 0; } @@ -1665,7 +1674,8 @@ static int ipip6_fill_info(struct sk_buff *skb, const struct net_device *dev) nla_put_u8(skb, IFLA_IPTUN_PMTUDISC, !!(parm->iph.frag_off & htons(IP_DF))) || nla_put_u8(skb, IFLA_IPTUN_PROTO, parm->iph.protocol) || - nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags)) + nla_put_be16(skb, IFLA_IPTUN_FLAGS, parm->i_flags) || + nla_put_u32(skb, IFLA_IPTUN_FWMARK, tunnel->fwmark)) goto nla_put_failure; #ifdef CONFIG_IPV6_SIT_6RD @@ -1715,6 +1725,7 @@ static const struct nla_policy ipip6_policy[IFLA_IPTUN_MAX + 1] = { [IFLA_IPTUN_ENCAP_FLAGS] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_SPORT] = { .type = NLA_U16 }, [IFLA_IPTUN_ENCAP_DPORT] = { .type = NLA_U16 }, + [IFLA_IPTUN_FWMARK] = { .type = NLA_U32 }, }; static void ipip6_dellink(struct net_device *dev, struct list_head *head) diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index 895ff650db43017ef39344679771d94ad6eaaf00..5abc3692b9011b140816dc4ce6223e79e5defddb 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -143,6 +144,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) int mss; struct dst_entry *dst; __u8 rcv_wscale; + u32 tsoff = 0; if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -162,6 +164,12 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) memset(&tcp_opt, 0, sizeof(tcp_opt)); tcp_parse_options(skb, &tcp_opt, 0, NULL); + if (tcp_opt.saw_tstamp && tcp_opt.rcv_tsecr) { + tsoff = secure_tcpv6_ts_off(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32); + tcp_opt.rcv_tsecr -= tsoff; + } + if (!cookie_timestamp_decode(&tcp_opt)) goto out; @@ -242,7 +250,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ireq->rcv_wscale = rcv_wscale; ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst); - ret = tcp_get_cookie_sock(sk, skb, req, dst); + ret = tcp_get_cookie_sock(sk, skb, req, dst, tsoff); out: return ret; out_free: diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 49fa2e8c3fa9212eef1198a1077a6726f0f1b6fc..7a8237acd210bf58cdc98f085fcf7fed433c3f24 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -101,12 +101,18 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) } } -static u32 tcp_v6_init_sequence(const struct sk_buff *skb, u32 *tsoff) +static u32 tcp_v6_init_seq(const struct sk_buff *skb) { - return secure_tcpv6_sequence_number(ipv6_hdr(skb)->daddr.s6_addr32, - ipv6_hdr(skb)->saddr.s6_addr32, - tcp_hdr(skb)->dest, - tcp_hdr(skb)->source, tsoff); + return secure_tcpv6_seq(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32, + tcp_hdr(skb)->dest, + tcp_hdr(skb)->source); +} + +static u32 tcp_v6_init_ts_off(const struct sk_buff *skb) +{ + return secure_tcpv6_ts_off(ipv6_hdr(skb)->daddr.s6_addr32, + ipv6_hdr(skb)->saddr.s6_addr32); } static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, @@ -122,7 +128,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, struct flowi6 fl6; struct dst_entry *dst; int addr_type; - u32 seq; int err; struct inet_timewait_death_row *tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; @@ -265,11 +270,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_gso_type = SKB_GSO_TCPV6; ip6_dst_store(sk, dst, NULL, NULL); - if (tcp_death_row->sysctl_tw_recycle && - !tp->rx_opt.ts_recent_stamp && - ipv6_addr_equal(&fl6.daddr, &sk->sk_v6_daddr)) - tcp_fetch_timewait_stamp(sk, dst); - icsk->icsk_ext_hdr_len = 0; if (opt) icsk->icsk_ext_hdr_len = opt->opt_flen + @@ -287,13 +287,13 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk_set_txhash(sk); if (likely(!tp->repair)) { - seq = secure_tcpv6_sequence_number(np->saddr.s6_addr32, - sk->sk_v6_daddr.s6_addr32, - inet->inet_sport, - inet->inet_dport, - &tp->tsoffset); if (!tp->write_seq) - tp->write_seq = seq; + tp->write_seq = secure_tcpv6_seq(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32, + inet->inet_sport, + inet->inet_dport); + tp->tsoffset = secure_tcpv6_ts_off(np->saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32); } if (tcp_fastopen_defer_connect(sk, &err)) @@ -727,11 +727,8 @@ static void tcp_v6_init_req(struct request_sock *req, static struct dst_entry *tcp_v6_route_req(const struct sock *sk, struct flowi *fl, - const struct request_sock *req, - bool *strict) + const struct request_sock *req) { - if (strict) - *strict = true; return inet6_csk_route_req(sk, &fl->u.ip6, req, IPPROTO_TCP); } @@ -757,7 +754,8 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .cookie_init_seq = cookie_v6_init_sequence, #endif .route_req = tcp_v6_route_req, - .init_seq = tcp_v6_init_sequence, + .init_seq = tcp_v6_init_seq, + .init_ts_off = tcp_v6_init_ts_off, .send_synack = tcp_v6_send_synack, }; @@ -1301,8 +1299,6 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) goto discard; if (nsk != sk) { - sock_rps_save_rxhash(nsk, skb); - sk_mark_napi_id(nsk, skb); if (tcp_child_process(sk, nsk, skb)) goto reset; if (opt_skb) @@ -1921,7 +1917,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, .h.hashinfo = &tcp_hashinfo, @@ -1933,8 +1929,9 @@ struct proto tcpv6_prot = { .diag_destroy = tcp_abort, }; -static const struct inet6_protocol tcpv6_protocol = { +static struct inet6_protocol tcpv6_protocol = { .early_demux = tcp_v6_early_demux, + .early_demux_handler = tcp_v6_early_demux, .handler = tcp_v6_rcv, .err_handler = tcp_v6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index e28082f0a307eb68ac13987580d8d9f65358212f..04862abfe4ec27d978adadd27735a9d19e3c4365 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -864,6 +865,69 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable, return 0; } + +static struct sock *__udp6_lib_demux_lookup(struct net *net, + __be16 loc_port, const struct in6_addr *loc_addr, + __be16 rmt_port, const struct in6_addr *rmt_addr, + int dif) +{ + unsigned short hnum = ntohs(loc_port); + unsigned int hash2 = udp6_portaddr_hash(net, loc_addr, hnum); + unsigned int slot2 = hash2 & udp_table.mask; + struct udp_hslot *hslot2 = &udp_table.hash2[slot2]; + const __portpair ports = INET_COMBINED_PORTS(rmt_port, hnum); + struct sock *sk; + + udp_portaddr_for_each_entry_rcu(sk, &hslot2->head) { + if (INET6_MATCH(sk, net, rmt_addr, loc_addr, ports, dif)) + return sk; + /* Only check first socket in chain */ + break; + } + return NULL; +} + +static void udp_v6_early_demux(struct sk_buff *skb) +{ + struct net *net = dev_net(skb->dev); + const struct udphdr *uh; + struct sock *sk; + struct dst_entry *dst; + int dif = skb->dev->ifindex; + + if (!pskb_may_pull(skb, skb_transport_offset(skb) + + sizeof(struct udphdr))) + return; + + uh = udp_hdr(skb); + + if (skb->pkt_type == PACKET_HOST) + sk = __udp6_lib_demux_lookup(net, uh->dest, + &ipv6_hdr(skb)->daddr, + uh->source, &ipv6_hdr(skb)->saddr, + dif); + else + return; + + if (!sk || !atomic_inc_not_zero_hint(&sk->sk_refcnt, 2)) + return; + + skb->sk = sk; + skb->destructor = sock_efree; + dst = READ_ONCE(sk->sk_rx_dst); + + if (dst) + dst = dst_check(dst, inet6_sk(sk)->rx_dst_cookie); + if (dst) { + if (dst->flags & DST_NOCACHE) { + if (likely(atomic_inc_not_zero(&dst->__refcnt))) + skb_dst_set(skb, dst); + } else { + skb_dst_set_noref(skb, dst); + } + } +} + static __inline__ int udpv6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udp_table, IPPROTO_UDP); @@ -1378,7 +1442,9 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname, } #endif -static const struct inet6_protocol udpv6_protocol = { +static struct inet6_protocol udpv6_protocol = { + .early_demux = udp_v6_early_demux, + .early_demux_handler = udp_v6_early_demux, .handler = udpv6_rcv, .err_handler = udpv6_err, .flags = INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL, diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 4439ee44c8b05461b8a66190c7800379ca5f105c..7a92c0f3191250118ce3572ca91ee9116460bce8 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -13,6 +13,7 @@ #include #include #include +#include /* Add encapsulation header. * @@ -26,6 +27,7 @@ static int xfrm6_transport_output(struct xfrm_state *x, struct sk_buff *skb) int hdr_len; iph = ipv6_hdr(skb); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); hdr_len = x->type->hdr_offset(x, skb, &prevhdr); skb_set_mac_header(skb, (prevhdr - x->props.header_len) - skb->data); @@ -61,9 +63,41 @@ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) return 0; } +static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + const struct net_offload *ops; + struct sk_buff *segs = ERR_PTR(-EINVAL); + struct xfrm_offload *xo = xfrm_offload(skb); + + skb->transport_header += x->props.header_len; + ops = rcu_dereference(inet6_offloads[xo->proto]); + if (likely(ops && ops->callbacks.gso_segment)) + segs = ops->callbacks.gso_segment(skb, features); + + return segs; +} + +static void xfrm6_transport_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + sizeof(struct ipv6hdr) + x->props.header_len); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb_reset_transport_header(skb); + skb->transport_header -= x->props.header_len; + } +} + + static struct xfrm_mode xfrm6_transport_mode = { .input = xfrm6_transport_input, .output = xfrm6_transport_output, + .gso_segment = xfrm4_transport_gso_segment, + .xmit = xfrm6_transport_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TRANSPORT, }; diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 372855eeaf42551208adeeaff16b1cd06f08d3f2..02556e356f87e94830e76bb6f4f519e4cf1eb4d7 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -36,6 +36,9 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) struct ipv6hdr *top_iph; int dsfield; + skb_set_inner_network_header(skb, skb_network_offset(skb)); + skb_set_inner_transport_header(skb, skb_transport_offset(skb)); + skb_set_network_header(skb, -x->props.header_len); skb->mac_header = skb->network_header + offsetof(struct ipv6hdr, nexthdr); @@ -96,11 +99,35 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) return err; } +static struct sk_buff *xfrm6_mode_tunnel_gso_segment(struct xfrm_state *x, + struct sk_buff *skb, + netdev_features_t features) +{ + __skb_push(skb, skb->mac_len); + return skb_mac_gso_segment(skb, features); + +} + +static void xfrm6_mode_tunnel_xmit(struct xfrm_state *x, struct sk_buff *skb) +{ + struct xfrm_offload *xo = xfrm_offload(skb); + + if (xo->flags & XFRM_GSO_SEGMENT) { + skb->network_header = skb->network_header - x->props.header_len; + skb->transport_header = skb->network_header + sizeof(struct ipv6hdr); + } + + skb_reset_mac_len(skb); + pskb_pull(skb, skb->mac_len + x->props.header_len); +} + static struct xfrm_mode xfrm6_tunnel_mode = { .input2 = xfrm6_mode_tunnel_input, .input = xfrm_prepare_input, .output2 = xfrm6_mode_tunnel_output, .output = xfrm6_prepare_output, + .gso_segment = xfrm6_mode_tunnel_gso_segment, + .xmit = xfrm6_mode_tunnel_xmit, .owner = THIS_MODULE, .encap = XFRM_MODE_TUNNEL, .flags = XFRM_MODE_FLAG_TUNNEL, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 4d09ce6fa90e666bfdeda09cbdc8c7b0cb8b5824..8ae87d4ec5ff607d431513bb2ef42d5c2c93450a 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -73,11 +73,16 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) int mtu, ret = 0; struct dst_entry *dst = skb_dst(skb); + if (skb->ignore_df) + goto out; + mtu = dst_mtu(dst); if (mtu < IPV6_MIN_MTU) mtu = IPV6_MIN_MTU; - if (!skb->ignore_df && skb->len > mtu) { + if ((!skb_is_gso(skb) && skb->len > mtu) || + (skb_is_gso(skb) && + skb_gso_network_seglen(skb) > ip6_skb_dst_mtu(skb))) { skb->dev = dst->dev; skb->protocol = htons(ETH_P_IPV6); @@ -89,7 +94,7 @@ static int xfrm6_tunnel_check_size(struct sk_buff *skb) icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); ret = -EMSGSIZE; } - +out: return ret; } diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 8a9219ff2e77e0205de652974a44c5e4ba33b2c0..fa31ef29e3fa0bf3973e12e43c57b48eb7d1be45 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1168,11 +1168,10 @@ static int ipxitf_ioctl(unsigned int cmd, void __user *arg) sipx->sipx_network = ipxif->if_netnum; memcpy(sipx->sipx_node, ipxif->if_node, sizeof(sipx->sipx_node)); - rc = -EFAULT; + rc = 0; if (copy_to_user(arg, &ifr, sizeof(ifr))) - break; + rc = -EFAULT; ipxitf_put(ipxif); - rc = 0; break; } case SIOCAIPXITFCRT: diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 31762f76cdb5f2a3ec322135068402be532218ed..deca20fb2ce2e5e50063e58ec2c0d9d6239d3bd4 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -1707,11 +1707,7 @@ static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) struct kcm_clone info; struct socket *newsock = NULL; - if (copy_from_user(&info, (void __user *)arg, sizeof(info))) - return -EFAULT; - err = kcm_clone(sock, &info, &newsock); - if (!err) { if (copy_to_user((void __user *)arg, &info, sizeof(info))) { diff --git a/net/key/af_key.c b/net/key/af_key.c index c6252ed42c1de65dee149d7d869b62b96616e22a..c1950bb14735bc914b53c0476342f2679ad50b87 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -63,8 +63,13 @@ struct pfkey_sock { } u; struct sk_buff *skb; } dump; + struct mutex dump_lock; }; +static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, + xfrm_address_t *saddr, xfrm_address_t *daddr, + u16 *family); + static inline struct pfkey_sock *pfkey_sk(struct sock *sk) { return (struct pfkey_sock *)sk; @@ -139,6 +144,7 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, { struct netns_pfkey *net_pfkey = net_generic(net, pfkey_net_id); struct sock *sk; + struct pfkey_sock *pfk; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) @@ -153,6 +159,9 @@ static int pfkey_create(struct net *net, struct socket *sock, int protocol, if (sk == NULL) goto out; + pfk = pfkey_sk(sk); + mutex_init(&pfk->dump_lock); + sock->ops = &pfkey_ops; sock_init_data(sock, sk); @@ -281,13 +290,23 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) struct sadb_msg *hdr; int rc; + mutex_lock(&pfk->dump_lock); + if (!pfk->dump.dump) { + rc = 0; + goto out; + } + rc = pfk->dump.dump(pfk); - if (rc == -ENOBUFS) - return 0; + if (rc == -ENOBUFS) { + rc = 0; + goto out; + } if (pfk->dump.skb) { - if (!pfkey_can_dump(&pfk->sk)) - return 0; + if (!pfkey_can_dump(&pfk->sk)) { + rc = 0; + goto out; + } hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; @@ -298,6 +317,9 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) } pfkey_terminate_dump(pfk); + +out: + mutex_unlock(&pfk->dump_lock); return rc; } @@ -1793,19 +1815,26 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms struct xfrm_address_filter *filter = NULL; struct pfkey_sock *pfk = pfkey_sk(sk); - if (pfk->dump.dump != NULL) + mutex_lock(&pfk->dump_lock); + if (pfk->dump.dump != NULL) { + mutex_unlock(&pfk->dump_lock); return -EBUSY; + } proto = pfkey_satype2proto(hdr->sadb_msg_satype); - if (proto == 0) + if (proto == 0) { + mutex_unlock(&pfk->dump_lock); return -EINVAL; + } if (ext_hdrs[SADB_X_EXT_FILTER - 1]) { struct sadb_x_filter *xfilter = ext_hdrs[SADB_X_EXT_FILTER - 1]; filter = kmalloc(sizeof(*filter), GFP_KERNEL); - if (filter == NULL) + if (filter == NULL) { + mutex_unlock(&pfk->dump_lock); return -ENOMEM; + } memcpy(&filter->saddr, &xfilter->sadb_x_filter_saddr, sizeof(xfrm_address_t)); @@ -1821,6 +1850,7 @@ static int pfkey_dump(struct sock *sk, struct sk_buff *skb, const struct sadb_ms pfk->dump.dump = pfkey_dump_sa; pfk->dump.done = pfkey_dump_sa_done; xfrm_state_walk_init(&pfk->dump.u.state, proto, filter); + mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } @@ -1913,19 +1943,14 @@ parse_ipsecrequest(struct xfrm_policy *xp, struct sadb_x_ipsecrequest *rq) /* addresses present only in tunnel mode */ if (t->mode == XFRM_MODE_TUNNEL) { - u8 *sa = (u8 *) (rq + 1); - int family, socklen; + int err; - family = pfkey_sockaddr_extract((struct sockaddr *)sa, - &t->saddr); - if (!family) - return -EINVAL; - - socklen = pfkey_sockaddr_len(family); - if (pfkey_sockaddr_extract((struct sockaddr *)(sa + socklen), - &t->id.daddr) != family) - return -EINVAL; - t->encap_family = family; + err = parse_sockaddr_pair( + (struct sockaddr *)(rq + 1), + rq->sadb_x_ipsecrequest_len - sizeof(*rq), + &t->saddr, &t->id.daddr, &t->encap_family); + if (err) + return err; } else t->encap_family = xp->family; @@ -1945,7 +1970,11 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) if (pol->sadb_x_policy_len * 8 < sizeof(struct sadb_x_policy)) return -EINVAL; - while (len >= sizeof(struct sadb_x_ipsecrequest)) { + while (len >= sizeof(*rq)) { + if (len < rq->sadb_x_ipsecrequest_len || + rq->sadb_x_ipsecrequest_len < sizeof(*rq)) + return -EINVAL; + if ((err = parse_ipsecrequest(xp, rq)) < 0) return err; len -= rq->sadb_x_ipsecrequest_len; @@ -2408,7 +2437,6 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc return err; } -#ifdef CONFIG_NET_KEY_MIGRATE static int pfkey_sockaddr_pair_size(sa_family_t family) { return PFKEY_ALIGN8(pfkey_sockaddr_len(family) * 2); @@ -2420,7 +2448,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, { int af, socklen; - if (ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) + if (ext_len < 2 || ext_len < pfkey_sockaddr_pair_size(sa->sa_family)) return -EINVAL; af = pfkey_sockaddr_extract(sa, saddr); @@ -2436,6 +2464,7 @@ static int parse_sockaddr_pair(struct sockaddr *sa, int ext_len, return 0; } +#ifdef CONFIG_NET_KEY_MIGRATE static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct xfrm_migrate *m) { @@ -2443,13 +2472,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, struct sadb_x_ipsecrequest *rq2; int mode; - if (len <= sizeof(struct sadb_x_ipsecrequest) || - len < rq1->sadb_x_ipsecrequest_len) + if (len < sizeof(*rq1) || + len < rq1->sadb_x_ipsecrequest_len || + rq1->sadb_x_ipsecrequest_len < sizeof(*rq1)) return -EINVAL; /* old endoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq1 + 1), - rq1->sadb_x_ipsecrequest_len, + rq1->sadb_x_ipsecrequest_len - sizeof(*rq1), &m->old_saddr, &m->old_daddr, &m->old_family); if (err) @@ -2458,13 +2488,14 @@ static int ipsecrequests_to_migrate(struct sadb_x_ipsecrequest *rq1, int len, rq2 = (struct sadb_x_ipsecrequest *)((u8 *)rq1 + rq1->sadb_x_ipsecrequest_len); len -= rq1->sadb_x_ipsecrequest_len; - if (len <= sizeof(struct sadb_x_ipsecrequest) || - len < rq2->sadb_x_ipsecrequest_len) + if (len <= sizeof(*rq2) || + len < rq2->sadb_x_ipsecrequest_len || + rq2->sadb_x_ipsecrequest_len < sizeof(*rq2)) return -EINVAL; /* new endpoints */ err = parse_sockaddr_pair((struct sockaddr *)(rq2 + 1), - rq2->sadb_x_ipsecrequest_len, + rq2->sadb_x_ipsecrequest_len - sizeof(*rq2), &m->new_saddr, &m->new_daddr, &m->new_family); if (err) @@ -2679,14 +2710,18 @@ static int pfkey_spddump(struct sock *sk, struct sk_buff *skb, const struct sadb { struct pfkey_sock *pfk = pfkey_sk(sk); - if (pfk->dump.dump != NULL) + mutex_lock(&pfk->dump_lock); + if (pfk->dump.dump != NULL) { + mutex_unlock(&pfk->dump_lock); return -EBUSY; + } pfk->dump.msg_version = hdr->sadb_msg_version; pfk->dump.msg_portid = hdr->sadb_msg_pid; pfk->dump.dump = pfkey_dump_sp; pfk->dump.done = pfkey_dump_sp_done; xfrm_policy_walk_init(&pfk->dump.u.policy, XFRM_POLICY_TYPE_MAIN); + mutex_unlock(&pfk->dump_lock); return pfkey_do_dump(pfk); } @@ -3792,7 +3827,6 @@ static inline void pfkey_exit_proc(struct net *net) static struct xfrm_mgr pfkeyv2_mgr = { - .id = "pfkeyv2", .notify = pfkey_send_notify, .acquire = pfkey_send_acquire, .compile_policy = pfkey_compile_policy, diff --git a/net/l2tp/l2tp_core.c b/net/l2tp/l2tp_core.c index e37d9554da7b47df0571c41478e6e758b8a8e6f9..fa0342574b8986ad98f458febe8e3e6314171eae 100644 --- a/net/l2tp/l2tp_core.c +++ b/net/l2tp/l2tp_core.c @@ -120,7 +120,7 @@ static inline struct l2tp_tunnel *l2tp_tunnel(struct sock *sk) return sk->sk_user_data; } -static inline struct l2tp_net *l2tp_pernet(struct net *net) +static inline struct l2tp_net *l2tp_pernet(const struct net *net) { BUG_ON(!net); @@ -217,27 +217,6 @@ static void l2tp_tunnel_sock_put(struct sock *sk) sock_put(sk); } -/* Lookup a session by id in the global session list - */ -static struct l2tp_session *l2tp_session_find_2(struct net *net, u32 session_id) -{ - struct l2tp_net *pn = l2tp_pernet(net); - struct hlist_head *session_list = - l2tp_session_id_hash_2(pn, session_id); - struct l2tp_session *session; - - rcu_read_lock_bh(); - hlist_for_each_entry_rcu(session, session_list, global_hlist) { - if (session->session_id == session_id) { - rcu_read_unlock_bh(); - return session; - } - } - rcu_read_unlock_bh(); - - return NULL; -} - /* Session hash list. * The session_id SHOULD be random according to RFC2661, but several * L2TP implementations (Cisco and Microsoft) use incrementing @@ -250,38 +229,10 @@ l2tp_session_id_hash(struct l2tp_tunnel *tunnel, u32 session_id) return &tunnel->session_hlist[hash_32(session_id, L2TP_HASH_BITS)]; } -/* Lookup a session by id - */ -struct l2tp_session *l2tp_session_find(struct net *net, struct l2tp_tunnel *tunnel, u32 session_id) -{ - struct hlist_head *session_list; - struct l2tp_session *session; - - /* In L2TPv3, session_ids are unique over all tunnels and we - * sometimes need to look them up before we know the - * tunnel. - */ - if (tunnel == NULL) - return l2tp_session_find_2(net, session_id); - - session_list = l2tp_session_id_hash(tunnel, session_id); - read_lock_bh(&tunnel->hlist_lock); - hlist_for_each_entry(session, session_list, hlist) { - if (session->session_id == session_id) { - read_unlock_bh(&tunnel->hlist_lock); - return session; - } - } - read_unlock_bh(&tunnel->hlist_lock); - - return NULL; -} -EXPORT_SYMBOL_GPL(l2tp_session_find); - -/* Like l2tp_session_find() but takes a reference on the returned session. +/* Lookup a session. A new reference is held on the returned session. * Optionally calls session->ref() too if do_ref is true. */ -struct l2tp_session *l2tp_session_get(struct net *net, +struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, bool do_ref) { @@ -356,7 +307,8 @@ EXPORT_SYMBOL_GPL(l2tp_session_get_nth); /* Lookup a session by interface name. * This is very inefficient but is only used by management interfaces. */ -struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname, +struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, + const char *ifname, bool do_ref) { struct l2tp_net *pn = l2tp_pernet(net); @@ -427,7 +379,7 @@ static int l2tp_session_add_to_tunnel(struct l2tp_tunnel *tunnel, /* Lookup a tunnel by id */ -struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id) +struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id) { struct l2tp_tunnel *tunnel; struct l2tp_net *pn = l2tp_pernet(net); @@ -445,7 +397,7 @@ struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id) } EXPORT_SYMBOL_GPL(l2tp_tunnel_find); -struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth) +struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth) { struct l2tp_net *pn = l2tp_pernet(net); struct l2tp_tunnel *tunnel; diff --git a/net/l2tp/l2tp_core.h b/net/l2tp/l2tp_core.h index 8ce7818c7a9d0578b79e70eff1916d3248e7604a..eec5ad2ebb93c32bca7a57b377f80df8e10b2591 100644 --- a/net/l2tp/l2tp_core.h +++ b/net/l2tp/l2tp_core.h @@ -230,18 +230,16 @@ static inline struct l2tp_tunnel *l2tp_sock_to_tunnel(struct sock *sk) return tunnel; } -struct l2tp_session *l2tp_session_get(struct net *net, +struct l2tp_session *l2tp_session_get(const struct net *net, struct l2tp_tunnel *tunnel, u32 session_id, bool do_ref); -struct l2tp_session *l2tp_session_find(struct net *net, - struct l2tp_tunnel *tunnel, - u32 session_id); struct l2tp_session *l2tp_session_get_nth(struct l2tp_tunnel *tunnel, int nth, bool do_ref); -struct l2tp_session *l2tp_session_get_by_ifname(struct net *net, char *ifname, +struct l2tp_session *l2tp_session_get_by_ifname(const struct net *net, + const char *ifname, bool do_ref); -struct l2tp_tunnel *l2tp_tunnel_find(struct net *net, u32 tunnel_id); -struct l2tp_tunnel *l2tp_tunnel_find_nth(struct net *net, int nth); +struct l2tp_tunnel *l2tp_tunnel_find(const struct net *net, u32 tunnel_id); +struct l2tp_tunnel *l2tp_tunnel_find_nth(const struct net *net, int nth); int l2tp_tunnel_create(struct net *net, int fd, int version, u32 tunnel_id, u32 peer_tunnel_id, struct l2tp_tunnel_cfg *cfg, diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 6fd41d7afe1ef27592970de333c75928264dc275..8b21af7321b928b4dcc5d7af3a6667380e9a949a 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -30,6 +30,9 @@ #include #include #include +#include +#include +#include #include "l2tp_core.h" @@ -127,8 +130,13 @@ static const struct net_device_ops l2tp_eth_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, }; +static struct device_type l2tpeth_type = { + .name = "l2tpeth", +}; + static void l2tp_eth_dev_setup(struct net_device *dev) { + SET_NETDEV_DEVTYPE(dev, &l2tpeth_type); ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; dev->features |= NETIF_F_LLTX; @@ -204,8 +212,58 @@ static void l2tp_eth_show(struct seq_file *m, void *arg) } #endif +static void l2tp_eth_adjust_mtu(struct l2tp_tunnel *tunnel, + struct l2tp_session *session, + struct net_device *dev) +{ + unsigned int overhead = 0; + struct dst_entry *dst; + u32 l3_overhead = 0; + + /* if the encap is UDP, account for UDP header size */ + if (tunnel->encap == L2TP_ENCAPTYPE_UDP) { + overhead += sizeof(struct udphdr); + dev->needed_headroom += sizeof(struct udphdr); + } + if (session->mtu != 0) { + dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; + return; + } + lock_sock(tunnel->sock); + l3_overhead = kernel_sock_ip_overhead(tunnel->sock); + release_sock(tunnel->sock); + if (l3_overhead == 0) { + /* L3 Overhead couldn't be identified, this could be + * because tunnel->sock was NULL or the socket's + * address family was not IPv4 or IPv6, + * dev mtu stays at 1500. + */ + return; + } + /* Adjust MTU, factor overhead - underlay L3, overlay L2 hdr + * UDP overhead, if any, was already factored in above. + */ + overhead += session->hdr_len + ETH_HLEN + l3_overhead; + + /* If PMTU discovery was enabled, use discovered MTU on L2TP device */ + dst = sk_dst_get(tunnel->sock); + if (dst) { + /* dst_mtu will use PMTU if found, else fallback to intf MTU */ + u32 pmtu = dst_mtu(dst); + + if (pmtu != 0) + dev->mtu = pmtu; + dst_release(dst); + } + session->mtu = dev->mtu - overhead; + dev->mtu = session->mtu; + dev->needed_headroom += session->hdr_len; +} + static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 peer_session_id, struct l2tp_session_cfg *cfg) { + unsigned char name_assign_type; struct net_device *dev; char name[IFNAMSIZ]; struct l2tp_tunnel *tunnel; @@ -222,15 +280,12 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p } if (cfg->ifname) { - dev = dev_get_by_name(net, cfg->ifname); - if (dev) { - dev_put(dev); - rc = -EEXIST; - goto out; - } strlcpy(name, cfg->ifname, IFNAMSIZ); - } else + name_assign_type = NET_NAME_USER; + } else { strcpy(name, L2TP_ETH_DEV_NAME); + name_assign_type = NET_NAME_ENUM; + } session = l2tp_session_create(sizeof(*spriv), tunnel, session_id, peer_session_id, cfg); @@ -239,7 +294,7 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p goto out; } - dev = alloc_netdev(sizeof(*priv), name, NET_NAME_UNKNOWN, + dev = alloc_netdev(sizeof(*priv), name, name_assign_type, l2tp_eth_dev_setup); if (!dev) { rc = -ENOMEM; @@ -247,12 +302,9 @@ static int l2tp_eth_create(struct net *net, u32 tunnel_id, u32 session_id, u32 p } dev_net_set(dev, net); - if (session->mtu == 0) - session->mtu = dev->mtu - session->hdr_len; - dev->mtu = session->mtu; - dev->needed_headroom += session->hdr_len; dev->min_mtu = 0; dev->max_mtu = ETH_MAX_MTU; + l2tp_eth_adjust_mtu(tunnel, session, dev); priv = netdev_priv(dev); priv->dev = dev; diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c index 7e3e669baac42df9d0be2864b927ba4f390258e9..12cfcd0ca807396d18e061e9bcf4c29e760b2563 100644 --- a/net/l2tp/l2tp_netlink.c +++ b/net/l2tp/l2tp_netlink.c @@ -521,11 +521,6 @@ static int l2tp_nl_cmd_session_create(struct sk_buff *skb, struct genl_info *inf goto out; } session_id = nla_get_u32(info->attrs[L2TP_ATTR_SESSION_ID]); - session = l2tp_session_find(net, tunnel, session_id); - if (session) { - ret = -EEXIST; - goto out; - } if (!info->attrs[L2TP_ATTR_PEER_SESSION_ID]) { ret = -EINVAL; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index cb4fff785cbf5aaad520442dc243ae62dc5750ea..8364fe5b59e4ca01ef8d05b1038dbe96fedf657b 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -142,7 +142,7 @@ static struct proto llc_proto = { .name = "LLC", .owner = THIS_MODULE, .obj_size = sizeof(struct llc_sock), - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; /** diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 8bc5a1bd2d453542df31506f543feb64b64cdd96..9b02c13d258b005bb10029b3baf8ec4db71f18b4 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -506,7 +506,7 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_estab_match(sap, daddr, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || @@ -565,7 +565,7 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_listener_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 5404d0d195cc581613e356b75bd70321e617673e..63b6ab0563705f4b15c6bc8e3d9c7ad85a2af381 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -328,7 +328,7 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc)) { - /* Extra checks required by SLAB_DESTROY_BY_RCU */ + /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 4456559cb056d1e32a621351120327cf27541bf7..1b7a4daf283c5191c19eaa4f3571d316c866906a 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -357,14 +357,14 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta, spin_lock_init(&tid_agg_rx->reorder_lock); /* rx timer */ - tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired; - tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; - init_timer_deferrable(&tid_agg_rx->session_timer); + setup_deferrable_timer(&tid_agg_rx->session_timer, + sta_rx_agg_session_timer_expired, + (unsigned long)&sta->timer_to_tid[tid]); /* rx reorder timer */ - tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired; - tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid]; - init_timer(&tid_agg_rx->reorder_timer); + setup_timer(&tid_agg_rx->reorder_timer, + sta_rx_agg_reorder_timer_expired, + (unsigned long)&sta->timer_to_tid[tid]); /* prepare reordering buffer */ tid_agg_rx->reorder_buf = diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 45319cc01121a9eb17d49185a07a9675e7b2995d..60e2a62f7bef2fbb014f3a40403cf498a75d429c 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -670,14 +670,14 @@ int ieee80211_start_tx_ba_session(struct ieee80211_sta *pubsta, u16 tid, tid_tx->timeout = timeout; /* response timer */ - tid_tx->addba_resp_timer.function = sta_addba_resp_timer_expired; - tid_tx->addba_resp_timer.data = (unsigned long)&sta->timer_to_tid[tid]; - init_timer(&tid_tx->addba_resp_timer); + setup_timer(&tid_tx->addba_resp_timer, + sta_addba_resp_timer_expired, + (unsigned long)&sta->timer_to_tid[tid]); /* tx timer */ - tid_tx->session_timer.function = sta_tx_agg_session_timer_expired; - tid_tx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; - init_timer_deferrable(&tid_tx->session_timer); + setup_deferrable_timer(&tid_tx->session_timer, + sta_tx_agg_session_timer_expired, + (unsigned long)&sta->timer_to_tid[tid]); /* assign a dialog token */ sta->ampdu_mlme.dialog_token_allocator++; diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ac879bb17870d4602fd151504f68d26c21eeac9b..6c2e6060cd549e3c4c39e78cc20ca9302472c4ac 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -3,7 +3,7 @@ * * Copyright 2006-2010 Johannes Berg * Copyright 2013-2015 Intel Mobile Communications GmbH - * Copyright (C) 2015-2016 Intel Deutschland GmbH + * Copyright (C) 2015-2017 Intel Deutschland GmbH * * This file is GPLv2 as found in COPYING. */ @@ -22,11 +22,98 @@ #include "mesh.h" #include "wme.h" +static void ieee80211_set_mu_mimo_follow(struct ieee80211_sub_if_data *sdata, + struct vif_params *params) +{ + bool mu_mimo_groups = false; + bool mu_mimo_follow = false; + + if (params->vht_mumimo_groups) { + u64 membership; + + BUILD_BUG_ON(sizeof(membership) != WLAN_MEMBERSHIP_LEN); + + memcpy(sdata->vif.bss_conf.mu_group.membership, + params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); + memcpy(sdata->vif.bss_conf.mu_group.position, + params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, + WLAN_USER_POSITION_LEN); + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS); + /* don't care about endianness - just check for 0 */ + memcpy(&membership, params->vht_mumimo_groups, + WLAN_MEMBERSHIP_LEN); + mu_mimo_groups = membership != 0; + } + + if (params->vht_mumimo_follow_addr) { + mu_mimo_follow = + is_valid_ether_addr(params->vht_mumimo_follow_addr); + ether_addr_copy(sdata->u.mntr.mu_follow_addr, + params->vht_mumimo_follow_addr); + } + + sdata->vif.mu_mimo_owner = mu_mimo_groups || mu_mimo_follow; +} + +static int ieee80211_set_mon_options(struct ieee80211_sub_if_data *sdata, + struct vif_params *params) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_sub_if_data *monitor_sdata; + + /* check flags first */ + if (params->flags && ieee80211_sdata_running(sdata)) { + u32 mask = MONITOR_FLAG_COOK_FRAMES | MONITOR_FLAG_ACTIVE; + + /* + * Prohibit MONITOR_FLAG_COOK_FRAMES and + * MONITOR_FLAG_ACTIVE to be changed while the + * interface is up. + * Else we would need to add a lot of cruft + * to update everything: + * cooked_mntrs, monitor and all fif_* counters + * reconfigure hardware + */ + if ((params->flags & mask) != (sdata->u.mntr.flags & mask)) + return -EBUSY; + } + + /* also validate MU-MIMO change */ + monitor_sdata = rtnl_dereference(local->monitor_sdata); + + if (!monitor_sdata && + (params->vht_mumimo_groups || params->vht_mumimo_follow_addr)) + return -EOPNOTSUPP; + + /* apply all changes now - no failures allowed */ + + if (monitor_sdata) + ieee80211_set_mu_mimo_follow(monitor_sdata, params); + + if (params->flags) { + if (ieee80211_sdata_running(sdata)) { + ieee80211_adjust_monitor_flags(sdata, -1); + sdata->u.mntr.flags = params->flags; + ieee80211_adjust_monitor_flags(sdata, 1); + + ieee80211_configure_filter(local); + } else { + /* + * Because the interface is down, ieee80211_do_stop + * and ieee80211_do_open take care of "everything" + * mentioned in the comment above. + */ + sdata->u.mntr.flags = params->flags; + } + } + + return 0; +} + static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, const char *name, unsigned char name_assign_type, enum nl80211_iftype type, - u32 *flags, struct vif_params *params) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -38,9 +125,14 @@ static struct wireless_dev *ieee80211_add_iface(struct wiphy *wiphy, if (err) return ERR_PTR(err); - if (type == NL80211_IFTYPE_MONITOR && flags) { - sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); - sdata->u.mntr.flags = *flags; + sdata = IEEE80211_WDEV_TO_SUB_IF(wdev); + + if (type == NL80211_IFTYPE_MONITOR) { + err = ieee80211_set_mon_options(sdata, params); + if (err) { + ieee80211_if_remove(sdata); + return NULL; + } } return wdev; @@ -55,7 +147,7 @@ static int ieee80211_del_iface(struct wiphy *wiphy, struct wireless_dev *wdev) static int ieee80211_change_iface(struct wiphy *wiphy, struct net_device *dev, - enum nl80211_iftype type, u32 *flags, + enum nl80211_iftype type, struct vif_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -75,58 +167,9 @@ static int ieee80211_change_iface(struct wiphy *wiphy, } if (sdata->vif.type == NL80211_IFTYPE_MONITOR) { - struct ieee80211_local *local = sdata->local; - struct ieee80211_sub_if_data *monitor_sdata; - u32 mu_mntr_cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; - - monitor_sdata = rtnl_dereference(local->monitor_sdata); - if (monitor_sdata && - wiphy_ext_feature_isset(wiphy, mu_mntr_cap_flag)) { - memcpy(monitor_sdata->vif.bss_conf.mu_group.membership, - params->vht_mumimo_groups, WLAN_MEMBERSHIP_LEN); - memcpy(monitor_sdata->vif.bss_conf.mu_group.position, - params->vht_mumimo_groups + WLAN_MEMBERSHIP_LEN, - WLAN_USER_POSITION_LEN); - monitor_sdata->vif.mu_mimo_owner = true; - ieee80211_bss_info_change_notify(monitor_sdata, - BSS_CHANGED_MU_GROUPS); - - ether_addr_copy(monitor_sdata->u.mntr.mu_follow_addr, - params->macaddr); - } - - if (!flags) - return 0; - - if (ieee80211_sdata_running(sdata)) { - u32 mask = MONITOR_FLAG_COOK_FRAMES | - MONITOR_FLAG_ACTIVE; - - /* - * Prohibit MONITOR_FLAG_COOK_FRAMES and - * MONITOR_FLAG_ACTIVE to be changed while the - * interface is up. - * Else we would need to add a lot of cruft - * to update everything: - * cooked_mntrs, monitor and all fif_* counters - * reconfigure hardware - */ - if ((*flags & mask) != (sdata->u.mntr.flags & mask)) - return -EBUSY; - - ieee80211_adjust_monitor_flags(sdata, -1); - sdata->u.mntr.flags = *flags; - ieee80211_adjust_monitor_flags(sdata, 1); - - ieee80211_configure_filter(local); - } else { - /* - * Because the interface is down, ieee80211_do_stop - * and ieee80211_do_open take care of "everything" - * mentioned in the comment above. - */ - sdata->u.mntr.flags = *flags; - } + ret = ieee80211_set_mon_options(sdata, params); + if (ret) + return ret; } return 0; @@ -617,10 +660,11 @@ void sta_set_rate_info_tx(struct sta_info *sta, int shift = ieee80211_vif_get_shift(&sta->sdata->vif); u16 brate; - sband = sta->local->hw.wiphy->bands[ - ieee80211_get_sdata_band(sta->sdata)]; - brate = sband->bitrates[rate->idx].bitrate; - rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); + sband = ieee80211_get_sband(sta->sdata); + if (sband) { + brate = sband->bitrates[rate->idx].bitrate; + rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); + } } if (rate->flags & IEEE80211_TX_RC_40_MHZ_WIDTH) rinfo->bw = RATE_INFO_BW_40; @@ -696,11 +740,8 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, return 0; mutex_lock(&local->mtx); - mutex_lock(&local->iflist_mtx); if (local->use_chanctx) { - sdata = rcu_dereference_protected( - local->monitor_sdata, - lockdep_is_held(&local->iflist_mtx)); + sdata = rtnl_dereference(local->monitor_sdata); if (sdata) { ieee80211_vif_release_channel(sdata); ret = ieee80211_vif_use_channel(sdata, chandef, @@ -713,7 +754,6 @@ static int ieee80211_set_monitor_channel(struct wiphy *wiphy, if (ret == 0) local->monitor_chandef = *chandef; - mutex_unlock(&local->iflist_mtx); mutex_unlock(&local->mtx); return ret; @@ -1214,10 +1254,11 @@ static int sta_apply_parameters(struct ieee80211_local *local, int ret = 0; struct ieee80211_supported_band *sband; struct ieee80211_sub_if_data *sdata = sta->sdata; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); u32 mask, set; - sband = local->hw.wiphy->bands[band]; + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; mask = params->sta_flags_mask; set = params->sta_flags_set; @@ -1350,7 +1391,7 @@ static int sta_apply_parameters(struct ieee80211_local *local, ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, sband, params->supported_rates, params->supported_rates_len, - &sta->sta.supp_rates[band]); + &sta->sta.supp_rates[sband->band]); } if (params->ht_capa) @@ -1366,8 +1407,8 @@ static int sta_apply_parameters(struct ieee80211_local *local, /* returned value is only needed for rc update, but the * rc isn't initialized here yet, so ignore it */ - __ieee80211_vht_handle_opmode(sdata, sta, - params->opmode_notif, band); + __ieee80211_vht_handle_opmode(sdata, sta, params->opmode_notif, + sband->band); } if (params->support_p2p_ps >= 0) @@ -2005,13 +2046,15 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - enum nl80211_band band; + struct ieee80211_supported_band *sband; u32 changed = 0; if (!sdata_dereference(sdata->u.ap.beacon, sdata)) return -ENOENT; - band = ieee80211_get_sdata_band(sdata); + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; if (params->use_cts_prot >= 0) { sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot; @@ -2024,7 +2067,7 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (!sdata->vif.bss_conf.use_short_slot && - band == NL80211_BAND_5GHZ) { + sband->band == NL80211_BAND_5GHZ) { sdata->vif.bss_conf.use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } @@ -2037,11 +2080,12 @@ static int ieee80211_change_bss(struct wiphy *wiphy, if (params->basic_rates) { ieee80211_parse_bitrates(&sdata->vif.bss_conf.chandef, - wiphy->bands[band], + wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, &sdata->vif.bss_conf.basic_rates); changed |= BSS_CHANGED_BASIC_RATES; + ieee80211_check_rate_mask(sdata); } if (params->ap_isolate >= 0) { @@ -2198,7 +2242,8 @@ ieee80211_sched_scan_start(struct wiphy *wiphy, } static int -ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev) +ieee80211_sched_scan_stop(struct wiphy *wiphy, struct net_device *dev, + u64 reqid) { struct ieee80211_local *local = wiphy_priv(wiphy); @@ -2630,6 +2675,33 @@ static int ieee80211_set_cqm_rssi_config(struct wiphy *wiphy, bss_conf->cqm_rssi_thold = rssi_thold; bss_conf->cqm_rssi_hyst = rssi_hyst; + bss_conf->cqm_rssi_low = 0; + bss_conf->cqm_rssi_high = 0; + sdata->u.mgd.last_cqm_event_signal = 0; + + /* tell the driver upon association, unless already associated */ + if (sdata->u.mgd.associated && + sdata->vif.driver_flags & IEEE80211_VIF_SUPPORTS_CQM_RSSI) + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_CQM); + + return 0; +} + +static int ieee80211_set_cqm_rssi_range_config(struct wiphy *wiphy, + struct net_device *dev, + s32 rssi_low, s32 rssi_high) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_vif *vif = &sdata->vif; + struct ieee80211_bss_conf *bss_conf = &vif->bss_conf; + + if (sdata->vif.driver_flags & IEEE80211_VIF_BEACON_FILTER) + return -EOPNOTSUPP; + + bss_conf->cqm_rssi_low = rssi_low; + bss_conf->cqm_rssi_high = rssi_high; + bss_conf->cqm_rssi_thold = 0; + bss_conf->cqm_rssi_hyst = 0; sdata->u.mgd.last_cqm_event_signal = 0; /* tell the driver upon association, unless already associated */ @@ -2658,6 +2730,21 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy, return ret; } + /* + * If active validate the setting and reject it if it doesn't leave + * at least one basic rate usable, since we really have to be able + * to send something, and if we're an AP we have to be able to do + * so at a basic rate so that all clients can receive it. + */ + if (rcu_access_pointer(sdata->vif.chanctx_conf) && + sdata->vif.bss_conf.chandef.chan) { + u32 basic_rates = sdata->vif.bss_conf.basic_rates; + enum nl80211_band band = sdata->vif.bss_conf.chandef.chan->band; + + if (!(mask->control[band].legacy & basic_rates)) + return -EINVAL; + } + for (i = 0; i < NUM_NL80211_BANDS; i++) { struct ieee80211_supported_band *sband = wiphy->bands[i]; int j; @@ -3639,6 +3726,7 @@ const struct cfg80211_ops mac80211_config_ops = { .mgmt_tx = ieee80211_mgmt_tx, .mgmt_tx_cancel_wait = ieee80211_mgmt_tx_cancel_wait, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, + .set_cqm_rssi_range_config = ieee80211_set_cqm_rssi_range_config, .mgmt_frame_register = ieee80211_mgmt_frame_register, .set_antenna = ieee80211_set_antenna, .get_antenna = ieee80211_get_antenna, diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 98999d3d5262743cf32ef92480772f034e70baf1..364d4e13764942db0b41eee7a9561b3c00e946ae 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -66,6 +66,8 @@ ieee80211_ibss_build_presp(struct ieee80211_sub_if_data *sdata, 2 + (IEEE80211_MAX_SUPP_RATES - 8) + 2 + sizeof(struct ieee80211_ht_cap) + 2 + sizeof(struct ieee80211_ht_operation) + + 2 + sizeof(struct ieee80211_vht_cap) + + 2 + sizeof(struct ieee80211_vht_operation) + ifibss->ie_len; presp = kzalloc(sizeof(*presp) + frame_len, GFP_KERNEL); if (!presp) @@ -425,7 +427,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, case NL80211_CHAN_WIDTH_5: case NL80211_CHAN_WIDTH_10: cfg80211_chandef_create(&chandef, cbss->channel, - NL80211_CHAN_WIDTH_20_NOHT); + NL80211_CHAN_NO_HT); chandef.width = sdata->u.ibss.chandef.width; break; case NL80211_CHAN_WIDTH_80: @@ -437,7 +439,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, default: /* fall back to 20 MHz for unsupported modes */ cfg80211_chandef_create(&chandef, cbss->channel, - NL80211_CHAN_WIDTH_20_NOHT); + NL80211_CHAN_NO_HT); break; } @@ -992,7 +994,7 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, enum nl80211_band band = rx_status->band; enum nl80211_bss_scan_width scan_width; struct ieee80211_local *local = sdata->local; - struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + struct ieee80211_supported_band *sband; bool rates_updated = false; u32 supp_rates = 0; @@ -1002,6 +1004,10 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, if (!ether_addr_equal(mgmt->bssid, sdata->u.ibss.bssid)) return; + sband = local->hw.wiphy->bands[band]; + if (WARN_ON(!sband)) + return; + rcu_read_lock(); sta = sta_info_get(sdata, mgmt->sa); @@ -1014,9 +1020,9 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata, prev_rates = sta->sta.supp_rates[band]; /* make sure mandatory rates are always added */ scan_width = NL80211_BSS_CHAN_WIDTH_20; - if (rx_status->flag & RX_FLAG_5MHZ) + if (rx_status->bw == RATE_INFO_BW_5) scan_width = NL80211_BSS_CHAN_WIDTH_5; - if (rx_status->flag & RX_FLAG_10MHZ) + else if (rx_status->bw == RATE_INFO_BW_10) scan_width = NL80211_BSS_CHAN_WIDTH_10; sta->sta.supp_rates[band] = supp_rates | diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 0e718437d080e7258efe75bcba26ce65990671ce..f8f6c148f5545feeb78c16ca6f33ebf5e02d0ab5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -839,6 +839,8 @@ struct txq_info { struct ieee80211_if_mntr { u32 flags; u8 mu_follow_addr[ETH_ALEN] __aligned(2); + + struct list_head list; }; /** @@ -999,21 +1001,6 @@ sdata_assert_lock(struct ieee80211_sub_if_data *sdata) lockdep_assert_held(&sdata->wdev.mtx); } -static inline enum nl80211_band -ieee80211_get_sdata_band(struct ieee80211_sub_if_data *sdata) -{ - enum nl80211_band band = NL80211_BAND_2GHZ; - struct ieee80211_chanctx_conf *chanctx_conf; - - rcu_read_lock(); - chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); - if (!WARN_ON(!chanctx_conf)) - band = chanctx_conf->def.chan->band; - rcu_read_unlock(); - - return band; -} - static inline int ieee80211_chandef_get_shift(struct cfg80211_chan_def *chandef) { @@ -1259,6 +1246,7 @@ struct ieee80211_local { /* see iface.c */ struct list_head interfaces; + struct list_head mon_list; /* only that are IFF_UP && !cooked */ struct mutex iflist_mtx; /* @@ -1418,6 +1406,27 @@ IEEE80211_WDEV_TO_SUB_IF(struct wireless_dev *wdev) return container_of(wdev, struct ieee80211_sub_if_data, wdev); } +static inline struct ieee80211_supported_band * +ieee80211_get_sband(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_chanctx_conf *chanctx_conf; + enum nl80211_band band; + + rcu_read_lock(); + chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf); + + if (WARN_ON(!chanctx_conf)) { + rcu_read_unlock(); + return NULL; + } + + band = chanctx_conf->def.chan->band; + rcu_read_unlock(); + + return local->hw.wiphy->bands[band]; +} + /* this struct represents 802.11n's RA/TID combination */ struct ieee80211_ra_tid { u8 ra[ETH_ALEN]; @@ -1474,6 +1483,7 @@ struct ieee802_11_elems { const u8 *opmode_notif; const struct ieee80211_sec_chan_offs_ie *sec_chan_offs; const struct ieee80211_mesh_chansw_params_ie *mesh_chansw_params_ie; + const struct ieee80211_bss_max_idle_period_ie *max_idle_period_ie; /* length of them, respectively */ u8 ext_capab_len; @@ -1527,9 +1537,9 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status) status->flag & RX_FLAG_MACTIME_END); if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END)) return true; - /* can't handle HT/VHT preamble yet */ + /* can't handle non-legacy preamble yet */ if (status->flag & RX_FLAG_MACTIME_PLCP_START && - !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT))) + status->encoding != RX_ENC_LEGACY) return true; return false; } diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 5bb0c501281954dfe656c5e886c9032b958061be..3bd5b81f5d81ec7d73686043c2683630e24ecde4 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -676,7 +676,8 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) set_bit(SDATA_STATE_RUNNING, &sdata->state); - if (sdata->vif.type == NL80211_IFTYPE_WDS) { + switch (sdata->vif.type) { + case NL80211_IFTYPE_WDS: /* Create STA entry for the WDS peer */ sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, GFP_KERNEL); @@ -697,8 +698,17 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) rate_control_rate_init(sta); netif_carrier_on(dev); - } else if (sdata->vif.type == NL80211_IFTYPE_P2P_DEVICE) { + break; + case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); + break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) + break; + list_add_tail_rcu(&sdata->u.mntr.list, &local->mon_list); + break; + default: + break; } /* @@ -817,6 +827,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP: cancel_work_sync(&sdata->u.ap.request_smps_work); break; + case NL80211_IFTYPE_MONITOR: + if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) + break; + list_del_rcu(&sdata->u.mntr.list); + break; default: break; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 56fb47953b72420b3ceb4e3f5d27dc9c4d23928b..8aa1f5b6a05145b0838b494abdf2b2c5b0aa17ab 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -253,6 +253,7 @@ static void ieee80211_restart_work(struct work_struct *work) WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); + flush_work(&local->radar_detected_work); rtnl_lock(); list_for_each_entry(sdata, &local->interfaces, list) flush_delayed_work(&sdata->dec_tailroom_needed_wk); @@ -603,6 +604,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, ARRAY_SIZE(local->ext_capa); INIT_LIST_HEAD(&local->interfaces); + INIT_LIST_HEAD(&local->mon_list); __hw_addr_init(&local->mc_list); @@ -1186,6 +1188,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) cancel_work_sync(&local->reconfig_filter); cancel_work_sync(&local->tdls_chsw_work); flush_work(&local->sched_scan_stopped_work); + flush_work(&local->radar_detected_work); ieee80211_clear_tx_pending(local); rate_control_deinitialize(local); diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c index 6e7b6a07b7d536a64f7e9ec296adf2bbba8d961c..737e1f082b0ddd55e03b30bf215a5fbb68943143 100644 --- a/net/mac80211/mesh.c +++ b/net/mac80211/mesh.c @@ -63,6 +63,7 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; u32 basic_rates = 0; struct cfg80211_chan_def sta_chan_def; + struct ieee80211_supported_band *sband; /* * As support for each feature is added, check for matching @@ -83,7 +84,11 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata, (ifmsh->mesh_auth_id == ie->mesh_config->meshconf_auth))) return false; - ieee80211_sta_get_rates(sdata, ie, ieee80211_get_sdata_band(sdata), + sband = ieee80211_get_sband(sdata); + if (!sband) + return false; + + ieee80211_sta_get_rates(sdata, ie, sband->band, &basic_rates); if (sdata->vif.bss_conf.basic_rates != basic_rates) @@ -399,12 +404,13 @@ static int mesh_add_ds_params_ie(struct ieee80211_sub_if_data *sdata, int mesh_add_ht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - struct ieee80211_local *local = sdata->local; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; - sband = local->hw.wiphy->bands[band]; + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; + if (!sband->ht_cap.ht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -462,12 +468,13 @@ int mesh_add_ht_oper_ie(struct ieee80211_sub_if_data *sdata, int mesh_add_vht_cap_ie(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) { - struct ieee80211_local *local = sdata->local; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u8 *pos; - sband = local->hw.wiphy->bands[band]; + sband = ieee80211_get_sband(sdata); + if (!sband) + return -EINVAL; + if (!sband->vht_cap.vht_supported || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_20_NOHT || sdata->vif.bss_conf.chandef.width == NL80211_CHAN_WIDTH_5 || @@ -916,12 +923,16 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, struct cfg80211_csa_settings params; struct ieee80211_csa_ie csa_ie; struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband; int err; u32 sta_flags; sdata_assert_lock(sdata); + sband = ieee80211_get_sband(sdata); + if (!sband) + return false; + sta_flags = IEEE80211_STA_DISABLE_VHT; switch (sdata->vif.bss_conf.chandef.width) { case NL80211_CHAN_WIDTH_20_NOHT: @@ -935,7 +946,7 @@ ieee80211_mesh_process_chnswitch(struct ieee80211_sub_if_data *sdata, memset(¶ms, 0, sizeof(params)); memset(&csa_ie, 0, sizeof(csa_ie)); - err = ieee80211_parse_ch_switch_ie(sdata, elems, band, + err = ieee80211_parse_ch_switch_ie(sdata, elems, sband->band, sta_flags, sdata->vif.addr, &csa_ie); if (err < 0) @@ -1100,8 +1111,14 @@ static void ieee80211_mesh_rx_bcn_presp(struct ieee80211_sub_if_data *sdata, if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; - if (mesh_matches_local(sdata, &elems)) - mesh_neighbour_update(sdata, mgmt->sa, &elems); + if (mesh_matches_local(sdata, &elems)) { + mpl_dbg(sdata, "rssi_threshold=%d,rx_status->signal=%d\n", + sdata->u.mesh.mshcfg.rssi_threshold, rx_status->signal); + if (!sdata->u.mesh.user_mpm || + sdata->u.mesh.mshcfg.rssi_threshold == 0 || + sdata->u.mesh.mshcfg.rssi_threshold < rx_status->signal) + mesh_neighbour_update(sdata, mgmt->sa, &elems); + } if (ifmsh->sync_ops) ifmsh->sync_ops->rx_bcn_presp(sdata, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index b747c9645e432bffe5b9d266a51e7ffca3b75ec3..4005edd71fe86db5415bf0bb9803dac7248ae77a 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -16,6 +16,7 @@ #define TEST_FRAME_LEN 8192 #define MAX_METRIC 0xffffffff #define ARITH_SHIFT 8 +#define LINK_FAIL_THRESH 95 #define MAX_PREQ_QUEUE_LEN 64 @@ -307,10 +308,12 @@ void ieee80211s_update_metric(struct ieee80211_local *local, failed = !(txinfo->flags & IEEE80211_TX_STAT_ACK); - /* moving average, scaled to 100 */ - sta->mesh->fail_avg = - ((80 * sta->mesh->fail_avg + 5) / 100 + 20 * failed); - if (sta->mesh->fail_avg > 95) + /* moving average, scaled to 100. + * feed failure as 100 and success as 0 + */ + ewma_mesh_fail_avg_add(&sta->mesh->fail_avg, failed * 100); + if (ewma_mesh_fail_avg_read(&sta->mesh->fail_avg) > + LINK_FAIL_THRESH) mesh_plink_broken(sta); } @@ -325,6 +328,8 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, int rate, err; u32 tx_time, estimated_retx; u64 result; + unsigned long fail_avg = + ewma_mesh_fail_avg_read(&sta->mesh->fail_avg); /* Try to get rate based on HW/SW RC algorithm. * Rate is returned in units of Kbps, correct this @@ -336,7 +341,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, if (rate) { err = 0; } else { - if (sta->mesh->fail_avg >= 100) + if (fail_avg > LINK_FAIL_THRESH) return MAX_METRIC; sta_set_rate_info_tx(sta, &sta->tx_stats.last_rate, &rinfo); @@ -344,7 +349,7 @@ static u32 airtime_link_metric_get(struct ieee80211_local *local, if (WARN_ON(!rate)) return MAX_METRIC; - err = (sta->mesh->fail_avg << ARITH_SHIFT) / 100; + err = (fail_avg << ARITH_SHIFT) / 100; } /* bitrate is in units of 100 Kbps, while we need rate in units of @@ -484,6 +489,9 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, ? mpath->exp_time : exp_time; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); + ewma_mesh_fail_avg_init(&sta->mesh->fail_avg); + /* init it at a low value - 0 start is tricky */ + ewma_mesh_fail_avg_add(&sta->mesh->fail_avg, 1); mesh_path_tx_pending(mpath); /* draft says preq_id should be saved to, but there does * not seem to be any use for it, skipping by now @@ -522,6 +530,9 @@ static u32 hwmp_route_info_get(struct ieee80211_sub_if_data *sdata, ? mpath->exp_time : exp_time; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); + ewma_mesh_fail_avg_init(&sta->mesh->fail_avg); + /* init it at a low value - 0 start is tricky */ + ewma_mesh_fail_avg_add(&sta->mesh->fail_avg, 1); mesh_path_tx_pending(mpath); } else spin_unlock_bh(&mpath->state_lock); diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c index f0e6175a9821f01d7aac2dfbda02c1ee5eeb31ec..97269caafecd7b52e644e9bb645d305fdfb67196 100644 --- a/net/mac80211/mesh_pathtbl.c +++ b/net/mac80211/mesh_pathtbl.c @@ -397,11 +397,10 @@ struct mesh_path *mesh_path_new(struct ieee80211_sub_if_data *sdata, new_mpath->sdata = sdata; new_mpath->flags = 0; skb_queue_head_init(&new_mpath->frame_queue); - new_mpath->timer.data = (unsigned long) new_mpath; - new_mpath->timer.function = mesh_path_timer; new_mpath->exp_time = jiffies; spin_lock_init(&new_mpath->state_lock); - init_timer(&new_mpath->timer); + setup_timer(&new_mpath->timer, mesh_path_timer, + (unsigned long) new_mpath); return new_mpath; } @@ -829,6 +828,9 @@ void mesh_path_fix_nexthop(struct mesh_path *mpath, struct sta_info *next_hop) mpath->flags = MESH_PATH_FIXED | MESH_PATH_SN_VALID; mesh_path_activate(mpath); spin_unlock_bh(&mpath->state_lock); + ewma_mesh_fail_avg_init(&next_hop->mesh->fail_avg); + /* init it at a low value - 0 start is tricky */ + ewma_mesh_fail_avg_add(&next_hop->mesh->fail_avg, 1); mesh_path_tx_pending(mpath); } diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 953d71e784a9ab71cb8494478e39eaf3b9464211..1131cd504a15ef5fa507c37c8a5464aac173321f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -95,19 +95,23 @@ static inline void mesh_plink_fsm_restart(struct sta_info *sta) static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); - struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + struct ieee80211_supported_band *sband; struct sta_info *sta; u32 erp_rates = 0, changed = 0; int i; bool short_slot = false; - if (band == NL80211_BAND_5GHZ) { + sband = ieee80211_get_sband(sdata); + if (!sband) + return changed; + + if (sband->band == NL80211_BAND_5GHZ) { /* (IEEE 802.11-2012 19.4.5) */ short_slot = true; goto out; - } else if (band != NL80211_BAND_2GHZ) + } else if (sband->band != NL80211_BAND_2GHZ) { goto out; + } for (i = 0; i < sband->n_bitrates; i++) if (sband->bitrates[i].flags & IEEE80211_RATE_ERP_G) @@ -123,7 +127,7 @@ static u32 mesh_set_short_slot_time(struct ieee80211_sub_if_data *sdata) continue; short_slot = false; - if (erp_rates & sta->sta.supp_rates[band]) + if (erp_rates & sta->sta.supp_rates[sband->band]) short_slot = true; else break; @@ -249,7 +253,15 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, mgmt->u.action.u.self_prot.action_code = action; if (action != WLAN_SP_MESH_PEERING_CLOSE) { - enum nl80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband; + enum nl80211_band band; + + sband = ieee80211_get_sband(sdata); + if (!sband) { + err = -EINVAL; + goto free; + } + band = sband->band; /* capability info */ pos = skb_put(skb, 2); @@ -395,13 +407,16 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, struct ieee802_11_elems *elems, bool insert) { struct ieee80211_local *local = sdata->local; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); struct ieee80211_supported_band *sband; u32 rates, basic_rates = 0, changed = 0; enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth; - sband = local->hw.wiphy->bands[band]; - rates = ieee80211_sta_get_rates(sdata, elems, band, &basic_rates); + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + rates = ieee80211_sta_get_rates(sdata, elems, sband->band, + &basic_rates); spin_lock_bh(&sta->mesh->plink_lock); sta->rx_stats.last_rx = jiffies; @@ -412,9 +427,9 @@ static void mesh_sta_info_init(struct ieee80211_sub_if_data *sdata, goto out; sta->mesh->processed_beacon = true; - if (sta->sta.supp_rates[band] != rates) + if (sta->sta.supp_rates[sband->band] != rates) changed |= IEEE80211_RC_SUPP_RATES_CHANGED; - sta->sta.supp_rates[band] = rates; + sta->sta.supp_rates[sband->band] = rates; if (ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband, elems->ht_cap_elem, sta)) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 6e90301154d5a6ba4a7e9bb612ac8243cf09a23c..0ea9712bd99ea698f40b068d5ee31a69133224a3 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -6,7 +6,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007, Michael Wu * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 - 2016 Intel Deutschland GmbH + * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -1855,11 +1855,16 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, u16 capab, bool erp_valid, u8 erp) { struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf; + struct ieee80211_supported_band *sband; u32 changed = 0; bool use_protection; bool use_short_preamble; bool use_short_slot; + sband = ieee80211_get_sband(sdata); + if (!sband) + return changed; + if (erp_valid) { use_protection = (erp & WLAN_ERP_USE_PROTECTION) != 0; use_short_preamble = (erp & WLAN_ERP_BARKER_PREAMBLE) == 0; @@ -1869,7 +1874,7 @@ static u32 ieee80211_handle_bss_capability(struct ieee80211_sub_if_data *sdata, } use_short_slot = !!(capab & WLAN_CAPABILITY_SHORT_SLOT_TIME); - if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_5GHZ) + if (sband->band == NL80211_BAND_5GHZ) use_short_slot = true; if (use_protection != bss_conf->use_cts_prot) { @@ -1908,6 +1913,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.associated = cbss; memcpy(sdata->u.mgd.bssid, cbss->bssid, ETH_ALEN); + ieee80211_check_rate_mask(sdata); + sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE; if (sdata->vif.p2p || @@ -2797,8 +2804,9 @@ static void ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, reason_code = le16_to_cpu(mgmt->u.disassoc.reason_code); - sdata_info(sdata, "disassociated from %pM (Reason: %u)\n", - mgmt->sa, reason_code); + sdata_info(sdata, "disassociated from %pM (Reason: %u=%s)\n", + mgmt->sa, reason_code, + ieee80211_get_reason_code_string(reason_code)); ieee80211_set_disassoc(sdata, 0, 0, false, NULL); @@ -2822,15 +2830,15 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, *have_higher_than_11mbit = true; /* - * BSS_MEMBERSHIP_SELECTOR_HT_PHY is defined in 802.11n-2009 - * 7.3.2.2 as a magic value instead of a rate. Hence, skip it. + * Skip HT and VHT BSS membership selectors since they're not + * rates. * - * Note: Even through the membership selector and the basic + * Note: Even though the membership selector and the basic * rate flag share the same bit, they are not exactly * the same. */ - if (!!(supp_rates[i] & 0x80) && - (supp_rates[i] & 0x7f) == BSS_MEMBERSHIP_SELECTOR_HT_PHY) + if (supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_HT_PHY) || + supp_rates[i] == (0x80 | BSS_MEMBERSHIP_SELECTOR_VHT_PHY)) continue; for (j = 0; j < sband->n_bitrates; j++) { @@ -3001,7 +3009,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out; } - sband = local->hw.wiphy->bands[ieee80211_get_sdata_band(sdata)]; + sband = ieee80211_get_sband(sdata); + if (!sband) { + mutex_unlock(&sdata->local->sta_mtx); + ret = false; + goto out; + } /* Set up internal HT/VHT capabilities */ if (elems.ht_cap_elem && !(ifmgd->flags & IEEE80211_STA_DISABLE_HT)) @@ -3085,6 +3098,18 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } changed |= BSS_CHANGED_QOS; + if (elems.max_idle_period_ie) { + bss_conf->max_idle_period = + le16_to_cpu(elems.max_idle_period_ie->max_idle_period); + bss_conf->protected_keep_alive = + !!(elems.max_idle_period_ie->idle_options & + WLAN_IDLE_OPTIONS_PROTECTED_KEEP_ALIVE); + changed |= BSS_CHANGED_KEEP_ALIVE; + } else { + bss_conf->max_idle_period = 0; + bss_conf->protected_keep_alive = false; + } + /* set AID and assoc capability, * ieee80211_set_associated() will tell the driver */ bss_conf->aid = aid; @@ -3430,6 +3455,30 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } } + if (bss_conf->cqm_rssi_low && + ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT) { + int sig = -ewma_beacon_signal_read(&ifmgd->ave_beacon_signal); + int last_event = ifmgd->last_cqm_event_signal; + int low = bss_conf->cqm_rssi_low; + int high = bss_conf->cqm_rssi_high; + + if (sig < low && + (last_event == 0 || last_event >= low)) { + ifmgd->last_cqm_event_signal = sig; + ieee80211_cqm_rssi_notify( + &sdata->vif, + NL80211_CQM_RSSI_THRESHOLD_EVENT_LOW, + sig, GFP_KERNEL); + } else if (sig > high && + (last_event == 0 || last_event <= high)) { + ifmgd->last_cqm_event_signal = sig; + ieee80211_cqm_rssi_notify( + &sdata->vif, + NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH, + sig, GFP_KERNEL); + } + } + if (ifmgd->flags & IEEE80211_STA_CONNECTION_POLL) { mlme_dbg_ratelimited(sdata, "cancelling AP probe due to a received beacon\n"); @@ -4333,6 +4382,10 @@ static int ieee80211_prep_connection(struct ieee80211_sub_if_data *sdata, if (WARN_ON(!ifmgd->auth_data && !ifmgd->assoc_data)) return -EINVAL; + /* If a reconfig is happening, bail out */ + if (local->in_reconfig) + return -EBUSY; + if (assoc) { rcu_read_lock(); have_sta = sta_info_get(sdata, cbss->bssid); diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index 76a8bcd8ef11237cf8715bf6a10c9085a2152d4d..a87d195c4a615761ed6041296b417ec3c9a8f996 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -10,7 +10,7 @@ static void ieee80211_sched_scan_cancel(struct ieee80211_local *local) { if (ieee80211_request_sched_scan_stop(local)) return; - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); } int __ieee80211_suspend(struct ieee80211_hw *hw, struct cfg80211_wowlan *wowlan) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 206698bc93f406939bb5d883b6ab2f04bc1a3bed..ea1f4315c521be976c822918d7d821cffd3cc0e9 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -2,6 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2005-2006, Devicescape Software, Inc. * Copyright (c) 2006 Jiri Benc + * Copyright 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -61,6 +62,28 @@ void rate_control_rate_init(struct sta_info *sta) set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } +void rate_control_tx_status(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct ieee80211_tx_status *st) +{ + struct rate_control_ref *ref = local->rate_ctrl; + struct sta_info *sta = container_of(st->sta, struct sta_info, sta); + void *priv_sta = sta->rate_ctrl_priv; + + if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) + return; + + spin_lock_bh(&sta->rate_ctrl_lock); + if (ref->ops->tx_status_ext) + ref->ops->tx_status_ext(ref->priv, sband, priv_sta, st); + else if (st->skb) + ref->ops->tx_status(ref->priv, sband, st->sta, priv_sta, st->skb); + else + WARN_ON_ONCE(1); + + spin_unlock_bh(&sta->rate_ctrl_lock); +} + void rate_control_rate_update(struct ieee80211_local *local, struct ieee80211_supported_band *sband, struct sta_info *sta, u32 changed) @@ -173,9 +196,11 @@ ieee80211_rate_control_ops_get(const char *name) /* try default if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(ieee80211_default_rc_algo); - /* try built-in one if specific alg requested but not found */ - if (!ops && strlen(CONFIG_MAC80211_RC_DEFAULT)) + /* Note: check for > 0 is intentional to avoid clang warning */ + if (!ops && (strlen(CONFIG_MAC80211_RC_DEFAULT) > 0)) + /* try built-in one if specific alg requested but not found */ ops = ieee80211_try_rate_control_ops_get(CONFIG_MAC80211_RC_DEFAULT); + kernel_param_unlock(THIS_MODULE); return ops; @@ -208,7 +233,6 @@ static struct rate_control_ref *rate_control_alloc(const char *name, ref = kmalloc(sizeof(struct rate_control_ref), GFP_KERNEL); if (!ref) return NULL; - ref->local = local; ref->ops = ieee80211_rate_control_ops_get(name); if (!ref->ops) goto free; @@ -229,18 +253,45 @@ static struct rate_control_ref *rate_control_alloc(const char *name, return NULL; } -static void rate_control_free(struct rate_control_ref *ctrl_ref) +static void rate_control_free(struct ieee80211_local *local, + struct rate_control_ref *ctrl_ref) { ctrl_ref->ops->free(ctrl_ref->priv); #ifdef CONFIG_MAC80211_DEBUGFS - debugfs_remove_recursive(ctrl_ref->local->debugfs.rcdir); - ctrl_ref->local->debugfs.rcdir = NULL; + debugfs_remove_recursive(local->debugfs.rcdir); + local->debugfs.rcdir = NULL; #endif kfree(ctrl_ref); } +void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata) +{ + struct ieee80211_local *local = sdata->local; + struct ieee80211_supported_band *sband; + u32 user_mask, basic_rates = sdata->vif.bss_conf.basic_rates; + enum nl80211_band band; + + if (WARN_ON(!sdata->vif.bss_conf.chandef.chan)) + return; + + if (WARN_ON_ONCE(!basic_rates)) + return; + + band = sdata->vif.bss_conf.chandef.chan->band; + user_mask = sdata->rc_rateidx_mask[band]; + sband = local->hw.wiphy->bands[band]; + + if (user_mask & basic_rates) + return; + + sdata_dbg(sdata, + "no overlap between basic rates (0x%x) and user mask (0x%x on band %d) - clearing the latter", + basic_rates, user_mask, band); + sdata->rc_rateidx_mask[band] = (1 << sband->n_bitrates) - 1; +} + static bool rc_no_data_or_no_ack_use_min(struct ieee80211_tx_rate_control *txrc) { struct sk_buff *skb = txrc->skb; @@ -875,7 +926,9 @@ int rate_control_set_rates(struct ieee80211_hw *hw, struct ieee80211_sta_rates *old; struct ieee80211_supported_band *sband; - sband = hw->wiphy->bands[ieee80211_get_sdata_band(sta->sdata)]; + sband = ieee80211_get_sband(sta->sdata); + if (!sband) + return -EINVAL; rate_control_apply_mask_ratetbl(sta, sband, rates); /* * mac80211 guarantees that this function will not be called @@ -936,6 +989,6 @@ void rate_control_deinitialize(struct ieee80211_local *local) return; local->rate_ctrl = NULL; - rate_control_free(ref); + rate_control_free(local, ref); } diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 8d3260785b940d3ea87679f86b98015ce1b4fbbd..8212bfeb71d6b382fda7e2efe243304c7475db2a 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -20,7 +20,6 @@ #include "driver-ops.h" struct rate_control_ref { - struct ieee80211_local *local; const struct rate_control_ops *ops; void *priv; }; @@ -29,47 +28,9 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_tx_rate_control *txrc); -static inline void rate_control_tx_status(struct ieee80211_local *local, - struct ieee80211_supported_band *sband, - struct sta_info *sta, - struct sk_buff *skb) -{ - struct rate_control_ref *ref = local->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - - if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - return; - - spin_lock_bh(&sta->rate_ctrl_lock); - if (ref->ops->tx_status) - ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); - else - ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); - spin_unlock_bh(&sta->rate_ctrl_lock); -} - -static inline void -rate_control_tx_status_noskb(struct ieee80211_local *local, - struct ieee80211_supported_band *sband, - struct sta_info *sta, - struct ieee80211_tx_info *info) -{ - struct rate_control_ref *ref = local->rate_ctrl; - struct ieee80211_sta *ista = &sta->sta; - void *priv_sta = sta->rate_ctrl_priv; - - if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) - return; - - if (WARN_ON_ONCE(!ref->ops->tx_status_noskb)) - return; - - spin_lock_bh(&sta->rate_ctrl_lock); - ref->ops->tx_status_noskb(ref->priv, sband, ista, priv_sta, info); - spin_unlock_bh(&sta->rate_ctrl_lock); -} +void rate_control_tx_status(struct ieee80211_local *local, + struct ieee80211_supported_band *sband, + struct ieee80211_tx_status *st); void rate_control_rate_init(struct sta_info *sta); void rate_control_rate_update(struct ieee80211_local *local, @@ -111,6 +72,8 @@ static inline void rate_control_remove_sta_debugfs(struct sta_info *sta) #endif } +void ieee80211_check_rate_mask(struct ieee80211_sub_if_data *sdata); + /* Get a reference to the rate control algorithm. If `name' is NULL, get the * first available algorithm. */ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c index 3ebe4405a2d43d66f76bdef3fd4f10cd3fa82b8f..9766c1cc4b0a5d59ae4c71aea92921391a1576b3 100644 --- a/net/mac80211/rc80211_minstrel.c +++ b/net/mac80211/rc80211_minstrel.c @@ -264,9 +264,9 @@ minstrel_update_stats(struct minstrel_priv *mp, struct minstrel_sta_info *mi) static void minstrel_tx_status(void *priv, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta, - struct ieee80211_tx_info *info) + void *priv_sta, struct ieee80211_tx_status *st) { + struct ieee80211_tx_info *info = st->info; struct minstrel_priv *mp = priv; struct minstrel_sta_info *mi = priv_sta; struct ieee80211_tx_rate *ar = info->status.rates; @@ -726,7 +726,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta) const struct rate_control_ops mac80211_minstrel = { .name = "minstrel", - .tx_status_noskb = minstrel_tx_status, + .tx_status_ext = minstrel_tx_status, .get_rate = minstrel_get_rate, .rate_init = minstrel_rate_init, .alloc = minstrel_alloc, diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 8e783e197e935cd8bcd0b7b7d5e88b8d2c9210cc..4a5bdad9f303034f0aee90ed3ac00cb044458a8d 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -678,9 +678,9 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb) static void minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, - struct ieee80211_sta *sta, void *priv_sta, - struct ieee80211_tx_info *info) + void *priv_sta, struct ieee80211_tx_status *st) { + struct ieee80211_tx_info *info = st->info; struct minstrel_ht_sta_priv *msp = priv_sta; struct minstrel_ht_sta *mi = &msp->ht; struct ieee80211_tx_rate *ar = info->status.rates; @@ -690,8 +690,8 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, int i; if (!msp->is_ht) - return mac80211_minstrel.tx_status_noskb(priv, sband, sta, - &msp->legacy, info); + return mac80211_minstrel.tx_status_ext(priv, sband, + &msp->legacy, st); /* This packet was aggregated but doesn't carry status info */ if ((info->flags & IEEE80211_TX_CTL_AMPDU) && @@ -1374,7 +1374,7 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta) static const struct rate_control_ops mac80211_minstrel_ht = { .name = "minstrel_ht", - .tx_status_noskb = minstrel_ht_tx_status, + .tx_status_ext = minstrel_ht_tx_status, .get_rate = minstrel_ht_get_rate, .rate_init = minstrel_ht_rate_init, .rate_update = minstrel_ht_rate_update, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e48724a6725e3266c1d5559d268339a7d2cd7f10..35f4c7d7a50089e3998ff9860e86f6d26e1da600 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -95,24 +95,13 @@ static u8 *ieee80211_get_bssid(struct ieee80211_hdr *hdr, size_t len, * This function cleans up the SKB, i.e. it removes all the stuff * only useful for monitoring. */ -static struct sk_buff *remove_monitor_info(struct ieee80211_local *local, - struct sk_buff *skb, - unsigned int rtap_vendor_space) +static void remove_monitor_info(struct sk_buff *skb, + unsigned int present_fcs_len, + unsigned int rtap_vendor_space) { - if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { - if (likely(skb->len > FCS_LEN)) - __pskb_trim(skb, skb->len - FCS_LEN); - else { - /* driver bug */ - WARN_ON(1); - dev_kfree_skb(skb); - return NULL; - } - } - + if (present_fcs_len) + __pskb_trim(skb, skb->len - present_fcs_len); __pskb_pull(skb, rtap_vendor_space); - - return skb; } static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len, @@ -167,7 +156,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, /* padding for RX_FLAGS if necessary */ len = ALIGN(len, 2); - if (status->flag & RX_FLAG_HT) /* HT info */ + if (status->encoding == RX_ENC_HT) /* HT info */ len += 3; if (status->flag & RX_FLAG_AMPDU_DETAILS) { @@ -175,7 +164,7 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, len += 8; } - if (status->flag & RX_FLAG_VHT) { + if (status->encoding == RX_ENC_VHT) { len = ALIGN(len, 2); len += 12; } @@ -208,6 +197,51 @@ ieee80211_rx_radiotap_hdrlen(struct ieee80211_local *local, return len; } +static void ieee80211_handle_mu_mimo_mon(struct ieee80211_sub_if_data *sdata, + struct sk_buff *skb, + int rtap_vendor_space) +{ + struct { + struct ieee80211_hdr_3addr hdr; + u8 category; + u8 action_code; + } __packed action; + + if (!sdata) + return; + + BUILD_BUG_ON(sizeof(action) != IEEE80211_MIN_ACTION_SIZE + 1); + + if (skb->len < rtap_vendor_space + sizeof(action) + + VHT_MUMIMO_GROUPS_DATA_LEN) + return; + + if (!is_valid_ether_addr(sdata->u.mntr.mu_follow_addr)) + return; + + skb_copy_bits(skb, rtap_vendor_space, &action, sizeof(action)); + + if (!ieee80211_is_action(action.hdr.frame_control)) + return; + + if (action.category != WLAN_CATEGORY_VHT) + return; + + if (action.action_code != WLAN_VHT_ACTION_GROUPID_MGMT) + return; + + if (!ether_addr_equal(action.hdr.addr1, sdata->u.mntr.mu_follow_addr)) + return; + + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) + return; + + skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&sdata->skb_queue, skb); + ieee80211_queue_work(&sdata->local->hw, &sdata->work); +} + /* * ieee80211_add_rx_radiotap_header - add radiotap header * @@ -295,12 +329,12 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos |= IEEE80211_RADIOTAP_F_FCS; if (status->flag & (RX_FLAG_FAILED_FCS_CRC | RX_FLAG_FAILED_PLCP_CRC)) *pos |= IEEE80211_RADIOTAP_F_BADFCS; - if (status->flag & RX_FLAG_SHORTPRE) + if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) *pos |= IEEE80211_RADIOTAP_F_SHORTPRE; pos++; /* IEEE80211_RADIOTAP_RATE */ - if (!rate || status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) { + if (!rate || status->encoding != RX_ENC_LEGACY) { /* * Without rate information don't add it. If we have, * MCS information is a separate field in radiotap, @@ -311,9 +345,9 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } else { int shift = 0; rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_RATE); - if (status->flag & RX_FLAG_10MHZ) + if (status->bw == RATE_INFO_BW_10) shift = 1; - else if (status->flag & RX_FLAG_5MHZ) + else if (status->bw == RATE_INFO_BW_5) shift = 2; *pos = DIV_ROUND_UP(rate->bitrate, 5 * (1 << shift)); } @@ -322,14 +356,14 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, /* IEEE80211_RADIOTAP_CHANNEL */ put_unaligned_le16(status->freq, pos); pos += 2; - if (status->flag & RX_FLAG_10MHZ) + if (status->bw == RATE_INFO_BW_10) channel_flags |= IEEE80211_CHAN_HALF; - else if (status->flag & RX_FLAG_5MHZ) + else if (status->bw == RATE_INFO_BW_5) channel_flags |= IEEE80211_CHAN_QUARTER; if (status->band == NL80211_BAND_5GHZ) channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_5GHZ; - else if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) + else if (status->encoding != RX_ENC_LEGACY) channel_flags |= IEEE80211_CHAN_DYN | IEEE80211_CHAN_2GHZ; else if (rate && rate->flags & IEEE80211_RATE_ERP_G) channel_flags |= IEEE80211_CHAN_OFDM | IEEE80211_CHAN_2GHZ; @@ -368,21 +402,21 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, put_unaligned_le16(rx_flags, pos); pos += 2; - if (status->flag & RX_FLAG_HT) { + if (status->encoding == RX_ENC_HT) { unsigned int stbc; rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_MCS); *pos++ = local->hw.radiotap_mcs_details; *pos = 0; - if (status->flag & RX_FLAG_SHORT_GI) + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) *pos |= IEEE80211_RADIOTAP_MCS_SGI; - if (status->flag & RX_FLAG_40MHZ) + if (status->bw == RATE_INFO_BW_40) *pos |= IEEE80211_RADIOTAP_MCS_BW_40; - if (status->flag & RX_FLAG_HT_GF) + if (status->enc_flags & RX_ENC_FLAG_HT_GF) *pos |= IEEE80211_RADIOTAP_MCS_FMT_GF; - if (status->flag & RX_FLAG_LDPC) + if (status->enc_flags & RX_ENC_FLAG_LDPC) *pos |= IEEE80211_RADIOTAP_MCS_FEC_LDPC; - stbc = (status->flag & RX_FLAG_STBC_MASK) >> RX_FLAG_STBC_SHIFT; + stbc = (status->enc_flags & RX_ENC_FLAG_STBC_MASK) >> RX_ENC_FLAG_STBC_SHIFT; *pos |= stbc << IEEE80211_RADIOTAP_MCS_STBC_SHIFT; pos++; *pos++ = status->rate_idx; @@ -415,35 +449,40 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, *pos++ = 0; } - if (status->flag & RX_FLAG_VHT) { + if (status->encoding == RX_ENC_VHT) { u16 known = local->hw.radiotap_vht_details; rthdr->it_present |= cpu_to_le32(1 << IEEE80211_RADIOTAP_VHT); put_unaligned_le16(known, pos); pos += 2; /* flags */ - if (status->flag & RX_FLAG_SHORT_GI) + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_SGI; /* in VHT, STBC is binary */ - if (status->flag & RX_FLAG_STBC_MASK) + if (status->enc_flags & RX_ENC_FLAG_STBC_MASK) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_STBC; - if (status->vht_flag & RX_VHT_FLAG_BF) + if (status->enc_flags & RX_ENC_FLAG_BF) *pos |= IEEE80211_RADIOTAP_VHT_FLAG_BEAMFORMED; pos++; /* bandwidth */ - if (status->vht_flag & RX_VHT_FLAG_80MHZ) + switch (status->bw) { + case RATE_INFO_BW_80: *pos++ = 4; - else if (status->vht_flag & RX_VHT_FLAG_160MHZ) + break; + case RATE_INFO_BW_160: *pos++ = 11; - else if (status->flag & RX_FLAG_40MHZ) + break; + case RATE_INFO_BW_40: *pos++ = 1; - else /* 20 MHz */ + break; + default: *pos++ = 0; + } /* MCS/NSS */ - *pos = (status->rate_idx << 4) | status->vht_nss; + *pos = (status->rate_idx << 4) | status->nss; pos += 4; /* coding field */ - if (status->flag & RX_FLAG_LDPC) + if (status->enc_flags & RX_ENC_FLAG_LDPC) *pos |= IEEE80211_RADIOTAP_CODING_LDPC_USER0; pos++; /* group ID */ @@ -499,6 +538,59 @@ ieee80211_add_rx_radiotap_header(struct ieee80211_local *local, } } +static struct sk_buff * +ieee80211_make_monitor_skb(struct ieee80211_local *local, + struct sk_buff **origskb, + struct ieee80211_rate *rate, + int rtap_vendor_space, bool use_origskb) +{ + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(*origskb); + int rt_hdrlen, needed_headroom; + struct sk_buff *skb; + + /* room for the radiotap header based on driver features */ + rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, *origskb); + needed_headroom = rt_hdrlen - rtap_vendor_space; + + if (use_origskb) { + /* only need to expand headroom if necessary */ + skb = *origskb; + *origskb = NULL; + + /* + * This shouldn't trigger often because most devices have an + * RX header they pull before we get here, and that should + * be big enough for our radiotap information. We should + * probably export the length to drivers so that we can have + * them allocate enough headroom to start with. + */ + if (skb_headroom(skb) < needed_headroom && + pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) { + dev_kfree_skb(skb); + return NULL; + } + } else { + /* + * Need to make a copy and possibly remove radiotap header + * and FCS from the original. + */ + skb = skb_copy_expand(*origskb, needed_headroom, 0, GFP_ATOMIC); + + if (!skb) + return NULL; + } + + /* prepend radiotap information */ + ieee80211_add_rx_radiotap_header(local, skb, rate, rt_hdrlen, true); + + skb_reset_mac_header(skb); + skb->ip_summed = CHECKSUM_UNNECESSARY; + skb->pkt_type = PACKET_OTHERHOST; + skb->protocol = htons(ETH_P_802_2); + + return skb; +} + /* * This function copies a received frame to all monitor interfaces and * returns a cleaned-up SKB that no longer includes the FCS nor the @@ -510,14 +602,12 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(origskb); struct ieee80211_sub_if_data *sdata; - int rt_hdrlen, needed_headroom; - struct sk_buff *skb, *skb2; - struct net_device *prev_dev = NULL; + struct sk_buff *monskb = NULL; int present_fcs_len = 0; unsigned int rtap_vendor_space = 0; - struct ieee80211_mgmt *mgmt; struct ieee80211_sub_if_data *monitor_sdata = rcu_dereference(local->monitor_sdata); + bool only_monitor = false; if (unlikely(status->flag & RX_FLAG_RADIOTAP_VENDOR_DATA)) { struct ieee80211_vendor_radiotap *rtap = (void *)origskb->data; @@ -534,8 +624,15 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, * the SKB because it has a bad FCS/PLCP checksum. */ - if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) + if (ieee80211_hw_check(&local->hw, RX_INCLUDES_FCS)) { + if (unlikely(origskb->len <= FCS_LEN)) { + /* driver bug */ + WARN_ON(1); + dev_kfree_skb(origskb); + return NULL; + } present_fcs_len = FCS_LEN; + } /* ensure hdr->frame_control and vendor radiotap data are in skb head */ if (!pskb_may_pull(origskb, 2 + rtap_vendor_space)) { @@ -543,104 +640,62 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, return NULL; } + only_monitor = should_drop_frame(origskb, present_fcs_len, + rtap_vendor_space); + if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) { - if (should_drop_frame(origskb, present_fcs_len, - rtap_vendor_space)) { + if (only_monitor) { dev_kfree_skb(origskb); return NULL; } - return remove_monitor_info(local, origskb, rtap_vendor_space); - } - - /* room for the radiotap header based on driver features */ - rt_hdrlen = ieee80211_rx_radiotap_hdrlen(local, status, origskb); - needed_headroom = rt_hdrlen - rtap_vendor_space; - - if (should_drop_frame(origskb, present_fcs_len, rtap_vendor_space)) { - /* only need to expand headroom if necessary */ - skb = origskb; - origskb = NULL; - - /* - * This shouldn't trigger often because most devices have an - * RX header they pull before we get here, and that should - * be big enough for our radiotap information. We should - * probably export the length to drivers so that we can have - * them allocate enough headroom to start with. - */ - if (skb_headroom(skb) < needed_headroom && - pskb_expand_head(skb, needed_headroom, 0, GFP_ATOMIC)) { - dev_kfree_skb(skb); - return NULL; - } - } else { - /* - * Need to make a copy and possibly remove radiotap header - * and FCS from the original. - */ - skb = skb_copy_expand(origskb, needed_headroom, 0, GFP_ATOMIC); - - origskb = remove_monitor_info(local, origskb, - rtap_vendor_space); - - if (!skb) - return origskb; + remove_monitor_info(origskb, present_fcs_len, + rtap_vendor_space); + return origskb; } - /* prepend radiotap information */ - ieee80211_add_rx_radiotap_header(local, skb, rate, rt_hdrlen, true); + ieee80211_handle_mu_mimo_mon(monitor_sdata, origskb, rtap_vendor_space); - skb_reset_mac_header(skb); - skb->ip_summed = CHECKSUM_UNNECESSARY; - skb->pkt_type = PACKET_OTHERHOST; - skb->protocol = htons(ETH_P_802_2); + list_for_each_entry_rcu(sdata, &local->mon_list, u.mntr.list) { + bool last_monitor = list_is_last(&sdata->u.mntr.list, + &local->mon_list); - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (sdata->vif.type != NL80211_IFTYPE_MONITOR) - continue; + if (!monskb) + monskb = ieee80211_make_monitor_skb(local, &origskb, + rate, + rtap_vendor_space, + only_monitor && + last_monitor); - if (sdata->u.mntr.flags & MONITOR_FLAG_COOK_FRAMES) - continue; + if (monskb) { + struct sk_buff *skb; - if (!ieee80211_sdata_running(sdata)) - continue; + if (last_monitor) { + skb = monskb; + monskb = NULL; + } else { + skb = skb_clone(monskb, GFP_ATOMIC); + } - if (prev_dev) { - skb2 = skb_clone(skb, GFP_ATOMIC); - if (skb2) { - skb2->dev = prev_dev; - netif_receive_skb(skb2); + if (skb) { + skb->dev = sdata->dev; + ieee80211_rx_stats(skb->dev, skb->len); + netif_receive_skb(skb); } } - prev_dev = sdata->dev; - ieee80211_rx_stats(sdata->dev, skb->len); + if (last_monitor) + break; } - mgmt = (void *)skb->data; - if (monitor_sdata && - skb->len >= IEEE80211_MIN_ACTION_SIZE + 1 + VHT_MUMIMO_GROUPS_DATA_LEN && - ieee80211_is_action(mgmt->frame_control) && - mgmt->u.action.category == WLAN_CATEGORY_VHT && - mgmt->u.action.u.vht_group_notif.action_code == WLAN_VHT_ACTION_GROUPID_MGMT && - is_valid_ether_addr(monitor_sdata->u.mntr.mu_follow_addr) && - ether_addr_equal(mgmt->da, monitor_sdata->u.mntr.mu_follow_addr)) { - struct sk_buff *mu_skb = skb_copy(skb, GFP_ATOMIC); - - if (mu_skb) { - mu_skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; - skb_queue_tail(&monitor_sdata->skb_queue, mu_skb); - ieee80211_queue_work(&local->hw, &monitor_sdata->work); - } - } + /* this happens if last_monitor was erroneously false */ + dev_kfree_skb(monskb); - if (prev_dev) { - skb->dev = prev_dev; - netif_receive_skb(skb); - } else - dev_kfree_skb(skb); + /* ditto */ + if (!origskb) + return NULL; + remove_monitor_info(origskb, present_fcs_len, rtap_vendor_space); return origskb; } @@ -3286,8 +3341,8 @@ static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, status = IEEE80211_SKB_RXCB((rx->skb)); sband = rx->local->hw.wiphy->bands[status->band]; - if (!(status->flag & RX_FLAG_HT) && - !(status->flag & RX_FLAG_VHT)) + if (!(status->encoding == RX_ENC_HT) && + !(status->encoding == RX_ENC_VHT)) rate = &sband->bitrates[status->rate_idx]; ieee80211_rx_cooked_monitor(rx, rate); @@ -3524,7 +3579,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); - int multicast = is_multicast_ether_addr(hdr->addr1); + bool multicast = is_multicast_ether_addr(hdr->addr1); switch (sdata->vif.type) { case NL80211_IFTYPE_STATION: @@ -3548,7 +3603,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; if (!rx->sta) { int rate_idx; - if (status->flag & (RX_FLAG_HT | RX_FLAG_VHT)) + if (status->encoding != RX_ENC_LEGACY) rate_idx = 0; /* TODO: HT/VHT rates */ else rate_idx = status->rate_idx; @@ -3568,7 +3623,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) return false; if (!rx->sta) { int rate_idx; - if (status->flag & RX_FLAG_HT) + if (status->encoding != RX_ENC_LEGACY) rate_idx = 0; /* TODO: HT rates */ else rate_idx = status->rate_idx; @@ -3610,6 +3665,27 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx) !ether_addr_equal(bssid, hdr->addr1)) return false; } + + /* + * 802.11-2016 Table 9-26 says that for data frames, A1 must be + * the BSSID - we've checked that already but may have accepted + * the wildcard (ff:ff:ff:ff:ff:ff). + * + * It also says: + * The BSSID of the Data frame is determined as follows: + * a) If the STA is contained within an AP or is associated + * with an AP, the BSSID is the address currently in use + * by the STA contained in the AP. + * + * So we should not accept data frames with an address that's + * multicast. + * + * Accepting it also opens a security problem because stations + * could encrypt it with the GTK and inject traffic that way. + */ + if (ieee80211_is_data(hdr->frame_control) && multicast) + return false; + return true; case NL80211_IFTYPE_WDS: if (bssid || !ieee80211_is_data(hdr->frame_control)) @@ -4210,7 +4286,8 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, * we probably can't have a valid rate here anyway. */ - if (status->flag & RX_FLAG_HT) { + switch (status->encoding) { + case RX_ENC_HT: /* * rate_idx is MCS index, which can be [0-76] * as documented on: @@ -4228,14 +4305,19 @@ void ieee80211_rx_napi(struct ieee80211_hw *hw, struct ieee80211_sta *pubsta, status->rate_idx, status->rate_idx)) goto drop; - } else if (status->flag & RX_FLAG_VHT) { + break; + case RX_ENC_VHT: if (WARN_ONCE(status->rate_idx > 9 || - !status->vht_nss || - status->vht_nss > 8, + !status->nss || + status->nss > 8, "Rate marked as a VHT rate but data is invalid: MCS: %d, NSS: %d\n", - status->rate_idx, status->vht_nss)) + status->rate_idx, status->nss)) goto drop; - } else { + break; + default: + WARN_ON_ONCE(1); + /* fall through */ + case RX_ENC_LEGACY: if (WARN_ON(status->rate_idx >= sband->n_bitrates)) goto drop; rate = &sband->bitrates[status->rate_idx]; diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index faab3c490d2b755f32ee09638eaf0d45067780cd..47d2ed5704700f110aa6a0f57ecb5af29e0c5db9 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -79,9 +79,9 @@ ieee80211_bss_info_update(struct ieee80211_local *local, bss_meta.signal = (rx_status->signal * 100) / local->hw.max_signal; bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_20; - if (rx_status->flag & RX_FLAG_5MHZ) + if (rx_status->bw == RATE_INFO_BW_5) bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_5; - if (rx_status->flag & RX_FLAG_10MHZ) + else if (rx_status->bw == RATE_INFO_BW_10) bss_meta.scan_width = NL80211_BSS_CHAN_WIDTH_10; bss_meta.chan = channel; @@ -174,8 +174,8 @@ ieee80211_bss_info_update(struct ieee80211_local *local, if (beacon) { struct ieee80211_supported_band *sband = local->hw.wiphy->bands[rx_status->band]; - if (!(rx_status->flag & RX_FLAG_HT) && - !(rx_status->flag & RX_FLAG_VHT)) + if (!(rx_status->encoding == RX_ENC_HT) && + !(rx_status->encoding == RX_ENC_VHT)) bss->beacon_rate = &sband->bitrates[rx_status->rate_idx]; } @@ -1219,7 +1219,7 @@ void ieee80211_sched_scan_results(struct ieee80211_hw *hw) trace_api_sched_scan_results(local); - cfg80211_sched_scan_results(hw->wiphy); + cfg80211_sched_scan_results(hw->wiphy, 0); } EXPORT_SYMBOL(ieee80211_sched_scan_results); @@ -1239,7 +1239,7 @@ void ieee80211_sched_scan_end(struct ieee80211_local *local) mutex_unlock(&local->mtx); - cfg80211_sched_scan_stopped(local->hw.wiphy); + cfg80211_sched_scan_stopped(local->hw.wiphy, 0); } void ieee80211_sched_scan_stopped_work(struct work_struct *work) diff --git a/net/mac80211/spectmgmt.c b/net/mac80211/spectmgmt.c index 97f4c9d6b54ced5bf9f337c55d334cd67646abe9..0782e486fe89306c3cd73d36f1f285dee130ddba 100644 --- a/net/mac80211/spectmgmt.c +++ b/net/mac80211/spectmgmt.c @@ -132,9 +132,9 @@ int ieee80211_parse_ch_switch_ie(struct ieee80211_sub_if_data *sdata, struct ieee80211_vht_operation vht_oper = { .chan_width = wide_bw_chansw_ie->new_channel_width, - .center_freq_seg1_idx = + .center_freq_seg0_idx = wide_bw_chansw_ie->new_center_freq_seg0, - .center_freq_seg2_idx = + .center_freq_seg1_idx = wide_bw_chansw_ie->new_center_freq_seg1, /* .basic_mcs_set doesn't matter */ }; diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 3323a2fb289bd035a1280043bd3d64c2509cb9e2..7cdf7a835bb01e8fade3b9d9bb6efaa19158f72b 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -2,7 +2,7 @@ * Copyright 2002-2005, Instant802 Networks, Inc. * Copyright 2006-2007 Jiri Benc * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015 - 2016 Intel Deutschland GmbH + * Copyright (C) 2015 - 2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -395,10 +395,15 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->sta.smps_mode = IEEE80211_SMPS_OFF; if (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) { - struct ieee80211_supported_band *sband = - hw->wiphy->bands[ieee80211_get_sdata_band(sdata)]; - u8 smps = (sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> - IEEE80211_HT_CAP_SM_PS_SHIFT; + struct ieee80211_supported_band *sband; + u8 smps; + + sband = ieee80211_get_sband(sdata); + if (!sband) + goto free_txq; + + smps = (sband->ht_cap.cap & IEEE80211_HT_CAP_SM_PS) >> + IEEE80211_HT_CAP_SM_PS_SHIFT; /* * Assume that hostapd advertises our caps in the beacon and * this is the known_smps_mode for a station that just assciated @@ -1957,24 +1962,32 @@ sta_get_last_rx_stats(struct sta_info *sta) static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, struct rate_info *rinfo) { - rinfo->bw = (rate & STA_STATS_RATE_BW_MASK) >> - STA_STATS_RATE_BW_SHIFT; + rinfo->bw = STA_STATS_GET(BW, rate); - if (rate & STA_STATS_RATE_VHT) { + switch (STA_STATS_GET(TYPE, rate)) { + case STA_STATS_RATE_TYPE_VHT: rinfo->flags = RATE_INFO_FLAGS_VHT_MCS; - rinfo->mcs = rate & 0xf; - rinfo->nss = (rate & 0xf0) >> 4; - } else if (rate & STA_STATS_RATE_HT) { + rinfo->mcs = STA_STATS_GET(VHT_MCS, rate); + rinfo->nss = STA_STATS_GET(VHT_NSS, rate); + if (STA_STATS_GET(SGI, rate)) + rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; + break; + case STA_STATS_RATE_TYPE_HT: rinfo->flags = RATE_INFO_FLAGS_MCS; - rinfo->mcs = rate & 0xff; - } else if (rate & STA_STATS_RATE_LEGACY) { + rinfo->mcs = STA_STATS_GET(HT_MCS, rate); + if (STA_STATS_GET(SGI, rate)) + rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; + break; + case STA_STATS_RATE_TYPE_LEGACY: { struct ieee80211_supported_band *sband; u16 brate; unsigned int shift; + int band = STA_STATS_GET(LEGACY_BAND, rate); + int rate_idx = STA_STATS_GET(LEGACY_IDX, rate); rinfo->flags = 0; - sband = local->hw.wiphy->bands[(rate >> 4) & 0xf]; - brate = sband->bitrates[rate & 0xf].bitrate; + sband = local->hw.wiphy->bands[band]; + brate = sband->bitrates[rate_idx].bitrate; if (rinfo->bw == RATE_INFO_BW_5) shift = 2; else if (rinfo->bw == RATE_INFO_BW_10) @@ -1982,10 +1995,9 @@ static void sta_stats_decode_rate(struct ieee80211_local *local, u16 rate, else shift = 0; rinfo->legacy = DIV_ROUND_UP(brate, 1 << shift); + break; + } } - - if (rate & STA_STATS_RATE_SGI) - rinfo->flags |= RATE_INFO_FLAGS_SHORT_GI; } static int sta_set_rate_info_rx(struct sta_info *sta, struct rate_info *rinfo) diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index e65cda34d2bc000fb7e3738a6235ba5d53b8fde6..5609cacb20d5f31e02ba877f88a6f0290e18ce8b 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -1,7 +1,7 @@ /* * Copyright 2002-2005, Devicescape Software, Inc. * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright(c) 2015-2016 Intel Deutschland GmbH + * Copyright(c) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -324,6 +325,9 @@ struct ieee80211_fast_rx { struct rcu_head rcu_head; }; +/* we use only values in the range 0-100, so pick a large precision */ +DECLARE_EWMA(mesh_fail_avg, 20, 8) + /** * struct mesh_sta - mesh STA information * @plink_lock: serialize access to plink fields @@ -369,7 +373,7 @@ struct mesh_sta { enum nl80211_mesh_power_mode nonpeer_pm; /* moving percentage of failed MSDUs */ - unsigned int fail_avg; + struct ewma_mesh_fail_avg fail_avg; }; DECLARE_EWMA(signal, 10, 8) @@ -724,40 +728,55 @@ void ieee80211_sta_ps_deliver_uapsd(struct sta_info *sta); unsigned long ieee80211_sta_last_active(struct sta_info *sta); +enum sta_stats_type { + STA_STATS_RATE_TYPE_INVALID = 0, + STA_STATS_RATE_TYPE_LEGACY, + STA_STATS_RATE_TYPE_HT, + STA_STATS_RATE_TYPE_VHT, +}; + +#define STA_STATS_FIELD_HT_MCS GENMASK( 7, 0) +#define STA_STATS_FIELD_LEGACY_IDX GENMASK( 3, 0) +#define STA_STATS_FIELD_LEGACY_BAND GENMASK( 7, 4) +#define STA_STATS_FIELD_VHT_MCS GENMASK( 3, 0) +#define STA_STATS_FIELD_VHT_NSS GENMASK( 7, 4) +#define STA_STATS_FIELD_BW GENMASK(11, 8) +#define STA_STATS_FIELD_SGI GENMASK(12, 12) +#define STA_STATS_FIELD_TYPE GENMASK(15, 13) + +#define STA_STATS_FIELD(_n, _v) FIELD_PREP(STA_STATS_FIELD_ ## _n, _v) +#define STA_STATS_GET(_n, _v) FIELD_GET(STA_STATS_FIELD_ ## _n, _v) + #define STA_STATS_RATE_INVALID 0 -#define STA_STATS_RATE_VHT 0x8000 -#define STA_STATS_RATE_HT 0x4000 -#define STA_STATS_RATE_LEGACY 0x2000 -#define STA_STATS_RATE_SGI 0x1000 -#define STA_STATS_RATE_BW_SHIFT 9 -#define STA_STATS_RATE_BW_MASK (0x7 << STA_STATS_RATE_BW_SHIFT) - -static inline u16 sta_stats_encode_rate(struct ieee80211_rx_status *s) + +static inline u32 sta_stats_encode_rate(struct ieee80211_rx_status *s) { - u16 r = s->rate_idx; - - if (s->vht_flag & RX_VHT_FLAG_80MHZ) - r |= RATE_INFO_BW_80 << STA_STATS_RATE_BW_SHIFT; - else if (s->vht_flag & RX_VHT_FLAG_160MHZ) - r |= RATE_INFO_BW_160 << STA_STATS_RATE_BW_SHIFT; - else if (s->flag & RX_FLAG_40MHZ) - r |= RATE_INFO_BW_40 << STA_STATS_RATE_BW_SHIFT; - else if (s->flag & RX_FLAG_10MHZ) - r |= RATE_INFO_BW_10 << STA_STATS_RATE_BW_SHIFT; - else if (s->flag & RX_FLAG_5MHZ) - r |= RATE_INFO_BW_5 << STA_STATS_RATE_BW_SHIFT; - else - r |= RATE_INFO_BW_20 << STA_STATS_RATE_BW_SHIFT; - - if (s->flag & RX_FLAG_SHORT_GI) - r |= STA_STATS_RATE_SGI; - - if (s->flag & RX_FLAG_VHT) - r |= STA_STATS_RATE_VHT | (s->vht_nss << 4); - else if (s->flag & RX_FLAG_HT) - r |= STA_STATS_RATE_HT; - else - r |= STA_STATS_RATE_LEGACY | (s->band << 4); + u16 r; + + r = STA_STATS_FIELD(BW, s->bw); + + if (s->enc_flags & RX_ENC_FLAG_SHORT_GI) + r |= STA_STATS_FIELD(SGI, 1); + + switch (s->encoding) { + case RX_ENC_VHT: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_VHT); + r |= STA_STATS_FIELD(VHT_NSS, s->nss); + r |= STA_STATS_FIELD(VHT_MCS, s->rate_idx); + break; + case RX_ENC_HT: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_HT); + r |= STA_STATS_FIELD(HT_MCS, s->rate_idx); + break; + case RX_ENC_LEGACY: + r |= STA_STATS_FIELD(TYPE, STA_STATS_RATE_TYPE_LEGACY); + r |= STA_STATS_FIELD(LEGACY_BAND, s->band); + r |= STA_STATS_FIELD(LEGACY_IDX, s->rate_idx); + break; + default: + WARN_ON(1); + return STA_STATS_RATE_INVALID; + } return r; } diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 83b8b11f24ea1dadc0501bd8a4c15598195524a2..be47ac5cd8c8dd44a23247979f0d1742cf089026 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -200,6 +200,7 @@ static void ieee80211_frame_acked(struct sta_info *sta, struct sk_buff *skb) } if (ieee80211_is_action(mgmt->frame_control) && + !ieee80211_has_protected(mgmt->frame_control) && mgmt->u.action.category == WLAN_CATEGORY_HT && mgmt->u.action.u.ht_smps.action == WLAN_HT_ACTION_SMPS && ieee80211_sdata_running(sdata)) { @@ -630,61 +631,6 @@ static int ieee80211_tx_get_rates(struct ieee80211_hw *hw, return rates_idx; } -void ieee80211_tx_status_noskb(struct ieee80211_hw *hw, - struct ieee80211_sta *pubsta, - struct ieee80211_tx_info *info) -{ - struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_supported_band *sband; - int retry_count; - bool acked, noack_success; - - ieee80211_tx_get_rates(hw, info, &retry_count); - - sband = hw->wiphy->bands[info->band]; - - acked = !!(info->flags & IEEE80211_TX_STAT_ACK); - noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); - - if (pubsta) { - struct sta_info *sta; - - sta = container_of(pubsta, struct sta_info, sta); - - if (!acked) - sta->status_stats.retry_failed++; - sta->status_stats.retry_count += retry_count; - - if (acked) { - sta->status_stats.last_ack = jiffies; - - if (sta->status_stats.lost_packets) - sta->status_stats.lost_packets = 0; - - /* Track when last TDLS packet was ACKed */ - if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) - sta->status_stats.last_tdls_pkt_time = jiffies; - } else { - ieee80211_lost_packet(sta, info); - } - - rate_control_tx_status_noskb(local, sband, sta, info); - } - - if (acked || noack_success) { - I802_DEBUG_INC(local->dot11TransmittedFrameCount); - if (!pubsta) - I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); - if (retry_count > 0) - I802_DEBUG_INC(local->dot11RetryCount); - if (retry_count > 1) - I802_DEBUG_INC(local->dot11MultipleRetryCount); - } else { - I802_DEBUG_INC(local->dot11FailedCount); - } -} -EXPORT_SYMBOL(ieee80211_tx_status_noskb); - void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, struct ieee80211_supported_band *sband, int retry_count, int shift, bool send_to_cooked) @@ -742,15 +688,16 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb, dev_kfree_skb(skb); } -void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +static void __ieee80211_tx_status(struct ieee80211_hw *hw, + struct ieee80211_tx_status *status) { + struct sk_buff *skb = status->skb; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; struct ieee80211_local *local = hw_to_local(hw); - struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); + struct ieee80211_tx_info *info = status->info; + struct sta_info *sta; __le16 fc; struct ieee80211_supported_band *sband; - struct rhlist_head *tmp; - struct sta_info *sta; int retry_count; int rates_idx; bool send_to_cooked; @@ -761,16 +708,11 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) rates_idx = ieee80211_tx_get_rates(hw, info, &retry_count); - rcu_read_lock(); - sband = local->hw.wiphy->bands[info->band]; fc = hdr->frame_control; - for_each_sta_info(local, hdr->addr1, sta, tmp) { - /* skip wrong virtual interface */ - if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) - continue; - + if (status->sta) { + sta = container_of(status->sta, struct sta_info, sta); shift = ieee80211_vif_get_shift(&sta->sdata->vif); if (info->flags & IEEE80211_TX_STATUS_EOSP) @@ -790,7 +732,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) * that this TX packet failed because of that. */ ieee80211_handle_filtered_frame(local, sta, skb); - rcu_read_unlock(); return; } @@ -840,7 +781,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) if (info->flags & IEEE80211_TX_STAT_TX_FILTERED) { ieee80211_handle_filtered_frame(local, sta, skb); - rcu_read_unlock(); return; } else { if (!acked) @@ -856,7 +796,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } } - rate_control_tx_status(local, sband, sta, skb); + rate_control_tx_status(local, sband, status); if (ieee80211_vif_is_mesh(&sta->sdata->vif)) ieee80211s_update_metric(local, sta, skb); @@ -883,8 +823,6 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } } - rcu_read_unlock(); - ieee80211_led_tx(local); /* SNMP counters @@ -949,8 +887,96 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) /* send to monitor interfaces */ ieee80211_tx_monitor(local, skb, sband, retry_count, shift, send_to_cooked); } + +void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) +{ + struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data; + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_tx_status status = { + .skb = skb, + .info = IEEE80211_SKB_CB(skb), + }; + struct rhlist_head *tmp; + struct sta_info *sta; + + rcu_read_lock(); + + for_each_sta_info(local, hdr->addr1, sta, tmp) { + /* skip wrong virtual interface */ + if (!ether_addr_equal(hdr->addr2, sta->sdata->vif.addr)) + continue; + + status.sta = &sta->sta; + break; + } + + __ieee80211_tx_status(hw, &status); + rcu_read_unlock(); +} EXPORT_SYMBOL(ieee80211_tx_status); +void ieee80211_tx_status_ext(struct ieee80211_hw *hw, + struct ieee80211_tx_status *status) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_tx_info *info = status->info; + struct ieee80211_sta *pubsta = status->sta; + struct ieee80211_supported_band *sband; + int retry_count; + bool acked, noack_success; + + if (status->skb) + return __ieee80211_tx_status(hw, status); + + if (!status->sta) + return; + + ieee80211_tx_get_rates(hw, info, &retry_count); + + sband = hw->wiphy->bands[info->band]; + + acked = !!(info->flags & IEEE80211_TX_STAT_ACK); + noack_success = !!(info->flags & IEEE80211_TX_STAT_NOACK_TRANSMITTED); + + if (pubsta) { + struct sta_info *sta; + + sta = container_of(pubsta, struct sta_info, sta); + + if (!acked) + sta->status_stats.retry_failed++; + sta->status_stats.retry_count += retry_count; + + if (acked) { + sta->status_stats.last_ack = jiffies; + + if (sta->status_stats.lost_packets) + sta->status_stats.lost_packets = 0; + + /* Track when last TDLS packet was ACKed */ + if (test_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH)) + sta->status_stats.last_tdls_pkt_time = jiffies; + } else { + ieee80211_lost_packet(sta, info); + } + + rate_control_tx_status(local, sband, status); + } + + if (acked || noack_success) { + I802_DEBUG_INC(local->dot11TransmittedFrameCount); + if (!pubsta) + I802_DEBUG_INC(local->dot11MulticastTransmittedFrameCount); + if (retry_count > 0) + I802_DEBUG_INC(local->dot11RetryCount); + if (retry_count > 1) + I802_DEBUG_INC(local->dot11MultipleRetryCount); + } else { + I802_DEBUG_INC(local->dot11FailedCount); + } +} +EXPORT_SYMBOL(ieee80211_tx_status_ext); + void ieee80211_report_low_ack(struct ieee80211_sta *pubsta, u32 num_packets) { struct sta_info *sta = container_of(pubsta, struct sta_info, sta); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index afca7d103684bbc8c953ae13bb0345934d8f5f35..f20dcf1b1830a2412ed96ebcc9ce3be65ef5d7e8 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -47,8 +47,7 @@ static void ieee80211_tdls_add_ext_capab(struct ieee80211_sub_if_data *sdata, NL80211_FEATURE_TDLS_CHANNEL_SWITCH; bool wider_band = ieee80211_hw_check(&local->hw, TDLS_WIDER_BW) && !ifmgd->tdls_wider_bw_prohibited; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); - struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band]; + struct ieee80211_supported_band *sband = ieee80211_get_sband(sdata); bool vht = sband && sband->vht_cap.vht_supported; u8 *pos = (void *)skb_put(skb, 10); @@ -180,11 +179,14 @@ static void ieee80211_tdls_add_bss_coex_ie(struct sk_buff *skb) static u16 ieee80211_get_tdls_sta_capab(struct ieee80211_sub_if_data *sdata, u16 status_code) { + struct ieee80211_supported_band *sband; + /* The capability will be 0 when sending a failure code */ if (status_code != 0) return 0; - if (ieee80211_get_sdata_band(sdata) == NL80211_BAND_2GHZ) { + sband = ieee80211_get_sband(sdata); + if (sband && sband->band == NL80211_BAND_2GHZ) { return WLAN_CAPABILITY_SHORT_SLOT_TIME | WLAN_CAPABILITY_SHORT_PREAMBLE; } @@ -358,17 +360,20 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, u8 action_code, bool initiator, const u8 *extra_ies, size_t extra_ies_len) { - enum nl80211_band band = ieee80211_get_sdata_band(sdata); - struct ieee80211_local *local = sdata->local; struct ieee80211_supported_band *sband; + struct ieee80211_local *local = sdata->local; struct ieee80211_sta_ht_cap ht_cap; struct ieee80211_sta_vht_cap vht_cap; struct sta_info *sta = NULL; size_t offset = 0, noffset; u8 *pos; - ieee80211_add_srates_ie(sdata, skb, false, band); - ieee80211_add_ext_srates_ie(sdata, skb, false, band); + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + + ieee80211_add_srates_ie(sdata, skb, false, sband->band); + ieee80211_add_ext_srates_ie(sdata, skb, false, sband->band); ieee80211_tdls_add_supp_channels(sdata, skb); /* add any custom IEs that go before Extended Capabilities */ @@ -439,7 +444,6 @@ ieee80211_tdls_add_setup_start_ies(struct ieee80211_sub_if_data *sdata, * the same on all bands. The specification limits the setup to a * single HT-cap, so use the current band for now. */ - sband = local->hw.wiphy->bands[band]; memcpy(&ht_cap, &sband->ht_cap, sizeof(ht_cap)); if ((action_code == WLAN_TDLS_SETUP_REQUEST || @@ -545,9 +549,13 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; size_t offset = 0, noffset; struct sta_info *sta, *ap_sta; - enum nl80211_band band = ieee80211_get_sdata_band(sdata); + struct ieee80211_supported_band *sband; u8 *pos; + sband = ieee80211_get_sband(sdata); + if (!sband) + return; + mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, peer); @@ -612,7 +620,8 @@ ieee80211_tdls_add_setup_cfm_ies(struct ieee80211_sub_if_data *sdata, ieee80211_tdls_add_link_ie(sdata, skb, peer, initiator); /* only include VHT-operation if not on the 2.4GHz band */ - if (band != NL80211_BAND_2GHZ && sta->sta.vht_cap.vht_supported) { + if (sband->band != NL80211_BAND_2GHZ && + sta->sta.vht_cap.vht_supported) { /* * if both peers support WIDER_BW, we can expand the chandef to * a wider compatible one, up to 80MHz diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ba8d7db0a07165e7b33f71caa260f352771d434a..04b22f8982fe42b41364d8d7feee0d5bd036c64a 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -682,10 +682,6 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx) txrc.skb = tx->skb; txrc.reported_rate.idx = -1; txrc.rate_idx_mask = tx->sdata->rc_rateidx_mask[info->band]; - if (txrc.rate_idx_mask == (1 << sband->n_bitrates) - 1) - txrc.max_rate_idx = -1; - else - txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; if (tx->sdata->rc_has_mcs_mask[info->band]) txrc.rate_idx_mcs_mask = @@ -4249,10 +4245,6 @@ __ieee80211_beacon_get(struct ieee80211_hw *hw, txrc.skb = skb; txrc.reported_rate.idx = -1; txrc.rate_idx_mask = sdata->rc_rateidx_mask[band]; - if (txrc.rate_idx_mask == (1 << txrc.sband->n_bitrates) - 1) - txrc.max_rate_idx = -1; - else - txrc.max_rate_idx = fls(txrc.rate_idx_mask) - 1; txrc.bss = true; rate_control_get_rate(sdata, NULL, &txrc); @@ -4305,7 +4297,10 @@ struct sk_buff *ieee80211_beacon_get_tim(struct ieee80211_hw *hw, return bcn; shift = ieee80211_vif_get_shift(vif); - sband = hw->wiphy->bands[ieee80211_get_sdata_band(vif_to_sdata(vif))]; + sband = ieee80211_get_sband(vif_to_sdata(vif)); + if (!sband) + return bcn; + ieee80211_tx_monitor(hw_to_local(hw), copy, sband, 1, shift, false); return bcn; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ac59fbd280dff8f712fb9dbc841147e4642ec607..ac9ac6c35594a4c67d570bb40cc4ad44f3d7bfa0 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -4,7 +4,7 @@ * Copyright 2006-2007 Jiri Benc * Copyright 2007 Johannes Berg * Copyright 2013-2014 Intel Mobile Communications GmbH - * Copyright (C) 2015-2016 Intel Deutschland GmbH + * Copyright (C) 2015-2017 Intel Deutschland GmbH * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -828,6 +828,7 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, case WLAN_EID_EXT_CAPABILITY: case WLAN_EID_CHAN_SWITCH_TIMING: case WLAN_EID_LINK_ID: + case WLAN_EID_BSS_MAX_IDLE_PERIOD: /* * not listing WLAN_EID_CHANNEL_SWITCH_WRAPPER -- it seems possible * that if the content gets bigger it might be needed more than once @@ -1089,6 +1090,10 @@ u32 ieee802_11_parse_elems_crc(const u8 *start, size_t len, bool action, else elem_parse_failed = true; break; + case WLAN_EID_BSS_MAX_IDLE_PERIOD: + if (elen >= sizeof(*elems->max_idle_period_ie)) + elems->max_idle_period_ie = (void *)pos; + break; default: break; } @@ -1590,14 +1595,14 @@ u32 ieee80211_sta_get_rates(struct ieee80211_sub_if_data *sdata, size_t num_rates; u32 supp_rates, rate_flags; int i, j, shift; + sband = sdata->local->hw.wiphy->bands[band]; + if (WARN_ON(!sband)) + return 1; rate_flags = ieee80211_chandef_rate_flags(&sdata->vif.bss_conf.chandef); shift = ieee80211_vif_get_shift(&sdata->vif); - if (WARN_ON(!sband)) - return 1; - num_rates = sband->n_bitrates; supp_rates = 0; for (i = 0; i < elems->supp_rates_len + @@ -1983,6 +1988,10 @@ int ieee80211_reconfig(struct ieee80211_local *local) if (sdata->u.mgd.have_beacon) changed |= BSS_CHANGED_BEACON_INFO; + if (sdata->vif.bss_conf.max_idle_period || + sdata->vif.bss_conf.protected_keep_alive) + changed |= BSS_CHANGED_KEEP_ALIVE; + sdata_lock(sdata); ieee80211_bss_info_change_notify(sdata, changed); sdata_unlock(sdata); @@ -2103,7 +2112,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_unlock(&local->mtx); if (sched_scan_stopped) - cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy); + cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy, 0); wake_up: if (local->in_reconfig) { @@ -2413,13 +2422,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, *pos++ = WLAN_EID_VHT_OPERATION; *pos++ = sizeof(struct ieee80211_vht_operation); vht_oper = (struct ieee80211_vht_operation *)pos; - vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel( + vht_oper->center_freq_seg0_idx = ieee80211_frequency_to_channel( chandef->center_freq1); if (chandef->center_freq2) - vht_oper->center_freq_seg2_idx = + vht_oper->center_freq_seg1_idx = ieee80211_frequency_to_channel(chandef->center_freq2); else - vht_oper->center_freq_seg2_idx = 0x00; + vht_oper->center_freq_seg1_idx = 0x00; switch (chandef->width) { case NL80211_CHAN_WIDTH_160: @@ -2428,11 +2437,11 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap, * workaround. */ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ; - vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx; + vht_oper->center_freq_seg1_idx = vht_oper->center_freq_seg0_idx; if (chandef->chan->center_freq < chandef->center_freq1) - vht_oper->center_freq_seg1_idx -= 8; + vht_oper->center_freq_seg0_idx -= 8; else - vht_oper->center_freq_seg1_idx += 8; + vht_oper->center_freq_seg0_idx += 8; break; case NL80211_CHAN_WIDTH_80P80: /* @@ -2491,9 +2500,9 @@ bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, if (!oper) return false; - cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, + cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg0_idx, chandef->chan->band); - cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx, + cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx, chandef->chan->band); switch (oper->chan_width) { @@ -2503,11 +2512,11 @@ bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper, new.width = NL80211_CHAN_WIDTH_80; new.center_freq1 = cf1; /* If needed, adjust based on the newer interop workaround. */ - if (oper->center_freq_seg2_idx) { + if (oper->center_freq_seg1_idx) { unsigned int diff; - diff = abs(oper->center_freq_seg2_idx - - oper->center_freq_seg1_idx); + diff = abs(oper->center_freq_seg1_idx - + oper->center_freq_seg0_idx); if (diff == 8) { new.width = NL80211_CHAN_WIDTH_160; new.center_freq1 = cf2; @@ -2715,42 +2724,39 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, memset(&ri, 0, sizeof(ri)); /* Fill cfg80211 rate info */ - if (status->flag & RX_FLAG_HT) { + switch (status->encoding) { + case RX_ENC_HT: ri.mcs = status->rate_idx; ri.flags |= RATE_INFO_FLAGS_MCS; - if (status->flag & RX_FLAG_40MHZ) - ri.bw = RATE_INFO_BW_40; - else - ri.bw = RATE_INFO_BW_20; - if (status->flag & RX_FLAG_SHORT_GI) + ri.bw = status->bw; + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; - } else if (status->flag & RX_FLAG_VHT) { + break; + case RX_ENC_VHT: ri.flags |= RATE_INFO_FLAGS_VHT_MCS; ri.mcs = status->rate_idx; - ri.nss = status->vht_nss; - if (status->flag & RX_FLAG_40MHZ) - ri.bw = RATE_INFO_BW_40; - else if (status->vht_flag & RX_VHT_FLAG_80MHZ) - ri.bw = RATE_INFO_BW_80; - else if (status->vht_flag & RX_VHT_FLAG_160MHZ) - ri.bw = RATE_INFO_BW_160; - else - ri.bw = RATE_INFO_BW_20; - if (status->flag & RX_FLAG_SHORT_GI) + ri.nss = status->nss; + ri.bw = status->bw; + if (status->enc_flags & RX_ENC_FLAG_SHORT_GI) ri.flags |= RATE_INFO_FLAGS_SHORT_GI; - } else { + break; + default: + WARN_ON(1); + /* fall through */ + case RX_ENC_LEGACY: { struct ieee80211_supported_band *sband; int shift = 0; int bitrate; - if (status->flag & RX_FLAG_10MHZ) { + ri.bw = status->bw; + + switch (status->bw) { + case RATE_INFO_BW_10: shift = 1; - ri.bw = RATE_INFO_BW_10; - } else if (status->flag & RX_FLAG_5MHZ) { + break; + case RATE_INFO_BW_5: shift = 2; - ri.bw = RATE_INFO_BW_5; - } else { - ri.bw = RATE_INFO_BW_20; + break; } sband = local->hw.wiphy->bands[status->band]; @@ -2762,19 +2768,21 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local, if (status->band == NL80211_BAND_5GHZ) { ts += 20 << shift; mpdu_offset += 2; - } else if (status->flag & RX_FLAG_SHORTPRE) { + } else if (status->enc_flags & RX_ENC_FLAG_SHORTPRE) { ts += 96; } else { ts += 192; } } + break; + } } rate = cfg80211_calculate_bitrate(&ri); if (WARN_ONCE(!rate, "Invalid bitrate: flags=0x%llx, idx=%d, vht_nss=%d\n", (unsigned long long)status->flag, status->rate_idx, - status->vht_nss)) + status->nss)) return 0; /* rewind from end of MPDU */ @@ -2791,8 +2799,10 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) struct ieee80211_sub_if_data *sdata; struct cfg80211_chan_def chandef; + /* for interface list, to avoid linking iflist_mtx and chanctx_mtx */ + ASSERT_RTNL(); + mutex_lock(&local->mtx); - mutex_lock(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { /* it might be waiting for the local->mtx, but then * by the time it gets it, sdata->wdev.cac_started @@ -2809,7 +2819,6 @@ void ieee80211_dfs_cac_cancel(struct ieee80211_local *local) GFP_KERNEL); } } - mutex_unlock(&local->iflist_mtx); mutex_unlock(&local->mtx); } @@ -2831,7 +2840,9 @@ void ieee80211_dfs_radar_detected_work(struct work_struct *work) } mutex_unlock(&local->chanctx_mtx); + rtnl_lock(); ieee80211_dfs_cac_cancel(local); + rtnl_unlock(); if (num_chanctx > 1) /* XXX: multi-channel is not supported yet */ @@ -2846,7 +2857,7 @@ void ieee80211_radar_detected(struct ieee80211_hw *hw) trace_api_radar_detected(local); - ieee80211_queue_work(hw, &local->radar_detected_work); + schedule_work(&local->radar_detected_work); } EXPORT_SYMBOL(ieee80211_radar_detected); diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 56ccffa3f2bfc7731adfaabb1026ef7e8af68d32..62141dcec2d66020fb4eb7bb698e880809878028 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -19,6 +19,7 @@ #ifndef __IEEE802154_I_H #define __IEEE802154_I_H +#include #include #include #include diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index 6414079aa7297eee8fbd6320fb4392c6d8865e34..257ec66009da2dd7010d063c0ad29dbb9a32564e 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -24,6 +24,9 @@ #include #include "internal.h" +/* max memory we will use for mpls_route */ +#define MAX_MPLS_ROUTE_MEM 4096 + /* Maximum number of labels to look ahead at when selecting a path of * a multipath route */ @@ -32,7 +35,9 @@ #define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1) static int zero = 0; +static int one = 1; static int label_limit = (1 << 20) - 1; +static int ttl_max = 255; static void rtmsg_lfib(int event, u32 label, struct mpls_route *rt, struct nlmsghdr *nlh, struct net *net, u32 portid, @@ -58,10 +63,7 @@ EXPORT_SYMBOL_GPL(mpls_output_possible); static u8 *__mpls_nh_via(struct mpls_route *rt, struct mpls_nh *nh) { - u8 *nh0_via = PTR_ALIGN((u8 *)&rt->rt_nh[rt->rt_nhn], VIA_ALEN_ALIGN); - int nh_index = nh - rt->rt_nh; - - return nh0_via + rt->rt_max_alen * nh_index; + return (u8 *)nh + rt->rt_via_offset; } static const u8 *mpls_nh_via(const struct mpls_route *rt, @@ -187,21 +189,32 @@ static u32 mpls_multipath_hash(struct mpls_route *rt, struct sk_buff *skb) return hash; } +static struct mpls_nh *mpls_get_nexthop(struct mpls_route *rt, u8 index) +{ + return (struct mpls_nh *)((u8 *)rt->rt_nh + index * rt->rt_nh_size); +} + +/* number of alive nexthops (rt->rt_nhn_alive) and the flags for + * a next hop (nh->nh_flags) are modified by netdev event handlers. + * Since those fields can change at any moment, use READ_ONCE to + * access both. + */ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, struct sk_buff *skb) { - int alive = ACCESS_ONCE(rt->rt_nhn_alive); u32 hash = 0; int nh_index = 0; int n = 0; + u8 alive; /* No need to look further into packet if there's only * one path */ if (rt->rt_nhn == 1) - goto out; + return rt->rt_nh; - if (alive <= 0) + alive = READ_ONCE(rt->rt_nhn_alive); + if (alive == 0) return NULL; hash = mpls_multipath_hash(rt, skb); @@ -209,7 +222,9 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, if (alive == rt->rt_nhn) goto out; for_nexthops(rt) { - if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) + unsigned int nh_flags = READ_ONCE(nh->nh_flags); + + if (nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN)) continue; if (n == nh_index) return nh; @@ -217,11 +232,11 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt, } endfor_nexthops(rt); out: - return &rt->rt_nh[nh_index]; + return mpls_get_nexthop(rt, nh_index); } -static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, - struct mpls_entry_decoded dec) +static bool mpls_egress(struct net *net, struct mpls_route *rt, + struct sk_buff *skb, struct mpls_entry_decoded dec) { enum mpls_payload_type payload_type; bool success = false; @@ -246,22 +261,46 @@ static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb, switch (payload_type) { case MPT_IPV4: { struct iphdr *hdr4 = ip_hdr(skb); + u8 new_ttl; skb->protocol = htons(ETH_P_IP); + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * TTL, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + new_ttl = dec.ttl; + else + new_ttl = hdr4->ttl ? hdr4->ttl - 1 : 0; + csum_replace2(&hdr4->check, htons(hdr4->ttl << 8), - htons(dec.ttl << 8)); - hdr4->ttl = dec.ttl; + htons(new_ttl << 8)); + hdr4->ttl = new_ttl; success = true; break; } case MPT_IPV6: { struct ipv6hdr *hdr6 = ipv6_hdr(skb); skb->protocol = htons(ETH_P_IPV6); - hdr6->hop_limit = dec.ttl; + + /* If propagating TTL, take the decremented TTL from + * the incoming MPLS header, otherwise decrement the + * hop limit, but only if not 0 to avoid underflow. + */ + if (rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED || + (rt->rt_ttl_propagate == MPLS_TTL_PROP_DEFAULT && + net->mpls.ip_ttl_propagate)) + hdr6->hop_limit = dec.ttl; + else if (hdr6->hop_limit) + hdr6->hop_limit = hdr6->hop_limit - 1; success = true; break; } case MPT_UNSPEC: + /* Should have decided which protocol it is by now */ break; } @@ -361,7 +400,7 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev, if (unlikely(!new_header_size && dec.bos)) { /* Penultimate hop popping */ - if (!mpls_egress(rt, skb, dec)) + if (!mpls_egress(dev_net(out_dev), rt, skb, dec)) goto err; } else { bool bos; @@ -412,6 +451,7 @@ static struct packet_type mpls_packet_type __read_mostly = { static const struct nla_policy rtm_mpls_policy[RTA_MAX+1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_OIF] = { .type = NLA_U32 }, + [RTA_TTL_PROPAGATE] = { .type = NLA_U8 }, }; struct mpls_route_config { @@ -421,6 +461,7 @@ struct mpls_route_config { u8 rc_via_alen; u8 rc_via[MAX_VIA_ALEN]; u32 rc_label; + u8 rc_ttl_propagate; u8 rc_output_labels; u32 rc_output_label[MAX_NEW_LABELS]; u32 rc_nlflags; @@ -430,20 +471,27 @@ struct mpls_route_config { int rc_mp_len; }; -static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen) +/* all nexthops within a route have the same size based on max + * number of labels and max via length for a hop + */ +static struct mpls_route *mpls_rt_alloc(u8 num_nh, u8 max_alen, u8 max_labels) { - u8 max_alen_aligned = ALIGN(max_alen, VIA_ALEN_ALIGN); + u8 nh_size = MPLS_NH_SIZE(max_labels, max_alen); struct mpls_route *rt; + size_t size; - rt = kzalloc(ALIGN(sizeof(*rt) + num_nh * sizeof(*rt->rt_nh), - VIA_ALEN_ALIGN) + - num_nh * max_alen_aligned, - GFP_KERNEL); - if (rt) { - rt->rt_nhn = num_nh; - rt->rt_nhn_alive = num_nh; - rt->rt_max_alen = max_alen_aligned; - } + size = sizeof(*rt) + num_nh * nh_size; + if (size > MAX_MPLS_ROUTE_MEM) + return ERR_PTR(-EINVAL); + + rt = kzalloc(size, GFP_KERNEL); + if (!rt) + return ERR_PTR(-ENOMEM); + + rt->rt_nhn = num_nh; + rt->rt_nhn_alive = num_nh; + rt->rt_nh_size = nh_size; + rt->rt_via_offset = MPLS_NH_VIA_OFF(max_labels); return rt; } @@ -648,9 +696,6 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, return -ENOMEM; err = -EINVAL; - /* Ensure only a supported number of labels are present */ - if (cfg->rc_output_labels > MAX_NEW_LABELS) - goto errout; nh->nh_labels = cfg->rc_output_labels; for (i = 0; i < nh->nh_labels; i++) @@ -675,7 +720,7 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg, static int mpls_nh_build(struct net *net, struct mpls_route *rt, struct mpls_nh *nh, int oif, struct nlattr *via, - struct nlattr *newdst) + struct nlattr *newdst, u8 max_labels) { int err = -ENOMEM; @@ -683,7 +728,7 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, goto errout; if (newdst) { - err = nla_get_labels(newdst, MAX_NEW_LABELS, + err = nla_get_labels(newdst, max_labels, &nh->nh_labels, nh->nh_label); if (err) goto errout; @@ -708,22 +753,20 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt, return err; } -static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, - u8 cfg_via_alen, u8 *max_via_alen) +static u8 mpls_count_nexthops(struct rtnexthop *rtnh, int len, + u8 cfg_via_alen, u8 *max_via_alen, + u8 *max_labels) { - int nhs = 0; int remaining = len; - - if (!rtnh) { - *max_via_alen = cfg_via_alen; - return 1; - } + u8 nhs = 0; *max_via_alen = 0; + *max_labels = 0; while (rtnh_ok(rtnh, remaining)) { struct nlattr *nla, *attrs = rtnh_attrs(rtnh); int attrlen; + u8 n_labels = 0; attrlen = rtnh_attrlen(rtnh); nla = nla_find(attrs, attrlen, RTA_VIA); @@ -737,7 +780,20 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, via_alen); } + nla = nla_find(attrs, attrlen, RTA_NEWDST); + if (nla && + nla_get_labels(nla, MAX_NEW_LABELS, &n_labels, NULL) != 0) + return 0; + + *max_labels = max_t(u8, *max_labels, n_labels); + + /* number of nexthops is tracked by a u8. + * Check for overflow. + */ + if (nhs == 255) + return 0; nhs++; + rtnh = rtnh_next(rtnh, &remaining); } @@ -746,13 +802,13 @@ static int mpls_count_nexthops(struct rtnexthop *rtnh, int len, } static int mpls_nh_build_multi(struct mpls_route_config *cfg, - struct mpls_route *rt) + struct mpls_route *rt, u8 max_labels) { struct rtnexthop *rtnh = cfg->rc_mp; struct nlattr *nla_via, *nla_newdst; int remaining = cfg->rc_mp_len; - int nhs = 0; int err = 0; + u8 nhs = 0; change_nexthops(rt) { int attrlen; @@ -779,7 +835,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg, } err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh, - rtnh->rtnh_ifindex, nla_via, nla_newdst); + rtnh->rtnh_ifindex, nla_via, nla_newdst, + max_labels); if (err) goto errout; @@ -806,7 +863,8 @@ static int mpls_route_add(struct mpls_route_config *cfg) int err = -EINVAL; u8 max_via_alen; unsigned index; - int nhs; + u8 max_labels; + u8 nhs; index = cfg->rc_label; @@ -844,21 +902,32 @@ static int mpls_route_add(struct mpls_route_config *cfg) goto errout; err = -EINVAL; - nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, - cfg->rc_via_alen, &max_via_alen); + if (cfg->rc_mp) { + nhs = mpls_count_nexthops(cfg->rc_mp, cfg->rc_mp_len, + cfg->rc_via_alen, &max_via_alen, + &max_labels); + } else { + max_via_alen = cfg->rc_via_alen; + max_labels = cfg->rc_output_labels; + nhs = 1; + } + if (nhs == 0) goto errout; err = -ENOMEM; - rt = mpls_rt_alloc(nhs, max_via_alen); - if (!rt) + rt = mpls_rt_alloc(nhs, max_via_alen, max_labels); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); goto errout; + } rt->rt_protocol = cfg->rc_protocol; rt->rt_payload_type = cfg->rc_payload_type; + rt->rt_ttl_propagate = cfg->rc_ttl_propagate; if (cfg->rc_mp) - err = mpls_nh_build_multi(cfg, rt); + err = mpls_nh_build_multi(cfg, rt, max_labels); else err = mpls_nh_build_from_cfg(cfg, rt); if (err) @@ -1011,8 +1080,8 @@ static int mpls_netconf_msgsize_devconf(int type) return size; } -static void mpls_netconf_notify_devconf(struct net *net, int type, - struct mpls_dev *mdev) +static void mpls_netconf_notify_devconf(struct net *net, int event, + int type, struct mpls_dev *mdev) { struct sk_buff *skb; int err = -ENOBUFS; @@ -1021,8 +1090,7 @@ static void mpls_netconf_notify_devconf(struct net *net, int type, if (!skb) goto errout; - err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, RTM_NEWNETCONF, - 0, type); + err = mpls_netconf_fill_devconf(skb, mdev, 0, 0, event, 0, type); if (err < 0) { /* -EMSGSIZE implies BUG in mpls_netconf_msgsize_devconf() */ WARN_ON(err == -EMSGSIZE); @@ -1042,7 +1110,8 @@ static const struct nla_policy devconf_mpls_policy[NETCONFA_MAX + 1] = { }; static int mpls_netconf_get_devconf(struct sk_buff *in_skb, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(in_skb->sk); struct nlattr *tb[NETCONFA_MAX + 1]; @@ -1054,7 +1123,7 @@ static int mpls_netconf_get_devconf(struct sk_buff *in_skb, int err; err = nlmsg_parse(nlh, sizeof(*ncm), tb, NETCONFA_MAX, - devconf_mpls_policy); + devconf_mpls_policy, NULL); if (err < 0) goto errout; @@ -1155,9 +1224,8 @@ static int mpls_conf_proc(struct ctl_table *ctl, int write, if (i == offsetof(struct mpls_dev, input_enabled) && val != oval) { - mpls_netconf_notify_devconf(net, - NETCONFA_INPUT, - mdev); + mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, + NETCONFA_INPUT, mdev); } } @@ -1198,10 +1266,11 @@ static int mpls_dev_sysctl_register(struct net_device *dev, snprintf(path, sizeof(path), "net/mpls/conf/%s", dev->name); - mdev->sysctl = register_net_sysctl(dev_net(dev), path, table); + mdev->sysctl = register_net_sysctl(net, path, table); if (!mdev->sysctl) goto free; + mpls_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_ALL, mdev); return 0; free: @@ -1210,13 +1279,17 @@ static int mpls_dev_sysctl_register(struct net_device *dev, return -ENOBUFS; } -static void mpls_dev_sysctl_unregister(struct mpls_dev *mdev) +static void mpls_dev_sysctl_unregister(struct net_device *dev, + struct mpls_dev *mdev) { + struct net *net = dev_net(dev); struct ctl_table *table; table = mdev->sysctl->ctl_table_arg; unregister_net_sysctl_table(mdev->sysctl); kfree(table); + + mpls_netconf_notify_devconf(net, RTM_DELNETCONF, 0, mdev); } static struct mpls_dev *mpls_add_dev(struct net_device *dev) @@ -1242,11 +1315,12 @@ static struct mpls_dev *mpls_add_dev(struct net_device *dev) u64_stats_init(&mpls_stats->syncp); } + mdev->dev = dev; + err = mpls_dev_sysctl_register(dev, mdev); if (err) goto free; - mdev->dev = dev; rcu_assign_pointer(dev->mpls_ptr, mdev); return mdev; @@ -1269,8 +1343,7 @@ static void mpls_ifdown(struct net_device *dev, int event) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); - unsigned int nh_flags = RTNH_F_DEAD | RTNH_F_LINKDOWN; - unsigned int alive; + u8 alive, deleted; unsigned index; platform_label = rtnl_dereference(net->mpls.platform_label); @@ -1281,36 +1354,48 @@ static void mpls_ifdown(struct net_device *dev, int event) continue; alive = 0; + deleted = 0; change_nexthops(rt) { + unsigned int nh_flags = nh->nh_flags; + if (rtnl_dereference(nh->nh_dev) != dev) goto next; switch (event) { case NETDEV_DOWN: case NETDEV_UNREGISTER: - nh->nh_flags |= RTNH_F_DEAD; + nh_flags |= RTNH_F_DEAD; /* fall through */ case NETDEV_CHANGE: - nh->nh_flags |= RTNH_F_LINKDOWN; + nh_flags |= RTNH_F_LINKDOWN; break; } if (event == NETDEV_UNREGISTER) RCU_INIT_POINTER(nh->nh_dev, NULL); + + if (nh->nh_flags != nh_flags) + WRITE_ONCE(nh->nh_flags, nh_flags); next: - if (!(nh->nh_flags & nh_flags)) + if (!(nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))) alive++; + if (!rtnl_dereference(nh->nh_dev)) + deleted++; } endfor_nexthops(rt); WRITE_ONCE(rt->rt_nhn_alive, alive); + + /* if there are no more nexthops, delete the route */ + if (event == NETDEV_UNREGISTER && deleted == rt->rt_nhn) + mpls_route_update(net, index, NULL, NULL); } } -static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) +static void mpls_ifup(struct net_device *dev, unsigned int flags) { struct mpls_route __rcu **platform_label; struct net *net = dev_net(dev); unsigned index; - int alive; + u8 alive; platform_label = rtnl_dereference(net->mpls.platform_label); for (index = 0; index < net->mpls.platform_labels; index++) { @@ -1321,20 +1406,22 @@ static void mpls_ifup(struct net_device *dev, unsigned int nh_flags) alive = 0; change_nexthops(rt) { + unsigned int nh_flags = nh->nh_flags; struct net_device *nh_dev = rtnl_dereference(nh->nh_dev); - if (!(nh->nh_flags & nh_flags)) { + if (!(nh_flags & flags)) { alive++; continue; } if (nh_dev != dev) continue; alive++; - nh->nh_flags &= ~nh_flags; + nh_flags &= ~flags; + WRITE_ONCE(nh->nh_flags, flags); } endfor_nexthops(rt); - ACCESS_ONCE(rt->rt_nhn_alive) = alive; + WRITE_ONCE(rt->rt_nhn_alive, alive); } } @@ -1385,7 +1472,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, mpls_ifdown(dev, event); mdev = mpls_dev_get(dev); if (mdev) { - mpls_dev_sysctl_unregister(mdev); + mpls_dev_sysctl_unregister(dev, mdev); RCU_INIT_POINTER(dev->mpls_ptr, NULL); call_rcu(&mdev->rcu, mpls_dev_destroy_rcu); } @@ -1395,7 +1482,7 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event, if (mdev) { int err; - mpls_dev_sysctl_unregister(mdev); + mpls_dev_sysctl_unregister(dev, mdev); err = mpls_dev_sysctl_register(dev, mdev); if (err) return notifier_from_errno(err); @@ -1455,16 +1542,18 @@ int nla_put_labels(struct sk_buff *skb, int attrtype, EXPORT_SYMBOL_GPL(nla_put_labels); int nla_get_labels(const struct nlattr *nla, - u32 max_labels, u8 *labels, u32 label[]) + u8 max_labels, u8 *labels, u32 label[]) { unsigned len = nla_len(nla); - unsigned nla_labels; struct mpls_shim_hdr *nla_label; + u8 nla_labels; bool bos; int i; - /* len needs to be an even multiple of 4 (the label size) */ - if (len & 3) + /* len needs to be an even multiple of 4 (the label size). Number + * of labels is a u8 so check for overflow. + */ + if (len & 3 || len / 4 > 255) return -EINVAL; /* Limit the number of new labels allowed */ @@ -1472,6 +1561,10 @@ int nla_get_labels(const struct nlattr *nla, if (nla_labels > max_labels) return -EINVAL; + /* when label == NULL, caller wants number of labels */ + if (!label) + goto out; + nla_label = nla_data(nla); bos = true; for (i = nla_labels - 1; i >= 0; i--, bos = false) { @@ -1495,6 +1588,7 @@ int nla_get_labels(const struct nlattr *nla, label[i] = dec.label; } +out: *labels = nla_labels; return 0; } @@ -1550,13 +1644,13 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, int index; int err; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_mpls_policy, + NULL); if (err < 0) goto errout; err = -EINVAL; rtm = nlmsg_data(nlh); - memset(cfg, 0, sizeof(*cfg)); if (rtm->rtm_family != AF_MPLS) goto errout; @@ -1584,6 +1678,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_label = LABEL_NOT_SPECIFIED; cfg->rc_protocol = rtm->rtm_protocol; cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC; + cfg->rc_ttl_propagate = MPLS_TTL_PROP_DEFAULT; cfg->rc_nlflags = nlh->nlmsg_flags; cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid; cfg->rc_nlinfo.nlh = nlh; @@ -1630,6 +1725,17 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, cfg->rc_mp_len = nla_len(nla); break; } + case RTA_TTL_PROPAGATE: + { + u8 ttl_propagate = nla_get_u8(nla); + + if (ttl_propagate > 1) + goto errout; + cfg->rc_ttl_propagate = ttl_propagate ? + MPLS_TTL_PROP_ENABLED : + MPLS_TTL_PROP_DISABLED; + break; + } default: /* Unsupported attribute */ goto errout; @@ -1641,29 +1747,47 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh, return err; } -static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int mpls_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct mpls_route_config cfg; + struct mpls_route_config *cfg; int err; - err = rtm_to_route_config(skb, nlh, &cfg); + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + + err = rtm_to_route_config(skb, nlh, cfg); if (err < 0) - return err; + goto out; + + err = mpls_route_del(cfg); +out: + kfree(cfg); - return mpls_route_del(&cfg); + return err; } -static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh) +static int mpls_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { - struct mpls_route_config cfg; + struct mpls_route_config *cfg; int err; - err = rtm_to_route_config(skb, nlh, &cfg); + cfg = kzalloc(sizeof(*cfg), GFP_KERNEL); + if (!cfg) + return -ENOMEM; + + err = rtm_to_route_config(skb, nlh, cfg); if (err < 0) - return err; + goto out; + + err = mpls_route_add(cfg); +out: + kfree(cfg); - return mpls_route_add(&cfg); + return err; } static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, @@ -1690,6 +1814,15 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, if (nla_put_labels(skb, RTA_DST, 1, &label)) goto nla_put_failure; + + if (rt->rt_ttl_propagate != MPLS_TTL_PROP_DEFAULT) { + bool ttl_propagate = + rt->rt_ttl_propagate == MPLS_TTL_PROP_ENABLED; + + if (nla_put_u8(skb, RTA_TTL_PROPAGATE, + ttl_propagate)) + goto nla_put_failure; + } if (rt->rt_nhn == 1) { const struct mpls_nh *nh = rt->rt_nh; @@ -1711,21 +1844,23 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event, } else { struct rtnexthop *rtnh; struct nlattr *mp; - int dead = 0; - int linkdown = 0; + u8 linkdown = 0; + u8 dead = 0; mp = nla_nest_start(skb, RTA_MULTIPATH); if (!mp) goto nla_put_failure; for_nexthops(rt) { + dev = rtnl_dereference(nh->nh_dev); + if (!dev) + continue; + rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh)); if (!rtnh) goto nla_put_failure; - dev = rtnl_dereference(nh->nh_dev); - if (dev) - rtnh->rtnh_ifindex = dev->ifindex; + rtnh->rtnh_ifindex = dev->ifindex; if (nh->nh_flags & RTNH_F_LINKDOWN) { rtnh->rtnh_flags |= RTNH_F_LINKDOWN; linkdown++; @@ -1800,7 +1935,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) { size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg)) - + nla_total_size(4); /* RTA_DST */ + + nla_total_size(4) /* RTA_DST */ + + nla_total_size(1); /* RTA_TTL_PROPAGATE */ if (rt->rt_nhn == 1) { struct mpls_nh *nh = rt->rt_nh; @@ -1816,6 +1952,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt) size_t nhsize = 0; for_nexthops(rt) { + if (!rtnl_dereference(nh->nh_dev)) + continue; nhsize += nla_total_size(sizeof(struct rtnexthop)); /* RTA_VIA */ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) @@ -1867,10 +2005,7 @@ static int resize_platform_label_table(struct net *net, size_t limit) unsigned index; if (size) { - labels = kzalloc(size, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); - if (!labels) - labels = vzalloc(size); - + labels = kvzalloc(size, GFP_KERNEL); if (!labels) goto nolabels; } @@ -1878,12 +2013,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) /* In case the predefined labels need to be populated */ if (limit > MPLS_LABEL_IPV4NULL) { struct net_device *lo = net->loopback_dev; - rt0 = mpls_rt_alloc(1, lo->addr_len); - if (!rt0) + rt0 = mpls_rt_alloc(1, lo->addr_len, 0); + if (IS_ERR(rt0)) goto nort0; RCU_INIT_POINTER(rt0->rt_nh->nh_dev, lo); rt0->rt_protocol = RTPROT_KERNEL; rt0->rt_payload_type = MPT_IPV4; + rt0->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt0->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt0->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt0, rt0->rt_nh), lo->dev_addr, @@ -1891,12 +2027,13 @@ static int resize_platform_label_table(struct net *net, size_t limit) } if (limit > MPLS_LABEL_IPV6NULL) { struct net_device *lo = net->loopback_dev; - rt2 = mpls_rt_alloc(1, lo->addr_len); - if (!rt2) + rt2 = mpls_rt_alloc(1, lo->addr_len, 0); + if (IS_ERR(rt2)) goto nort2; RCU_INIT_POINTER(rt2->rt_nh->nh_dev, lo); rt2->rt_protocol = RTPROT_KERNEL; rt2->rt_payload_type = MPT_IPV6; + rt2->rt_ttl_propagate = MPLS_TTL_PROP_DEFAULT; rt2->rt_nh->nh_via_table = NEIGH_LINK_TABLE; rt2->rt_nh->nh_via_alen = lo->addr_len; memcpy(__mpls_nh_via(rt2, rt2->rt_nh), lo->dev_addr, @@ -1978,6 +2115,9 @@ static int mpls_platform_labels(struct ctl_table *table, int write, return ret; } +#define MPLS_NS_SYSCTL_OFFSET(field) \ + (&((struct net *)0)->field) + static const struct ctl_table mpls_table[] = { { .procname = "platform_labels", @@ -1986,21 +2126,47 @@ static const struct ctl_table mpls_table[] = { .mode = 0644, .proc_handler = mpls_platform_labels, }, + { + .procname = "ip_ttl_propagate", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.ip_ttl_propagate), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { + .procname = "default_ttl", + .data = MPLS_NS_SYSCTL_OFFSET(mpls.default_ttl), + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &ttl_max, + }, { } }; static int mpls_net_init(struct net *net) { struct ctl_table *table; + int i; net->mpls.platform_labels = 0; net->mpls.platform_label = NULL; + net->mpls.ip_ttl_propagate = 1; + net->mpls.default_ttl = 255; table = kmemdup(mpls_table, sizeof(mpls_table), GFP_KERNEL); if (table == NULL) return -ENOMEM; - table[0].data = net; + /* Table data contains only offsets relative to the base of + * the mdev at this point, so make them absolute. + */ + for (i = 0; i < ARRAY_SIZE(mpls_table) - 1; i++) + table[i].data = (char *)net + (uintptr_t)table[i].data; + net->mpls.ctl = register_net_sysctl(net, "net/mpls", table); if (net->mpls.ctl == NULL) { kfree(table); diff --git a/net/mpls/internal.h b/net/mpls/internal.h index 76360d8b95798e148c5d6a48ce3375eeadcad5a5..4db6a59713220dcefac8be970ab06b1ae25bcd5c 100644 --- a/net/mpls/internal.h +++ b/net/mpls/internal.h @@ -2,6 +2,11 @@ #define MPLS_INTERNAL_H #include +/* put a reasonable limit on the number of labels + * we will accept from userspace + */ +#define MAX_NEW_LABELS 30 + struct mpls_entry_decoded { u32 label; u8 ttl; @@ -64,7 +69,6 @@ struct mpls_dev { struct sk_buff; #define LABEL_NOT_SPECIFIED (1 << 20) -#define MAX_NEW_LABELS 2 /* This maximum ha length copied from the definition of struct neighbour */ #define VIA_ALEN_ALIGN sizeof(unsigned long) @@ -83,11 +87,35 @@ enum mpls_payload_type { struct mpls_nh { /* next hop label forwarding entry */ struct net_device __rcu *nh_dev; + + /* nh_flags is accessed under RCU in the packet path; it is + * modified handling netdev events with rtnl lock held + */ unsigned int nh_flags; - u32 nh_label[MAX_NEW_LABELS]; u8 nh_labels; u8 nh_via_alen; u8 nh_via_table; + u8 nh_reserved1; + + u32 nh_label[0]; +}; + +/* offset of via from beginning of mpls_nh */ +#define MPLS_NH_VIA_OFF(num_labels) \ + ALIGN(sizeof(struct mpls_nh) + (num_labels) * sizeof(u32), \ + VIA_ALEN_ALIGN) + +/* all nexthops within a route have the same size based on the + * max number of labels and max via length across all nexthops + */ +#define MPLS_NH_SIZE(num_labels, max_via_alen) \ + (MPLS_NH_VIA_OFF((num_labels)) + \ + ALIGN((max_via_alen), VIA_ALEN_ALIGN)) + +enum mpls_ttl_propagation { + MPLS_TTL_PROP_DEFAULT, + MPLS_TTL_PROP_ENABLED, + MPLS_TTL_PROP_DISABLED, }; /* The route, nexthops and vias are stored together in the same memory @@ -98,16 +126,16 @@ struct mpls_nh { /* next hop label forwarding entry */ * +----------------------+ * | mpls_nh 0 | * +----------------------+ - * | ... | - * +----------------------+ - * | mpls_nh n-1 | - * +----------------------+ - * | alignment padding | + * | alignment padding | 4 bytes for odd number of labels * +----------------------+ * | via[rt_max_alen] 0 | * +----------------------+ + * | alignment padding | via's aligned on sizeof(unsigned long) + * +----------------------+ * | ... | * +----------------------+ + * | mpls_nh n-1 | + * +----------------------+ * | via[rt_max_alen] n-1 | * +----------------------+ */ @@ -116,22 +144,30 @@ struct mpls_route { /* next hop label forwarding entry */ u8 rt_protocol; u8 rt_payload_type; u8 rt_max_alen; - unsigned int rt_nhn; - unsigned int rt_nhn_alive; + u8 rt_ttl_propagate; + u8 rt_nhn; + /* rt_nhn_alive is accessed under RCU in the packet path; it + * is modified handling netdev events with rtnl lock held + */ + u8 rt_nhn_alive; + u8 rt_nh_size; + u8 rt_via_offset; + u8 rt_reserved1; struct mpls_nh rt_nh[0]; }; #define for_nexthops(rt) { \ - int nhsel; struct mpls_nh *nh; \ - for (nhsel = 0, nh = (rt)->rt_nh; \ + int nhsel; struct mpls_nh *nh; u8 *__nh; \ + for (nhsel = 0, nh = (rt)->rt_nh, __nh = (u8 *)((rt)->rt_nh); \ nhsel < (rt)->rt_nhn; \ - nh++, nhsel++) + __nh += rt->rt_nh_size, nh = (struct mpls_nh *)__nh, nhsel++) #define change_nexthops(rt) { \ - int nhsel; struct mpls_nh *nh; \ - for (nhsel = 0, nh = (struct mpls_nh *)((rt)->rt_nh); \ + int nhsel; struct mpls_nh *nh; u8 *__nh; \ + for (nhsel = 0, nh = (struct mpls_nh *)((rt)->rt_nh), \ + __nh = (u8 *)((rt)->rt_nh); \ nhsel < (rt)->rt_nhn; \ - nh++, nhsel++) + __nh += rt->rt_nh_size, nh = (struct mpls_nh *)__nh, nhsel++) #define endfor_nexthops(rt) } @@ -166,7 +202,7 @@ static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev) int nla_put_labels(struct sk_buff *skb, int attrtype, u8 labels, const u32 label[]); -int nla_get_labels(const struct nlattr *nla, u32 max_labels, u8 *labels, +int nla_get_labels(const struct nlattr *nla, u8 max_labels, u8 *labels, u32 label[]); int nla_get_via(const struct nlattr *nla, u8 *via_alen, u8 *via_table, u8 via[]); diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c index e4e4424f9eb1f5531d22463687d74c2e2ca971a6..369c7a23c86c1407702917488c90e0f3e7f8ee1a 100644 --- a/net/mpls/mpls_iptunnel.c +++ b/net/mpls/mpls_iptunnel.c @@ -29,6 +29,7 @@ static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = { [MPLS_IPTUNNEL_DST] = { .type = NLA_U32 }, + [MPLS_IPTUNNEL_TTL] = { .type = NLA_U8 }, }; static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en) @@ -49,6 +50,7 @@ static int mpls_xmit(struct sk_buff *skb) struct rtable *rt = NULL; struct rt6_info *rt6 = NULL; struct mpls_dev *out_mdev; + struct net *net; int err = 0; bool bos; int i; @@ -56,17 +58,7 @@ static int mpls_xmit(struct sk_buff *skb) /* Find the output device */ out_dev = dst->dev; - - /* Obtain the ttl */ - if (dst->ops->family == AF_INET) { - ttl = ip_hdr(skb)->ttl; - rt = (struct rtable *)dst; - } else if (dst->ops->family == AF_INET6) { - ttl = ipv6_hdr(skb)->hop_limit; - rt6 = (struct rt6_info *)dst; - } else { - goto drop; - } + net = dev_net(out_dev); skb_orphan(skb); @@ -78,6 +70,38 @@ static int mpls_xmit(struct sk_buff *skb) tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate); + /* Obtain the ttl using the following set of rules. + * + * LWT ttl propagation setting: + * - disabled => use default TTL value from LWT + * - enabled => use TTL value from IPv4/IPv6 header + * - default => + * Global ttl propagation setting: + * - disabled => use default TTL value from global setting + * - enabled => use TTL value from IPv4/IPv6 header + */ + if (dst->ops->family == AF_INET) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ip_hdr(skb)->ttl; + rt = (struct rtable *)dst; + } else if (dst->ops->family == AF_INET6) { + if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DISABLED) + ttl = tun_encap_info->default_ttl; + else if (tun_encap_info->ttl_propagate == MPLS_TTL_PROP_DEFAULT && + !net->mpls.ip_ttl_propagate) + ttl = net->mpls.default_ttl; + else + ttl = ipv6_hdr(skb)->hop_limit; + rt6 = (struct rt6_info *)dst; + } else { + goto drop; + } + /* Verify the destination can hold the packet */ new_header_size = mpls_encap_size(tun_encap_info); mtu = mpls_dev_mtu(out_dev); @@ -140,10 +164,11 @@ static int mpls_build_state(struct nlattr *nla, struct mpls_iptunnel_encap *tun_encap_info; struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1]; struct lwtunnel_state *newts; + u8 n_labels; int ret; ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla, - mpls_iptunnel_policy); + mpls_iptunnel_policy, NULL); if (ret < 0) return ret; @@ -151,15 +176,32 @@ static int mpls_build_state(struct nlattr *nla, return -EINVAL; - newts = lwtunnel_state_alloc(sizeof(*tun_encap_info)); + /* determine number of labels */ + if (nla_get_labels(tb[MPLS_IPTUNNEL_DST], + MAX_NEW_LABELS, &n_labels, NULL)) + return -EINVAL; + + newts = lwtunnel_state_alloc(sizeof(*tun_encap_info) + + n_labels * sizeof(u32)); if (!newts) return -ENOMEM; tun_encap_info = mpls_lwtunnel_encap(newts); - ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS, + ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], n_labels, &tun_encap_info->labels, tun_encap_info->label); if (ret) goto errout; + + tun_encap_info->ttl_propagate = MPLS_TTL_PROP_DEFAULT; + + if (tb[MPLS_IPTUNNEL_TTL]) { + tun_encap_info->default_ttl = nla_get_u8(tb[MPLS_IPTUNNEL_TTL]); + /* TTL 0 implies propagate from IP header */ + tun_encap_info->ttl_propagate = tun_encap_info->default_ttl ? + MPLS_TTL_PROP_DISABLED : + MPLS_TTL_PROP_ENABLED; + } + newts->type = LWTUNNEL_ENCAP_MPLS; newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; newts->headroom = mpls_encap_size(tun_encap_info); @@ -186,6 +228,10 @@ static int mpls_fill_encap_info(struct sk_buff *skb, tun_encap_info->label)) goto nla_put_failure; + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT && + nla_put_u8(skb, MPLS_IPTUNNEL_TTL, tun_encap_info->default_ttl)) + goto nla_put_failure; + return 0; nla_put_failure: @@ -195,10 +241,16 @@ static int mpls_fill_encap_info(struct sk_buff *skb, static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate) { struct mpls_iptunnel_encap *tun_encap_info; + int nlsize; tun_encap_info = mpls_lwtunnel_encap(lwtstate); - return nla_total_size(tun_encap_info->labels * 4); + nlsize = nla_total_size(tun_encap_info->labels * 4); + + if (tun_encap_info->ttl_propagate != MPLS_TTL_PROP_DEFAULT) + nlsize += nla_total_size(1); + + return nlsize; } static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) @@ -207,10 +259,12 @@ static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b); int l; - if (a_hdr->labels != b_hdr->labels) + if (a_hdr->labels != b_hdr->labels || + a_hdr->ttl_propagate != b_hdr->ttl_propagate || + a_hdr->default_ttl != b_hdr->default_ttl) return 1; - for (l = 0; l < MAX_NEW_LABELS; l++) + for (l = 0; l < a_hdr->labels; l++) if (a_hdr->label[l] != b_hdr->label[l]) return 1; return 0; diff --git a/net/netfilter/core.c b/net/netfilter/core.c index a87a6f8a74d8c90b2e6801a9d391ced7084dd15f..552d606e57ca4aac6bfe5ad26c42c385a431ab68 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -126,14 +126,15 @@ int nf_register_net_hook(struct net *net, const struct nf_hook_ops *reg) } EXPORT_SYMBOL(nf_register_net_hook); -void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) +static struct nf_hook_entry * +__nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) { struct nf_hook_entry __rcu **pp; struct nf_hook_entry *p; pp = nf_hook_entry_head(net, reg); if (WARN_ON_ONCE(!pp)) - return; + return NULL; mutex_lock(&nf_hook_mutex); for (; (p = nf_entry_dereference(*pp)) != NULL; pp = &p->next) { @@ -145,7 +146,7 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) mutex_unlock(&nf_hook_mutex); if (!p) { WARN(1, "nf_unregister_net_hook: hook not found!\n"); - return; + return NULL; } #ifdef CONFIG_NETFILTER_INGRESS if (reg->pf == NFPROTO_NETDEV && reg->hooknum == NF_NETDEV_INGRESS) @@ -154,10 +155,24 @@ void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) #ifdef HAVE_JUMP_LABEL static_key_slow_dec(&nf_hooks_needed[reg->pf][reg->hooknum]); #endif + + return p; +} + +void nf_unregister_net_hook(struct net *net, const struct nf_hook_ops *reg) +{ + struct nf_hook_entry *p = __nf_unregister_net_hook(net, reg); + unsigned int nfq; + + if (!p) + return; + synchronize_net(); - nf_queue_nf_hook_drop(net, p); + /* other cpu might still process nfqueue verdict that used reg */ - synchronize_net(); + nfq = nf_queue_nf_hook_drop(net); + if (nfq) + synchronize_net(); kfree(p); } EXPORT_SYMBOL(nf_unregister_net_hook); @@ -183,10 +198,32 @@ int nf_register_net_hooks(struct net *net, const struct nf_hook_ops *reg, EXPORT_SYMBOL(nf_register_net_hooks); void nf_unregister_net_hooks(struct net *net, const struct nf_hook_ops *reg, - unsigned int n) + unsigned int hookcount) { - while (n-- > 0) - nf_unregister_net_hook(net, ®[n]); + struct nf_hook_entry *to_free[16]; + unsigned int i, n, nfq; + + do { + n = min_t(unsigned int, hookcount, ARRAY_SIZE(to_free)); + + for (i = 0; i < n; i++) + to_free[i] = __nf_unregister_net_hook(net, ®[i]); + + synchronize_net(); + + /* need 2nd synchronize_net() if nfqueue is used, skb + * can get reinjected right before nf_queue_hook_drop() + */ + nfq = nf_queue_nf_hook_drop(net); + if (nfq) + synchronize_net(); + + for (i = 0; i < n; i++) + kfree(to_free[i]); + + reg += n; + hookcount -= n; + } while (hookcount > 0); } EXPORT_SYMBOL(nf_unregister_net_hooks); diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index 6f09a99298cdf6aaf18c1ff947c76db2d4985865..8ad2b52a0b328249688656c1fa80a7135b932c02 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -232,7 +232,7 @@ mtype_list(const struct ip_set *set, if (!test_bit(id, map->members) || (SET_WITH_TIMEOUT(set) && #ifdef IP_SET_BITMAP_STORED_TIMEOUT - mtype_is_filled((const struct mtype_elem *)x) && + mtype_is_filled(x) && #endif ip_set_timeout_expired(ext_timeout(x, set)))) continue; @@ -248,8 +248,7 @@ mtype_list(const struct ip_set *set, } if (mtype_do_list(skb, map, id, set->dsize)) goto nla_put_failure; - if (ip_set_put_extensions(skb, set, x, - mtype_is_filled((const struct mtype_elem *)x))) + if (ip_set_put_extensions(skb, set, x, mtype_is_filled(x))) goto nla_put_failure; ipset_nest_end(skb, nested); } diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index c296f9b606d495d59c50ee46a04e7e17b21c1530..ba6a5516dc7c724b172ee13c0456b461c47f7281 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -295,7 +295,8 @@ ip_set_get_ipaddr4(struct nlattr *nla, __be32 *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV4))) return -IPSET_ERR_PROTOCOL; @@ -313,7 +314,8 @@ ip_set_get_ipaddr6(struct nlattr *nla, union nf_inet_addr *ipaddr) if (unlikely(!flag_nested(nla))) return -IPSET_ERR_PROTOCOL; - if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, ipaddr_policy)) + if (nla_parse_nested(tb, IPSET_ATTR_IPADDR_MAX, nla, + ipaddr_policy, NULL)) return -IPSET_ERR_PROTOCOL; if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_IPADDR_IPV6))) return -IPSET_ERR_PROTOCOL; @@ -500,14 +502,6 @@ __ip_set_put(struct ip_set *set) /* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need * a separate reference counter */ -static inline void -__ip_set_get_netlink(struct ip_set *set) -{ - write_lock_bh(&ip_set_ref_lock); - set->ref_netlink++; - write_unlock_bh(&ip_set_ref_lock); -} - static inline void __ip_set_put_netlink(struct ip_set *set) { @@ -769,7 +763,7 @@ start_msg(struct sk_buff *skb, u32 portid, u32 seq, unsigned int flags, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - nlh = nlmsg_put(skb, portid, seq, cmd | (NFNL_SUBSYS_IPSET << 8), + nlh = nlmsg_put(skb, portid, seq, nfnl_msg_type(NFNL_SUBSYS_IPSET, cmd), sizeof(*nfmsg), flags); if (!nlh) return NULL; @@ -906,7 +900,7 @@ static int ip_set_create(struct net *net, struct sock *ctnl, /* Without holding any locks, create private part. */ if (attr[IPSET_ATTR_DATA] && nla_parse_nested(tb, IPSET_ATTR_CREATE_MAX, attr[IPSET_ATTR_DATA], - set->type->create_policy)) { + set->type->create_policy, NULL)) { ret = -IPSET_ERR_PROTOCOL; goto put_out; } @@ -1257,8 +1251,8 @@ dump_init(struct netlink_callback *cb, struct ip_set_net *inst) ip_set_id_t index; /* Second pass, so parser can't fail */ - nla_parse(cda, IPSET_ATTR_CMD_MAX, - attr, nlh->nlmsg_len - min_len, ip_set_setname_policy); + nla_parse(cda, IPSET_ATTR_CMD_MAX, attr, nlh->nlmsg_len - min_len, + ip_set_setname_policy, NULL); if (cda[IPSET_ATTR_SETNAME]) { struct ip_set *set; @@ -1305,7 +1299,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb) * manually :-( */ if (nlh->nlmsg_flags & NLM_F_ACK) - netlink_ack(cb->skb, nlh, ret); + netlink_ack(cb->skb, nlh, ret, NULL); return ret; } } @@ -1501,9 +1495,8 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set, memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); cmdattr = (void *)&errmsg->msg + min_len; - nla_parse(cda, IPSET_ATTR_CMD_MAX, - cmdattr, nlh->nlmsg_len - min_len, - ip_set_adt_policy); + nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, + nlh->nlmsg_len - min_len, ip_set_adt_policy, NULL); errline = nla_data(cda[IPSET_ATTR_LINENO]); @@ -1549,7 +1542,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], - set->type->adt_policy)) + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1561,7 +1554,7 @@ static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, - set->type->adt_policy)) + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_ADD, flags, use_lineno); @@ -1603,7 +1596,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (attr[IPSET_ATTR_DATA]) { if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], - set->type->adt_policy)) + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, use_lineno); @@ -1615,7 +1608,7 @@ static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb, if (nla_type(nla) != IPSET_ATTR_DATA || !flag_nested(nla) || nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, nla, - set->type->adt_policy)) + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; ret = call_ad(ctnl, skb, set, tb, IPSET_DEL, flags, use_lineno); @@ -1646,7 +1639,7 @@ static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb, return -ENOENT; if (nla_parse_nested(tb, IPSET_ATTR_ADT_MAX, attr[IPSET_ATTR_DATA], - set->type->adt_policy)) + set->type->adt_policy, NULL)) return -IPSET_ERR_PROTOCOL; rcu_read_lock_bh(); @@ -1915,7 +1908,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len) ret = -EFAULT; goto done; } - op = (unsigned int *)data; + op = data; if (*op < IP_SET_OP_VERSION) { /* Check the version at the beginning of operations */ @@ -2013,7 +2006,7 @@ static struct nf_sockopt_ops so_set __read_mostly = { .pf = PF_INET, .get_optmin = SO_IP_SET, .get_optmax = SO_IP_SET + 1, - .get = &ip_set_sockfn_get, + .get = ip_set_sockfn_get, .owner = THIS_MODULE, }; diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index e6a2753dff9e91dac406e657ad0d49875f052503..3d2ac71a83ec411294361037e7b1d77b6d4bb7e2 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -181,7 +181,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) if (!(cp->flags & IP_VS_CONN_F_HASHED)) { cp->flags |= IP_VS_CONN_F_HASHED; - atomic_inc(&cp->refcnt); + refcount_inc(&cp->refcnt); hlist_add_head_rcu(&cp->c_list, &ip_vs_conn_tab[hash]); ret = 1; } else { @@ -215,7 +215,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_HASHED) { hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; - atomic_dec(&cp->refcnt); + refcount_dec(&cp->refcnt); ret = 1; } else ret = 0; @@ -242,13 +242,13 @@ static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp) if (cp->flags & IP_VS_CONN_F_HASHED) { ret = false; /* Decrease refcnt and unlink conn only if we are last user */ - if (atomic_cmpxchg(&cp->refcnt, 1, 0) == 1) { + if (refcount_dec_if_one(&cp->refcnt)) { hlist_del_rcu(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; ret = true; } } else - ret = atomic_read(&cp->refcnt) ? false : true; + ret = refcount_read(&cp->refcnt) ? false : true; spin_unlock(&cp->lock); ct_write_unlock_bh(hash); @@ -475,7 +475,7 @@ static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp) void ip_vs_conn_put(struct ip_vs_conn *cp) { if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && - (atomic_read(&cp->refcnt) == 1) && + (refcount_read(&cp->refcnt) == 1) && !timer_pending(&cp->timer)) /* expire connection immediately */ __ip_vs_conn_put_notimer(cp); @@ -617,8 +617,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + cp->flags, refcount_read(&cp->refcnt), + refcount_read(&dest->refcnt)); /* Update the connection counters */ if (!(flags & IP_VS_CONN_F_TEMPLATE)) { @@ -714,8 +714,8 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), ip_vs_fwd_tag(cp), cp->state, - cp->flags, atomic_read(&cp->refcnt), - atomic_read(&dest->refcnt)); + cp->flags, refcount_read(&cp->refcnt), + refcount_read(&dest->refcnt)); /* Update the connection counters */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) { @@ -863,10 +863,10 @@ static void ip_vs_conn_expire(unsigned long data) expire_later: IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n", - atomic_read(&cp->refcnt), + refcount_read(&cp->refcnt), atomic_read(&cp->n_control)); - atomic_inc(&cp->refcnt); + refcount_inc(&cp->refcnt); cp->timeout = 60*HZ; if (ipvs->sync_state & IP_VS_STATE_MASTER) @@ -941,7 +941,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af, * it in the table, so that other thread run ip_vs_random_dropentry * but cannot drop this entry. */ - atomic_set(&cp->refcnt, 1); + refcount_set(&cp->refcnt, 1); cp->control = NULL; atomic_set(&cp->n_control, 0); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index db40050f8785eb9205a7bf493a71c9b956b93ab8..d2d7bdf1d5104b6e68284bbbb533c30f159844e2 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -542,7 +542,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); + cp->flags, refcount_read(&cp->refcnt)); ip_vs_conn_stats(cp, svc); return cp; @@ -1193,7 +1193,7 @@ struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc, IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport), IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport), IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport), - cp->flags, atomic_read(&cp->refcnt)); + cp->flags, refcount_read(&cp->refcnt)); LeaveFunction(12); return cp; } @@ -2200,6 +2200,7 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { static int __net_init __ip_vs_init(struct net *net) { struct netns_ipvs *ipvs; + int ret; ipvs = net_generic(net, ip_vs_net_id); if (ipvs == NULL) @@ -2231,13 +2232,17 @@ static int __net_init __ip_vs_init(struct net *net) if (ip_vs_sync_net_init(ipvs) < 0) goto sync_fail; - printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", - sizeof(struct netns_ipvs), ipvs->gen); + ret = nf_register_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + if (ret < 0) + goto hook_fail; + return 0; /* * Error handling */ +hook_fail: + ip_vs_sync_net_cleanup(ipvs); sync_fail: ip_vs_conn_net_cleanup(ipvs); conn_fail: @@ -2257,6 +2262,7 @@ static void __net_exit __ip_vs_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); + nf_unregister_net_hooks(net, ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); ip_vs_service_net_cleanup(ipvs); /* ip_vs_flush() with locks */ ip_vs_conn_net_cleanup(ipvs); ip_vs_app_net_cleanup(ipvs); @@ -2317,24 +2323,16 @@ static int __init ip_vs_init(void) if (ret < 0) goto cleanup_sub; - ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); - if (ret < 0) { - pr_err("can't register hooks.\n"); - goto cleanup_dev; - } - ret = ip_vs_register_nl_ioctl(); if (ret < 0) { pr_err("can't register netlink/ioctl.\n"); - goto cleanup_hooks; + goto cleanup_dev; } pr_info("ipvs loaded.\n"); return ret; -cleanup_hooks: - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); cleanup_dev: unregister_pernet_device(&ipvs_core_dev_ops); cleanup_sub: @@ -2351,7 +2349,6 @@ static int __init ip_vs_init(void) static void __exit ip_vs_cleanup(void) { ip_vs_unregister_nl_ioctl(); - nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); unregister_pernet_device(&ipvs_core_dev_ops); unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_conn_cleanup(); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5aeb0dde6ccc5e525e740ca5fde3ba2c58c50070..1fa3c2307b6ea0173bbcf13c3d0b5cca8c68cba9 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -699,7 +699,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af, dest->vfwmark, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); + refcount_read(&dest->refcnt)); if (dest->af == dest_af && ip_vs_addr_equal(dest_af, &dest->addr, daddr) && dest->port == dport && @@ -934,7 +934,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 1); + refcount_set(&dest->refcnt, 1); INIT_HLIST_NODE(&dest->d_list); spin_lock_init(&dest->dst_lock); @@ -998,7 +998,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, " "dest->refcnt=%d, service %u/%s:%u\n", IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport), - atomic_read(&dest->refcnt), + refcount_read(&dest->refcnt), dest->vfwmark, IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); @@ -1074,7 +1074,7 @@ static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest, spin_lock_bh(&ipvs->dest_trash_lock); IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); + refcount_read(&dest->refcnt)); if (list_empty(&ipvs->dest_trash) && !cleanup) mod_timer(&ipvs->dest_trash_timer, jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1)); @@ -1157,7 +1157,7 @@ static void ip_vs_dest_trash_expire(unsigned long data) spin_lock(&ipvs->dest_trash_lock); list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) { - if (atomic_read(&dest->refcnt) > 1) + if (refcount_read(&dest->refcnt) > 1) continue; if (dest->idle_start) { if (time_before(now, dest->idle_start + @@ -1545,7 +1545,7 @@ ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev) dev->name, IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), - atomic_read(&dest->refcnt)); + refcount_read(&dest->refcnt)); __ip_vs_dst_cache_reset(dest); } spin_unlock_bh(&dest->dst_lock); @@ -1774,13 +1774,13 @@ static struct ctl_table vs_vars[] = { .procname = "sync_version", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_do_sync_mode, + .proc_handler = proc_do_sync_mode, }, { .procname = "sync_ports", .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_do_sync_ports, + .proc_handler = proc_do_sync_ports, }, { .procname = "sync_persist_mode", @@ -2130,8 +2130,8 @@ static int ip_vs_stats_show(struct seq_file *seq, void *v) /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - " Conns Packets Packets Bytes Bytes\n"); + seq_puts(seq, + " Conns Packets Packets Bytes Bytes\n"); ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats); seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n", @@ -2178,8 +2178,8 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */ seq_puts(seq, " Total Incoming Outgoing Incoming Outgoing\n"); - seq_printf(seq, - "CPU Conns Packets Packets Bytes Bytes\n"); + seq_puts(seq, + "CPU Conns Packets Packets Bytes Bytes\n"); for_each_possible_cpu(i) { struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i); @@ -3078,6 +3078,17 @@ static int ip_vs_genl_dump_services(struct sk_buff *skb, return skb->len; } +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, @@ -3089,7 +3100,8 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, /* Parse mandatory identifying service fields first */ if (nla == NULL || - nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy)) + nla_parse_nested(attrs, IPVS_SVC_ATTR_MAX, nla, + ip_vs_svc_policy, NULL)) return -EINVAL; nla_af = attrs[IPVS_SVC_ATTR_AF]; @@ -3104,11 +3116,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif + if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { @@ -3251,8 +3259,8 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, mutex_lock(&__ip_vs_mutex); /* Try to find the service for which to dump destinations */ - if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, - IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy)) + if (nlmsg_parse(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, + ip_vs_cmd_policy, NULL)) goto out_err; @@ -3288,7 +3296,8 @@ static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest, /* Parse mandatory identifying destination fields first */ if (nla == NULL || - nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy)) + nla_parse_nested(attrs, IPVS_DEST_ATTR_MAX, nla, + ip_vs_dest_policy, NULL)) return -EINVAL; nla_addr = attrs[IPVS_DEST_ATTR_ADDR]; @@ -3530,7 +3539,7 @@ static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[IPVS_CMD_ATTR_DAEMON] || nla_parse_nested(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], - ip_vs_daemon_policy)) + ip_vs_daemon_policy, info->extack)) goto out; if (cmd == IPVS_CMD_NEW_DAEMON) @@ -3610,6 +3619,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index d30c327bb578981c0868086cfddb6c304419b2e8..fb780be76d15a05c0d66e8a7f5ea58ee32dacdb9 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -260,7 +260,9 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, buf_len = strlen(buf); ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct) && nfct_nat(ct)) { + if (ct && (ct->status & IPS_NAT_MASK)) { + bool mangled; + /* If mangling fails this function will return 0 * which will cause the packet to be dropped. * Mangling can only fail under memory pressure, @@ -268,12 +270,13 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, * packet. */ rcu_read_lock(); - ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, - iph->ihl * 4, - start-data, end-start, - buf, buf_len); + mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, + iph->ihl * 4, + start - data, + end - start, + buf, buf_len); rcu_read_unlock(); - if (ret) { + if (mangled) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); if (skb->ip_summed == CHECKSUM_COMPLETE) @@ -482,11 +485,8 @@ static struct pernet_operations ip_vs_ftp_ops = { static int __init ip_vs_ftp_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_ftp_ops); /* rcu_barrier() is called by netns on error */ - return rv; + return register_pernet_subsys(&ip_vs_ftp_ops); } /* diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 5824927cf8e02b7fa02f319177d96219c9427033..b6aa4a970c6e97678e8c88e8fe0e0e0b4e4d476d 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -448,7 +448,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index 703f11877beece84cb56ec62d4bd13e87c0d67c3..c13ff575f9f73ab9fb53837ff1b01cd279156c9f 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -204,7 +204,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; } @@ -249,7 +249,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) __func__, IP_VS_DBG_ADDR(most->af, &most->addr), ntohs(most->port), atomic_read(&most->activeconns), - atomic_read(&most->refcnt), + refcount_read(&most->refcnt), atomic_read(&most->weight), moh); return most; } @@ -612,7 +612,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index fc230d99aa3b0143f198d7a9ec798344886058ad..6cf3fd81a5eca887ccefe628c2b9d627b40cf055 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -85,7 +85,7 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) struct nf_conn *ct = nf_ct_get(skb, &ctinfo); struct nf_conntrack_tuple new_tuple; - if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) return; @@ -232,7 +232,7 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, { struct nf_conntrack_expect *exp; - if (ct == NULL || nf_ct_is_untracked(ct)) + if (ct == NULL) return; exp = nf_ct_expect_alloc(ct); diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index a8b63401e7731e6c8fef37b62c43425e2f96b43c..7d9d4ac596ca5809a9322aa3c1276f6dc08f146c 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -110,7 +110,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 8ae480715ceae79ae07fd4dbfc9bacf424b429b3..ca880a3ad033aec3f55641c3cd485098983a8722 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -193,28 +193,6 @@ ip_vs_create_timeout_table(int *table, int size) } -/* - * Set timeout value for state specified by name - */ -int -ip_vs_set_state_timeout(int *table, int num, const char *const *names, - const char *name, int to) -{ - int i; - - if (!table || !name || !to) - return -EINVAL; - - for (i = 0; i < num; i++) { - if (strcmp(names[i], name)) - continue; - table[i] = to * HZ; - return 0; - } - return -ENOENT; -} - - const char * ip_vs_state_name(__u16 proto, int state) { struct ip_vs_protocol *pp = ip_vs_proto_get(proto); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index d952d67f904d1124ed0c5adfa20a51f82207181c..56f8e4b204ffcc4840a1042097a0f7d0f004df18 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -447,7 +447,7 @@ set_sctp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ntohs(cp->cport), sctp_state_name(cp->state), sctp_state_name(next_state), - atomic_read(&cp->refcnt)); + refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && (next_state != IP_VS_SCTP_S_ESTABLISHED)) { diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 5117bcb7d2f00604d5ab73f0610246a7d4fef755..12dc8d5bc37d7ea03ba8448514a6d3caf03a62b0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -557,7 +557,7 @@ set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, ntohs(cp->cport), tcp_state_name(cp->state), tcp_state_name(new_state), - atomic_read(&cp->refcnt)); + refcount_read(&cp->refcnt)); if (dest) { if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index 58bacfc461ee6a1d6df4e6e024032fb52a044e35..ee0530d14c5f9d468e199bcb7cd53aad022b748b 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -97,7 +97,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), atomic_read(&dest->weight)); + refcount_read(&dest->refcnt), atomic_read(&dest->weight)); return dest; } diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index f8e2d00f528b945e774564854fc66f53dbc61970..ab23cf203437772407f800861ea68687d78f8ff6 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -111,7 +111,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index b03c28084f814a9f0b805357e5164ef7b00b0985..0e5b64a75da0521be40831f791eb8f3d7303f70e 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -520,7 +520,7 @@ static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs, if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) && pkts % sync_period != sysctl_sync_threshold(ipvs)) return 0; - } else if (sync_refresh_period <= 0 && + } else if (!sync_refresh_period && pkts != sysctl_sync_threshold(ipvs)) return 0; @@ -1849,7 +1849,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, if (state == IP_VS_STATE_MASTER) { struct ipvs_master_sync_state *ms; - ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL); + ipvs->ms = kcalloc(count, sizeof(ipvs->ms[0]), GFP_KERNEL); if (!ipvs->ms) goto out; ms = ipvs->ms; @@ -1862,7 +1862,7 @@ int start_sync_thread(struct netns_ipvs *ipvs, struct ipvs_sync_daemon_cfg *c, ms->ipvs = ipvs; } } else { - array = kzalloc(count * sizeof(struct task_struct *), + array = kcalloc(count, sizeof(struct task_struct *), GFP_KERNEL); if (!array) goto out; diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index 6b366fd905542ff086a36da4111bd11646d274d8..6add39e0ec20d61d21883082e5079d16cab6942c 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -83,7 +83,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, IP_VS_DBG_ADDR(least->af, &least->addr), ntohs(least->port), atomic_read(&least->activeconns), - atomic_read(&least->refcnt), + refcount_read(&least->refcnt), atomic_read(&least->weight), loh); return least; diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 17e6d4406ca7c32657eff5e103d0aa1b9317e813..62258dd457ac9825aa14f26bfebb08bb2f2f1755 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -218,7 +218,7 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb, "activeconns %d refcnt %d weight %d\n", IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port), atomic_read(&dest->activeconns), - atomic_read(&dest->refcnt), + refcount_read(&dest->refcnt), atomic_read(&dest->weight)); mark->cl = dest; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 4e1a98fcc8c3faa3fd037cc0516fd8a3108ab2a9..2eab1e0400f48a5816239cbb95a6b192169137bf 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -775,7 +775,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, ipvsh->off, "ip_vs_nat_xmit(): " "stopping DNAT to local address"); @@ -866,7 +866,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, ipvsh->off, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); @@ -1338,7 +1338,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI4\n", __func__, &cp->daddr.ip); @@ -1429,7 +1429,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { IP_VS_DBG(10, "%s(): " "stopping DNAT to local address %pI6\n", __func__, &cp->daddr.in6); diff --git a/net/netfilter/nf_conntrack_acct.c b/net/netfilter/nf_conntrack_acct.c index 45da11afa785b2779857b3786881158c821263b3..86691671290519809bf8b39b150c7b040a3c0fe1 100644 --- a/net/netfilter/nf_conntrack_acct.c +++ b/net/netfilter/nf_conntrack_acct.c @@ -55,7 +55,7 @@ seq_print_acct(struct seq_file *s, const struct nf_conn *ct, int dir) }; EXPORT_SYMBOL_GPL(seq_print_acct); -static struct nf_ct_ext_type acct_extend __read_mostly = { +static const struct nf_ct_ext_type acct_extend = { .len = sizeof(struct nf_conn_acct), .align = __alignof__(struct nf_conn_acct), .id = NF_CT_EXT_ACCT, diff --git a/net/netfilter/nf_conntrack_amanda.c b/net/netfilter/nf_conntrack_amanda.c index 57a26cc90c9fada2be251d6718f8b814e059beea..03d2ccffa9fa3c1eecfbc07dcadc420a0ab5e315 100644 --- a/net/netfilter/nf_conntrack_amanda.c +++ b/net/netfilter/nf_conntrack_amanda.c @@ -207,6 +207,8 @@ static int __init nf_conntrack_amanda_init(void) { int ret, i; + NF_CT_HELPER_BUILD_BUG_ON(0); + for (i = 0; i < ARRAY_SIZE(search); i++) { search[i].ts = textsearch_prepare(ts_algo, search[i].string, search[i].len, diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index ffb78e5f7b70912a2bba608a7f32f0a2bc486adc..e847dbaa0c6b3aefc3d417421a2b529a10735e38 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -76,6 +76,7 @@ struct conntrack_gc_work { struct delayed_work dwork; u32 last_bucket; bool exiting; + bool early_drop; long next_gc_run; }; @@ -180,14 +181,6 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; seqcount_t nf_conntrack_generation __read_mostly; - -/* nf_conn must be 8 bytes aligned, as the 3 LSB bits are used - * for the nfctinfo. We cheat by (ab)using the PER CPU cache line - * alignment to enforce this. - */ -DEFINE_PER_CPU_ALIGNED(struct nf_conn, nf_conntrack_untracked); -EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); - static unsigned int nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, @@ -706,7 +699,7 @@ static int nf_ct_resolve_clash(struct net *net, struct sk_buff *skb, l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); if (l4proto->allow_clash && - !nfct_nat(ct) && + ((ct->status & IPS_NAT_DONE_MASK) == 0) && !nf_ct_is_dying(ct) && atomic_inc_not_zero(&ct->ct_general.use)) { enum ip_conntrack_info oldinfo; @@ -918,7 +911,7 @@ static unsigned int early_drop_list(struct net *net, continue; /* kill only if still in same netns -- might have moved due to - * SLAB_DESTROY_BY_RCU rules. + * SLAB_TYPESAFE_BY_RCU rules. * * We steal the timer reference. If that fails timer has * already fired or someone else deleted it. Just drop ref @@ -959,10 +952,30 @@ static noinline int early_drop(struct net *net, unsigned int _hash) return false; } +static bool gc_worker_skip_ct(const struct nf_conn *ct) +{ + return !nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct); +} + +static bool gc_worker_can_early_drop(const struct nf_conn *ct) +{ + const struct nf_conntrack_l4proto *l4proto; + + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) + return true; + + l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct)); + if (l4proto->can_early_drop && l4proto->can_early_drop(ct)) + return true; + + return false; +} + static void gc_worker(struct work_struct *work) { unsigned int min_interval = max(HZ / GC_MAX_BUCKETS_DIV, 1u); unsigned int i, goal, buckets = 0, expired_count = 0; + unsigned int nf_conntrack_max95 = 0; struct conntrack_gc_work *gc_work; unsigned int ratio, scanned = 0; unsigned long next_run; @@ -971,6 +984,8 @@ static void gc_worker(struct work_struct *work) goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; i = gc_work->last_bucket; + if (gc_work->early_drop) + nf_conntrack_max95 = nf_conntrack_max / 100u * 95u; do { struct nf_conntrack_tuple_hash *h; @@ -987,6 +1002,8 @@ static void gc_worker(struct work_struct *work) i = 0; hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { + struct net *net; + tmp = nf_ct_tuplehash_to_ctrack(h); scanned++; @@ -995,6 +1012,27 @@ static void gc_worker(struct work_struct *work) expired_count++; continue; } + + if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) + continue; + + net = nf_ct_net(tmp); + if (atomic_read(&net->ct.count) < nf_conntrack_max95) + continue; + + /* need to take reference to avoid possible races */ + if (!atomic_inc_not_zero(&tmp->ct_general.use)) + continue; + + if (gc_worker_skip_ct(tmp)) { + nf_ct_put(tmp); + continue; + } + + if (gc_worker_can_early_drop(tmp)) + nf_ct_kill(tmp); + + nf_ct_put(tmp); } /* could check get_nulls_value() here and restart if ct @@ -1040,6 +1078,7 @@ static void gc_worker(struct work_struct *work) next_run = gc_work->next_gc_run; gc_work->last_bucket = i; + gc_work->early_drop = false; queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); } @@ -1065,6 +1104,8 @@ __nf_conntrack_alloc(struct net *net, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { if (!early_drop(net, hash)) { + if (!conntrack_gc_work.early_drop) + conntrack_gc_work.early_drop = true; atomic_dec(&net->ct.count); net_warn_ratelimited("nf_conntrack: table full, dropping packet\n"); return ERR_PTR(-ENOMEM); @@ -1073,7 +1114,7 @@ __nf_conntrack_alloc(struct net *net, /* * Do not use kmem_cache_zalloc(), as this cache uses - * SLAB_DESTROY_BY_RCU. + * SLAB_TYPESAFE_BY_RCU. */ ct = kmem_cache_alloc(nf_conntrack_cachep, gfp); if (ct == NULL) @@ -1118,7 +1159,7 @@ void nf_conntrack_free(struct nf_conn *ct) struct net *net = nf_ct_net(ct); /* A freed object has refcnt == 0, that's - * the golden rule for SLAB_DESTROY_BY_RCU + * the golden rule for SLAB_TYPESAFE_BY_RCU */ NF_CT_ASSERT(atomic_read(&ct->ct_general.use) == 0); @@ -1133,7 +1174,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_free); /* Allocate a new conntrack: we return -ENOMEM if classification failed due to stress. Otherwise it really is unclassifiable. */ -static struct nf_conntrack_tuple_hash * +static noinline struct nf_conntrack_tuple_hash * init_conntrack(struct net *net, struct nf_conn *tmpl, const struct nf_conntrack_tuple *tuple, struct nf_conntrack_l3proto *l3proto, @@ -1241,21 +1282,20 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return &ct->tuplehash[IP_CT_DIR_ORIGINAL]; } -/* On success, returns conntrack ptr, sets skb->_nfct | ctinfo */ -static inline struct nf_conn * +/* On success, returns 0, sets skb->_nfct | ctinfo */ +static int resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff, u_int16_t l3num, u_int8_t protonum, struct nf_conntrack_l3proto *l3proto, - struct nf_conntrack_l4proto *l4proto, - int *set_reply, - enum ip_conntrack_info *ctinfo) + struct nf_conntrack_l4proto *l4proto) { const struct nf_conntrack_zone *zone; struct nf_conntrack_tuple tuple; struct nf_conntrack_tuple_hash *h; + enum ip_conntrack_info ctinfo; struct nf_conntrack_zone tmp; struct nf_conn *ct; u32 hash; @@ -1264,7 +1304,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, dataoff, l3num, protonum, net, &tuple, l3proto, l4proto)) { pr_debug("Can't get tuple\n"); - return NULL; + return 0; } /* look for tuple match */ @@ -1275,33 +1315,30 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, skb, dataoff, hash); if (!h) - return NULL; + return 0; if (IS_ERR(h)) - return (void *)h; + return PTR_ERR(h); } ct = nf_ct_tuplehash_to_ctrack(h); /* It exists; we have (non-exclusive) reference. */ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) { - *ctinfo = IP_CT_ESTABLISHED_REPLY; - /* Please set reply bit if this packet OK */ - *set_reply = 1; + ctinfo = IP_CT_ESTABLISHED_REPLY; } else { /* Once we've had two way comms, always ESTABLISHED. */ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { pr_debug("normal packet for %p\n", ct); - *ctinfo = IP_CT_ESTABLISHED; + ctinfo = IP_CT_ESTABLISHED; } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) { pr_debug("related packet for %p\n", ct); - *ctinfo = IP_CT_RELATED; + ctinfo = IP_CT_RELATED; } else { pr_debug("new packet for %p\n", ct); - *ctinfo = IP_CT_NEW; + ctinfo = IP_CT_NEW; } - *set_reply = 0; } - nf_ct_set(skb, ct, *ctinfo); - return ct; + nf_ct_set(skb, ct, ctinfo); + return 0; } unsigned int @@ -1315,13 +1352,13 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, unsigned int *timeouts; unsigned int dataoff; u_int8_t protonum; - int set_reply = 0; int ret; tmpl = nf_ct_get(skb, &ctinfo); - if (tmpl) { + if (tmpl || ctinfo == IP_CT_UNTRACKED) { /* Previously seen (loopback or untracked)? Ignore. */ - if (!nf_ct_is_template(tmpl)) { + if ((tmpl && !nf_ct_is_template(tmpl)) || + ctinfo == IP_CT_UNTRACKED) { NF_CT_STAT_INC_ATOMIC(net, ignore); return NF_ACCEPT; } @@ -1358,23 +1395,22 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } repeat: - ct = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, - l3proto, l4proto, &set_reply, &ctinfo); - if (!ct) { - /* Not valid part of a connection */ - NF_CT_STAT_INC_ATOMIC(net, invalid); - ret = NF_ACCEPT; - goto out; - } - - if (IS_ERR(ct)) { + ret = resolve_normal_ct(net, tmpl, skb, dataoff, pf, protonum, + l3proto, l4proto); + if (ret < 0) { /* Too stressed to deal. */ NF_CT_STAT_INC_ATOMIC(net, drop); ret = NF_DROP; goto out; } - NF_CT_ASSERT(skb_nfct(skb)); + ct = nf_ct_get(skb, &ctinfo); + if (!ct) { + /* Not valid part of a connection */ + NF_CT_STAT_INC_ATOMIC(net, invalid); + ret = NF_ACCEPT; + goto out; + } /* Decide what timeout policy we want to apply to this flow. */ timeouts = nf_ct_timeout_lookup(net, ct, l4proto); @@ -1399,7 +1435,8 @@ nf_conntrack_in(struct net *net, u_int8_t pf, unsigned int hooknum, goto out; } - if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) + if (ctinfo == IP_CT_ESTABLISHED_REPLY && + !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status)) nf_conntrack_event_cache(IPCT_REPLY, ct); out: if (tmpl) @@ -1634,18 +1671,6 @@ void nf_ct_free_hashtable(void *hash, unsigned int size) } EXPORT_SYMBOL_GPL(nf_ct_free_hashtable); -static int untrack_refs(void) -{ - int cnt = 0, cpu; - - for_each_possible_cpu(cpu) { - struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); - - cnt += atomic_read(&ct->ct_general.use) - 1; - } - return cnt; -} - void nf_conntrack_cleanup_start(void) { conntrack_gc_work.exiting = true; @@ -1655,8 +1680,6 @@ void nf_conntrack_cleanup_start(void) void nf_conntrack_cleanup_end(void) { RCU_INIT_POINTER(nf_ct_destroy, NULL); - while (untrack_refs() > 0) - schedule(); cancel_delayed_work_sync(&conntrack_gc_work.dwork); nf_ct_free_hashtable(nf_conntrack_hash, nf_conntrack_htable_size); @@ -1830,20 +1853,44 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -void nf_ct_untracked_status_or(unsigned long bits) +static __always_inline unsigned int total_extension_size(void) { - int cpu; + /* remember to add new extensions below */ + BUILD_BUG_ON(NF_CT_EXT_NUM > 9); - for_each_possible_cpu(cpu) - per_cpu(nf_conntrack_untracked, cpu).status |= bits; -} -EXPORT_SYMBOL_GPL(nf_ct_untracked_status_or); + return sizeof(struct nf_ct_ext) + + sizeof(struct nf_conn_help) +#if IS_ENABLED(CONFIG_NF_NAT) + + sizeof(struct nf_conn_nat) +#endif + + sizeof(struct nf_conn_seqadj) + + sizeof(struct nf_conn_acct) +#ifdef CONFIG_NF_CONNTRACK_EVENTS + + sizeof(struct nf_conntrack_ecache) +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMESTAMP + + sizeof(struct nf_conn_tstamp) +#endif +#ifdef CONFIG_NF_CONNTRACK_TIMEOUT + + sizeof(struct nf_conn_timeout) +#endif +#ifdef CONFIG_NF_CONNTRACK_LABELS + + sizeof(struct nf_conn_labels) +#endif +#if IS_ENABLED(CONFIG_NETFILTER_SYNPROXY) + + sizeof(struct nf_conn_synproxy) +#endif + ; +}; int nf_conntrack_init_start(void) { int max_factor = 8; int ret = -ENOMEM; - int i, cpu; + int i; + + /* struct nf_ct_ext uses u8 to store offsets/size */ + BUILD_BUG_ON(total_extension_size() > 255u); seqcount_init(&nf_conntrack_generation); @@ -1882,7 +1929,7 @@ int nf_conntrack_init_start(void) nf_conntrack_cachep = kmem_cache_create("nf_conntrack", sizeof(struct nf_conn), NFCT_INFOMASK + 1, - SLAB_DESTROY_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); + SLAB_TYPESAFE_BY_RCU | SLAB_HWCACHE_ALIGN, NULL); if (!nf_conntrack_cachep) goto err_cachep; @@ -1926,15 +1973,6 @@ int nf_conntrack_init_start(void) if (ret < 0) goto err_proto; - /* Set up fake conntrack: to never be deleted, not in any hashes */ - for_each_possible_cpu(cpu) { - struct nf_conn *ct = &per_cpu(nf_conntrack_untracked, cpu); - write_pnet(&ct->ct_net, &init_net); - atomic_set(&ct->ct_general.use, 1); - } - /* - and look it like as a confirmed connection */ - nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); - conntrack_gc_work_init(&conntrack_gc_work); queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, HZ); @@ -1982,6 +2020,7 @@ int nf_conntrack_init_net(struct net *net) int ret = -ENOMEM; int cpu; + BUILD_BUG_ON(IP_CT_UNTRACKED == IP_CT_NUMBER); atomic_set(&net->ct.count, 0); net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu); diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 22fc32143e9c4ae17bc48edc68eb0decf11d931c..caac41ad9483ed36333884f8e584521778e76937 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -195,7 +195,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) events = xchg(&e->cache, 0); - if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) + if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct)) goto out_unlock; /* We make a copy of the missed event cache without taking @@ -212,7 +212,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) ret = notify->fcn(events | missed, &item); - if (likely(ret >= 0 && !missed)) + if (likely(ret == 0 && !missed)) goto out_unlock; spin_lock_bh(&ct->lock); @@ -347,7 +347,7 @@ static struct ctl_table event_sysctl_table[] = { }; #endif /* CONFIG_SYSCTL */ -static struct nf_ct_ext_type event_extend __read_mostly = { +static const struct nf_ct_ext_type event_extend = { .len = sizeof(struct nf_conntrack_ecache), .align = __alignof__(struct nf_conntrack_ecache), .id = NF_CT_EXT_ECACHE, @@ -420,6 +420,9 @@ int nf_conntrack_ecache_init(void) int ret = nf_ct_extend_register(&event_extend); if (ret < 0) pr_err("nf_ct_event: Unable to register event extension.\n"); + + BUILD_BUG_ON(__IPCT_MAX >= 16); /* ctmask, missed use u16 */ + return ret; } diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index d80073037856db9081e57b7c88e482a61b94a45f..e03d16ed550d6bf07a537f4b150148a3cd967f59 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -103,6 +103,17 @@ nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple, nf_ct_zone_equal_any(i->master, zone); } +bool nf_ct_remove_expect(struct nf_conntrack_expect *exp) +{ + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(nf_ct_remove_expect); + struct nf_conntrack_expect * __nf_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, @@ -133,7 +144,7 @@ nf_ct_expect_find_get(struct net *net, rcu_read_lock(); i = __nf_ct_expect_find(net, zone, tuple); - if (i && !atomic_inc_not_zero(&i->use)) + if (i && !refcount_inc_not_zero(&i->use)) i = NULL; rcu_read_unlock(); @@ -186,7 +197,7 @@ nf_ct_find_expectation(struct net *net, return NULL; if (exp->flags & NF_CT_EXPECT_PERMANENT) { - atomic_inc(&exp->use); + refcount_inc(&exp->use); return exp; } else if (del_timer(&exp->timeout)) { nf_ct_unlink_expect(exp); @@ -211,10 +222,7 @@ void nf_ct_remove_expectations(struct nf_conn *ct) spin_lock_bh(&nf_conntrack_expect_lock); hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { - if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_ct_expect_put(exp); - } + nf_ct_remove_expect(exp); } spin_unlock_bh(&nf_conntrack_expect_lock); } @@ -255,10 +263,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a, void nf_ct_unexpect_related(struct nf_conntrack_expect *exp) { spin_lock_bh(&nf_conntrack_expect_lock); - if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_ct_expect_put(exp); - } + nf_ct_remove_expect(exp); spin_unlock_bh(&nf_conntrack_expect_lock); } EXPORT_SYMBOL_GPL(nf_ct_unexpect_related); @@ -275,7 +280,7 @@ struct nf_conntrack_expect *nf_ct_expect_alloc(struct nf_conn *me) return NULL; new->master = me; - atomic_set(&new->use, 1); + refcount_set(&new->use, 1); return new; } EXPORT_SYMBOL_GPL(nf_ct_expect_alloc); @@ -348,7 +353,7 @@ static void nf_ct_expect_free_rcu(struct rcu_head *head) void nf_ct_expect_put(struct nf_conntrack_expect *exp) { - if (atomic_dec_and_test(&exp->use)) + if (refcount_dec_and_test(&exp->use)) call_rcu(&exp->rcu, nf_ct_expect_free_rcu); } EXPORT_SYMBOL_GPL(nf_ct_expect_put); @@ -361,7 +366,7 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple); /* two references : one for hash insert, one for the timer */ - atomic_add(2, &exp->use); + refcount_add(2, &exp->use); hlist_add_head_rcu(&exp->lnode, &master_help->expectations); master_help->expecting[exp->class]++; @@ -394,10 +399,8 @@ static void evict_oldest_expect(struct nf_conn *master, last = exp; } - if (last && del_timer(&last->timeout)) { - nf_ct_unlink_expect(last); - nf_ct_expect_put(last); - } + if (last) + nf_ct_remove_expect(last); } static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) @@ -419,11 +422,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) h = nf_ct_expect_dst_hash(net, &expect->tuple); hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) { if (expect_matches(i, expect)) { - if (del_timer(&i->timeout)) { - nf_ct_unlink_expect(i); - nf_ct_expect_put(i); + if (nf_ct_remove_expect(expect)) break; - } } else if (expect_clash(i, expect)) { ret = -EBUSY; goto out; @@ -549,7 +549,7 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "%ld ", timer_pending(&expect->timeout) ? (long)(expect->timeout.expires - jiffies)/HZ : 0); else - seq_printf(s, "- "); + seq_puts(s, "- "); seq_printf(s, "l3proto = %u proto=%u ", expect->tuple.src.l3num, expect->tuple.dst.protonum); @@ -559,7 +559,7 @@ static int exp_seq_show(struct seq_file *s, void *v) expect->tuple.dst.protonum)); if (expect->flags & NF_CT_EXPECT_PERMANENT) { - seq_printf(s, "PERMANENT"); + seq_puts(s, "PERMANENT"); delim = ","; } if (expect->flags & NF_CT_EXPECT_INACTIVE) { diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 008299b7f78fe3754946cf0a58029090234ad905..6c605e88ebae238ce618b9e808a4da0dc697d95c 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -18,17 +18,14 @@ static struct nf_ct_ext_type __rcu *nf_ct_ext_types[NF_CT_EXT_NUM]; static DEFINE_MUTEX(nf_ct_ext_type_mutex); +#define NF_CT_EXT_PREALLOC 128u /* conntrack events are on by default */ -void __nf_ct_ext_destroy(struct nf_conn *ct) +void nf_ct_ext_destroy(struct nf_conn *ct) { unsigned int i; struct nf_ct_ext_type *t; - struct nf_ct_ext *ext = ct->ext; for (i = 0; i < NF_CT_EXT_NUM; i++) { - if (!__nf_ct_ext_exist(ext, i)) - continue; - rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[i]); @@ -41,54 +38,26 @@ void __nf_ct_ext_destroy(struct nf_conn *ct) rcu_read_unlock(); } } -EXPORT_SYMBOL(__nf_ct_ext_destroy); - -static void * -nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, - size_t var_alloc_len, gfp_t gfp) -{ - unsigned int off, len; - struct nf_ct_ext_type *t; - size_t alloc_size; - - rcu_read_lock(); - t = rcu_dereference(nf_ct_ext_types[id]); - if (!t) { - rcu_read_unlock(); - return NULL; - } - - off = ALIGN(sizeof(struct nf_ct_ext), t->align); - len = off + t->len + var_alloc_len; - alloc_size = t->alloc_size + var_alloc_len; - rcu_read_unlock(); - - *ext = kzalloc(alloc_size, gfp); - if (!*ext) - return NULL; - - (*ext)->offset[id] = off; - (*ext)->len = len; - - return (void *)(*ext) + off; -} +EXPORT_SYMBOL(nf_ct_ext_destroy); -void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, - size_t var_alloc_len, gfp_t gfp) +void *nf_ct_ext_add(struct nf_conn *ct, enum nf_ct_ext_id id, gfp_t gfp) { + unsigned int newlen, newoff, oldlen, alloc; struct nf_ct_ext *old, *new; - int newlen, newoff; struct nf_ct_ext_type *t; /* Conntrack must not be confirmed to avoid races on reallocation. */ NF_CT_ASSERT(!nf_ct_is_confirmed(ct)); old = ct->ext; - if (!old) - return nf_ct_ext_create(&ct->ext, id, var_alloc_len, gfp); - if (__nf_ct_ext_exist(old, id)) - return NULL; + if (old) { + if (__nf_ct_ext_exist(old, id)) + return NULL; + oldlen = old->len; + } else { + oldlen = sizeof(*new); + } rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[id]); @@ -97,15 +66,19 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, return NULL; } - newoff = ALIGN(old->len, t->align); - newlen = newoff + t->len + var_alloc_len; + newoff = ALIGN(oldlen, t->align); + newlen = newoff + t->len; rcu_read_unlock(); - new = __krealloc(old, newlen, gfp); + alloc = max(newlen, NF_CT_EXT_PREALLOC); + new = __krealloc(old, alloc, gfp); if (!new) return NULL; - if (new != old) { + if (!old) { + memset(new->offset, 0, sizeof(new->offset)); + ct->ext = new; + } else if (new != old) { kfree_rcu(old, rcu); rcu_assign_pointer(ct->ext, new); } @@ -115,45 +88,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, memset((void *)new + newoff, 0, newlen - newoff); return (void *)new + newoff; } -EXPORT_SYMBOL(__nf_ct_ext_add_length); - -static void update_alloc_size(struct nf_ct_ext_type *type) -{ - int i, j; - struct nf_ct_ext_type *t1, *t2; - enum nf_ct_ext_id min = 0, max = NF_CT_EXT_NUM - 1; - - /* unnecessary to update all types */ - if ((type->flags & NF_CT_EXT_F_PREALLOC) == 0) { - min = type->id; - max = type->id; - } - - /* This assumes that extended areas in conntrack for the types - whose NF_CT_EXT_F_PREALLOC bit set are allocated in order */ - for (i = min; i <= max; i++) { - t1 = rcu_dereference_protected(nf_ct_ext_types[i], - lockdep_is_held(&nf_ct_ext_type_mutex)); - if (!t1) - continue; - - t1->alloc_size = ALIGN(sizeof(struct nf_ct_ext), t1->align) + - t1->len; - for (j = 0; j < NF_CT_EXT_NUM; j++) { - t2 = rcu_dereference_protected(nf_ct_ext_types[j], - lockdep_is_held(&nf_ct_ext_type_mutex)); - if (t2 == NULL || t2 == t1 || - (t2->flags & NF_CT_EXT_F_PREALLOC) == 0) - continue; - - t1->alloc_size = ALIGN(t1->alloc_size, t2->align) - + t2->len; - } - } -} +EXPORT_SYMBOL(nf_ct_ext_add); /* This MUST be called in process context. */ -int nf_ct_extend_register(struct nf_ct_ext_type *type) +int nf_ct_extend_register(const struct nf_ct_ext_type *type) { int ret = 0; @@ -163,12 +101,7 @@ int nf_ct_extend_register(struct nf_ct_ext_type *type) goto out; } - /* This ensures that nf_ct_ext_create() can allocate enough area - before updating alloc_size */ - type->alloc_size = ALIGN(sizeof(struct nf_ct_ext), type->align) - + type->len; rcu_assign_pointer(nf_ct_ext_types[type->id], type); - update_alloc_size(type); out: mutex_unlock(&nf_ct_ext_type_mutex); return ret; @@ -176,11 +109,10 @@ int nf_ct_extend_register(struct nf_ct_ext_type *type) EXPORT_SYMBOL_GPL(nf_ct_extend_register); /* This MUST be called in process context. */ -void nf_ct_extend_unregister(struct nf_ct_ext_type *type) +void nf_ct_extend_unregister(const struct nf_ct_ext_type *type) { mutex_lock(&nf_ct_ext_type_mutex); RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); - update_alloc_size(type); mutex_unlock(&nf_ct_ext_type_mutex); synchronize_rcu(); } diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 4aecef4a89fb135e5b50ac382f19645aafaaff94..f0e9a7511e1ac4915ed9bc43492d214417b3c1bd 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -577,6 +577,8 @@ static int __init nf_conntrack_ftp_init(void) { int i, ret = 0; + NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_ftp_master)); + ftp_buffer = kmalloc(65536, GFP_KERNEL); if (!ftp_buffer) return -ENOMEM; @@ -589,12 +591,10 @@ static int __init nf_conntrack_ftp_init(void) for (i = 0; i < ports_c; i++) { nf_ct_helper_init(&ftp[2 * i], AF_INET, IPPROTO_TCP, "ftp", FTP_PORT, ports[i], ports[i], &ftp_exp_policy, - 0, sizeof(struct nf_ct_ftp_master), help, - nf_ct_ftp_from_nlattr, THIS_MODULE); + 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE); nf_ct_helper_init(&ftp[2 * i + 1], AF_INET6, IPPROTO_TCP, "ftp", FTP_PORT, ports[i], ports[i], &ftp_exp_policy, - 0, sizeof(struct nf_ct_ftp_master), help, - nf_ct_ftp_from_nlattr, THIS_MODULE); + 0, help, nf_ct_ftp_from_nlattr, THIS_MODULE); } ret = nf_conntrack_helpers_register(ftp, ports_c * 2); diff --git a/net/netfilter/nf_conntrack_h323_main.c b/net/netfilter/nf_conntrack_h323_main.c index f65d93639d12595884081726808f4c1e17caec03..3bcdc718484e4c5352ab2de17c5a8aaf227af2f3 100644 --- a/net/netfilter/nf_conntrack_h323_main.c +++ b/net/netfilter/nf_conntrack_h323_main.c @@ -637,7 +637,6 @@ static const struct nf_conntrack_expect_policy h245_exp_policy = { static struct nf_conntrack_helper nf_conntrack_helper_h245 __read_mostly = { .name = "H.245", .me = THIS_MODULE, - .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_UNSPEC, .tuple.dst.protonum = IPPROTO_UDP, .help = h245_help, @@ -1215,7 +1214,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_q931[] __read_mostly = { { .name = "Q.931", .me = THIS_MODULE, - .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, .tuple.src.u.tcp.port = cpu_to_be16(Q931_PORT), .tuple.dst.protonum = IPPROTO_TCP, @@ -1800,7 +1798,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { { .name = "RAS", .me = THIS_MODULE, - .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET, .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, @@ -1810,7 +1807,6 @@ static struct nf_conntrack_helper nf_conntrack_helper_ras[] __read_mostly = { { .name = "RAS", .me = THIS_MODULE, - .data_len = sizeof(struct nf_ct_h323_master), .tuple.src.l3num = AF_INET6, .tuple.src.u.udp.port = cpu_to_be16(RAS_PORT), .tuple.dst.protonum = IPPROTO_UDP, @@ -1836,6 +1832,8 @@ static int __init nf_conntrack_h323_init(void) { int ret; + NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_h323_master)); + h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) return -ENOMEM; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4eeb3418366ad5473945395346b6e4e5fc213fbc..3a60efa7799b2e4569af35ce943c67fc354dc68a 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -187,8 +187,7 @@ nf_ct_helper_ext_add(struct nf_conn *ct, { struct nf_conn_help *help; - help = nf_ct_ext_add_length(ct, NF_CT_EXT_HELPER, - helper->data_len, gfp); + help = nf_ct_ext_add(ct, NF_CT_EXT_HELPER, gfp); if (help) INIT_HLIST_HEAD(&help->expectations); else @@ -386,17 +385,36 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; - int ret = 0; + int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); BUG_ON(strlen(me->name) > NF_CT_HELPER_NAME_LEN - 1); + if (me->expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) + return -EINVAL; + mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; - goto out; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(cur->name, me->name) && + (cur->tuple.src.l3num == NFPROTO_UNSPEC || + cur->tuple.src.l3num == me->tuple.src.l3num) && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + } + + /* avoid unpredictable behaviour for auto_assign_helper */ + if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, + &mask)) { + ret = -EEXIST; + goto out; + } } } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); @@ -455,11 +473,8 @@ void nf_conntrack_helper_unregister(struct nf_conntrack_helper *me) if ((rcu_dereference_protected( help->helper, lockdep_is_held(&nf_conntrack_expect_lock) - ) == me || exp->helper == me) && - del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); - nf_ct_expect_put(exp); - } + ) == me || exp->helper == me)) + nf_ct_remove_expect(exp); } } spin_unlock_bh(&nf_conntrack_expect_lock); @@ -491,7 +506,7 @@ void nf_ct_helper_init(struct nf_conntrack_helper *helper, u16 l3num, u16 protonum, const char *name, u16 default_port, u16 spec_port, u32 id, const struct nf_conntrack_expect_policy *exp_pol, - u32 expect_class_max, u32 data_len, + u32 expect_class_max, int (*help)(struct sk_buff *skb, unsigned int protoff, struct nf_conn *ct, enum ip_conntrack_info ctinfo), @@ -504,7 +519,6 @@ void nf_ct_helper_init(struct nf_conntrack_helper *helper, helper->tuple.src.u.all = htons(spec_port); helper->expect_policy = exp_pol; helper->expect_class_max = expect_class_max; - helper->data_len = data_len; helper->help = help; helper->from_nlattr = from_nlattr; helper->me = module; @@ -544,7 +558,7 @@ void nf_conntrack_helpers_unregister(struct nf_conntrack_helper *helper, } EXPORT_SYMBOL_GPL(nf_conntrack_helpers_unregister); -static struct nf_ct_ext_type helper_extend __read_mostly = { +static const struct nf_ct_ext_type helper_extend = { .len = sizeof(struct nf_conn_help), .align = __alignof__(struct nf_conn_help), .id = NF_CT_EXT_HELPER, diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1972a149f958350c6f6dd5f466d36fed152b76fc..5523acce9d6993dc71e467bbdf7b718ab2b25bf7 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -243,6 +243,12 @@ static int __init nf_conntrack_irc_init(void) return -EINVAL; } + if (max_dcc_channels > NF_CT_EXPECT_MAX_CNT) { + pr_err("max_dcc_channels must not be more than %u\n", + NF_CT_EXPECT_MAX_CNT); + return -EINVAL; + } + irc_exp_policy.max_expected = max_dcc_channels; irc_exp_policy.timeout = dcc_timeout; @@ -257,7 +263,7 @@ static int __init nf_conntrack_irc_init(void) for (i = 0; i < ports_c; i++) { nf_ct_helper_init(&irc[i], AF_INET, IPPROTO_TCP, "irc", IRC_PORT, ports[i], i, &irc_exp_policy, - 0, 0, help, NULL, THIS_MODULE); + 0, help, NULL, THIS_MODULE); } ret = nf_conntrack_helpers_register(&irc[0], ports_c); diff --git a/net/netfilter/nf_conntrack_labels.c b/net/netfilter/nf_conntrack_labels.c index bcab8bde73128f9f9f8b4dba31a1c283b07f8eb1..adf2198599018ca6ec4377ffa4b92bb0a2dcd169 100644 --- a/net/netfilter/nf_conntrack_labels.c +++ b/net/netfilter/nf_conntrack_labels.c @@ -82,7 +82,7 @@ void nf_connlabels_put(struct net *net) } EXPORT_SYMBOL_GPL(nf_connlabels_put); -static struct nf_ct_ext_type labels_extend __read_mostly = { +static const struct nf_ct_ext_type labels_extend = { .len = sizeof(struct nf_conn_labels), .align = __alignof__(struct nf_conn_labels), .id = NF_CT_EXT_LABELS, diff --git a/net/netfilter/nf_conntrack_netbios_ns.c b/net/netfilter/nf_conntrack_netbios_ns.c index 4c8f30a3d6d2762e69604c4b1f6cd6821663bc64..496ce173f0c1937c73fb360e402c9c3e3936e1e9 100644 --- a/net/netfilter/nf_conntrack_netbios_ns.c +++ b/net/netfilter/nf_conntrack_netbios_ns.c @@ -58,6 +58,8 @@ static struct nf_conntrack_helper helper __read_mostly = { static int __init nf_conntrack_netbios_ns_init(void) { + NF_CT_HELPER_BUILD_BUG_ON(0); + exp_policy.timeout = timeout; return nf_conntrack_helper_register(&helper); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index dc7dfd68fafe5d8db341488ac7d9ad5bf8f80b46..dcf561b5c97a47e627ee00649d756635db0e6fb3 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -417,8 +417,7 @@ dump_ct_seq_adj(struct sk_buff *skb, const struct nf_ct_seqadj *seq, int type) return -1; } -static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, - const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; + spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) - return -1; + goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) - return -1; + goto err; + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -467,7 +471,7 @@ ctnetlink_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nlattr *nest_parms; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_NEW); + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_NEW); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -627,10 +631,6 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) unsigned int flags = 0, group; int err; - /* ignore our fake conntrack entry */ - if (nf_ct_is_untracked(ct)) - return 0; - if (events & (1 << IPCT_DESTROY)) { type = IPCTNL_MSG_CT_DELETE; group = NFNLGRP_CONNTRACK_DESTROY; @@ -652,7 +652,7 @@ ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) if (skb == NULL) goto errout; - type |= NFNL_SUBSYS_CTNETLINK << 8; + type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, type); nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -908,7 +908,7 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, struct nf_conntrack_l3proto *l3proto; int ret = 0; - ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL); + ret = nla_parse_nested(tb, CTA_IP_MAX, attr, NULL, NULL); if (ret < 0) return ret; @@ -917,7 +917,7 @@ static int ctnetlink_parse_tuple_ip(struct nlattr *attr, if (likely(l3proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_IP_MAX, - l3proto->nla_policy); + l3proto->nla_policy, NULL); if (ret == 0) ret = l3proto->nlattr_to_tuple(tb, tuple); } @@ -938,7 +938,8 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, struct nf_conntrack_l4proto *l4proto; int ret = 0; - ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy); + ret = nla_parse_nested(tb, CTA_PROTO_MAX, attr, proto_nla_policy, + NULL); if (ret < 0) return ret; @@ -951,7 +952,7 @@ static int ctnetlink_parse_tuple_proto(struct nlattr *attr, if (likely(l4proto->nlattr_to_tuple)) { ret = nla_validate_nested(attr, CTA_PROTO_MAX, - l4proto->nla_policy); + l4proto->nla_policy, NULL); if (ret == 0) ret = l4proto->nlattr_to_tuple(tb, tuple); } @@ -1015,7 +1016,8 @@ ctnetlink_parse_tuple(const struct nlattr * const cda[], memset(tuple, 0, sizeof(*tuple)); - err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy); + err = nla_parse_nested(tb, CTA_TUPLE_MAX, cda[type], tuple_nla_policy, + NULL); if (err < 0) return err; @@ -1065,7 +1067,7 @@ static int ctnetlink_parse_help(const struct nlattr *attr, char **helper_name, int err; struct nlattr *tb[CTA_HELP_MAX+1]; - err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy); + err = nla_parse_nested(tb, CTA_HELP_MAX, attr, help_nla_policy, NULL); if (err < 0) return err; @@ -1419,6 +1421,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif +static void +__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, + unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} + static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1438,10 +1458,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* ASSURED bit can only be set */ return -EBUSY; - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * nf_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + __ctnetlink_change_status(ct, status, 0); return 0; } @@ -1510,23 +1527,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, return 0; } + rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { -#ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_expect_lock); - - if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_expect_lock); - return -EOPNOTSUPP; - } - - spin_lock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) - return -EAGAIN; -#endif + rcu_read_unlock(); return -EOPNOTSUPP; } @@ -1535,13 +1540,16 @@ static int ctnetlink_change_helper(struct nf_conn *ct, /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - return 0; + err = 0; } else - return -EBUSY; + err = -EBUSY; + } else { + /* we cannot set a helper for an existing conntrack */ + err = -EOPNOTSUPP; } - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; + rcu_read_unlock(); + return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, @@ -1571,7 +1579,8 @@ static int ctnetlink_change_protoinfo(struct nf_conn *ct, struct nf_conntrack_l4proto *l4proto; int err = 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy); + err = nla_parse_nested(tb, CTA_PROTOINFO_MAX, attr, protoinfo_policy, + NULL); if (err < 0) return err; @@ -1596,7 +1605,7 @@ static int change_seq_adj(struct nf_ct_seqadj *seq, int err; struct nlattr *cda[CTA_SEQADJ_MAX+1]; - err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy); + err = nla_parse_nested(cda, CTA_SEQADJ_MAX, attr, seqadj_policy, NULL); if (err < 0) return err; @@ -1631,25 +1640,30 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (!seqadj) return 0; + spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return ret; } static int @@ -1960,9 +1974,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -1988,7 +2000,8 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS_CPU); + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, + IPCTNL_MSG_CT_GET_STATS_CPU); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -2071,7 +2084,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, unsigned int flags = portid ? NLM_F_MULTI : 0, event; unsigned int nr_conntracks = atomic_read(&net->ct.count); - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_CT_GET_STATS); + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, IPCTNL_MSG_CT_GET_STATS); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -2177,13 +2190,7 @@ ctnetlink_glue_build_size(const struct nf_conn *ct) static struct nf_conn *ctnetlink_glue_get_ct(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo) { - struct nf_conn *ct; - - ct = nf_ct_get(skb, ctinfo); - if (ct && nf_ct_is_untracked(ct)) - ct = NULL; - - return ct; + return nf_ct_get(skb, ctinfo); } static int __ctnetlink_glue_build(struct sk_buff *skb, struct nf_conn *ct) @@ -2300,10 +2307,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the - * unchangeable bits but do not error out. + * unchangeable bits but do not error out. Also user programs + * are allowed to clear the bits that they are allowed to change. */ - ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | - (ct->status & IPS_UNCHANGEABLE_MASK); + __ctnetlink_change_status(ct, status, ~status); return 0; } @@ -2353,15 +2360,11 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) struct nlattr *cda[CTA_MAX+1]; int ret; - ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy); + ret = nla_parse_nested(cda, CTA_MAX, attr, ct_nla_policy, NULL); if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_expect_lock); - - return ret; + return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, @@ -2390,7 +2393,8 @@ ctnetlink_glue_attach_expect(const struct nlattr *attr, struct nf_conn *ct, struct nf_conntrack_expect *exp; int err; - err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy); + err = nla_parse_nested(cda, CTA_EXPECT_MAX, attr, exp_nla_policy, + NULL); if (err < 0) return err; @@ -2581,7 +2585,7 @@ ctnetlink_exp_fill_info(struct sk_buff *skb, u32 portid, u32 seq, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - event |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -2632,7 +2636,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) if (skb == NULL) goto errout; - type |= NFNL_SUBSYS_CTNETLINK_EXP << 8; + type = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_EXP, type); nlh = nlmsg_put(skb, item->portid, 0, type, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -2698,7 +2702,7 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!atomic_inc_not_zero(&exp->use)) + if (!refcount_inc_not_zero(&exp->use)) continue; cb->args[1] = (unsigned long)exp; goto out; @@ -2744,7 +2748,7 @@ ctnetlink_exp_ct_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->nlh->nlmsg_seq, IPCTNL_MSG_EXP_NEW, exp) < 0) { - if (!atomic_inc_not_zero(&exp->use)) + if (!refcount_inc_not_zero(&exp->use)) continue; cb->args[1] = (unsigned long)exp; goto out; @@ -3015,7 +3019,8 @@ ctnetlink_parse_expect_nat(const struct nlattr *attr, struct nf_conntrack_tuple nat_tuple = {}; int err; - err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, exp_nat_nla_policy); + err = nla_parse_nested(tb, CTA_EXPECT_NAT_MAX, attr, + exp_nat_nla_policy, NULL); if (err < 0) return err; @@ -3049,6 +3054,10 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, struct nf_conn_help *help; int err; + help = nfct_help(ct); + if (!help) + return ERR_PTR(-EOPNOTSUPP); + if (cda[CTA_EXPECT_CLASS] && helper) { class = ntohl(nla_get_be32(cda[CTA_EXPECT_CLASS])); if (class > helper->expect_class_max) @@ -3058,26 +3067,11 @@ ctnetlink_alloc_expect(const struct nlattr * const cda[], struct nf_conn *ct, if (!exp) return ERR_PTR(-ENOMEM); - help = nfct_help(ct); - if (!help) { - if (!cda[CTA_EXPECT_TIMEOUT]) { - err = -EINVAL; - goto err_out; - } - exp->timeout.expires = - jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; - - exp->flags = NF_CT_EXPECT_USERSPACE; - if (cda[CTA_EXPECT_FLAGS]) { - exp->flags |= - ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); - } + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; } else { - if (cda[CTA_EXPECT_FLAGS]) { - exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); - exp->flags &= ~NF_CT_EXPECT_USERSPACE; - } else - exp->flags = 0; + exp->flags = 0; } if (cda[CTA_EXPECT_FN]) { const char *name = nla_data(cda[CTA_EXPECT_FN]); @@ -3240,7 +3234,8 @@ ctnetlink_exp_stat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, int cpu, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0, event; - event = (NFNL_SUBSYS_CTNETLINK << 8 | IPCTNL_MSG_EXP_GET_STATS_CPU); + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK, + IPCTNL_MSG_EXP_GET_STATS_CPU); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c index f60a4755d71e2b0763be35e07976e02169e1b01e..6959e93063d4c957017b97e08dfca27b775461c2 100644 --- a/net/netfilter/nf_conntrack_pptp.c +++ b/net/netfilter/nf_conntrack_pptp.c @@ -263,7 +263,7 @@ static int exp_gre(struct nf_conn *ct, __be16 callid, __be16 peer_callid) goto out_put_both; } -static inline int +static int pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, @@ -391,7 +391,7 @@ pptp_inbound_pkt(struct sk_buff *skb, unsigned int protoff, return NF_ACCEPT; } -static inline int +static int pptp_outbound_pkt(struct sk_buff *skb, unsigned int protoff, struct PptpControlHeader *ctlh, union pptp_ctrl_union *pptpReq, @@ -523,6 +523,14 @@ conntrack_pptp_help(struct sk_buff *skb, unsigned int protoff, int ret; u_int16_t msg; +#if IS_ENABLED(CONFIG_NF_NAT) + if (!nf_ct_is_confirmed(ct) && (ct->status & IPS_NAT_MASK)) { + struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); + + if (!nat && !nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC)) + return NF_DROP; + } +#endif /* don't do any tracking before tcp handshake complete */ if (ctinfo != IP_CT_ESTABLISHED && ctinfo != IP_CT_ESTABLISHED_REPLY) return NF_ACCEPT; @@ -596,7 +604,6 @@ static const struct nf_conntrack_expect_policy pptp_exp_policy = { static struct nf_conntrack_helper pptp __read_mostly = { .name = "pptp", .me = THIS_MODULE, - .data_len = sizeof(struct nf_ct_pptp_master), .tuple.src.l3num = AF_INET, .tuple.src.u.tcp.port = cpu_to_be16(PPTP_CONTROL_PORT), .tuple.dst.protonum = IPPROTO_TCP, @@ -607,6 +614,8 @@ static struct nf_conntrack_helper pptp __read_mostly = { static int __init nf_conntrack_pptp_init(void) { + NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_pptp_master)); + return nf_conntrack_helper_register(&pptp); } diff --git a/net/netfilter/nf_conntrack_proto.c b/net/netfilter/nf_conntrack_proto.c index 2d6ee18034159d12603e50c694541a9bdda147fc..2de6c1fe326149c5ab46f06b9ca7a3db992c4e7f 100644 --- a/net/netfilter/nf_conntrack_proto.c +++ b/net/netfilter/nf_conntrack_proto.c @@ -202,7 +202,7 @@ static int kill_l3proto(struct nf_conn *i, void *data) static int kill_l4proto(struct nf_conn *i, void *data) { struct nf_conntrack_l4proto *l4proto; - l4proto = (struct nf_conntrack_l4proto *)data; + l4proto = data; return nf_ct_protonum(i) == l4proto->l4proto && nf_ct_l3num(i) == l4proto->l3proto; } @@ -441,9 +441,8 @@ EXPORT_SYMBOL_GPL(nf_ct_l4proto_unregister_one); void nf_ct_l4proto_pernet_unregister_one(struct net *net, struct nf_conntrack_l4proto *l4proto) { - struct nf_proto_net *pn = NULL; + struct nf_proto_net *pn = nf_ct_l4proto_net(net, l4proto); - pn = nf_ct_l4proto_net(net, l4proto); if (pn == NULL) return; diff --git a/net/netfilter/nf_conntrack_proto_dccp.c b/net/netfilter/nf_conntrack_proto_dccp.c index 93dd1c5b7bff9e5285530a446bba6811ec26ead4..b553fdd68816b98a0f6f58cba3bd7363959479cf 100644 --- a/net/netfilter/nf_conntrack_proto_dccp.c +++ b/net/netfilter/nf_conntrack_proto_dccp.c @@ -609,6 +609,20 @@ static int dccp_error(struct net *net, struct nf_conn *tmpl, return -NF_ACCEPT; } +static bool dccp_can_early_drop(const struct nf_conn *ct) +{ + switch (ct->proto.dccp.state) { + case CT_DCCP_CLOSEREQ: + case CT_DCCP_CLOSING: + case CT_DCCP_TIMEWAIT: + return true; + default: + break; + } + + return false; +} + static void dccp_print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple) { @@ -665,7 +679,7 @@ static int nlattr_to_dccp(struct nlattr *cda[], struct nf_conn *ct) return 0; err = nla_parse_nested(tb, CTA_PROTOINFO_DCCP_MAX, attr, - dccp_nla_policy); + dccp_nla_policy, NULL); if (err < 0) return err; @@ -868,6 +882,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp4 __read_mostly = { .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, .error = dccp_error, + .can_early_drop = dccp_can_early_drop, .print_tuple = dccp_print_tuple, .print_conntrack = dccp_print_conntrack, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -902,6 +917,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_dccp6 __read_mostly = { .packet = dccp_packet, .get_timeouts = dccp_get_timeouts, .error = dccp_error, + .can_early_drop = dccp_can_early_drop, .print_tuple = dccp_print_tuple, .print_conntrack = dccp_print_conntrack, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) diff --git a/net/netfilter/nf_conntrack_proto_sctp.c b/net/netfilter/nf_conntrack_proto_sctp.c index 33279aab583d5eac3016b4a58f6bf2ea8b457395..13875d599a85713bbfeb2fee7b8978fa498a10ed 100644 --- a/net/netfilter/nf_conntrack_proto_sctp.c +++ b/net/netfilter/nf_conntrack_proto_sctp.c @@ -535,6 +535,20 @@ static int sctp_error(struct net *net, struct nf_conn *tpl, struct sk_buff *skb, return -NF_ACCEPT; } +static bool sctp_can_early_drop(const struct nf_conn *ct) +{ + switch (ct->proto.sctp.state) { + case SCTP_CONNTRACK_SHUTDOWN_SENT: + case SCTP_CONNTRACK_SHUTDOWN_RECD: + case SCTP_CONNTRACK_SHUTDOWN_ACK_SENT: + return true; + default: + break; + } + + return false; +} + #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include @@ -584,10 +598,8 @@ static int nlattr_to_sctp(struct nlattr *cda[], struct nf_conn *ct) if (!attr) return 0; - err = nla_parse_nested(tb, - CTA_PROTOINFO_SCTP_MAX, - attr, - sctp_nla_policy); + err = nla_parse_nested(tb, CTA_PROTOINFO_SCTP_MAX, attr, + sctp_nla_policy, NULL); if (err < 0) return err; @@ -783,6 +795,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp4 __read_mostly = { .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, + .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = sctp_to_nlattr, @@ -818,6 +831,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_sctp6 __read_mostly = { .get_timeouts = sctp_get_timeouts, .new = sctp_new, .error = sctp_error, + .can_early_drop = sctp_can_early_drop, .me = THIS_MODULE, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = sctp_to_nlattr, diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index b122e9dacfed06e27aecab3fbc71203772d7b427..9758a7dfd83ef95c0bcab1257d63cb38d9faed88 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -419,10 +419,9 @@ static void tcp_options(const struct sk_buff *skb, && opsize == TCPOLEN_WINDOW) { state->td_scale = *(u_int8_t *)ptr; - if (state->td_scale > 14) { - /* See RFC1323 */ - state->td_scale = 14; - } + if (state->td_scale > TCP_MAX_WSCALE) + state->td_scale = TCP_MAX_WSCALE; + state->flags |= IP_CT_TCP_FLAG_WINDOW_SCALE; } @@ -1172,6 +1171,22 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, return true; } +static bool tcp_can_early_drop(const struct nf_conn *ct) +{ + switch (ct->proto.tcp.state) { + case TCP_CONNTRACK_FIN_WAIT: + case TCP_CONNTRACK_LAST_ACK: + case TCP_CONNTRACK_TIME_WAIT: + case TCP_CONNTRACK_CLOSE: + case TCP_CONNTRACK_CLOSE_WAIT: + return true; + default: + break; + } + + return false; +} + #if IS_ENABLED(CONFIG_NF_CT_NETLINK) #include @@ -1234,7 +1249,8 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) if (!pattr) return 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy); + err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, + tcp_nla_policy, NULL); if (err < 0) return err; @@ -1549,6 +1565,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, + .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .nlattr_size = tcp_nlattr_size, @@ -1586,6 +1603,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .get_timeouts = tcp_get_timeouts, .new = tcp_new, .error = tcp_error, + .can_early_drop = tcp_can_early_drop, #if IS_ENABLED(CONFIG_NF_CT_NETLINK) .to_nlattr = tcp_to_nlattr, .nlattr_size = tcp_nlattr_size, diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c index 9dcb9ee9b97d53361c73df05aa5fe9923e933bda..ae457f39d5ceb804aa60120c7d897dd441e49d2a 100644 --- a/net/netfilter/nf_conntrack_sane.c +++ b/net/netfilter/nf_conntrack_sane.c @@ -184,6 +184,8 @@ static int __init nf_conntrack_sane_init(void) { int i, ret = 0; + NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sane_master)); + sane_buffer = kmalloc(65536, GFP_KERNEL); if (!sane_buffer) return -ENOMEM; @@ -196,13 +198,11 @@ static int __init nf_conntrack_sane_init(void) for (i = 0; i < ports_c; i++) { nf_ct_helper_init(&sane[2 * i], AF_INET, IPPROTO_TCP, "sane", SANE_PORT, ports[i], ports[i], - &sane_exp_policy, 0, - sizeof(struct nf_ct_sane_master), help, NULL, + &sane_exp_policy, 0, help, NULL, THIS_MODULE); nf_ct_helper_init(&sane[2 * i + 1], AF_INET6, IPPROTO_TCP, "sane", SANE_PORT, ports[i], ports[i], - &sane_exp_policy, 0, - sizeof(struct nf_ct_sane_master), help, NULL, + &sane_exp_policy, 0, help, NULL, THIS_MODULE); } diff --git a/net/netfilter/nf_conntrack_seqadj.c b/net/netfilter/nf_conntrack_seqadj.c index ef7063eced7c490dd6813d3e54ee72fdad9a8565..a975efd6b8c3e3baac2a80334237be9201ace130 100644 --- a/net/netfilter/nf_conntrack_seqadj.c +++ b/net/netfilter/nf_conntrack_seqadj.c @@ -231,7 +231,7 @@ s32 nf_ct_seq_offset(const struct nf_conn *ct, } EXPORT_SYMBOL_GPL(nf_ct_seq_offset); -static struct nf_ct_ext_type nf_ct_seqadj_extend __read_mostly = { +static const struct nf_ct_ext_type nf_ct_seqadj_extend = { .len = sizeof(struct nf_conn_seqadj), .align = __alignof__(struct nf_conn_seqadj), .id = NF_CT_EXT_SEQADJ, diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 0d17894798b5caea7540b3bf7d341a0a2f623fd6..d38af4274335b9bed6de1edb4b0c4ae6fdf96656 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -829,10 +829,8 @@ static void flush_expectations(struct nf_conn *ct, bool media) hlist_for_each_entry_safe(exp, next, &help->expectations, lnode) { if ((exp->class != SIP_EXPECT_SIGNALLING) ^ media) continue; - if (!del_timer(&exp->timeout)) + if (!nf_ct_remove_expect(exp)) continue; - nf_ct_unlink_expect(exp); - nf_ct_expect_put(exp); if (!media) break; } @@ -1624,29 +1622,27 @@ static int __init nf_conntrack_sip_init(void) { int i, ret; + NF_CT_HELPER_BUILD_BUG_ON(sizeof(struct nf_ct_sip_master)); + if (ports_c == 0) ports[ports_c++] = SIP_PORT; for (i = 0; i < ports_c; i++) { nf_ct_helper_init(&sip[4 * i], AF_INET, IPPROTO_UDP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, - sizeof(struct nf_ct_sip_master), sip_help_udp, + SIP_EXPECT_MAX, sip_help_udp, NULL, THIS_MODULE); nf_ct_helper_init(&sip[4 * i + 1], AF_INET, IPPROTO_TCP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, - sizeof(struct nf_ct_sip_master), sip_help_tcp, + SIP_EXPECT_MAX, sip_help_tcp, NULL, THIS_MODULE); nf_ct_helper_init(&sip[4 * i + 2], AF_INET6, IPPROTO_UDP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, - sizeof(struct nf_ct_sip_master), sip_help_udp, + SIP_EXPECT_MAX, sip_help_udp, NULL, THIS_MODULE); nf_ct_helper_init(&sip[4 * i + 3], AF_INET6, IPPROTO_TCP, "sip", SIP_PORT, ports[i], i, sip_exp_policy, - SIP_EXPECT_MAX, - sizeof(struct nf_ct_sip_master), sip_help_tcp, + SIP_EXPECT_MAX, sip_help_tcp, NULL, THIS_MODULE); } diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 2256147dcaad80ea982868607b2e66b233c27c6f..ccb5cb9043e0e769ba463759a69f4d9f36e122e0 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -250,7 +250,7 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; if (!(test_bit(IPS_SEEN_REPLY_BIT, &ct->status))) - seq_printf(s, "[UNREPLIED] "); + seq_puts(s, "[UNREPLIED] "); print_tuple(s, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, l3proto, l4proto); @@ -261,7 +261,7 @@ static int ct_seq_show(struct seq_file *s, void *v) goto release; if (test_bit(IPS_ASSURED_BIT, &ct->status)) - seq_printf(s, "[ASSURED] "); + seq_puts(s, "[ASSURED] "); if (seq_has_overflowed(s)) goto release; @@ -350,7 +350,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) const struct ip_conntrack_stat *st = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries searched found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c index b1227dc6f75e115c8cb2e88e488c9349686891a0..0ec6779fd5d944406eb67ff6e3256698ec40d20c 100644 --- a/net/netfilter/nf_conntrack_tftp.c +++ b/net/netfilter/nf_conntrack_tftp.c @@ -113,16 +113,18 @@ static int __init nf_conntrack_tftp_init(void) { int i, ret; + NF_CT_HELPER_BUILD_BUG_ON(0); + if (ports_c == 0) ports[ports_c++] = TFTP_PORT; for (i = 0; i < ports_c; i++) { nf_ct_helper_init(&tftp[2 * i], AF_INET, IPPROTO_UDP, "tftp", TFTP_PORT, ports[i], i, &tftp_exp_policy, - 0, 0, tftp_help, NULL, THIS_MODULE); + 0, tftp_help, NULL, THIS_MODULE); nf_ct_helper_init(&tftp[2 * i + 1], AF_INET6, IPPROTO_UDP, "tftp", TFTP_PORT, ports[i], i, &tftp_exp_policy, - 0, 0, tftp_help, NULL, THIS_MODULE); + 0, tftp_help, NULL, THIS_MODULE); } ret = nf_conntrack_helpers_register(tftp, ports_c * 2); diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c index 26e742006c48c944ba16aea8d8a2ed541ad06ca1..46aee65f339bea3e39e6fcdd94d82d4646b991b4 100644 --- a/net/netfilter/nf_conntrack_timeout.c +++ b/net/netfilter/nf_conntrack_timeout.c @@ -31,7 +31,7 @@ EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook); void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_timeout_put_hook); -static struct nf_ct_ext_type timeout_extend __read_mostly = { +static const struct nf_ct_ext_type timeout_extend = { .len = sizeof(struct nf_conn_timeout), .align = __alignof__(struct nf_conn_timeout), .id = NF_CT_EXT_TIMEOUT, diff --git a/net/netfilter/nf_conntrack_timestamp.c b/net/netfilter/nf_conntrack_timestamp.c index 7a394df0deb7686fa7fe4da162ff77d3c03435c5..4c4734b78318962360bd07265dc8179f95fc5135 100644 --- a/net/netfilter/nf_conntrack_timestamp.c +++ b/net/netfilter/nf_conntrack_timestamp.c @@ -33,7 +33,7 @@ static struct ctl_table tstamp_sysctl_table[] = { }; #endif /* CONFIG_SYSCTL */ -static struct nf_ct_ext_type tstamp_extend __read_mostly = { +static const struct nf_ct_ext_type tstamp_extend = { .len = sizeof(struct nf_conn_tstamp), .align = __alignof__(struct nf_conn_tstamp), .id = NF_CT_EXT_TSTAMP, diff --git a/net/netfilter/nf_internals.h b/net/netfilter/nf_internals.h index c46d214d532355b7294a543d7c68843a82fb489d..bfa742da83aff37dfc8b426e1d193f0de056fb6f 100644 --- a/net/netfilter/nf_internals.h +++ b/net/netfilter/nf_internals.h @@ -14,7 +14,7 @@ /* nf_queue.c */ int nf_queue(struct sk_buff *skb, struct nf_hook_state *state, struct nf_hook_entry **entryp, unsigned int verdict); -void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry); +unsigned int nf_queue_nf_hook_drop(struct net *net); int __init netfilter_queue_init(void); /* nf_log.c */ diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index 8d85a0598b603c5f7d1f2f6f0c0ba68bb96b4d58..8bb152a7cca499d8e9b349dcd9fe0693d6599fbf 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -71,7 +71,6 @@ void nf_log_unset(struct net *net, const struct nf_logger *logger) RCU_INIT_POINTER(net->nf.nf_loggers[i], NULL); } mutex_unlock(&nf_log_mutex); - synchronize_rcu(); } EXPORT_SYMBOL(nf_log_unset); @@ -376,13 +375,13 @@ static int seq_show(struct seq_file *s, void *v) logger = nft_log_dereference(loggers[*pos][i]); seq_printf(s, "%s", logger->name); if (i == 0 && loggers[*pos][i + 1] != NULL) - seq_printf(s, ","); + seq_puts(s, ","); if (seq_has_overflowed(s)) return -ENOSPC; } - seq_printf(s, ")\n"); + seq_puts(s, ")\n"); if (seq_has_overflowed(s)) return -ENOSPC; diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index eb772380a202f0037745a4357cf2c00658181bcd..e4d61a7a52589d85d23185bba1919dfd188e62c2 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -33,7 +33,6 @@ static unsigned int help(struct sk_buff *skb, { char buffer[sizeof("65535")]; u_int16_t port; - unsigned int ret; /* Connection comes from client. */ exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; @@ -63,14 +62,14 @@ static unsigned int help(struct sk_buff *skb, } sprintf(buffer, "%u", port); - ret = nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, - protoff, matchoff, matchlen, - buffer, strlen(buffer)); - if (ret != NF_ACCEPT) { + if (!nf_nat_mangle_udp_packet(skb, exp->master, ctinfo, + protoff, matchoff, matchlen, + buffer, strlen(buffer))) { nf_ct_helper_log(skb, exp->master, "cannot mangle packet"); nf_ct_unexpect_related(exp); + return NF_DROP; } - return ret; + return NF_ACCEPT; } static void __exit nf_nat_amanda_fini(void) diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 82802e4a6640817e64eb3f3a6ffcb875ad14a747..b48d6b5aae8a87d4ea69cae0e025739ebe3f1658 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -71,11 +71,10 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) if (ct == NULL) return; - family = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.l3num; - rcu_read_lock(); + family = nf_ct_l3num(ct); l3proto = __nf_nat_l3proto_find(family); if (l3proto == NULL) - goto out; + return; dir = CTINFO2DIR(ctinfo); if (dir == IP_CT_DIR_ORIGINAL) @@ -84,8 +83,6 @@ static void __nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl) statusbit = IPS_SRC_NAT; l3proto->decode_session(skb, ct, dir, statusbit, fl); -out: - rcu_read_unlock(); } int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family) @@ -411,12 +408,6 @@ nf_nat_setup_info(struct nf_conn *ct, enum nf_nat_manip_type maniptype) { struct nf_conntrack_tuple curr_tuple, new_tuple; - struct nf_conn_nat *nat; - - /* nat helper or nfctnetlink also setup binding */ - nat = nf_ct_nat_ext_add(ct); - if (nat == NULL) - return NF_ACCEPT; NF_CT_ASSERT(maniptype == NF_NAT_MANIP_SRC || maniptype == NF_NAT_MANIP_DST); @@ -549,10 +540,6 @@ struct nf_nat_proto_clean { static int nf_nat_proto_remove(struct nf_conn *i, void *data) { const struct nf_nat_proto_clean *clean = data; - struct nf_conn_nat *nat = nfct_nat(i); - - if (!nat) - return 0; if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) || (clean->l4proto && nf_ct_protonum(i) != clean->l4proto)) @@ -563,12 +550,10 @@ static int nf_nat_proto_remove(struct nf_conn *i, void *data) static int nf_nat_proto_clean(struct nf_conn *ct, void *data) { - struct nf_conn_nat *nat = nfct_nat(ct); - if (nf_nat_proto_remove(ct, data)) return 1; - if (!nat) + if ((ct->status & IPS_SRC_NAT_DONE) == 0) return 0; /* This netns is being destroyed, and conntrack has nat null binding. @@ -716,13 +701,9 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregister); /* No one using conntrack by the time this called. */ static void nf_nat_cleanup_conntrack(struct nf_conn *ct) { - struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT); - - if (!nat) - return; - - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, - nf_nat_bysource_params); + if (ct->status & IPS_SRC_NAT_DONE) + rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource, + nf_nat_bysource_params); } static struct nf_ct_ext_type nat_extend __read_mostly = { @@ -730,7 +711,6 @@ static struct nf_ct_ext_type nat_extend __read_mostly = { .align = __alignof__(struct nf_conn_nat), .destroy = nf_nat_cleanup_conntrack, .id = NF_CT_EXT_NAT, - .flags = NF_CT_EXT_F_PREALLOC, }; #if IS_ENABLED(CONFIG_NF_CT_NETLINK) @@ -751,7 +731,8 @@ static int nfnetlink_parse_nat_proto(struct nlattr *attr, const struct nf_nat_l4proto *l4proto; int err; - err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, protonat_nla_policy); + err = nla_parse_nested(tb, CTA_PROTONAT_MAX, attr, + protonat_nla_policy, NULL); if (err < 0) return err; @@ -780,7 +761,7 @@ nfnetlink_parse_nat(const struct nlattr *nat, memset(range, 0, sizeof(*range)); - err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy); + err = nla_parse_nested(tb, CTA_NAT_MAX, nat, nat_nla_policy, NULL); if (err < 0) return err; @@ -819,7 +800,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct, /* No NAT information has been passed, allocate the null-binding */ if (attr == NULL) - return __nf_nat_alloc_null_binding(ct, manip); + return __nf_nat_alloc_null_binding(ct, manip) == NF_DROP ? -ENOMEM : 0; err = nfnetlink_parse_nat(attr, ct, &range, l3proto); if (err < 0) @@ -874,9 +855,6 @@ static int __init nf_nat_init(void) nf_ct_helper_expectfn_register(&follow_master_nat); - /* Initialize fake conntrack so that NAT will skip it */ - nf_ct_untracked_status_or(IPS_NAT_DONE_MASK); - BUG_ON(nfnetlink_parse_nat_setup_hook != NULL); RCU_INIT_POINTER(nfnetlink_parse_nat_setup_hook, nfnetlink_parse_nat_setup); diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index 211661cb2c90a5d25a57d5ce41c9c5f9898c5302..607a373379b40cf5cef2d4bba1bfefa58a8dfa33 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -70,15 +70,15 @@ static void mangle_contents(struct sk_buff *skb, } /* Unusual, but possible case. */ -static int enlarge_skb(struct sk_buff *skb, unsigned int extra) +static bool enlarge_skb(struct sk_buff *skb, unsigned int extra) { if (skb->len + extra > 65535) - return 0; + return false; if (pskb_expand_head(skb, 0, extra - skb_tailroom(skb), GFP_ATOMIC)) - return 0; + return false; - return 1; + return true; } /* Generic function for mangling variable-length address changes inside @@ -89,26 +89,26 @@ static int enlarge_skb(struct sk_buff *skb, unsigned int extra) * skb enlargement, ... * * */ -int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, - struct nf_conn *ct, - enum ip_conntrack_info ctinfo, - unsigned int protoff, - unsigned int match_offset, - unsigned int match_len, - const char *rep_buffer, - unsigned int rep_len, bool adjust) +bool __nf_nat_mangle_tcp_packet(struct sk_buff *skb, + struct nf_conn *ct, + enum ip_conntrack_info ctinfo, + unsigned int protoff, + unsigned int match_offset, + unsigned int match_len, + const char *rep_buffer, + unsigned int rep_len, bool adjust) { const struct nf_nat_l3proto *l3proto; struct tcphdr *tcph; int oldlen, datalen; if (!skb_make_writable(skb, skb->len)) - return 0; + return false; if (rep_len > match_len && rep_len - match_len > skb_tailroom(skb) && !enlarge_skb(skb, rep_len - match_len)) - return 0; + return false; SKB_LINEAR_ASSERT(skb); @@ -128,7 +128,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, nf_ct_seqadj_set(ct, ctinfo, tcph->seq, (int)rep_len - (int)match_len); - return 1; + return true; } EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); @@ -142,7 +142,7 @@ EXPORT_SYMBOL(__nf_nat_mangle_tcp_packet); * XXX - This function could be merged with nf_nat_mangle_tcp_packet which * should be fairly easy to do. */ -int +bool nf_nat_mangle_udp_packet(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info ctinfo, @@ -157,12 +157,12 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, int datalen, oldlen; if (!skb_make_writable(skb, skb->len)) - return 0; + return false; if (rep_len > match_len && rep_len - match_len > skb_tailroom(skb) && !enlarge_skb(skb, rep_len - match_len)) - return 0; + return false; udph = (void *)skb->data + protoff; @@ -176,13 +176,13 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, /* fix udp checksum if udp checksum was previously calculated */ if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) - return 1; + return true; l3proto = __nf_nat_l3proto_find(nf_ct_l3num(ct)); l3proto->csum_recalc(skb, IPPROTO_UDP, udph, &udph->check, datalen, oldlen); - return 1; + return true; } EXPORT_SYMBOL(nf_nat_mangle_udp_packet); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index 1fb2258c35357c8c6f5072ae6593de2d53ab9f03..0648cb096bd87b2df3b84db5da8ff8fdbffc7018 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -37,7 +37,6 @@ static unsigned int help(struct sk_buff *skb, struct nf_conn *ct = exp->master; union nf_inet_addr newaddr; u_int16_t port; - unsigned int ret; /* Reply comes from server. */ newaddr = ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3; @@ -83,14 +82,14 @@ static unsigned int help(struct sk_buff *skb, pr_debug("nf_nat_irc: inserting '%s' == %pI4, port %u\n", buffer, &newaddr.ip, port); - ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, - matchlen, buffer, strlen(buffer)); - if (ret != NF_ACCEPT) { + if (!nf_nat_mangle_tcp_packet(skb, ct, ctinfo, protoff, matchoff, + matchlen, buffer, strlen(buffer))) { nf_ct_helper_log(skb, ct, "cannot mangle packet"); nf_ct_unexpect_related(exp); + return NF_DROP; } - return ret; + return NF_ACCEPT; } static void __exit nf_nat_irc_fini(void) diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index 4a7662486f44bae527ed83e7e952c4d7195f3871..043850c9d154dcf128903d7dba6b17ea5ff20370 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -96,15 +96,18 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry) } EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs); -void nf_queue_nf_hook_drop(struct net *net, const struct nf_hook_entry *entry) +unsigned int nf_queue_nf_hook_drop(struct net *net) { const struct nf_queue_handler *qh; + unsigned int count = 0; rcu_read_lock(); qh = rcu_dereference(net->nf.queue_handler); if (qh) - qh->nf_hook_drop(net, entry); + count = qh->nf_hook_drop(net); rcu_read_unlock(); + + return count; } static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, diff --git a/net/netfilter/nf_synproxy_core.c b/net/netfilter/nf_synproxy_core.c index 7c6d1fbe38b9aafc668ca39ee6e096d64f79bc7f..a504e87c6ddff1b1266a901549256f29dc1973d1 100644 --- a/net/netfilter/nf_synproxy_core.c +++ b/net/netfilter/nf_synproxy_core.c @@ -66,8 +66,8 @@ synproxy_parse_options(const struct sk_buff *skb, unsigned int doff, case TCPOPT_WINDOW: if (opsize == TCPOLEN_WINDOW) { opts->wscale = *ptr; - if (opts->wscale > 14) - opts->wscale = 14; + if (opts->wscale > TCP_MAX_WSCALE) + opts->wscale = TCP_MAX_WSCALE; opts->options |= XT_SYNPROXY_OPT_WSCALE; } break; @@ -287,9 +287,9 @@ static int synproxy_cpu_seq_show(struct seq_file *seq, void *v) struct synproxy_stats *stats = v; if (v == SEQ_START_TOKEN) { - seq_printf(seq, "entries\t\tsyn_received\t" - "cookie_invalid\tcookie_valid\t" - "cookie_retrans\tconn_reopened\n"); + seq_puts(seq, "entries\t\tsyn_received\t" + "cookie_invalid\tcookie_valid\t" + "cookie_retrans\tconn_reopened\n"); return 0; } diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 434c739dfecaa8727dc193f8ad86e18133932660..5592250297402fe6e272f3213efa7e02ab230485 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -144,7 +144,7 @@ static int nf_tables_register_hooks(struct net *net, unsigned int hook_nops) { if (table->flags & NFT_TABLE_F_DORMANT || - !(chain->flags & NFT_BASE_CHAIN)) + !nft_is_base_chain(chain)) return 0; return nf_register_net_hooks(net, nft_base_chain(chain)->ops, @@ -157,7 +157,7 @@ static void nf_tables_unregister_hooks(struct net *net, unsigned int hook_nops) { if (table->flags & NFT_TABLE_F_DORMANT || - !(chain->flags & NFT_BASE_CHAIN)) + !nft_is_base_chain(chain)) return; nf_unregister_net_hooks(net, nft_base_chain(chain)->ops, hook_nops); @@ -438,7 +438,7 @@ static int nf_tables_fill_table_info(struct sk_buff *skb, struct net *net, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); if (nlh == NULL) goto nla_put_failure; @@ -587,7 +587,7 @@ static void _nf_tables_table_disable(struct net *net, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (!(chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(chain)) continue; if (cnt && i++ == cnt) @@ -608,7 +608,7 @@ static int nf_tables_table_enable(struct net *net, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; - if (!(chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(chain)) continue; err = nf_register_net_hooks(net, nft_base_chain(chain)->ops, @@ -989,7 +989,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); if (nlh == NULL) goto nla_put_failure; @@ -1007,7 +1007,7 @@ static int nf_tables_fill_chain_info(struct sk_buff *skb, struct net *net, if (nla_put_string(skb, NFTA_CHAIN_NAME, chain->name)) goto nla_put_failure; - if (chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(chain)) { const struct nft_base_chain *basechain = nft_base_chain(chain); const struct nf_hook_ops *ops = &basechain->ops[0]; struct nlattr *nest; @@ -1182,7 +1182,8 @@ static struct nft_stats __percpu *nft_stats_alloc(const struct nlattr *attr) struct nft_stats *stats; int err; - err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy); + err = nla_parse_nested(tb, NFTA_COUNTER_MAX, attr, nft_counter_policy, + NULL); if (err < 0) return ERR_PTR(err); @@ -1226,7 +1227,7 @@ static void nf_tables_chain_destroy(struct nft_chain *chain) { BUG_ON(chain->use > 0); - if (chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); module_put(basechain->type->owner); @@ -1257,7 +1258,7 @@ static int nft_chain_parse_hook(struct net *net, int err; err = nla_parse_nested(ha, NFTA_HOOK_MAX, nla[NFTA_CHAIN_HOOK], - nft_hook_policy); + nft_hook_policy, NULL); if (err < 0) return err; @@ -1364,8 +1365,8 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } if (nla[NFTA_CHAIN_POLICY]) { - if ((chain != NULL && - !(chain->flags & NFT_BASE_CHAIN))) + if (chain != NULL && + !nft_is_base_chain(chain)) return -EOPNOTSUPP; if (chain == NULL && @@ -1396,7 +1397,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, struct nft_chain_hook hook; struct nf_hook_ops *ops; - if (!(chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(chain)) return -EBUSY; err = nft_chain_parse_hook(net, nla, afi, &hook, @@ -1433,7 +1434,7 @@ static int nf_tables_newchain(struct net *net, struct sock *nlsk, } if (nla[NFTA_CHAIN_COUNTERS]) { - if (!(chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(chain)) return -EOPNOTSUPP; stats = nft_stats_alloc(nla[NFTA_CHAIN_COUNTERS]); @@ -1724,7 +1725,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_EXPR_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy); + err = nla_parse_nested(tb, NFTA_EXPR_MAX, nla, nft_expr_policy, NULL); if (err < 0) return err; @@ -1734,7 +1735,7 @@ static int nf_tables_expr_parse(const struct nft_ctx *ctx, if (tb[NFTA_EXPR_DATA]) { err = nla_parse_nested(info->tb, type->maxattr, - tb[NFTA_EXPR_DATA], type->policy); + tb[NFTA_EXPR_DATA], type->policy, NULL); if (err < 0) goto err1; } else @@ -1772,8 +1773,19 @@ static int nf_tables_newexpr(const struct nft_ctx *ctx, goto err1; } + if (ops->validate) { + const struct nft_data *data = NULL; + + err = ops->validate(ctx, expr, &data); + if (err < 0) + goto err2; + } + return 0; +err2: + if (ops->destroy) + ops->destroy(ctx, expr); err1: expr->ops = NULL; return err; @@ -1874,10 +1886,9 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, const struct nft_expr *expr, *next; struct nlattr *list; const struct nft_rule *prule; - int type = event | NFNL_SUBSYS_NFTABLES << 8; + u16 type = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); - nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), - flags); + nlh = nlmsg_put(skb, portid, seq, type, sizeof(struct nfgenmsg), flags); if (nlh == NULL) goto nla_put_failure; @@ -1895,7 +1906,7 @@ static int nf_tables_fill_rule_info(struct sk_buff *skb, struct net *net, goto nla_put_failure; if ((event != NFT_MSG_DELRULE) && (rule->list.prev != &chain->rules)) { - prule = list_entry(rule->list.prev, struct nft_rule, list); + prule = list_prev_entry(rule, list); if (nla_put_be64(skb, NFTA_RULE_POSITION, cpu_to_be64(prule->handle), NFTA_RULE_PAD)) @@ -2523,8 +2534,8 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net, return 0; } -struct nft_set *nf_tables_set_lookup(const struct nft_table *table, - const struct nlattr *nla, u8 genmask) +static struct nft_set *nf_tables_set_lookup(const struct nft_table *table, + const struct nlattr *nla, u8 genmask) { struct nft_set *set; @@ -2538,11 +2549,10 @@ struct nft_set *nf_tables_set_lookup(const struct nft_table *table, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_set_lookup); -struct nft_set *nf_tables_set_lookup_byid(const struct net *net, - const struct nlattr *nla, - u8 genmask) +static struct nft_set *nf_tables_set_lookup_byid(const struct net *net, + const struct nlattr *nla, + u8 genmask) { struct nft_trans *trans; u32 id = ntohl(nla_get_be32(nla)); @@ -2557,7 +2567,25 @@ struct nft_set *nf_tables_set_lookup_byid(const struct net *net, } return ERR_PTR(-ENOENT); } -EXPORT_SYMBOL_GPL(nf_tables_set_lookup_byid); + +struct nft_set *nft_set_lookup(const struct net *net, + const struct nft_table *table, + const struct nlattr *nla_set_name, + const struct nlattr *nla_set_id, + u8 genmask) +{ + struct nft_set *set; + + set = nf_tables_set_lookup(table, nla_set_name, genmask); + if (IS_ERR(set)) { + if (!nla_set_id) + return set; + + set = nf_tables_set_lookup_byid(net, nla_set_id, genmask); + } + return set; +} +EXPORT_SYMBOL_GPL(nft_set_lookup); static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, const char *name) @@ -2617,7 +2645,7 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx, u32 portid = ctx->portid; u32 seq = ctx->seq; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); if (nlh == NULL) @@ -2851,7 +2879,8 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx, struct nlattr *da[NFTA_SET_DESC_MAX + 1]; int err; - err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, nft_set_desc_policy); + err = nla_parse_nested(da, NFTA_SET_DESC_MAX, nla, + nft_set_desc_policy, NULL); if (err < 0) return err; @@ -3353,7 +3382,8 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) int event, err; err = nlmsg_parse(cb->nlh, sizeof(struct nfgenmsg), nla, - NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy); + NFTA_SET_ELEM_LIST_MAX, nft_set_elem_list_policy, + NULL); if (err < 0) return err; @@ -3367,8 +3397,7 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb) if (IS_ERR(set)) return PTR_ERR(set); - event = NFT_MSG_NEWSETELEM; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWSETELEM); portid = NETLINK_CB(cb->skb).portid; seq = cb->nlh->nlmsg_seq; @@ -3453,7 +3482,7 @@ static int nf_tables_fill_setelem_info(struct sk_buff *skb, struct nlattr *nest; int err; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); if (nlh == NULL) @@ -3612,7 +3641,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, - nft_set_elem_policy); + nft_set_elem_policy, NULL); if (err < 0) return err; @@ -3749,6 +3778,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || + nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) + return -EBUSY; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), @@ -3842,7 +3876,7 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set, int err; err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr, - nft_set_elem_policy); + nft_set_elem_policy, NULL); if (err < 0) goto err1; @@ -4064,7 +4098,8 @@ static const struct nla_policy nft_obj_policy[NFTA_OBJ_MAX + 1] = { [NFTA_OBJ_DATA] = { .type = NLA_NESTED }, }; -static struct nft_object *nft_obj_init(const struct nft_object_type *type, +static struct nft_object *nft_obj_init(const struct nft_ctx *ctx, + const struct nft_object_type *type, const struct nlattr *attr) { struct nlattr *tb[type->maxattr + 1]; @@ -4072,7 +4107,8 @@ static struct nft_object *nft_obj_init(const struct nft_object_type *type, int err; if (attr) { - err = nla_parse_nested(tb, type->maxattr, attr, type->policy); + err = nla_parse_nested(tb, type->maxattr, attr, type->policy, + NULL); if (err < 0) goto err1; } else { @@ -4084,7 +4120,7 @@ static struct nft_object *nft_obj_init(const struct nft_object_type *type, if (obj == NULL) goto err1; - err = type->init((const struct nlattr * const *)tb, obj); + err = type->init(ctx, (const struct nlattr * const *)tb, obj); if (err < 0) goto err2; @@ -4192,7 +4228,7 @@ static int nf_tables_newobj(struct net *net, struct sock *nlsk, if (IS_ERR(type)) return PTR_ERR(type); - obj = nft_obj_init(type, nla[NFTA_OBJ_DATA]); + obj = nft_obj_init(&ctx, type, nla[NFTA_OBJ_DATA]); if (IS_ERR(obj)) { err = PTR_ERR(obj); goto err1; @@ -4224,7 +4260,7 @@ static int nf_tables_fill_obj_info(struct sk_buff *skb, struct net *net, struct nfgenmsg *nfmsg; struct nlmsghdr *nlh; - event |= NFNL_SUBSYS_NFTABLES << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), flags); if (nlh == NULL) goto nla_put_failure; @@ -4406,8 +4442,6 @@ static int nf_tables_getobj(struct net *net, struct sock *nlsk, err: kfree_skb(skb2); return err; - - return 0; } static void nft_obj_destroy(struct nft_object *obj) @@ -4497,7 +4531,7 @@ static int nf_tables_fill_gen_info(struct sk_buff *skb, struct net *net, { struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; - int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_NEWGEN; + int event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_NEWGEN); nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct nfgenmsg), 0); if (nlh == NULL) @@ -4679,7 +4713,7 @@ static void nft_chain_commit_update(struct nft_trans *trans) if (nft_trans_chain_name(trans)[0]) strcpy(trans->ctx.chain->name, nft_trans_chain_name(trans)); - if (!(trans->ctx.chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(trans->ctx.chain)) return; basechain = nft_base_chain(trans->ctx.chain); @@ -4993,7 +5027,7 @@ int nft_chain_validate_dependency(const struct nft_chain *chain, { const struct nft_base_chain *basechain; - if (chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(chain)) { basechain = nft_base_chain(chain); if (basechain->type->type != type) return -EOPNOTSUPP; @@ -5007,7 +5041,7 @@ int nft_chain_validate_hooks(const struct nft_chain *chain, { struct nft_base_chain *basechain; - if (chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(chain)) { basechain = nft_base_chain(chain); if ((1 << basechain->ops[0].hooknum) & hook_flags) @@ -5285,7 +5319,8 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, struct nft_chain *chain; int err; - err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy); + err = nla_parse_nested(tb, NFTA_VERDICT_MAX, nla, nft_verdict_policy, + NULL); if (err < 0) return err; @@ -5316,7 +5351,7 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, tb[NFTA_VERDICT_CHAIN], genmask); if (IS_ERR(chain)) return PTR_ERR(chain); - if (chain->flags & NFT_BASE_CHAIN) + if (nft_is_base_chain(chain)) return -EOPNOTSUPP; chain->use++; @@ -5415,7 +5450,7 @@ int nft_data_init(const struct nft_ctx *ctx, struct nlattr *tb[NFTA_DATA_MAX + 1]; int err; - err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy); + err = nla_parse_nested(tb, NFTA_DATA_MAX, nla, nft_data_policy, NULL); if (err < 0) return err; @@ -5489,7 +5524,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN)); + BUG_ON(!nft_is_base_chain(ctx->chain)); nf_tables_unregister_hooks(ctx->net, ctx->chain->table, ctx->chain, ctx->afi->nops); diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c index 9e2ae424b64005555d5a7132ea1aa469dc4d0fc1..403432988313567358871d5f85a5d71fc0385051 100644 --- a/net/netfilter/nf_tables_netdev.c +++ b/net/netfilter/nf_tables_netdev.c @@ -128,7 +128,7 @@ static int nf_tables_netdev_event(struct notifier_block *this, list_for_each_entry(table, &afi->tables, list) { ctx.table = table; list_for_each_entry_safe(chain, nr, &table->chains, list) { - if (!(chain->flags & NFT_BASE_CHAIN)) + if (!nft_is_base_chain(chain)) continue; ctx.chain = chain; diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c index 12eb9041dca284a398a2c1ac64924ba773a18489..e1b15e7a5793f6b877f63a95dc20ce2626caf7f5 100644 --- a/net/netfilter/nf_tables_trace.c +++ b/net/netfilter/nf_tables_trace.c @@ -169,7 +169,7 @@ void nft_trace_notify(struct nft_traceinfo *info) struct nlmsghdr *nlh; struct sk_buff *skb; unsigned int size; - int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE; + u16 event; if (!nfnetlink_has_listeners(nft_net(pkt), NFNLGRP_NFTRACE)) return; @@ -198,6 +198,7 @@ void nft_trace_notify(struct nft_traceinfo *info) if (!skb) return; + event = nfnl_msg_type(NFNL_SUBSYS_NFTABLES, NFT_MSG_TRACE); nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0); if (!nlh) goto nla_put_failure; diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index 68eda920160e12f3cb904dad08b7f6cc9422571e..80f5ecf2c3d7096f579361663da5d2e651eebd3e 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -148,7 +148,8 @@ int nfnetlink_unicast(struct sk_buff *skb, struct net *net, u32 portid, EXPORT_SYMBOL_GPL(nfnetlink_unicast); /* Process one complete nfnetlink message. */ -static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); const struct nfnl_callback *nc; @@ -191,8 +192,8 @@ static int nfnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) int attrlen = nlh->nlmsg_len - min_len; __u8 subsys_id = NFNL_SUBSYS_ID(type); - err = nla_parse(cda, ss->cb[cb_id].attr_count, - attr, attrlen, ss->cb[cb_id].policy); + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, attrlen, + ss->cb[cb_id].policy, extack); if (err < 0) { rcu_read_unlock(); return err; @@ -261,7 +262,7 @@ static void nfnl_err_deliver(struct list_head *err_list, struct sk_buff *skb) struct nfnl_err *nfnl_err, *next; list_for_each_entry_safe(nfnl_err, next, err_list, head) { - netlink_ack(skb, nfnl_err->nlh, nfnl_err->err); + netlink_ack(skb, nfnl_err->nlh, nfnl_err->err, NULL); nfnl_err_del(nfnl_err); } } @@ -284,13 +285,13 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, int err; if (subsys_id >= NFNL_SUBSYS_COUNT) - return netlink_ack(skb, nlh, -EINVAL); + return netlink_ack(skb, nlh, -EINVAL, NULL); replay: status = 0; skb = netlink_skb_clone(oskb, GFP_KERNEL); if (!skb) - return netlink_ack(oskb, nlh, -ENOMEM); + return netlink_ack(oskb, nlh, -ENOMEM, NULL); nfnl_lock(subsys_id); ss = nfnl_dereference_protected(subsys_id); @@ -304,20 +305,20 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, #endif { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } } if (!ss->commit || !ss->abort) { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -EOPNOTSUPP); + netlink_ack(oskb, nlh, -EOPNOTSUPP, NULL); return kfree_skb(skb); } if (genid && ss->valid_genid && !ss->valid_genid(net, genid)) { nfnl_unlock(subsys_id); - netlink_ack(oskb, nlh, -ERESTART); + netlink_ack(oskb, nlh, -ERESTART, NULL); return kfree_skb(skb); } @@ -376,8 +377,8 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *attr = (void *)nlh + min_len; int attrlen = nlh->nlmsg_len - min_len; - err = nla_parse(cda, ss->cb[cb_id].attr_count, - attr, attrlen, ss->cb[cb_id].policy); + err = nla_parse(cda, ss->cb[cb_id].attr_count, attr, + attrlen, ss->cb[cb_id].policy, NULL); if (err < 0) goto ack; @@ -407,7 +408,8 @@ static void nfnetlink_rcv_batch(struct sk_buff *skb, struct nlmsghdr *nlh, * pointing to the batch header. */ nfnl_err_reset(&err_list); - netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM); + netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM, + NULL); status |= NFNL_BATCH_FAILURE; goto done; } @@ -465,9 +467,10 @@ static void nfnetlink_rcv_skb_batch(struct sk_buff *skb, struct nlmsghdr *nlh) skb->len < NLMSG_HDRLEN + sizeof(struct nfgenmsg)) return; - err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy); + err = nla_parse(cda, NFNL_BATCH_MAX, attr, attrlen, nfnl_batch_policy, + NULL); if (err < 0) { - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, NULL); return; } if (cda[NFNL_BATCH_GENID]) @@ -493,14 +496,14 @@ static void nfnetlink_rcv(struct sk_buff *skb) return; if (!netlink_net_capable(skb, CAP_NET_ADMIN)) { - netlink_ack(skb, nlh, -EPERM); + netlink_ack(skb, nlh, -EPERM, NULL); return; } if (nlh->nlmsg_type == NFNL_MSG_BATCH_BEGIN) nfnetlink_rcv_skb_batch(skb, nlh); else - netlink_rcv_skb(skb, &nfnetlink_rcv_msg); + netlink_rcv_skb(skb, nfnetlink_rcv_msg); } #ifdef CONFIG_MODULES diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c index d44d89b561275e25bb31fe2b0ed198d13995533c..9898fb4d0512ce80d0bf970340e5da43db25ce15 100644 --- a/net/netfilter/nfnetlink_acct.c +++ b/net/netfilter/nfnetlink_acct.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -32,7 +33,7 @@ struct nf_acct { atomic64_t bytes; unsigned long flags; struct list_head head; - atomic_t refcnt; + refcount_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; char data[0]; @@ -123,7 +124,7 @@ static int nfnl_acct_new(struct net *net, struct sock *nfnl, atomic64_set(&nfacct->pkts, be64_to_cpu(nla_get_be64(tb[NFACCT_PKTS]))); } - atomic_set(&nfacct->refcnt, 1); + refcount_set(&nfacct->refcnt, 1); list_add_tail_rcu(&nfacct->head, &net->nfnl_acct_list); return 0; } @@ -138,7 +139,7 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, u64 pkts, bytes; u32 old_flags; - event |= NFNL_SUBSYS_ACCT << 8; + event = nfnl_msg_type(NFNL_SUBSYS_ACCT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -166,7 +167,7 @@ nfnl_acct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, NFACCT_PAD) || nla_put_be64(skb, NFACCT_BYTES, cpu_to_be64(bytes), NFACCT_PAD) || - nla_put_be32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt)))) + nla_put_be32(skb, NFACCT_USE, htonl(refcount_read(&acct->refcnt)))) goto nla_put_failure; if (acct->flags & NFACCT_F_QUOTA) { u64 *quota = (u64 *)acct->data; @@ -243,7 +244,8 @@ nfacct_filter_alloc(const struct nlattr * const attr) struct nlattr *tb[NFACCT_FILTER_MAX + 1]; int err; - err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy); + err = nla_parse_nested(tb, NFACCT_FILTER_MAX, attr, filter_policy, + NULL); if (err < 0) return ERR_PTR(err); @@ -329,7 +331,7 @@ static int nfnl_acct_try_del(struct nf_acct *cur) /* We want to avoid races with nfnl_acct_put. So only when the current * refcnt is 1, we decrease it to 0. */ - if (atomic_cmpxchg(&cur->refcnt, 1, 0) == 1) { + if (refcount_dec_if_one(&cur->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&cur->head); kfree_rcu(cur, rcu_head); @@ -413,7 +415,7 @@ struct nf_acct *nfnl_acct_find_get(struct net *net, const char *acct_name) if (!try_module_get(THIS_MODULE)) goto err; - if (!atomic_inc_not_zero(&cur->refcnt)) { + if (!refcount_inc_not_zero(&cur->refcnt)) { module_put(THIS_MODULE); goto err; } @@ -429,7 +431,7 @@ EXPORT_SYMBOL_GPL(nfnl_acct_find_get); void nfnl_acct_put(struct nf_acct *acct) { - if (atomic_dec_and_test(&acct->refcnt)) + if (refcount_dec_and_test(&acct->refcnt)) kfree_rcu(acct, rcu_head); module_put(THIS_MODULE); @@ -502,7 +504,7 @@ static void __net_exit nfnl_acct_net_exit(struct net *net) list_for_each_entry_safe(cur, tmp, &net->nfnl_acct_list, head) { list_del_rcu(&cur->head); - if (atomic_dec_and_test(&cur->refcnt)) + if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index d45558178da5b62a8ad7c896e096c1862c512091..950bf6eadc6578516ac92b50427fe682cba3976d 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -77,7 +77,8 @@ nfnl_cthelper_parse_tuple(struct nf_conntrack_tuple *tuple, int err; struct nlattr *tb[NFCTH_TUPLE_MAX+1]; - err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, nfnl_cthelper_tuple_pol); + err = nla_parse_nested(tb, NFCTH_TUPLE_MAX, attr, + nfnl_cthelper_tuple_pol, NULL); if (err < 0) return err; @@ -104,7 +105,7 @@ nfnl_cthelper_from_nlattr(struct nlattr *attr, struct nf_conn *ct) if (help->helper->data_len == 0) return -EINVAL; - memcpy(help->data, nla_data(attr), help->helper->data_len); + nla_memcpy(help->data, nla_data(attr), sizeof(help->data)); return 0; } @@ -137,7 +138,8 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, int err; struct nlattr *tb[NFCTH_POLICY_MAX+1]; - err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, nfnl_cthelper_expect_pol); + err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, + nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; @@ -150,6 +152,9 @@ nfnl_cthelper_expect_policy(struct nf_conntrack_expect_policy *expect_policy, nla_data(tb[NFCTH_POLICY_NAME]), NF_CT_HELPER_NAME_LEN); expect_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); + if (expect_policy->max_expected > NF_CT_EXPECT_MAX_CNT) + return -EINVAL; + expect_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); @@ -171,7 +176,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, unsigned int class_max; ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, - nfnl_cthelper_expect_policy_set); + nfnl_cthelper_expect_policy_set, NULL); if (ret < 0) return ret; @@ -213,6 +218,7 @@ nfnl_cthelper_create(const struct nlattr * const tb[], { struct nf_conntrack_helper *helper; struct nfnl_cthelper *nfcth; + unsigned int size; int ret; if (!tb[NFCTH_TUPLE] || !tb[NFCTH_POLICY] || !tb[NFCTH_PRIV_DATA_LEN]) @@ -228,7 +234,12 @@ nfnl_cthelper_create(const struct nlattr * const tb[], goto err1; strncpy(helper->name, nla_data(tb[NFCTH_NAME]), NF_CT_HELPER_NAME_LEN); - helper->data_len = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + size = ntohl(nla_get_be32(tb[NFCTH_PRIV_DATA_LEN])); + if (size > FIELD_SIZEOF(struct nf_conn_help, data)) { + ret = -ENOMEM; + goto err2; + } + helper->flags |= NF_CT_HELPER_F_USERSPACE; memcpy(&helper->tuple, tuple, sizeof(struct nf_conntrack_tuple)); @@ -276,7 +287,7 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, int err; err = nla_parse_nested(tb, NFCTH_POLICY_MAX, attr, - nfnl_cthelper_expect_pol); + nfnl_cthelper_expect_pol, NULL); if (err < 0) return err; @@ -290,6 +301,9 @@ nfnl_cthelper_update_policy_one(const struct nf_conntrack_expect_policy *policy, new_policy->max_expected = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_MAX])); + if (new_policy->max_expected > NF_CT_EXPECT_MAX_CNT) + return -EINVAL; + new_policy->timeout = ntohl(nla_get_be32(tb[NFCTH_POLICY_EXPECT_TIMEOUT])); @@ -336,7 +350,7 @@ static int nfnl_cthelper_update_policy(struct nf_conntrack_helper *helper, int err; err = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, - nfnl_cthelper_expect_policy_set); + nfnl_cthelper_expect_policy_set, NULL); if (err < 0) return err; @@ -501,7 +515,7 @@ nfnl_cthelper_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, unsigned int flags = portid ? NLM_F_MULTI : 0; int status; - event |= NFNL_SUBSYS_CTHELPER << 8; + event = nfnl_msg_type(NFNL_SUBSYS_CTHELPER, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index 47d6656c9119fd5c75fb1fb5f0bfd4bb49f1c38b..a3e7bb54d96acf2e7ac570077d57502d5b23625b 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -56,7 +56,8 @@ ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto, struct nlattr *tb[l4proto->ctnl_timeout.nlattr_max+1]; ret = nla_parse_nested(tb, l4proto->ctnl_timeout.nlattr_max, - attr, l4proto->ctnl_timeout.nla_policy); + attr, l4proto->ctnl_timeout.nla_policy, + NULL); if (ret < 0) return ret; @@ -138,7 +139,7 @@ static int cttimeout_new_timeout(struct net *net, struct sock *ctnl, strcpy(timeout->name, nla_data(cda[CTA_TIMEOUT_NAME])); timeout->l3num = l3num; timeout->l4proto = l4proto; - atomic_set(&timeout->refcnt, 1); + refcount_set(&timeout->refcnt, 1); list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list); return 0; @@ -158,7 +159,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, unsigned int flags = portid ? NLM_F_MULTI : 0; struct nf_conntrack_l4proto *l4proto = timeout->l4proto; - event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -172,7 +173,7 @@ ctnl_timeout_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, nla_put_be16(skb, CTA_TIMEOUT_L3PROTO, htons(timeout->l3num)) || nla_put_u8(skb, CTA_TIMEOUT_L4PROTO, timeout->l4proto->l4proto) || nla_put_be32(skb, CTA_TIMEOUT_USE, - htonl(atomic_read(&timeout->refcnt)))) + htonl(refcount_read(&timeout->refcnt)))) goto nla_put_failure; if (likely(l4proto->ctnl_timeout.obj_to_nlattr)) { @@ -339,7 +340,7 @@ static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout) /* We want to avoid races with ctnl_timeout_put. So only when the * current refcnt is 1, we decrease it to 0. */ - if (atomic_cmpxchg(&timeout->refcnt, 1, 0) == 1) { + if (refcount_dec_if_one(&timeout->refcnt)) { /* We are protected by nfnl mutex. */ list_del_rcu(&timeout->head); nf_ct_l4proto_put(timeout->l4proto); @@ -431,7 +432,7 @@ cttimeout_default_fill_info(struct net *net, struct sk_buff *skb, u32 portid, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - event |= NFNL_SUBSYS_CTNETLINK_TIMEOUT << 8; + event = nfnl_msg_type(NFNL_SUBSYS_CTNETLINK_TIMEOUT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; @@ -536,7 +537,7 @@ ctnl_timeout_find_get(struct net *net, const char *name) if (!try_module_get(THIS_MODULE)) goto err; - if (!atomic_inc_not_zero(&timeout->refcnt)) { + if (!refcount_inc_not_zero(&timeout->refcnt)) { module_put(THIS_MODULE); goto err; } @@ -550,7 +551,7 @@ ctnl_timeout_find_get(struct net *net, const char *name) static void ctnl_timeout_put(struct ctnl_timeout *timeout) { - if (atomic_dec_and_test(&timeout->refcnt)) + if (refcount_dec_and_test(&timeout->refcnt)) kfree_rcu(timeout, rcu_head); module_put(THIS_MODULE); @@ -601,7 +602,7 @@ static void __net_exit cttimeout_net_exit(struct net *net) list_del_rcu(&cur->head); nf_ct_l4proto_put(cur->l4proto); - if (atomic_dec_and_test(&cur->refcnt)) + if (refcount_dec_and_test(&cur->refcnt)) kfree_rcu(cur, rcu_head); } } diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 08247bf7d7b836828c8151ed07f438e05973c2c0..da9704971a83cd2f1f2808bd99061c880d9349f7 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -40,6 +40,8 @@ #include #include +#include + #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) #include "../bridge/br_private.h" @@ -57,7 +59,7 @@ struct nfulnl_instance { struct hlist_node hlist; /* global list of instances */ spinlock_t lock; - atomic_t use; /* use count */ + refcount_t use; /* use count */ unsigned int qlen; /* number of nlmsgs in skb */ struct sk_buff *skb; /* pre-allocatd skb */ @@ -115,7 +117,7 @@ __instance_lookup(struct nfnl_log_net *log, u_int16_t group_num) static inline void instance_get(struct nfulnl_instance *inst) { - atomic_inc(&inst->use); + refcount_inc(&inst->use); } static struct nfulnl_instance * @@ -125,7 +127,7 @@ instance_lookup_get(struct nfnl_log_net *log, u_int16_t group_num) rcu_read_lock_bh(); inst = __instance_lookup(log, group_num); - if (inst && !atomic_inc_not_zero(&inst->use)) + if (inst && !refcount_inc_not_zero(&inst->use)) inst = NULL; rcu_read_unlock_bh(); @@ -145,7 +147,7 @@ static void nfulnl_instance_free_rcu(struct rcu_head *head) static void instance_put(struct nfulnl_instance *inst) { - if (inst && atomic_dec_and_test(&inst->use)) + if (inst && refcount_dec_and_test(&inst->use)) call_rcu_bh(&inst->rcu, nfulnl_instance_free_rcu); } @@ -180,7 +182,7 @@ instance_create(struct net *net, u_int16_t group_num, INIT_HLIST_NODE(&inst->hlist); spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ - atomic_set(&inst->use, 2); + refcount_set(&inst->use, 2); setup_timer(&inst->timer, nfulnl_timer, (unsigned long)inst); @@ -409,7 +411,7 @@ __build_packet_message(struct nfnl_log_net *log, const unsigned char *hwhdrp; nlh = nlmsg_put(inst->skb, 0, 0, - NFNL_SUBSYS_ULOG << 8 | NFULNL_MSG_PACKET, + nfnl_msg_type(NFNL_SUBSYS_ULOG, NFULNL_MSG_PACKET), sizeof(struct nfgenmsg), 0); if (!nlh) return -1; @@ -801,7 +803,7 @@ static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl, static struct nf_logger nfulnl_logger __read_mostly = { .name = "nfnetlink_log", .type = NF_LOG_TYPE_ULOG, - .logfn = &nfulnl_log_packet, + .logfn = nfulnl_log_packet, .me = THIS_MODULE, }; @@ -1031,7 +1033,7 @@ static int seq_show(struct seq_file *s, void *v) inst->group_num, inst->peer_portid, inst->qlen, inst->copy_mode, inst->copy_range, - inst->flushtimeout, atomic_read(&inst->use)); + inst->flushtimeout, refcount_read(&inst->use)); return 0; } @@ -1138,10 +1140,10 @@ static int __init nfnetlink_log_init(void) static void __exit nfnetlink_log_fini(void) { - nf_log_unregister(&nfulnl_logger); nfnetlink_subsys_unregister(&nfulnl_subsys); netlink_unregister_notifier(&nfulnl_rtnl_notifier); unregister_pernet_subsys(&nfnl_log_net_ops); + nf_log_unregister(&nfulnl_logger); } MODULE_DESCRIPTION("netfilter userspace logging"); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index 933509ebf3d3e2e84aecd55fd7f19f21f69e46d4..8a0f218b79380bb66176b54a7bcd54e43e909a0a 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -447,7 +447,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, } nlh = nlmsg_put(skb, 0, 0, - NFNL_SUBSYS_QUEUE << 8 | NFQNL_MSG_PACKET, + nfnl_msg_type(NFNL_SUBSYS_QUEUE, NFQNL_MSG_PACKET), sizeof(struct nfgenmsg), 0); if (!nlh) { skb_tx_error(entskb); @@ -922,16 +922,10 @@ static struct notifier_block nfqnl_dev_notifier = { .notifier_call = nfqnl_rcv_dev_event, }; -static int nf_hook_cmp(struct nf_queue_entry *entry, unsigned long entry_ptr) -{ - return rcu_access_pointer(entry->hook) == - (struct nf_hook_entry *)entry_ptr; -} - -static void nfqnl_nf_hook_drop(struct net *net, - const struct nf_hook_entry *hook) +static unsigned int nfqnl_nf_hook_drop(struct net *net) { struct nfnl_queue_net *q = nfnl_queue_pernet(net); + unsigned int instances = 0; int i; rcu_read_lock(); @@ -939,10 +933,14 @@ static void nfqnl_nf_hook_drop(struct net *net, struct nfqnl_instance *inst; struct hlist_head *head = &q->instance_table[i]; - hlist_for_each_entry_rcu(inst, head, hlist) - nfqnl_flush(inst, nf_hook_cmp, (unsigned long)hook); + hlist_for_each_entry_rcu(inst, head, hlist) { + nfqnl_flush(inst, NULL, 0); + instances++; + } } rcu_read_unlock(); + + return instances; } static int @@ -1109,7 +1107,7 @@ static int nfqa_parse_bridge(struct nf_queue_entry *entry, int err; err = nla_parse_nested(tb, NFQA_VLAN_MAX, nfqa[NFQA_VLAN], - nfqa_vlan_policy); + nfqa_vlan_policy, NULL); if (err < 0) return err; @@ -1213,8 +1211,8 @@ static const struct nla_policy nfqa_cfg_policy[NFQA_CFG_MAX+1] = { }; static const struct nf_queue_handler nfqh = { - .outfn = &nfqnl_enqueue_packet, - .nf_hook_drop = &nfqnl_nf_hook_drop, + .outfn = nfqnl_enqueue_packet, + .nf_hook_drop = nfqnl_nf_hook_drop, }; static int nfqnl_recv_config(struct net *net, struct sock *ctnl, diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c index c21e7eb8dce02a6b73c5a466300ae793a2787b26..f753ec69f7902b80c9f448f764ecc1e48f6a2680 100644 --- a/net/netfilter/nft_compat.c +++ b/net/netfilter/nft_compat.c @@ -42,7 +42,8 @@ static int nft_compat_chain_validate_dependency(const char *tablename, { const struct nft_base_chain *basechain; - if (!tablename || !(chain->flags & NFT_BASE_CHAIN)) + if (!tablename || + !nft_is_base_chain(chain)) return 0; basechain = nft_base_chain(chain); @@ -165,7 +166,7 @@ nft_target_set_tgchk_param(struct xt_tgchk_param *par, par->entryinfo = entry; par->target = target; par->targinfo = info; - if (ctx->chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops[0]; @@ -200,7 +201,7 @@ static int nft_parse_compat(const struct nlattr *attr, u16 *proto, bool *inv) int err; err = nla_parse_nested(tb, NFTA_RULE_COMPAT_MAX, attr, - nft_rule_compat_policy); + nft_rule_compat_policy, NULL); if (err < 0) return err; @@ -230,10 +231,6 @@ nft_target_init(const struct nft_ctx *ctx, const struct nft_expr *expr, union nft_entry e = {}; int ret; - ret = nft_compat_chain_validate_dependency(target->table, ctx->chain); - if (ret < 0) - goto err; - target_compat_from_user(target, nla_data(tb[NFTA_TARGET_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { @@ -302,7 +299,7 @@ static int nft_target_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; - if (ctx->chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops[0]; @@ -383,7 +380,7 @@ nft_match_set_mtchk_param(struct xt_mtchk_param *par, const struct nft_ctx *ctx, par->entryinfo = entry; par->match = match; par->matchinfo = info; - if (ctx->chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops[0]; @@ -419,10 +416,6 @@ nft_match_init(const struct nft_ctx *ctx, const struct nft_expr *expr, union nft_entry e = {}; int ret; - ret = nft_compat_chain_validate_dependency(match->table, ctx->chain); - if (ret < 0) - goto err; - match_compat_from_user(match, nla_data(tb[NFTA_MATCH_INFO]), info); if (ctx->nla[NFTA_RULE_COMPAT]) { @@ -485,7 +478,7 @@ static int nft_match_validate(const struct nft_ctx *ctx, unsigned int hook_mask = 0; int ret; - if (ctx->chain->flags & NFT_BASE_CHAIN) { + if (nft_is_base_chain(ctx->chain)) { const struct nft_base_chain *basechain = nft_base_chain(ctx->chain); const struct nf_hook_ops *ops = &basechain->ops[0]; @@ -511,7 +504,7 @@ nfnl_compat_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type, struct nfgenmsg *nfmsg; unsigned int flags = portid ? NLM_F_MULTI : 0; - event |= NFNL_SUBSYS_NFT_COMPAT << 8; + event = nfnl_msg_type(NFNL_SUBSYS_NFT_COMPAT, event); nlh = nlmsg_put(skb, portid, seq, event, sizeof(*nfmsg), flags); if (nlh == NULL) goto nlmsg_failure; diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c index 7f84222133414d4a65ed96508c83dea711295c9f..67a710ebde09da21464da4bb22a78a36c791274d 100644 --- a/net/netfilter/nft_counter.c +++ b/net/netfilter/nft_counter.c @@ -82,7 +82,8 @@ static int nft_counter_do_init(const struct nlattr * const tb[], return 0; } -static int nft_counter_obj_init(const struct nlattr * const tb[], +static int nft_counter_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], struct nft_object *obj) { struct nft_counter_percpu_priv *priv = nft_obj_data(obj); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 0264258c46feb5071a8eebcf9299b4b717fd0a32..a34ceb38fc55681962daad2c323f0011a5fe683d 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -32,6 +32,12 @@ struct nft_ct { }; }; +struct nft_ct_helper_obj { + struct nf_conntrack_helper *helper4; + struct nf_conntrack_helper *helper6; + u8 l4proto; +}; + #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; @@ -66,12 +72,12 @@ static void nft_ct_get_eval(const struct nft_expr *expr, switch (priv->key) { case NFT_CT_STATE: - if (ct == NULL) - state = NF_CT_STATE_INVALID_BIT; - else if (nf_ct_is_untracked(ct)) + if (ct) + state = NF_CT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) state = NF_CT_STATE_UNTRACKED_BIT; else - state = NF_CT_STATE_BIT(ctinfo); + state = NF_CT_STATE_INVALID_BIT; *dest = state; return; default: @@ -258,7 +264,7 @@ static void nft_ct_set_eval(const struct nft_expr *expr, struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_template(ct)) return; switch (priv->key) { @@ -277,6 +283,22 @@ static void nft_ct_set_eval(const struct nft_expr *expr, ®s->data[priv->sreg], NF_CT_LABELS_MAX_SIZE / sizeof(u32)); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + case NFT_CT_EVENTMASK: { + struct nf_conntrack_ecache *e = nf_ct_ecache_find(ct); + u32 ctmask = regs->data[priv->sreg]; + + if (e) { + if (e->ctmask != ctmask) + e->ctmask = ctmask; + break; + } + + if (ctmask && !nf_ct_is_confirmed(ct)) + nf_ct_ecache_ext_add(ct, ctmask, 0, GFP_ATOMIC); + break; + } #endif default: break; @@ -532,6 +554,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, nft_ct_pcpu_template_refcnt++; len = sizeof(u16); break; +#endif +#ifdef CONFIG_NF_CONNTRACK_EVENTS + case NFT_CT_EVENTMASK: + if (tb[NFTA_CT_DIRECTION]) + return -EINVAL; + len = sizeof(u32); + break; #endif default: return -EOPNOTSUPP; @@ -696,7 +725,7 @@ nft_ct_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_ct_type __read_mostly = { .name = "ct", - .select_ops = &nft_ct_select_ops, + .select_ops = nft_ct_select_ops, .policy = nft_ct_policy, .maxattr = NFTA_CT_MAX, .owner = THIS_MODULE, @@ -712,12 +741,10 @@ static void nft_notrack_eval(const struct nft_expr *expr, ct = nf_ct_get(pkt->skb, &ctinfo); /* Previously seen (loopback or untracked)? Ignore. */ - if (ct) + if (ct || ctinfo == IP_CT_UNTRACKED) return; - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); - nf_ct_set(skb, ct, IP_CT_NEW); + nf_ct_set(skb, ct, IP_CT_UNTRACKED); } static struct nft_expr_type nft_notrack_type; @@ -733,6 +760,162 @@ static struct nft_expr_type nft_notrack_type __read_mostly = { .owner = THIS_MODULE, }; +static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], + struct nft_object *obj) +{ + struct nft_ct_helper_obj *priv = nft_obj_data(obj); + struct nf_conntrack_helper *help4, *help6; + char name[NF_CT_HELPER_NAME_LEN]; + int family = ctx->afi->family; + + if (!tb[NFTA_CT_HELPER_NAME] || !tb[NFTA_CT_HELPER_L4PROTO]) + return -EINVAL; + + priv->l4proto = nla_get_u8(tb[NFTA_CT_HELPER_L4PROTO]); + if (!priv->l4proto) + return -ENOENT; + + nla_strlcpy(name, tb[NFTA_CT_HELPER_NAME], sizeof(name)); + + if (tb[NFTA_CT_HELPER_L3PROTO]) + family = ntohs(nla_get_be16(tb[NFTA_CT_HELPER_L3PROTO])); + + help4 = NULL; + help6 = NULL; + + switch (family) { + case NFPROTO_IPV4: + if (ctx->afi->family == NFPROTO_IPV6) + return -EINVAL; + + help4 = nf_conntrack_helper_try_module_get(name, family, + priv->l4proto); + break; + case NFPROTO_IPV6: + if (ctx->afi->family == NFPROTO_IPV4) + return -EINVAL; + + help6 = nf_conntrack_helper_try_module_get(name, family, + priv->l4proto); + break; + case NFPROTO_NETDEV: /* fallthrough */ + case NFPROTO_BRIDGE: /* same */ + case NFPROTO_INET: + help4 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV4, + priv->l4proto); + help6 = nf_conntrack_helper_try_module_get(name, NFPROTO_IPV6, + priv->l4proto); + break; + default: + return -EAFNOSUPPORT; + } + + /* && is intentional; only error if INET found neither ipv4 or ipv6 */ + if (!help4 && !help6) + return -ENOENT; + + priv->helper4 = help4; + priv->helper6 = help6; + + return 0; +} + +static void nft_ct_helper_obj_destroy(struct nft_object *obj) +{ + struct nft_ct_helper_obj *priv = nft_obj_data(obj); + + if (priv->helper4) + module_put(priv->helper4->me); + if (priv->helper6) + module_put(priv->helper6->me); +} + +static void nft_ct_helper_obj_eval(struct nft_object *obj, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + const struct nft_ct_helper_obj *priv = nft_obj_data(obj); + struct nf_conn *ct = (struct nf_conn *)skb_nfct(pkt->skb); + struct nf_conntrack_helper *to_assign = NULL; + struct nf_conn_help *help; + + if (!ct || + nf_ct_is_confirmed(ct) || + nf_ct_is_template(ct) || + priv->l4proto != nf_ct_protonum(ct)) + return; + + switch (nf_ct_l3num(ct)) { + case NFPROTO_IPV4: + to_assign = priv->helper4; + break; + case NFPROTO_IPV6: + to_assign = priv->helper6; + break; + default: + WARN_ON_ONCE(1); + return; + } + + if (!to_assign) + return; + + if (test_bit(IPS_HELPER_BIT, &ct->status)) + return; + + help = nf_ct_helper_ext_add(ct, to_assign, GFP_ATOMIC); + if (help) { + rcu_assign_pointer(help->helper, to_assign); + set_bit(IPS_HELPER_BIT, &ct->status); + } +} + +static int nft_ct_helper_obj_dump(struct sk_buff *skb, + struct nft_object *obj, bool reset) +{ + const struct nft_ct_helper_obj *priv = nft_obj_data(obj); + const struct nf_conntrack_helper *helper = priv->helper4; + u16 family; + + if (nla_put_string(skb, NFTA_CT_HELPER_NAME, helper->name)) + return -1; + + if (nla_put_u8(skb, NFTA_CT_HELPER_L4PROTO, priv->l4proto)) + return -1; + + if (priv->helper4 && priv->helper6) + family = NFPROTO_INET; + else if (priv->helper6) + family = NFPROTO_IPV6; + else + family = NFPROTO_IPV4; + + if (nla_put_be16(skb, NFTA_CT_HELPER_L3PROTO, htons(family))) + return -1; + + return 0; +} + +static const struct nla_policy nft_ct_helper_policy[NFTA_CT_HELPER_MAX + 1] = { + [NFTA_CT_HELPER_NAME] = { .type = NLA_STRING, + .len = NF_CT_HELPER_NAME_LEN - 1 }, + [NFTA_CT_HELPER_L3PROTO] = { .type = NLA_U16 }, + [NFTA_CT_HELPER_L4PROTO] = { .type = NLA_U8 }, +}; + +static struct nft_object_type nft_ct_helper_obj __read_mostly = { + .type = NFT_OBJECT_CT_HELPER, + .size = sizeof(struct nft_ct_helper_obj), + .maxattr = NFTA_CT_HELPER_MAX, + .policy = nft_ct_helper_policy, + .eval = nft_ct_helper_obj_eval, + .init = nft_ct_helper_obj_init, + .destroy = nft_ct_helper_obj_destroy, + .dump = nft_ct_helper_obj_dump, + .owner = THIS_MODULE, +}; + static int __init nft_ct_module_init(void) { int err; @@ -747,7 +930,14 @@ static int __init nft_ct_module_init(void) if (err < 0) goto err1; + err = nft_register_obj(&nft_ct_helper_obj); + if (err < 0) + goto err2; + return 0; + +err2: + nft_unregister_expr(&nft_notrack_type); err1: nft_unregister_expr(&nft_ct_type); return err; @@ -755,6 +945,7 @@ static int __init nft_ct_module_init(void) static void __exit nft_ct_module_exit(void) { + nft_unregister_obj(&nft_ct_helper_obj); nft_unregister_expr(&nft_notrack_type); nft_unregister_expr(&nft_ct_type); } @@ -766,3 +957,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_ALIAS_NFT_EXPR("ct"); MODULE_ALIAS_NFT_EXPR("notrack"); +MODULE_ALIAS_NFT_OBJ(NFT_OBJECT_CT_HELPER); diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 049ad2d9ee66959367a051903563dca6ba654edb..66221ad891a9f281c1cf8e723c6a8c302e1f3741 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - } else if (sexpr == NULL) - goto out; + } if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); @@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; return; } -out: + if (!priv->invert) regs->verdict.code = NFT_BREAK; } @@ -133,16 +132,10 @@ static int nft_dynset_init(const struct nft_ctx *ctx, priv->invert = true; } - set = nf_tables_set_lookup(ctx->table, tb[NFTA_DYNSET_SET_NAME], - genmask); - if (IS_ERR(set)) { - if (tb[NFTA_DYNSET_SET_ID]) - set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_DYNSET_SET_ID], - genmask); - if (IS_ERR(set)) - return PTR_ERR(set); - } + set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_DYNSET_SET_NAME], + tb[NFTA_DYNSET_SET_ID], genmask); + if (IS_ERR(set)) + return PTR_ERR(set); if (set->ops->update == NULL) return -EOPNOTSUPP; diff --git a/net/netfilter/nft_exthdr.c b/net/netfilter/nft_exthdr.c index c308920b194cdbe5e3a2e9a09cfb8aab7267f588..1ec49fe5845f1ec8f289f411d456a5765303e244 100644 --- a/net/netfilter/nft_exthdr.c +++ b/net/netfilter/nft_exthdr.c @@ -98,14 +98,21 @@ static void nft_exthdr_tcp_eval(const struct nft_expr *expr, goto err; offset = i + priv->offset; - dest[priv->len / NFT_REG32_SIZE] = 0; - memcpy(dest, opt + offset, priv->len); + if (priv->flags & NFT_EXTHDR_F_PRESENT) { + *dest = 1; + } else { + dest[priv->len / NFT_REG32_SIZE] = 0; + memcpy(dest, opt + offset, priv->len); + } return; } err: - regs->verdict.code = NFT_BREAK; + if (priv->flags & NFT_EXTHDR_F_PRESENT) + *dest = 0; + else + regs->verdict.code = NFT_BREAK; } static const struct nla_policy nft_exthdr_policy[NFTA_EXTHDR_MAX + 1] = { @@ -225,7 +232,7 @@ nft_exthdr_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_exthdr_type __read_mostly = { .name = "exthdr", - .select_ops = &nft_exthdr_select_ops, + .select_ops = nft_exthdr_select_ops, .policy = nft_exthdr_policy, .maxattr = NFTA_EXTHDR_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_fib.c b/net/netfilter/nft_fib.c index 29a4906adc277cd3a2cf55242224e4c99fd070e7..21df8cccea6582e56d7bfbb6fba21f821b7c56d9 100644 --- a/net/netfilter/nft_fib.c +++ b/net/netfilter/nft_fib.c @@ -24,7 +24,8 @@ const struct nla_policy nft_fib_policy[NFTA_FIB_MAX + 1] = { EXPORT_SYMBOL(nft_fib_policy); #define NFTA_FIB_F_ALL (NFTA_FIB_F_SADDR | NFTA_FIB_F_DADDR | \ - NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF) + NFTA_FIB_F_MARK | NFTA_FIB_F_IIF | NFTA_FIB_F_OIF | \ + NFTA_FIB_F_PRESENT) int nft_fib_validate(const struct nft_ctx *ctx, const struct nft_expr *expr, const struct nft_data **data) @@ -112,7 +113,7 @@ int nft_fib_init(const struct nft_ctx *ctx, const struct nft_expr *expr, if (err < 0) return err; - return nft_fib_validate(ctx, expr, NULL); + return 0; } EXPORT_SYMBOL_GPL(nft_fib_init); @@ -133,19 +134,22 @@ int nft_fib_dump(struct sk_buff *skb, const struct nft_expr *expr) } EXPORT_SYMBOL_GPL(nft_fib_dump); -void nft_fib_store_result(void *reg, enum nft_fib_result r, +void nft_fib_store_result(void *reg, const struct nft_fib *priv, const struct nft_pktinfo *pkt, int index) { struct net_device *dev; u32 *dreg = reg; - switch (r) { + switch (priv->result) { case NFT_FIB_RESULT_OIF: - *dreg = index; + *dreg = (priv->flags & NFTA_FIB_F_PRESENT) ? !!index : index; break; case NFT_FIB_RESULT_OIFNAME: dev = dev_get_by_index_rcu(nft_net(pkt), index); - strncpy(reg, dev ? dev->name : "", IFNAMSIZ); + if (priv->flags & NFTA_FIB_F_PRESENT) + *dreg = !!dev; + else + strncpy(reg, dev ? dev->name : "", IFNAMSIZ); break; default: WARN_ON_ONCE(1); diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c index c4dad1254ead01818fb5b2c0474b26f74762fee2..24f2f7567ddb779af44ee5050df36292f413d4d6 100644 --- a/net/netfilter/nft_hash.c +++ b/net/netfilter/nft_hash.c @@ -17,7 +17,7 @@ #include #include -struct nft_hash { +struct nft_jhash { enum nft_registers sreg:8; enum nft_registers dreg:8; u8 len; @@ -27,11 +27,11 @@ struct nft_hash { u32 offset; }; -static void nft_hash_eval(const struct nft_expr *expr, - struct nft_regs *regs, - const struct nft_pktinfo *pkt) +static void nft_jhash_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) { - struct nft_hash *priv = nft_expr_priv(expr); + struct nft_jhash *priv = nft_expr_priv(expr); const void *data = ®s->data[priv->sreg]; u32 h; @@ -39,6 +39,25 @@ static void nft_hash_eval(const struct nft_expr *expr, regs->data[priv->dreg] = h + priv->offset; } +struct nft_symhash { + enum nft_registers dreg:8; + u32 modulus; + u32 offset; +}; + +static void nft_symhash_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + struct sk_buff *skb = pkt->skb; + u32 h; + + h = reciprocal_scale(__skb_get_hash_symmetric(skb), priv->modulus); + + regs->data[priv->dreg] = h + priv->offset; +} + static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_SREG] = { .type = NLA_U32 }, [NFTA_HASH_DREG] = { .type = NLA_U32 }, @@ -46,13 +65,14 @@ static const struct nla_policy nft_hash_policy[NFTA_HASH_MAX + 1] = { [NFTA_HASH_MODULUS] = { .type = NLA_U32 }, [NFTA_HASH_SEED] = { .type = NLA_U32 }, [NFTA_HASH_OFFSET] = { .type = NLA_U32 }, + [NFTA_HASH_TYPE] = { .type = NLA_U32 }, }; -static int nft_hash_init(const struct nft_ctx *ctx, - const struct nft_expr *expr, - const struct nlattr * const tb[]) +static int nft_jhash_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) { - struct nft_hash *priv = nft_expr_priv(expr); + struct nft_jhash *priv = nft_expr_priv(expr); u32 len; int err; @@ -95,10 +115,36 @@ static int nft_hash_init(const struct nft_ctx *ctx, NFT_DATA_VALUE, sizeof(u32)); } -static int nft_hash_dump(struct sk_buff *skb, - const struct nft_expr *expr) +static int nft_symhash_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_symhash *priv = nft_expr_priv(expr); + + if (!tb[NFTA_HASH_DREG] || + !tb[NFTA_HASH_MODULUS]) + return -EINVAL; + + if (tb[NFTA_HASH_OFFSET]) + priv->offset = ntohl(nla_get_be32(tb[NFTA_HASH_OFFSET])); + + priv->dreg = nft_parse_register(tb[NFTA_HASH_DREG]); + + priv->modulus = ntohl(nla_get_be32(tb[NFTA_HASH_MODULUS])); + if (priv->modulus <= 1) + return -ERANGE; + + if (priv->offset + priv->modulus - 1 < priv->offset) + return -EOVERFLOW; + + return nft_validate_register_store(ctx, priv->dreg, NULL, + NFT_DATA_VALUE, sizeof(u32)); +} + +static int nft_jhash_dump(struct sk_buff *skb, + const struct nft_expr *expr) { - const struct nft_hash *priv = nft_expr_priv(expr); + const struct nft_jhash *priv = nft_expr_priv(expr); if (nft_dump_register(skb, NFTA_HASH_SREG, priv->sreg)) goto nla_put_failure; @@ -114,6 +160,28 @@ static int nft_hash_dump(struct sk_buff *skb, if (priv->offset != 0) if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_JENKINS))) + goto nla_put_failure; + return 0; + +nla_put_failure: + return -1; +} + +static int nft_symhash_dump(struct sk_buff *skb, + const struct nft_expr *expr) +{ + const struct nft_symhash *priv = nft_expr_priv(expr); + + if (nft_dump_register(skb, NFTA_HASH_DREG, priv->dreg)) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_MODULUS, htonl(priv->modulus))) + goto nla_put_failure; + if (priv->offset != 0) + if (nla_put_be32(skb, NFTA_HASH_OFFSET, htonl(priv->offset))) + goto nla_put_failure; + if (nla_put_be32(skb, NFTA_HASH_TYPE, htonl(NFT_HASH_SYM))) + goto nla_put_failure; return 0; nla_put_failure: @@ -121,17 +189,46 @@ static int nft_hash_dump(struct sk_buff *skb, } static struct nft_expr_type nft_hash_type; -static const struct nft_expr_ops nft_hash_ops = { +static const struct nft_expr_ops nft_jhash_ops = { .type = &nft_hash_type, - .size = NFT_EXPR_SIZE(sizeof(struct nft_hash)), - .eval = nft_hash_eval, - .init = nft_hash_init, - .dump = nft_hash_dump, + .size = NFT_EXPR_SIZE(sizeof(struct nft_jhash)), + .eval = nft_jhash_eval, + .init = nft_jhash_init, + .dump = nft_jhash_dump, }; +static const struct nft_expr_ops nft_symhash_ops = { + .type = &nft_hash_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_symhash)), + .eval = nft_symhash_eval, + .init = nft_symhash_init, + .dump = nft_symhash_dump, +}; + +static const struct nft_expr_ops * +nft_hash_select_ops(const struct nft_ctx *ctx, + const struct nlattr * const tb[]) +{ + u32 type; + + if (!tb[NFTA_HASH_TYPE]) + return &nft_jhash_ops; + + type = ntohl(nla_get_be32(tb[NFTA_HASH_TYPE])); + switch (type) { + case NFT_HASH_SYM: + return &nft_symhash_ops; + case NFT_HASH_JENKINS: + return &nft_jhash_ops; + default: + break; + } + return ERR_PTR(-EOPNOTSUPP); +} + static struct nft_expr_type nft_hash_type __read_mostly = { .name = "hash", - .ops = &nft_hash_ops, + .select_ops = nft_hash_select_ops, .policy = nft_hash_policy, .maxattr = NFTA_HASH_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c index c6baf412236d662b0d165fec3b4ff26579c9c6d8..18dd57a526513bd726944fa7ad7a9e41fcfb0251 100644 --- a/net/netfilter/nft_limit.c +++ b/net/netfilter/nft_limit.c @@ -17,9 +17,8 @@ #include #include -static DEFINE_SPINLOCK(limit_lock); - struct nft_limit { + spinlock_t lock; u64 last; u64 tokens; u64 tokens_max; @@ -34,7 +33,7 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) u64 now, tokens; s64 delta; - spin_lock_bh(&limit_lock); + spin_lock_bh(&limit->lock); now = ktime_get_ns(); tokens = limit->tokens + now - limit->last; if (tokens > limit->tokens_max) @@ -44,11 +43,11 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost) delta = tokens - cost; if (delta >= 0) { limit->tokens = delta; - spin_unlock_bh(&limit_lock); + spin_unlock_bh(&limit->lock); return limit->invert; } limit->tokens = tokens; - spin_unlock_bh(&limit_lock); + spin_unlock_bh(&limit->lock); return !limit->invert; } @@ -86,6 +85,7 @@ static int nft_limit_init(struct nft_limit *limit, limit->invert = true; } limit->last = ktime_get_ns(); + spin_lock_init(&limit->lock); return 0; } diff --git a/net/netfilter/nft_lookup.c b/net/netfilter/nft_lookup.c index e21aea7e5ec8f141ea3155d1da3c491484c00a73..475570e89ede710b323868792bb16fd5f09d1b09 100644 --- a/net/netfilter/nft_lookup.c +++ b/net/netfilter/nft_lookup.c @@ -71,16 +71,10 @@ static int nft_lookup_init(const struct nft_ctx *ctx, tb[NFTA_LOOKUP_SREG] == NULL) return -EINVAL; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_LOOKUP_SET], genmask); - if (IS_ERR(set)) { - if (tb[NFTA_LOOKUP_SET_ID]) { - set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_LOOKUP_SET_ID], - genmask); - } - if (IS_ERR(set)) - return PTR_ERR(set); - } + set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_LOOKUP_SET], + tb[NFTA_LOOKUP_SET_ID], genmask); + if (IS_ERR(set)) + return PTR_ERR(set); if (set->flags & NFT_SET_EVAL) return -EOPNOTSUPP; diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c index 11ce016cd47948f3a2902402e0cbbda5e22d9438..6ac03d4266c9038cb4ffd3de412a216b769d07a1 100644 --- a/net/netfilter/nft_masq.c +++ b/net/netfilter/nft_masq.c @@ -46,10 +46,6 @@ int nft_masq_init(const struct nft_ctx *ctx, struct nft_masq *priv = nft_expr_priv(expr); int err; - err = nft_masq_validate(ctx, expr, NULL); - if (err) - return err; - if (tb[NFTA_MASQ_FLAGS]) { priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS])); if (priv->flags & ~NF_NAT_RANGE_MASK) diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 7b60e01f38ff9f2f9fa7d28f6f99b4f889d190d7..5a60eb23a7ed8cc82cc2500c7f8c116ee6df2f30 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -372,10 +372,6 @@ int nft_meta_set_init(const struct nft_ctx *ctx, return -EOPNOTSUPP; } - err = nft_meta_set_validate(ctx, expr, NULL); - if (err < 0) - return err; - priv->sreg = nft_parse_register(tb[NFTA_META_SREG]); err = nft_validate_register_load(priv->sreg, len); if (err < 0) @@ -471,7 +467,7 @@ nft_meta_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", - .select_ops = &nft_meta_select_ops, + .select_ops = nft_meta_select_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c index 439e0bd152a004c98664a19ae6c920458fa6160a..ed548d06b6dda9a98888bb83f2baa6b45c965c15 100644 --- a/net/netfilter/nft_nat.c +++ b/net/netfilter/nft_nat.c @@ -138,10 +138,6 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr, return -EINVAL; } - err = nft_nat_validate(ctx, expr, NULL); - if (err < 0) - return err; - if (tb[NFTA_NAT_FAMILY] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_numgen.c b/net/netfilter/nft_numgen.c index a66b36097b8f4f3cc1d230d9943be852866fcfbd..5a3a52c71545ac15ae5c906dbf53d6d425c3c557 100644 --- a/net/netfilter/nft_numgen.c +++ b/net/netfilter/nft_numgen.c @@ -188,7 +188,7 @@ nft_ng_select_ops(const struct nft_ctx *ctx, const struct nlattr * const tb[]) static struct nft_expr_type nft_ng_type __read_mostly = { .name = "numgen", - .select_ops = &nft_ng_select_ops, + .select_ops = nft_ng_select_ops, .policy = nft_ng_policy, .maxattr = NFTA_NG_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 1ae8c49ca4a1fac06f69c41f68a36b7e85593adb..1dd428fbaaa3f836e7bcecb50355f60c14d8faad 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -116,16 +116,10 @@ static int nft_objref_map_init(const struct nft_ctx *ctx, struct nft_set *set; int err; - set = nf_tables_set_lookup(ctx->table, tb[NFTA_OBJREF_SET_NAME], genmask); - if (IS_ERR(set)) { - if (tb[NFTA_OBJREF_SET_ID]) { - set = nf_tables_set_lookup_byid(ctx->net, - tb[NFTA_OBJREF_SET_ID], - genmask); - } - if (IS_ERR(set)) - return PTR_ERR(set); - } + set = nft_set_lookup(ctx->net, ctx->table, tb[NFTA_OBJREF_SET_NAME], + tb[NFTA_OBJREF_SET_ID], genmask); + if (IS_ERR(set)) + return PTR_ERR(set); if (!(set->flags & NFT_SET_OBJECT)) return -EINVAL; diff --git a/net/netfilter/nft_queue.c b/net/netfilter/nft_queue.c index dbb6aaff67ec5c151f8c8c6333721421f8e85076..98613658d4ac5ce359e946efca4c93d9df216817 100644 --- a/net/netfilter/nft_queue.c +++ b/net/netfilter/nft_queue.c @@ -197,7 +197,7 @@ nft_queue_select_ops(const struct nft_ctx *ctx, static struct nft_expr_type nft_queue_type __read_mostly = { .name = "queue", - .select_ops = &nft_queue_select_ops, + .select_ops = nft_queue_select_ops, .policy = nft_queue_policy, .maxattr = NFTA_QUEUE_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_quota.c b/net/netfilter/nft_quota.c index 2d6fe3559912674385e7679557fc31ddeb901b38..25e33159be57882fcf9875725079188dfaa7d113 100644 --- a/net/netfilter/nft_quota.c +++ b/net/netfilter/nft_quota.c @@ -99,7 +99,8 @@ static int nft_quota_do_init(const struct nlattr * const tb[], return 0; } -static int nft_quota_obj_init(const struct nlattr * const tb[], +static int nft_quota_obj_init(const struct nft_ctx *ctx, + const struct nlattr * const tb[], struct nft_object *obj) { struct nft_quota *priv = nft_obj_data(obj); diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c index 40dcd05146d5fb0f346e6e170d16de9813e92804..1e66538bf0ff24e3286ec6312e4d593c6197bd9b 100644 --- a/net/netfilter/nft_redir.c +++ b/net/netfilter/nft_redir.c @@ -47,10 +47,6 @@ int nft_redir_init(const struct nft_ctx *ctx, unsigned int plen; int err; - err = nft_redir_validate(ctx, expr, NULL); - if (err < 0) - return err; - plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all); if (tb[NFTA_REDIR_REG_PROTO_MIN]) { priv->sreg_proto_min = diff --git a/net/netfilter/nft_reject.c b/net/netfilter/nft_reject.c index c64de3f7379df551fa413a4af186f3c16886f112..29f5bd2377b0deaf7ede8ec0573bf71cfeef7478 100644 --- a/net/netfilter/nft_reject.c +++ b/net/netfilter/nft_reject.c @@ -42,11 +42,6 @@ int nft_reject_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int err; - - err = nft_reject_validate(ctx, expr, NULL); - if (err < 0) - return err; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_reject_inet.c b/net/netfilter/nft_reject_inet.c index 9e90a02cb104dad81daf84208902e90390a8f504..5a7fb5ff867d382f04633a2fab53997d0ca1f2b2 100644 --- a/net/netfilter/nft_reject_inet.c +++ b/net/netfilter/nft_reject_inet.c @@ -66,11 +66,7 @@ static int nft_reject_inet_init(const struct nft_ctx *ctx, const struct nlattr * const tb[]) { struct nft_reject *priv = nft_expr_priv(expr); - int icmp_code, err; - - err = nft_reject_validate(ctx, expr, NULL); - if (err < 0) - return err; + int icmp_code; if (tb[NFTA_REJECT_TYPE] == NULL) return -EINVAL; diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 8ebbc2940f4c593d393c65bd5674d90feb585d98..b988162b5b15b9442b496abf2571a9cf7dbc66f3 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set, static void nft_bitmap_destroy(const struct nft_set *set) { + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be, *n; + + list_for_each_entry_safe(be, n, &priv->list, head) + nft_set_elem_destroy(set, be, true); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 5f652720fc78e6de437c100d31524588e729001c..8ec086b6b56b742485e34511b38a77af848d9f99 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -352,7 +352,7 @@ static int nft_hash_init(const struct nft_set *set, static void nft_hash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy((const struct nft_set *)arg, ptr, true); + nft_set_elem_destroy(arg, ptr, true); } static void nft_hash_destroy(const struct nft_set *set) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 78dfbf9588b368107bdc385c7ef208a9abd3d297..e97e2fb53f0a107b0361322be10f16b4ab4b5d32 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -18,9 +18,8 @@ #include #include -static DEFINE_SPINLOCK(nft_rbtree_lock); - struct nft_rbtree { + rwlock_t lock; struct rb_root root; }; @@ -44,14 +43,14 @@ static bool nft_rbtree_equal(const struct nft_set *set, const void *this, static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { - const struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree *priv = nft_set_priv(set); const struct nft_rbtree_elem *rbe, *interval = NULL; u8 genmask = nft_genmask_cur(net); const struct rb_node *parent; const void *this; int d; - spin_lock_bh(&nft_rbtree_lock); + read_lock_bh(&priv->lock); parent = priv->root.rb_node; while (parent != NULL) { rbe = rb_entry(parent, struct nft_rbtree_elem, node); @@ -75,7 +74,7 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, } if (nft_rbtree_interval_end(rbe)) goto out; - spin_unlock_bh(&nft_rbtree_lock); + read_unlock_bh(&priv->lock); *ext = &rbe->ext; return true; @@ -85,12 +84,12 @@ static bool nft_rbtree_lookup(const struct net *net, const struct nft_set *set, if (set->flags & NFT_SET_INTERVAL && interval != NULL && nft_set_elem_active(&interval->ext, genmask) && !nft_rbtree_interval_end(interval)) { - spin_unlock_bh(&nft_rbtree_lock); + read_unlock_bh(&priv->lock); *ext = &interval->ext; return true; } out: - spin_unlock_bh(&nft_rbtree_lock); + read_unlock_bh(&priv->lock); return false; } @@ -140,12 +139,13 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set, const struct nft_set_elem *elem, struct nft_set_ext **ext) { + struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe = elem->priv; int err; - spin_lock_bh(&nft_rbtree_lock); + write_lock_bh(&priv->lock); err = __nft_rbtree_insert(net, set, rbe, ext); - spin_unlock_bh(&nft_rbtree_lock); + write_unlock_bh(&priv->lock); return err; } @@ -157,9 +157,9 @@ static void nft_rbtree_remove(const struct net *net, struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe = elem->priv; - spin_lock_bh(&nft_rbtree_lock); + write_lock_bh(&priv->lock); rb_erase(&rbe->node, &priv->root); - spin_unlock_bh(&nft_rbtree_lock); + write_unlock_bh(&priv->lock); } static void nft_rbtree_activate(const struct net *net, @@ -224,12 +224,12 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_iter *iter) { - const struct nft_rbtree *priv = nft_set_priv(set); + struct nft_rbtree *priv = nft_set_priv(set); struct nft_rbtree_elem *rbe; struct nft_set_elem elem; struct rb_node *node; - spin_lock_bh(&nft_rbtree_lock); + read_lock_bh(&priv->lock); for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { rbe = rb_entry(node, struct nft_rbtree_elem, node); @@ -242,13 +242,13 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, iter->err = iter->fn(ctx, set, iter, &elem); if (iter->err < 0) { - spin_unlock_bh(&nft_rbtree_lock); + read_unlock_bh(&priv->lock); return; } cont: iter->count++; } - spin_unlock_bh(&nft_rbtree_lock); + read_unlock_bh(&priv->lock); } static unsigned int nft_rbtree_privsize(const struct nlattr * const nla[]) @@ -262,6 +262,7 @@ static int nft_rbtree_init(const struct nft_set *set, { struct nft_rbtree *priv = nft_set_priv(set); + rwlock_init(&priv->lock); priv->root = RB_ROOT; return 0; } diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 14857afc9937d30cae604fe839029593c1006944..8876b7da6884c210393d1988032cbb5bd7018507 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -763,17 +763,8 @@ EXPORT_SYMBOL(xt_check_entry_offsets); */ unsigned int *xt_alloc_entry_offsets(unsigned int size) { - unsigned int *off; + return kvmalloc_array(size, sizeof(unsigned int), GFP_KERNEL | __GFP_ZERO); - off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN); - - if (off) - return off; - - if (size < (SIZE_MAX / sizeof(unsigned int))) - off = vmalloc(size * sizeof(unsigned int)); - - return off; } EXPORT_SYMBOL(xt_alloc_entry_offsets); @@ -1007,8 +998,7 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size) if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY); if (!info) { - info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | - __GFP_NORETRY | __GFP_HIGHMEM, + info = __vmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, PAGE_KERNEL); if (!info) return NULL; @@ -1051,8 +1041,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, list_for_each_entry(t, &init_net.xt.tables[af], list) { if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); return NULL; + } mutex_unlock(&xt[af].mutex); if (t->table_init(net) != 0) { @@ -1114,7 +1106,7 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) size = sizeof(void **) * nr_cpu_ids; if (size > PAGE_SIZE) - i->jumpstack = vzalloc(size); + i->jumpstack = kvzalloc(size, GFP_KERNEL); else i->jumpstack = kzalloc(size, GFP_KERNEL); if (i->jumpstack == NULL) @@ -1136,12 +1128,8 @@ static int xt_jumpstack_alloc(struct xt_table_info *i) */ size = sizeof(void *) * i->stacksize * 2u; for_each_possible_cpu(cpu) { - if (size > PAGE_SIZE) - i->jumpstack[cpu] = vmalloc_node(size, - cpu_to_node(cpu)); - else - i->jumpstack[cpu] = kmalloc_node(size, - GFP_KERNEL, cpu_to_node(cpu)); + i->jumpstack[cpu] = kvmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); if (i->jumpstack[cpu] == NULL) /* * Freeing will be done later on by the callers. The diff --git a/net/netfilter/xt_AUDIT.c b/net/netfilter/xt_AUDIT.c index 19247a17e5114f1e278e30508085463d7396959e..c502419d63061c7aa5d3205187e7429294b3cc97 100644 --- a/net/netfilter/xt_AUDIT.c +++ b/net/netfilter/xt_AUDIT.c @@ -31,146 +31,76 @@ MODULE_ALIAS("ip6t_AUDIT"); MODULE_ALIAS("ebt_AUDIT"); MODULE_ALIAS("arpt_AUDIT"); -static void audit_proto(struct audit_buffer *ab, struct sk_buff *skb, - unsigned int proto, unsigned int offset) -{ - switch (proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: { - const __be16 *pptr; - __be16 _ports[2]; - - pptr = skb_header_pointer(skb, offset, sizeof(_ports), _ports); - if (pptr == NULL) { - audit_log_format(ab, " truncated=1"); - return; - } - - audit_log_format(ab, " sport=%hu dport=%hu", - ntohs(pptr[0]), ntohs(pptr[1])); - } - break; - - case IPPROTO_ICMP: - case IPPROTO_ICMPV6: { - const u8 *iptr; - u8 _ih[2]; - - iptr = skb_header_pointer(skb, offset, sizeof(_ih), &_ih); - if (iptr == NULL) { - audit_log_format(ab, " truncated=1"); - return; - } - - audit_log_format(ab, " icmptype=%hhu icmpcode=%hhu", - iptr[0], iptr[1]); - - } - break; - } -} - -static void audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) +static bool audit_ip4(struct audit_buffer *ab, struct sk_buff *skb) { struct iphdr _iph; const struct iphdr *ih; - ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph); - if (!ih) { - audit_log_format(ab, " truncated=1"); - return; - } + ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_iph), &_iph); + if (!ih) + return false; - audit_log_format(ab, " saddr=%pI4 daddr=%pI4 ipid=%hu proto=%hhu", - &ih->saddr, &ih->daddr, ntohs(ih->id), ih->protocol); + audit_log_format(ab, " saddr=%pI4 daddr=%pI4 proto=%hhu", + &ih->saddr, &ih->daddr, ih->protocol); - if (ntohs(ih->frag_off) & IP_OFFSET) { - audit_log_format(ab, " frag=1"); - return; - } - - audit_proto(ab, skb, ih->protocol, ih->ihl * 4); + return true; } -static void audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) +static bool audit_ip6(struct audit_buffer *ab, struct sk_buff *skb) { struct ipv6hdr _ip6h; const struct ipv6hdr *ih; u8 nexthdr; __be16 frag_off; - int offset; ih = skb_header_pointer(skb, skb_network_offset(skb), sizeof(_ip6h), &_ip6h); - if (!ih) { - audit_log_format(ab, " truncated=1"); - return; - } + if (!ih) + return false; nexthdr = ih->nexthdr; - offset = ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), - &nexthdr, &frag_off); + ipv6_skip_exthdr(skb, skb_network_offset(skb) + sizeof(_ip6h), &nexthdr, &frag_off); audit_log_format(ab, " saddr=%pI6c daddr=%pI6c proto=%hhu", &ih->saddr, &ih->daddr, nexthdr); - if (offset) - audit_proto(ab, skb, nexthdr, offset); + return true; } static unsigned int audit_tg(struct sk_buff *skb, const struct xt_action_param *par) { - const struct xt_audit_info *info = par->targinfo; struct audit_buffer *ab; + int fam = -1; if (audit_enabled == 0) goto errout; - ab = audit_log_start(NULL, GFP_ATOMIC, AUDIT_NETFILTER_PKT); if (ab == NULL) goto errout; - audit_log_format(ab, "action=%hhu hook=%u len=%u inif=%s outif=%s", - info->type, xt_hooknum(par), skb->len, - xt_in(par) ? xt_inname(par) : "?", - xt_out(par) ? xt_outname(par) : "?"); - - if (skb->mark) - audit_log_format(ab, " mark=%#x", skb->mark); - - if (skb->dev && skb->dev->type == ARPHRD_ETHER) { - audit_log_format(ab, " smac=%pM dmac=%pM macproto=0x%04x", - eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, - ntohs(eth_hdr(skb)->h_proto)); - - if (xt_family(par) == NFPROTO_BRIDGE) { - switch (eth_hdr(skb)->h_proto) { - case htons(ETH_P_IP): - audit_ip4(ab, skb); - break; - - case htons(ETH_P_IPV6): - audit_ip6(ab, skb); - break; - } - } - } + audit_log_format(ab, "mark=%#x", skb->mark); switch (xt_family(par)) { + case NFPROTO_BRIDGE: + switch (eth_hdr(skb)->h_proto) { + case htons(ETH_P_IP): + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; + break; + case htons(ETH_P_IPV6): + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; + break; + } + break; case NFPROTO_IPV4: - audit_ip4(ab, skb); + fam = audit_ip4(ab, skb) ? NFPROTO_IPV4 : -1; break; - case NFPROTO_IPV6: - audit_ip6(ab, skb); + fam = audit_ip6(ab, skb) ? NFPROTO_IPV6 : -1; break; } -#ifdef CONFIG_NETWORK_SECMARK - if (skb->secmark) - audit_log_secctx(ab, skb->secmark); -#endif + if (fam == -1) + audit_log_format(ab, " saddr=? daddr=? proto=-1"); audit_log_end(ab); diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index b008db0184b8a2a1679d5d4737427a672c36b560..bb7ad82dcd5603e810db8fba35f81d3f2c03a2b7 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -26,11 +26,12 @@ static inline int xt_ct_target(struct sk_buff *skb, struct nf_conn *ct) if (skb->_nfct != 0) return XT_CONTINUE; - /* special case the untracked ct : we want the percpu object */ - if (!ct) - ct = nf_ct_untracked_get(); - atomic_inc(&ct->ct_general.use); - nf_ct_set(skb, ct, IP_CT_NEW); + if (ct) { + atomic_inc(&ct->ct_general.use); + nf_ct_set(skb, ct, IP_CT_NEW); + } else { + nf_ct_set(skb, ct, IP_CT_UNTRACKED); + } return XT_CONTINUE; } @@ -167,8 +168,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (timeout_ext == NULL) + if (!timeout_ext) { ret = -ENOMEM; + goto err_put_timeout; + } rcu_read_unlock(); return ret; @@ -200,6 +203,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; + struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -248,7 +252,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (info->timeout[0]) { ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) - goto err3; + goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); nf_conntrack_get(&ct->ct_general); @@ -256,6 +260,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, info->ct = ct; return 0; +err4: + help = nfct_help(ct); + if (help) + module_put(help->helper->me); err3: nf_ct_tmpl_free(ct); err2: @@ -335,7 +343,7 @@ static void xt_ct_tg_destroy(const struct xt_tgdtor_param *par, struct nf_conn *ct = info->ct; struct nf_conn_help *help; - if (ct && !nf_ct_is_untracked(ct)) { + if (ct) { help = nfct_help(ct); if (help) module_put(help->helper->me); @@ -412,8 +420,7 @@ notrack_tg(struct sk_buff *skb, const struct xt_action_param *par) if (skb->_nfct != 0) return XT_CONTINUE; - nf_ct_set(skb, nf_ct_untracked_get(), IP_CT_NEW); - nf_conntrack_get(skb_nfct(skb)); + nf_ct_set(skb, NULL, IP_CT_UNTRACKED); return XT_CONTINUE; } diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 02afaf48a7290b15e6f2d56b2ca2c5cb07455b4d..60e6dbe124605569f249f0e15c7304b231fe1500 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -84,7 +84,7 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, struct nf_conntrack_tuple *otuple; struct nf_conntrack_tuple *rtuple; - if (ct == NULL || nf_ct_is_untracked(ct)) + if (ct == NULL) return -1; otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; diff --git a/net/netfilter/xt_cluster.c b/net/netfilter/xt_cluster.c index 9a9884a39c0e9cc806a3a6b0b16bbd387a26d220..57ef175dfbfaa86db8368c10fbbd51ba3b2a691c 100644 --- a/net/netfilter/xt_cluster.c +++ b/net/netfilter/xt_cluster.c @@ -121,9 +121,6 @@ xt_cluster_mt(const struct sk_buff *skb, struct xt_action_param *par) if (ct == NULL) return false; - if (nf_ct_is_untracked(ct)) - return false; - if (ct->master) hash = xt_cluster_hash(ct->master, info); else diff --git a/net/netfilter/xt_connlabel.c b/net/netfilter/xt_connlabel.c index 7827128d5a95f5faf699d49c5dac796d03744a51..23372879e6e3004398454f6c96944422db8e1cc7 100644 --- a/net/netfilter/xt_connlabel.c +++ b/net/netfilter/xt_connlabel.c @@ -29,7 +29,7 @@ connlabel_mt(const struct sk_buff *skb, struct xt_action_param *par) bool invert = info->options & XT_CONNLABEL_OP_INVERT; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL || nf_ct_is_untracked(ct)) + if (ct == NULL) return invert; labels = nf_ct_labels_find(ct); diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 9935d5029b0e52735b45bca017e52dd72f593c1e..ec377cc6a369c71025136fc4b8b3cf3fce177eea 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -44,7 +44,7 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int32_t newmark; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL || nf_ct_is_untracked(ct)) + if (ct == NULL) return XT_CONTINUE; switch (info->mode) { @@ -97,7 +97,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL || nf_ct_is_untracked(ct)) + if (ct == NULL) return false; return ((ct->mark & info->mask) == info->mark) ^ info->invert; diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index c0fb217bc64969bd0da2c656d4418f8c80b29d14..39cf1d019240e61f504bc8ced96a63c41c9bd839 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -172,12 +172,11 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par, ct = nf_ct_get(skb, &ctinfo); - if (ct) { - if (nf_ct_is_untracked(ct)) - statebit = XT_CONNTRACK_STATE_UNTRACKED; - else - statebit = XT_CONNTRACK_STATE_BIT(ctinfo); - } else + if (ct) + statebit = XT_CONNTRACK_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + statebit = XT_CONNTRACK_STATE_UNTRACKED; + else statebit = XT_CONNTRACK_STATE_INVALID; if (info->match_flags & XT_CONNTRACK_STATE) { diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index 2a6dfe8b74d37f7d3d8ed0569650f6417b76d58a..762e1874f28b7b1faba5b48e94f743c0414689f3 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -119,7 +119,7 @@ static int cfg_copy(struct hashlimit_cfg2 *to, void *from, int revision) { if (revision == 1) { - struct hashlimit_cfg1 *cfg = (struct hashlimit_cfg1 *)from; + struct hashlimit_cfg1 *cfg = from; to->mode = cfg->mode; to->avg = cfg->avg; @@ -895,7 +895,7 @@ static void *dl_seq_start(struct seq_file *s, loff_t *pos) static void *dl_seq_next(struct seq_file *s, void *v, loff_t *pos) { struct xt_hashlimit_htable *htable = s->private; - unsigned int *bucket = (unsigned int *)v; + unsigned int *bucket = v; *pos = ++(*bucket); if (*pos >= htable->cfg.size) { @@ -909,7 +909,7 @@ static void dl_seq_stop(struct seq_file *s, void *v) __releases(htable->lock) { struct xt_hashlimit_htable *htable = s->private; - unsigned int *bucket = (unsigned int *)v; + unsigned int *bucket = v; if (!IS_ERR(bucket)) kfree(bucket); @@ -980,7 +980,7 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family, static int dl_seq_show_v1(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = s->private; - unsigned int *bucket = (unsigned int *)v; + unsigned int *bucket = v; struct dsthash_ent *ent; if (!hlist_empty(&htable->hash[*bucket])) { @@ -994,7 +994,7 @@ static int dl_seq_show_v1(struct seq_file *s, void *v) static int dl_seq_show(struct seq_file *s, void *v) { struct xt_hashlimit_htable *htable = s->private; - unsigned int *bucket = (unsigned int *)v; + unsigned int *bucket = v; struct dsthash_ent *ent; if (!hlist_empty(&htable->hash[*bucket])) { diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 0fdc89064488050bf4e667b1c09a26169a326872..42540d26c2b8ec34ab62422fe5894588df2306b6 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -116,7 +116,7 @@ ipvs_mt(const struct sk_buff *skb, struct xt_action_param *par) enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL || nf_ct_is_untracked(ct)) { + if (ct == NULL) { match = false; goto out_put_cp; } diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c index dab962df178795612580a1c8e22257213bdab07d..d27b5f1ea619f9696912b58bd5012358206725d7 100644 --- a/net/netfilter/xt_limit.c +++ b/net/netfilter/xt_limit.c @@ -18,6 +18,7 @@ #include struct xt_limit_priv { + spinlock_t lock; unsigned long prev; uint32_t credit; }; @@ -32,8 +33,6 @@ MODULE_ALIAS("ip6t_limit"); * see net/sched/sch_tbf.c in the linux source tree */ -static DEFINE_SPINLOCK(limit_lock); - /* Rusty: This is my (non-mathematically-inclined) understanding of this algorithm. The `average rate' in jiffies becomes your initial amount of credit `credit' and the most credit you can ever have @@ -72,7 +71,7 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) struct xt_limit_priv *priv = r->master; unsigned long now = jiffies; - spin_lock_bh(&limit_lock); + spin_lock_bh(&priv->lock); priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY; if (priv->credit > r->credit_cap) priv->credit = r->credit_cap; @@ -80,11 +79,11 @@ limit_mt(const struct sk_buff *skb, struct xt_action_param *par) if (priv->credit >= r->cost) { /* We're not limited. */ priv->credit -= r->cost; - spin_unlock_bh(&limit_lock); + spin_unlock_bh(&priv->lock); return true; } - spin_unlock_bh(&limit_lock); + spin_unlock_bh(&priv->lock); return false; } @@ -126,6 +125,8 @@ static int limit_mt_check(const struct xt_mtchk_param *par) r->credit_cap = priv->credit; /* Credits full. */ r->cost = user2credits(r->avg); } + spin_lock_init(&priv->lock); + return 0; } diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c index 1d89a4eaf841a33e95247badfba064ce7dad0f63..3f6c4fa78bdb717c067b57b53a0900529dcc4000 100644 --- a/net/netfilter/xt_recent.c +++ b/net/netfilter/xt_recent.c @@ -388,10 +388,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par, } sz = sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size; - if (sz <= PAGE_SIZE) - t = kzalloc(sz, GFP_KERNEL); - else - t = vzalloc(sz); + t = kvzalloc(sz, GFP_KERNEL); if (t == NULL) { ret = -ENOMEM; goto out; @@ -532,7 +529,7 @@ static int recent_seq_show(struct seq_file *seq, void *v) &e->addr.in6, e->ttl, e->stamps[i], e->index); for (i = 0; i < e->nstamps; i++) seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); - seq_printf(seq, "\n"); + seq_putc(seq, '\n'); return 0; } diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 770bbec878f149f5688584982a552440ff5351fd..e75ef39669c5a9a5b72c9a1cec8b72020600eae1 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family) switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); -#ifdef XT_SOCKET_HAVE_IPV6 +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif diff --git a/net/netfilter/xt_state.c b/net/netfilter/xt_state.c index 5746a33789a50ced02746d57403970872f2cf29f..5fbd79194d21ee1f26fa669162b8de92d965f8c8 100644 --- a/net/netfilter/xt_state.c +++ b/net/netfilter/xt_state.c @@ -28,14 +28,13 @@ state_mt(const struct sk_buff *skb, struct xt_action_param *par) unsigned int statebit; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); - if (!ct) + if (ct) + statebit = XT_STATE_BIT(ctinfo); + else if (ctinfo == IP_CT_UNTRACKED) + statebit = XT_STATE_UNTRACKED; + else statebit = XT_STATE_INVALID; - else { - if (nf_ct_is_untracked(ct)) - statebit = XT_STATE_UNTRACKED; - else - statebit = XT_STATE_BIT(ctinfo); - } + return (sinfo->statemask & statebit); } diff --git a/net/netlabel/netlabel_cipso_v4.c b/net/netlabel/netlabel_cipso_v4.c index 4149d3e6358976f093dbcb25dee79d10f6275231..9aacf2da3d98feaf94319f7315b255323f4416f2 100644 --- a/net/netlabel/netlabel_cipso_v4.c +++ b/net/netlabel/netlabel_cipso_v4.c @@ -101,7 +101,7 @@ static int netlbl_cipsov4_add_common(struct genl_info *info, if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_TAGLST], NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy) != 0) + netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; nla_for_each_nested(nla, info->attrs[NLBL_CIPSOV4_A_TAGLST], nla_rem) @@ -148,7 +148,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy) != 0) + netlbl_cipsov4_genl_policy, NULL) != 0) return -EINVAL; doi_def = kmalloc(sizeof(*doi_def), GFP_KERNEL); @@ -170,10 +170,10 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, info->attrs[NLBL_CIPSOV4_A_MLSLVLLST], nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSLVL) { - if (nla_validate_nested(nla_a, - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy) != 0) - goto add_std_failure; + if (nla_validate_nested(nla_a, NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) + goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { case NLBL_CIPSOV4_A_MLSLVLLOC: @@ -236,7 +236,7 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, if (info->attrs[NLBL_CIPSOV4_A_MLSCATLST]) { if (nla_validate_nested(info->attrs[NLBL_CIPSOV4_A_MLSCATLST], NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy) != 0) + netlbl_cipsov4_genl_policy, NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_a, @@ -244,8 +244,9 @@ static int netlbl_cipsov4_add_std(struct genl_info *info, nla_a_rem) if (nla_type(nla_a) == NLBL_CIPSOV4_A_MLSCAT) { if (nla_validate_nested(nla_a, - NLBL_CIPSOV4_A_MAX, - netlbl_cipsov4_genl_policy) != 0) + NLBL_CIPSOV4_A_MAX, + netlbl_cipsov4_genl_policy, + NULL) != 0) goto add_std_failure; nla_for_each_nested(nla_b, nla_a, nla_b_rem) switch (nla_type(nla_b)) { diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 596eaff66649e5955d6c0f349f062b6d8360dc2d..ee841f00a6ec715914fb14bb31dc8405f8dd4c4e 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -78,14 +78,6 @@ struct listeners { /* state bits */ #define NETLINK_S_CONGESTED 0x0 -/* flags */ -#define NETLINK_F_KERNEL_SOCKET 0x1 -#define NETLINK_F_RECV_PKTINFO 0x2 -#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 -#define NETLINK_F_RECV_NO_ENOBUFS 0x8 -#define NETLINK_F_LISTEN_ALL_NSID 0x10 -#define NETLINK_F_CAP_ACK 0x20 - static inline int netlink_is_kernel(struct sock *sk) { return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET; @@ -1660,6 +1652,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname, nlk->flags &= ~NETLINK_F_CAP_ACK; err = 0; break; + case NETLINK_EXT_ACK: + if (val) + nlk->flags |= NETLINK_F_EXT_ACK; + else + nlk->flags &= ~NETLINK_F_EXT_ACK; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -1744,6 +1743,15 @@ static int netlink_getsockopt(struct socket *sock, int level, int optname, return -EFAULT; err = 0; break; + case NETLINK_EXT_ACK: + if (len < sizeof(int)) + return -EINVAL; + len = sizeof(int); + val = nlk->flags & NETLINK_F_EXT_ACK ? 1 : 0; + if (put_user(len, optlen) || put_user(val, optval)) + return -EFAULT; + err = 0; + break; default: err = -ENOPROTOOPT; } @@ -2275,21 +2283,44 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, } EXPORT_SYMBOL(__netlink_dump_start); -void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) +void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, + const struct netlink_ext_ack *extack) { struct sk_buff *skb; struct nlmsghdr *rep; struct nlmsgerr *errmsg; size_t payload = sizeof(*errmsg); + size_t tlvlen = 0; struct netlink_sock *nlk = nlk_sk(NETLINK_CB(in_skb).sk); + unsigned int flags = 0; /* Error messages get the original request appened, unless the user - * requests to cap the error message. + * requests to cap the error message, and get extra error data if + * requested. */ - if (!(nlk->flags & NETLINK_F_CAP_ACK) && err) - payload += nlmsg_len(nlh); + if (err) { + if (!(nlk->flags & NETLINK_F_CAP_ACK)) + payload += nlmsg_len(nlh); + else + flags |= NLM_F_CAPPED; + if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (extack->_msg) + tlvlen += nla_total_size(strlen(extack->_msg) + 1); + if (extack->bad_attr) + tlvlen += nla_total_size(sizeof(u32)); + } + } else { + flags |= NLM_F_CAPPED; + + if (nlk->flags & NETLINK_F_EXT_ACK && + extack && extack->cookie_len) + tlvlen += nla_total_size(extack->cookie_len); + } + + if (tlvlen) + flags |= NLM_F_ACK_TLVS; - skb = nlmsg_new(payload, GFP_KERNEL); + skb = nlmsg_new(payload + tlvlen, GFP_KERNEL); if (!skb) { struct sock *sk; @@ -2305,17 +2336,42 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err) } rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, 0); + NLMSG_ERROR, payload, flags); errmsg = nlmsg_data(rep); errmsg->error = err; memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); + + if (nlk->flags & NETLINK_F_EXT_ACK && extack) { + if (err) { + if (extack->_msg) + WARN_ON(nla_put_string(skb, NLMSGERR_ATTR_MSG, + extack->_msg)); + if (extack->bad_attr && + !WARN_ON((u8 *)extack->bad_attr < in_skb->data || + (u8 *)extack->bad_attr >= in_skb->data + + in_skb->len)) + WARN_ON(nla_put_u32(skb, NLMSGERR_ATTR_OFFS, + (u8 *)extack->bad_attr - + in_skb->data)); + } else { + if (extack->cookie_len) + WARN_ON(nla_put(skb, NLMSGERR_ATTR_COOKIE, + extack->cookie_len, + extack->cookie)); + } + } + + nlmsg_end(skb, rep); + netlink_unicast(in_skb->sk, skb, NETLINK_CB(in_skb).portid, MSG_DONTWAIT); } EXPORT_SYMBOL(netlink_ack); int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, - struct nlmsghdr *)) + struct nlmsghdr *, + struct netlink_ext_ack *)) { + struct netlink_ext_ack extack = {}; struct nlmsghdr *nlh; int err; @@ -2336,13 +2392,13 @@ int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, if (nlh->nlmsg_type < NLMSG_MIN_TYPE) goto ack; - err = cb(skb, nlh); + err = cb(skb, nlh, &extack); if (err == -EINTR) goto skip; ack: if (nlh->nlmsg_flags & NLM_F_ACK || err) - netlink_ack(skb, nlh, err); + netlink_ack(skb, nlh, err, &extack); skip: msglen = NLMSG_ALIGN(nlh->nlmsg_len); diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index 4fdb3831897775547f77c069a8018c0d2a253c8c..3490f2430532cf753118e14883b52c4af151b18c 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -6,6 +6,15 @@ #include #include +/* flags */ +#define NETLINK_F_KERNEL_SOCKET 0x1 +#define NETLINK_F_RECV_PKTINFO 0x2 +#define NETLINK_F_BROADCAST_SEND_ERROR 0x4 +#define NETLINK_F_RECV_NO_ENOBUFS 0x8 +#define NETLINK_F_LISTEN_ALL_NSID 0x10 +#define NETLINK_F_CAP_ACK 0x20 +#define NETLINK_F_EXT_ACK 0x40 + #define NLGRPSZ(x) (ALIGN(x, sizeof(unsigned long) * 8) / 8) #define NLGRPLONGS(x) (NLGRPSZ(x)/sizeof(unsigned long)) diff --git a/net/netlink/diag.c b/net/netlink/diag.c index a5546249fb1022b52144a40717b8a4268755b972..8faa20b4d4573f3e2b510f03f3d4a85a70352559 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -19,6 +19,27 @@ static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) nlk->groups); } +static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb) +{ + struct netlink_sock *nlk = nlk_sk(sk); + u32 flags = 0; + + if (nlk->cb_running) + flags |= NDIAG_FLAG_CB_RUNNING; + if (nlk->flags & NETLINK_F_RECV_PKTINFO) + flags |= NDIAG_FLAG_PKTINFO; + if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR) + flags |= NDIAG_FLAG_BROADCAST_ERROR; + if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS) + flags |= NDIAG_FLAG_NO_ENOBUFS; + if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID) + flags |= NDIAG_FLAG_LISTEN_ALL_NSID; + if (nlk->flags & NETLINK_F_CAP_ACK) + flags |= NDIAG_FLAG_CAP_ACK; + + return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags); +} + static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, struct netlink_diag_req *req, u32 portid, u32 seq, u32 flags, int sk_ino) @@ -52,6 +73,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb, sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) goto out_nlmsg_trim; + if ((req->ndiag_show & NDIAG_SHOW_FLAGS) && + sk_diag_put_flags(sk, skb)) + goto out_nlmsg_trim; + nlmsg_end(skb, nlh); return 0; diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 92e0981f74040d7029b65863167b459322612024..10f8b4cff40accd99898ea4d0696a2ba5d90080c 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -497,7 +497,8 @@ static int genl_lock_done(struct netlink_callback *cb) static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, - struct nlmsghdr *nlh) + struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { const struct genl_ops *ops; struct net *net = sock_net(skb->sk); @@ -573,7 +574,7 @@ static int genl_family_rcv_msg(const struct genl_family *family, if (attrbuf) { err = nlmsg_parse(nlh, hdrlen, attrbuf, family->maxattr, - ops->policy); + ops->policy, extack); if (err < 0) goto out; } @@ -584,6 +585,7 @@ static int genl_family_rcv_msg(const struct genl_family *family, info.genlhdr = nlmsg_data(nlh); info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = attrbuf; + info.extack = extack; genl_info_net_set(&info, net); memset(&info.user_ptr, 0, sizeof(info.user_ptr)); @@ -605,7 +607,8 @@ static int genl_family_rcv_msg(const struct genl_family *family, return err; } -static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { const struct genl_family *family; int err; @@ -617,7 +620,7 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (!family->parallel_ops) genl_lock(); - err = genl_family_rcv_msg(family, skb, nlh); + err = genl_family_rcv_msg(family, skb, nlh, extack); if (!family->parallel_ops) genl_unlock(); diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c index 03f3d5c7beb8d173fae25456b278ee9cf3e04c5e..6b0850e63e09cf747a498f913f7eabe11e59c380 100644 --- a/net/nfc/netlink.c +++ b/net/nfc/netlink.c @@ -119,7 +119,8 @@ static struct nfc_dev *__get_device_from_cb(struct netlink_callback *cb) u32 idx; rc = nlmsg_parse(cb->nlh, GENL_HDRLEN + nfc_genl_family.hdrsize, - attrbuf, nfc_genl_family.maxattr, nfc_genl_policy); + attrbuf, nfc_genl_family.maxattr, nfc_genl_policy, + NULL); if (rc < 0) return ERR_PTR(rc); @@ -303,6 +304,17 @@ int nfc_genl_tm_deactivated(struct nfc_dev *dev) return -EMSGSIZE; } +static int nfc_genl_setup_device_added(struct nfc_dev *dev, struct sk_buff *msg) +{ + if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || + nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || + nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || + nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || + nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) + return -1; + return 0; +} + int nfc_genl_device_added(struct nfc_dev *dev) { struct sk_buff *msg; @@ -317,10 +329,7 @@ int nfc_genl_device_added(struct nfc_dev *dev) if (!hdr) goto free_msg; - if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || - nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || - nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || - nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up)) + if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -596,11 +605,7 @@ static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev, if (cb) genl_dump_check_consistent(cb, hdr, &nfc_genl_family); - if (nla_put_string(msg, NFC_ATTR_DEVICE_NAME, nfc_device_name(dev)) || - nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) || - nla_put_u32(msg, NFC_ATTR_PROTOCOLS, dev->supported_protocols) || - nla_put_u8(msg, NFC_ATTR_DEVICE_POWERED, dev->dev_up) || - nla_put_u8(msg, NFC_ATTR_RF_MODE, dev->rf_mode)) + if (nfc_genl_setup_device_added(dev, msg)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -918,7 +923,7 @@ static int nfc_genl_activate_target(struct sk_buff *skb, struct genl_info *info) rc = nfc_activate_target(dev, target_idx, protocol); nfc_put_device(dev); - return 0; + return rc; } static int nfc_genl_dep_link_up(struct sk_buff *skb, struct genl_info *info) @@ -1161,7 +1166,7 @@ static int nfc_genl_llc_sdreq(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NFC_ATTR_LLC_SDP], rem) { rc = nla_parse_nested(sdp_attrs, NFC_SDP_ATTR_MAX, attr, - nfc_sdp_genl_policy); + nfc_sdp_genl_policy, info->extack); if (rc != 0) { rc = -EINVAL; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c82301ce3fffb6caeb41a9882a53289ec7b63c8d..e4610676299bcdac626db1a30cd4da44ccc62c0b 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2017 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -44,13 +44,10 @@ #include "conntrack.h" #include "vport.h" -static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, - struct sw_flow_key *key, - const struct nlattr *attr, int len); - struct deferred_action { struct sk_buff *skb; const struct nlattr *actions; + int actions_len; /* Store pkt_key clone when creating deferred action. */ struct sw_flow_key pkt_key; @@ -82,14 +79,31 @@ struct action_fifo { struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE]; }; -struct recirc_keys { +struct action_flow_keys { struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD]; }; static struct action_fifo __percpu *action_fifos; -static struct recirc_keys __percpu *recirc_keys; +static struct action_flow_keys __percpu *flow_keys; static DEFINE_PER_CPU(int, exec_actions_level); +/* Make a clone of the 'key', using the pre-allocated percpu 'flow_keys' + * space. Return NULL if out of key spaces. + */ +static struct sw_flow_key *clone_key(const struct sw_flow_key *key_) +{ + struct action_flow_keys *keys = this_cpu_ptr(flow_keys); + int level = this_cpu_read(exec_actions_level); + struct sw_flow_key *key = NULL; + + if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { + key = &keys->key[level - 1]; + *key = *key_; + } + + return key; +} + static void action_fifo_init(struct action_fifo *fifo) { fifo->head = 0; @@ -119,8 +133,9 @@ static struct deferred_action *action_fifo_put(struct action_fifo *fifo) /* Return true if fifo is not full */ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, - const struct sw_flow_key *key, - const struct nlattr *attr) + const struct sw_flow_key *key, + const struct nlattr *actions, + const int actions_len) { struct action_fifo *fifo; struct deferred_action *da; @@ -129,7 +144,8 @@ static struct deferred_action *add_deferred_actions(struct sk_buff *skb, da = action_fifo_put(fifo); if (da) { da->skb = skb; - da->actions = attr; + da->actions = actions; + da->actions_len = actions_len; da->pkt_key = *key; } @@ -146,6 +162,12 @@ static bool is_flow_key_valid(const struct sw_flow_key *key) return !(key->mac_proto & SW_FLOW_KEY_INVALID); } +static int clone_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, + u32 recirc_id, + const struct nlattr *actions, int len, + bool last, bool clone_flow_key); + static void update_ethertype(struct sk_buff *skb, struct ethhdr *hdr, __be16 ethertype) { @@ -908,72 +930,35 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, return ovs_dp_upcall(dp, skb, key, &upcall, cutlen); } +/* When 'last' is true, sample() should always consume the 'skb'. + * Otherwise, sample() should keep 'skb' intact regardless what + * actions are executed within sample(). + */ static int sample(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, const struct nlattr *attr, - const struct nlattr *actions, int actions_len) + bool last) { - const struct nlattr *acts_list = NULL; - const struct nlattr *a; - int rem; - u32 cutlen = 0; - - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { - u32 probability; - - switch (nla_type(a)) { - case OVS_SAMPLE_ATTR_PROBABILITY: - probability = nla_get_u32(a); - if (!probability || prandom_u32() > probability) - return 0; - break; - - case OVS_SAMPLE_ATTR_ACTIONS: - acts_list = a; - break; - } - } - - rem = nla_len(acts_list); - a = nla_data(acts_list); - - /* Actions list is empty, do nothing */ - if (unlikely(!rem)) + struct nlattr *actions; + struct nlattr *sample_arg; + int rem = nla_len(attr); + const struct sample_arg *arg; + bool clone_flow_key; + + /* The first action is always 'OVS_SAMPLE_ATTR_ARG'. */ + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); + + if ((arg->probability != U32_MAX) && + (!arg->probability || prandom_u32() > arg->probability)) { + if (last) + consume_skb(skb); return 0; - - /* The only known usage of sample action is having a single user-space - * action, or having a truncate action followed by a single user-space - * action. Treat this usage as a special case. - * The output_userspace() should clone the skb to be sent to the - * user space. This skb will be consumed by its caller. - */ - if (unlikely(nla_type(a) == OVS_ACTION_ATTR_TRUNC)) { - struct ovs_action_trunc *trunc = nla_data(a); - - if (skb->len > trunc->max_len) - cutlen = skb->len - trunc->max_len; - - a = nla_next(a, &rem); } - if (likely(nla_type(a) == OVS_ACTION_ATTR_USERSPACE && - nla_is_last(a, rem))) - return output_userspace(dp, skb, key, a, actions, - actions_len, cutlen); - - skb = skb_clone(skb, GFP_ATOMIC); - if (!skb) - /* Skip the sample action when out of memory. */ - return 0; - - if (!add_deferred_actions(skb, key, a)) { - if (net_ratelimit()) - pr_warn("%s: deferred actions limit reached, dropping sample action\n", - ovs_dp_name(dp)); - - kfree_skb(skb); - } - return 0; + clone_flow_key = !arg->exec; + return clone_execute(dp, skb, key, 0, actions, rem, last, + clone_flow_key); } static void execute_hash(struct sk_buff *skb, struct sw_flow_key *key, @@ -1084,10 +1069,9 @@ static int execute_masked_set_action(struct sk_buff *skb, static int execute_recirc(struct datapath *dp, struct sk_buff *skb, struct sw_flow_key *key, - const struct nlattr *a, int rem) + const struct nlattr *a, bool last) { - struct deferred_action *da; - int level; + u32 recirc_id; if (!is_flow_key_valid(key)) { int err; @@ -1098,43 +1082,8 @@ static int execute_recirc(struct datapath *dp, struct sk_buff *skb, } BUG_ON(!is_flow_key_valid(key)); - if (!nla_is_last(a, rem)) { - /* Recirc action is the not the last action - * of the action list, need to clone the skb. - */ - skb = skb_clone(skb, GFP_ATOMIC); - - /* Skip the recirc action when out of memory, but - * continue on with the rest of the action list. - */ - if (!skb) - return 0; - } - - level = this_cpu_read(exec_actions_level); - if (level <= OVS_DEFERRED_ACTION_THRESHOLD) { - struct recirc_keys *rks = this_cpu_ptr(recirc_keys); - struct sw_flow_key *recirc_key = &rks->key[level - 1]; - - *recirc_key = *key; - recirc_key->recirc_id = nla_get_u32(a); - ovs_dp_process_packet(skb, recirc_key); - - return 0; - } - - da = add_deferred_actions(skb, key, NULL); - if (da) { - da->pkt_key.recirc_id = nla_get_u32(a); - } else { - kfree_skb(skb); - - if (net_ratelimit()) - pr_warn("%s: deferred action limit reached, drop recirc action\n", - ovs_dp_name(dp)); - } - - return 0; + recirc_id = nla_get_u32(a); + return clone_execute(dp, skb, key, recirc_id, NULL, 0, last, true); } /* Execute a list of actions against 'skb'. */ @@ -1206,9 +1155,11 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = pop_vlan(skb, key); break; - case OVS_ACTION_ATTR_RECIRC: - err = execute_recirc(dp, skb, key, a, rem); - if (nla_is_last(a, rem)) { + case OVS_ACTION_ATTR_RECIRC: { + bool last = nla_is_last(a, rem); + + err = execute_recirc(dp, skb, key, a, last); + if (last) { /* If this is the last action, the skb has * been consumed or freed. * Return immediately. @@ -1216,6 +1167,7 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return err; } break; + } case OVS_ACTION_ATTR_SET: err = execute_set_action(skb, key, nla_data(a)); @@ -1226,9 +1178,15 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, err = execute_masked_set_action(skb, key, nla_data(a)); break; - case OVS_ACTION_ATTR_SAMPLE: - err = sample(dp, skb, key, a, attr, len); + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = sample(dp, skb, key, a, last); + if (last) + return err; + break; + } case OVS_ACTION_ATTR_CT: if (!is_flow_key_valid(key)) { @@ -1264,6 +1222,79 @@ static int do_execute_actions(struct datapath *dp, struct sk_buff *skb, return 0; } +/* Execute the actions on the clone of the packet. The effect of the + * execution does not affect the original 'skb' nor the original 'key'. + * + * The execution may be deferred in case the actions can not be executed + * immediately. + */ +static int clone_execute(struct datapath *dp, struct sk_buff *skb, + struct sw_flow_key *key, u32 recirc_id, + const struct nlattr *actions, int len, + bool last, bool clone_flow_key) +{ + struct deferred_action *da; + struct sw_flow_key *clone; + + skb = last ? skb : skb_clone(skb, GFP_ATOMIC); + if (!skb) { + /* Out of memory, skip this action. + */ + return 0; + } + + /* When clone_flow_key is false, the 'key' will not be change + * by the actions, then the 'key' can be used directly. + * Otherwise, try to clone key from the next recursion level of + * 'flow_keys'. If clone is successful, execute the actions + * without deferring. + */ + clone = clone_flow_key ? clone_key(key) : key; + if (clone) { + int err = 0; + + if (actions) { /* Sample action */ + if (clone_flow_key) + __this_cpu_inc(exec_actions_level); + + err = do_execute_actions(dp, skb, clone, + actions, len); + + if (clone_flow_key) + __this_cpu_dec(exec_actions_level); + } else { /* Recirc action */ + clone->recirc_id = recirc_id; + ovs_dp_process_packet(skb, clone); + } + return err; + } + + /* Out of 'flow_keys' space. Defer actions */ + da = add_deferred_actions(skb, key, actions, len); + if (da) { + if (!actions) { /* Recirc action */ + key = &da->pkt_key; + key->recirc_id = recirc_id; + } + } else { + /* Out of per CPU action FIFO space. Drop the 'skb' and + * log an error. + */ + kfree_skb(skb); + + if (net_ratelimit()) { + if (actions) { /* Sample action */ + pr_warn("%s: deferred action limit reached, drop sample action\n", + ovs_dp_name(dp)); + } else { /* Recirc action */ + pr_warn("%s: deferred action limit reached, drop recirc action\n", + ovs_dp_name(dp)); + } + } + } + return 0; +} + static void process_deferred_actions(struct datapath *dp) { struct action_fifo *fifo = this_cpu_ptr(action_fifos); @@ -1278,10 +1309,10 @@ static void process_deferred_actions(struct datapath *dp) struct sk_buff *skb = da->skb; struct sw_flow_key *key = &da->pkt_key; const struct nlattr *actions = da->actions; + int actions_len = da->actions_len; if (actions) - do_execute_actions(dp, skb, key, actions, - nla_len(actions)); + do_execute_actions(dp, skb, key, actions, actions_len); else ovs_dp_process_packet(skb, key); } while (!action_fifo_is_empty(fifo)); @@ -1323,8 +1354,8 @@ int action_fifos_init(void) if (!action_fifos) return -ENOMEM; - recirc_keys = alloc_percpu(struct recirc_keys); - if (!recirc_keys) { + flow_keys = alloc_percpu(struct action_flow_keys); + if (!flow_keys) { free_percpu(action_fifos); return -ENOMEM; } @@ -1335,5 +1366,5 @@ int action_fifos_init(void) void action_fifos_exit(void) { free_percpu(action_fifos); - free_percpu(recirc_keys); + free_percpu(flow_keys); } diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 7b2c2fce408a02d4251f03a2e3f0b4d9e7fccb80..bf602e33c40af4896240c9cc0566fa10126cf662 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -66,7 +66,9 @@ struct ovs_conntrack_info { u8 commit : 1; u8 nat : 3; /* enum ovs_ct_nat */ u8 force : 1; + u8 have_eventmask : 1; u16 family; + u32 eventmask; /* Mask of 1 << IPCT_*. */ struct md_mark mark; struct md_labels labels; #ifdef CONFIG_NF_NAT_NEEDED @@ -373,7 +375,7 @@ static int ovs_ct_init_labels(struct nf_conn *ct, struct sw_flow_key *key, } /* Labels are included in the IPCTNL_MSG_CT_NEW event only if the - * IPCT_LABEL bit it set in the event cache. + * IPCT_LABEL bit is set in the event cache. */ nf_conntrack_event_cache(IPCT_LABEL, ct); @@ -514,10 +516,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; - return __nf_ct_expect_find(net, zone, &tuple); + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; } /* This replicates logic from nf_conntrack_core.c that is not exported. */ @@ -795,11 +825,6 @@ static int ovs_ct_nat(struct net *net, struct sw_flow_key *key, enum nf_nat_manip_type maniptype; int err; - if (nf_ct_is_untracked(ct)) { - /* A NAT action may only be performed on tracked packets. */ - return NF_ACCEPT; - } - /* Add NAT extension if not confirmed yet. */ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct)) return NF_ACCEPT; /* Can't NAT. */ @@ -1007,6 +1032,20 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key, if (!ct) return 0; + /* Set the conntrack event mask if given. NEW and DELETE events have + * their own groups, but the NFNLGRP_CONNTRACK_UPDATE group listener + * typically would receive many kinds of updates. Setting the event + * mask allows those events to be filtered. The set event mask will + * remain in effect for the lifetime of the connection unless changed + * by a further CT action with both the commit flag and the eventmask + * option. */ + if (info->have_eventmask) { + struct nf_conntrack_ecache *cache = nf_ct_ecache_find(ct); + + if (cache) + cache->ctmask = info->eventmask; + } + /* Apply changes before confirming the connection so that the initial * conntrack NEW netlink event carries the values given in the CT * action. @@ -1238,6 +1277,8 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = { /* NAT length is checked when parsing the nested attributes. */ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX }, #endif + [OVS_CT_ATTR_EVENTMASK] = { .minlen = sizeof(u32), + .maxlen = sizeof(u32) }, }; static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, @@ -1316,6 +1357,11 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info, break; } #endif + case OVS_CT_ATTR_EVENTMASK: + info->have_eventmask = true; + info->eventmask = nla_get_u32(a); + break; + default: OVS_NLERR(log, "Unknown conntrack attr (%d)", type); @@ -1515,6 +1561,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info, ct_info->helper->name)) return -EMSGSIZE; } + if (ct_info->have_eventmask && + nla_put_u32(skb, OVS_CT_ATTR_EVENTMASK, ct_info->eventmask)) + return -EMSGSIZE; + #ifdef CONFIG_NF_NAT_NEEDED if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb)) return -EMSGSIZE; diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index 9c62b6325f7adc92420b605cf4cdc3b26da437fc..7b17da9a94a0c15ab8ec28205e025d9554c798fb 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1353,7 +1353,7 @@ static int ovs_flow_cmd_dump(struct sk_buff *skb, struct netlink_callback *cb) int err; err = genlmsg_parse(cb->nlh, &dp_flow_genl_family, a, - OVS_FLOW_ATTR_MAX, flow_policy); + OVS_FLOW_ATTR_MAX, flow_policy, NULL); if (err) return err; ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]); diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h index 1c6e9377436df1e93081c825c142b712a811277b..da931bdef8a7b5f25c189a5fe76e5fc2c4c1efdf 100644 --- a/net/openvswitch/datapath.h +++ b/net/openvswitch/datapath.h @@ -34,8 +34,6 @@ #define DP_MAX_PORTS USHRT_MAX #define DP_VPORT_HASH_BUCKETS 1024 -#define SAMPLE_ACTION_DEPTH 3 - /** * struct dp_stats_percpu - per-cpu packet processing statistics for a given * datapath. diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 1105a838bab83f9fe647423060d08e4a415bab61..7e1d8a2afa63687aeb87121dbae614bad2ec34a3 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2014 Nicira, Inc. + * Copyright (c) 2007-2017 Nicira, Inc. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -59,6 +59,39 @@ struct ovs_len_tbl { #define OVS_ATTR_NESTED -1 #define OVS_ATTR_VARIABLE -2 +static bool actions_may_change_flow(const struct nlattr *actions) +{ + struct nlattr *nla; + int rem; + + nla_for_each_nested(nla, actions, rem) { + u16 action = nla_type(nla); + + switch (action) { + case OVS_ACTION_ATTR_OUTPUT: + case OVS_ACTION_ATTR_RECIRC: + case OVS_ACTION_ATTR_TRUNC: + case OVS_ACTION_ATTR_USERSPACE: + break; + + case OVS_ACTION_ATTR_CT: + case OVS_ACTION_ATTR_HASH: + case OVS_ACTION_ATTR_POP_ETH: + case OVS_ACTION_ATTR_POP_MPLS: + case OVS_ACTION_ATTR_POP_VLAN: + case OVS_ACTION_ATTR_PUSH_ETH: + case OVS_ACTION_ATTR_PUSH_MPLS: + case OVS_ACTION_ATTR_PUSH_VLAN: + case OVS_ACTION_ATTR_SAMPLE: + case OVS_ACTION_ATTR_SET: + case OVS_ACTION_ATTR_SET_MASKED: + default: + return true; + } + } + return false; +} + static void update_range(struct sw_flow_match *match, size_t offset, size_t size, bool is_mask) { @@ -2023,18 +2056,20 @@ static inline void add_nested_action_end(struct sw_flow_actions *sfa, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, - int depth, struct sw_flow_actions **sfa, + struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log); static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, - const struct sw_flow_key *key, int depth, + const struct sw_flow_key *key, struct sw_flow_actions **sfa, - __be16 eth_type, __be16 vlan_tci, bool log) + __be16 eth_type, __be16 vlan_tci, + bool log, bool last) { const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1]; const struct nlattr *probability, *actions; const struct nlattr *a; - int rem, start, err, st_acts; + int rem, start, err; + struct sample_arg arg; memset(attrs, 0, sizeof(attrs)); nla_for_each_nested(a, attr, rem) { @@ -2058,20 +2093,32 @@ static int validate_and_copy_sample(struct net *net, const struct nlattr *attr, start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE, log); if (start < 0) return start; - err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, - nla_data(probability), sizeof(u32), log); + + /* When both skb and flow may be changed, put the sample + * into a deferred fifo. On the other hand, if only skb + * may be modified, the actions can be executed in place. + * + * Do this analysis at the flow installation time. + * Set 'clone_action->exec' to true if the actions can be + * executed without being deferred. + * + * If the sample is the last action, it can always be excuted + * rather than deferred. + */ + arg.exec = last || !actions_may_change_flow(actions); + arg.probability = nla_get_u32(probability); + + err = ovs_nla_add_action(sfa, OVS_SAMPLE_ATTR_ARG, &arg, sizeof(arg), + log); if (err) return err; - st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS, log); - if (st_acts < 0) - return st_acts; - err = __ovs_nla_copy_actions(net, actions, key, depth + 1, sfa, + err = __ovs_nla_copy_actions(net, actions, key, sfa, eth_type, vlan_tci, log); + if (err) return err; - add_nested_action_end(*sfa, st_acts); add_nested_action_end(*sfa, start); return 0; @@ -2380,8 +2427,8 @@ static int validate_userspace(const struct nlattr *attr) struct nlattr *a[OVS_USERSPACE_ATTR_MAX + 1]; int error; - error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, - attr, userspace_policy); + error = nla_parse_nested(a, OVS_USERSPACE_ATTR_MAX, attr, + userspace_policy, NULL); if (error) return error; @@ -2408,16 +2455,13 @@ static int copy_action(const struct nlattr *from, static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, const struct sw_flow_key *key, - int depth, struct sw_flow_actions **sfa, + struct sw_flow_actions **sfa, __be16 eth_type, __be16 vlan_tci, bool log) { u8 mac_proto = ovs_key_mac_proto(key); const struct nlattr *a; int rem, err; - if (depth >= SAMPLE_ACTION_DEPTH) - return -EOVERFLOW; - nla_for_each_nested(a, attr, rem) { /* Expected argument lengths, (u32)-1 for variable length. */ static const u32 action_lens[OVS_ACTION_ATTR_MAX + 1] = { @@ -2555,13 +2599,17 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return err; break; - case OVS_ACTION_ATTR_SAMPLE: - err = validate_and_copy_sample(net, a, key, depth, sfa, - eth_type, vlan_tci, log); + case OVS_ACTION_ATTR_SAMPLE: { + bool last = nla_is_last(a, rem); + + err = validate_and_copy_sample(net, a, key, sfa, + eth_type, vlan_tci, + log, last); if (err) return err; skip_copy = true; break; + } case OVS_ACTION_ATTR_CT: err = ovs_ct_copy_action(net, a, key, sfa, log); @@ -2615,7 +2663,7 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return PTR_ERR(*sfa); (*sfa)->orig_len = nla_len(attr); - err = __ovs_nla_copy_actions(net, attr, key, 0, sfa, key->eth.type, + err = __ovs_nla_copy_actions(net, attr, key, sfa, key->eth.type, key->eth.vlan.tci, log); if (err) ovs_nla_free_flow_actions(*sfa); @@ -2623,39 +2671,44 @@ int ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, return err; } -static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb) +static int sample_action_to_attr(const struct nlattr *attr, + struct sk_buff *skb) { - const struct nlattr *a; - struct nlattr *start; - int err = 0, rem; + struct nlattr *start, *ac_start = NULL, *sample_arg; + int err = 0, rem = nla_len(attr); + const struct sample_arg *arg; + struct nlattr *actions; start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE); if (!start) return -EMSGSIZE; - nla_for_each_nested(a, attr, rem) { - int type = nla_type(a); - struct nlattr *st_sample; + sample_arg = nla_data(attr); + arg = nla_data(sample_arg); + actions = nla_next(sample_arg, &rem); - switch (type) { - case OVS_SAMPLE_ATTR_PROBABILITY: - if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, - sizeof(u32), nla_data(a))) - return -EMSGSIZE; - break; - case OVS_SAMPLE_ATTR_ACTIONS: - st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); - if (!st_sample) - return -EMSGSIZE; - err = ovs_nla_put_actions(nla_data(a), nla_len(a), skb); - if (err) - return err; - nla_nest_end(skb, st_sample); - break; - } + if (nla_put_u32(skb, OVS_SAMPLE_ATTR_PROBABILITY, arg->probability)) { + err = -EMSGSIZE; + goto out; + } + + ac_start = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS); + if (!ac_start) { + err = -EMSGSIZE; + goto out; + } + + err = ovs_nla_put_actions(actions, rem, skb); + +out: + if (err) { + nla_nest_cancel(skb, ac_start); + nla_nest_cancel(skb, start); + } else { + nla_nest_end(skb, ac_start); + nla_nest_end(skb, start); } - nla_nest_end(skb, start); return err; } diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c index 7eb955e453e6d657d13d0aa7b35ce7c8b7de2f15..869acb3b3d3f25a75ee4912359ee0fea31fa7c2d 100644 --- a/net/openvswitch/vport-vxlan.c +++ b/net/openvswitch/vport-vxlan.c @@ -70,7 +70,8 @@ static int vxlan_configure_exts(struct vport *vport, struct nlattr *attr, if (nla_len(attr) < sizeof(struct nlattr)) return -EINVAL; - err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy); + err = nla_parse_nested(exts, OVS_VXLAN_EXT_MAX, attr, exts_policy, + NULL); if (err < 0) return err; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 8489beff5c25c971067f38833ed4a790a98dd86e..f4001763134da35ed5f0f04bc2836015bb15a4af 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1496,6 +1496,7 @@ static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev, DEFINE_MUTEX(fanout_mutex); EXPORT_SYMBOL_GPL(fanout_mutex); static LIST_HEAD(fanout_list); +static u16 fanout_next_id; static void __fanout_link(struct sock *sk, struct packet_sock *po) { @@ -1629,6 +1630,36 @@ static void fanout_release_data(struct packet_fanout *f) }; } +static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id) +{ + struct packet_fanout *f; + + list_for_each_entry(f, &fanout_list, list) { + if (f->id == candidate_id && + read_pnet(&f->net) == sock_net(sk)) { + return false; + } + } + return true; +} + +static bool fanout_find_new_id(struct sock *sk, u16 *new_id) +{ + u16 id = fanout_next_id; + + do { + if (__fanout_id_is_free(sk, id)) { + *new_id = id; + fanout_next_id = id + 1; + return true; + } + + id++; + } while (id != fanout_next_id); + + return false; +} + static int fanout_add(struct sock *sk, u16 id, u16 type_flags) { struct packet_rollover *rollover = NULL; @@ -1676,6 +1707,19 @@ static int fanout_add(struct sock *sk, u16 id, u16 type_flags) po->rollover = rollover; } + if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) { + if (id != 0) { + err = -EINVAL; + goto out; + } + if (!fanout_find_new_id(sk, &id)) { + err = -ENOMEM; + goto out; + } + /* ephemeral flag for the first socket in the group: drop it */ + flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8); + } + match = NULL; list_for_each_entry(f, &fanout_list, list) { if (f->id == id && @@ -3836,6 +3880,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); + if (len < sizeof(int)) + return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { diff --git a/net/phonet/pn_netlink.c b/net/phonet/pn_netlink.c index bc5ee5fbe6ae3845b6ec67eaea2beaec2d12ba08..45b3af3080d8349f9e9c6e7f424f0548a79a42b0 100644 --- a/net/phonet/pn_netlink.c +++ b/net/phonet/pn_netlink.c @@ -61,7 +61,8 @@ static const struct nla_policy ifa_phonet_policy[IFA_MAX+1] = { [IFA_LOCAL] = { .type = NLA_U8 }, }; -static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[IFA_MAX+1]; @@ -78,7 +79,8 @@ static int addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy); + err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, ifa_phonet_policy, + extack); if (err < 0) return err; @@ -226,7 +228,8 @@ static const struct nla_policy rtm_phonet_policy[RTA_MAX+1] = { [RTA_OIF] = { .type = NLA_U32 }, }; -static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tb[RTA_MAX+1]; @@ -243,7 +246,8 @@ static int route_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy); + err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_phonet_policy, + extack); if (err < 0) return err; diff --git a/net/qrtr/Kconfig b/net/qrtr/Kconfig index b83c6807a5ae5cedc63b073c7f928b1776f42524..326fd97444f5bed5c82af7d632d8a4424f9908d3 100644 --- a/net/qrtr/Kconfig +++ b/net/qrtr/Kconfig @@ -16,7 +16,7 @@ if QRTR config QRTR_SMD tristate "SMD IPC Router channels" - depends on QCOM_SMD || (COMPILE_TEST && QCOM_SMD=n) + depends on RPMSG || (COMPILE_TEST && RPMSG=n) ---help--- Say Y here to support SMD based ipcrouter channels. SMD is the most common transport for IPC Router. diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index ae5ac175b2bef96ffa614bc799db5cd90a7bdc08..a9a8c7d5a4a983b9be12fe85a7f71f3e6a825f19 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -658,7 +658,9 @@ static int qrtr_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) } if (plen != len) { - skb_pad(skb, plen - len); + rc = skb_pad(skb, plen - len); + if (rc) + goto out_node; skb_put(skb, plen - len); } @@ -943,7 +945,8 @@ static const struct nla_policy qrtr_policy[IFA_MAX + 1] = { [IFA_LOCAL] = { .type = NLA_U32 }, }; -static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) +static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct nlattr *tb[IFA_MAX + 1]; struct ifaddrmsg *ifm; @@ -957,7 +960,7 @@ static int qrtr_addr_doit(struct sk_buff *skb, struct nlmsghdr *nlh) ASSERT_RTNL(); - rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy); + rc = nlmsg_parse(nlh, sizeof(*ifm), tb, IFA_MAX, qrtr_policy, extack); if (rc < 0) return rc; diff --git a/net/qrtr/smd.c b/net/qrtr/smd.c index 0d11132b3370a4024be644dede74a77d188a4f64..50615d5efac1529a0fd617c2692b1e9419da7137 100644 --- a/net/qrtr/smd.c +++ b/net/qrtr/smd.c @@ -14,21 +14,21 @@ #include #include -#include +#include #include "qrtr.h" struct qrtr_smd_dev { struct qrtr_endpoint ep; - struct qcom_smd_channel *channel; + struct rpmsg_endpoint *channel; struct device *dev; }; /* from smd to qrtr */ -static int qcom_smd_qrtr_callback(struct qcom_smd_channel *channel, - const void *data, size_t len) +static int qcom_smd_qrtr_callback(struct rpmsg_device *rpdev, + void *data, int len, void *priv, u32 addr) { - struct qrtr_smd_dev *qdev = qcom_smd_get_drvdata(channel); + struct qrtr_smd_dev *qdev = dev_get_drvdata(&rpdev->dev); int rc; if (!qdev) @@ -54,7 +54,7 @@ static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) if (rc) goto out; - rc = qcom_smd_send(qdev->channel, skb->data, skb->len); + rc = rpmsg_send(qdev->channel, skb->data, skb->len); out: if (rc) @@ -64,57 +64,55 @@ static int qcom_smd_qrtr_send(struct qrtr_endpoint *ep, struct sk_buff *skb) return rc; } -static int qcom_smd_qrtr_probe(struct qcom_smd_device *sdev) +static int qcom_smd_qrtr_probe(struct rpmsg_device *rpdev) { struct qrtr_smd_dev *qdev; int rc; - qdev = devm_kzalloc(&sdev->dev, sizeof(*qdev), GFP_KERNEL); + qdev = devm_kzalloc(&rpdev->dev, sizeof(*qdev), GFP_KERNEL); if (!qdev) return -ENOMEM; - qdev->channel = sdev->channel; - qdev->dev = &sdev->dev; + qdev->channel = rpdev->ept; + qdev->dev = &rpdev->dev; qdev->ep.xmit = qcom_smd_qrtr_send; rc = qrtr_endpoint_register(&qdev->ep, QRTR_EP_NID_AUTO); if (rc) return rc; - qcom_smd_set_drvdata(sdev->channel, qdev); - dev_set_drvdata(&sdev->dev, qdev); + dev_set_drvdata(&rpdev->dev, qdev); - dev_dbg(&sdev->dev, "Qualcomm SMD QRTR driver probed\n"); + dev_dbg(&rpdev->dev, "Qualcomm SMD QRTR driver probed\n"); return 0; } -static void qcom_smd_qrtr_remove(struct qcom_smd_device *sdev) +static void qcom_smd_qrtr_remove(struct rpmsg_device *rpdev) { - struct qrtr_smd_dev *qdev = dev_get_drvdata(&sdev->dev); + struct qrtr_smd_dev *qdev = dev_get_drvdata(&rpdev->dev); qrtr_endpoint_unregister(&qdev->ep); - dev_set_drvdata(&sdev->dev, NULL); + dev_set_drvdata(&rpdev->dev, NULL); } -static const struct qcom_smd_id qcom_smd_qrtr_smd_match[] = { +static const struct rpmsg_device_id qcom_smd_qrtr_smd_match[] = { { "IPCRTR" }, {} }; -static struct qcom_smd_driver qcom_smd_qrtr_driver = { +static struct rpmsg_driver qcom_smd_qrtr_driver = { .probe = qcom_smd_qrtr_probe, .remove = qcom_smd_qrtr_remove, .callback = qcom_smd_qrtr_callback, - .smd_match_table = qcom_smd_qrtr_smd_match, - .driver = { + .id_table = qcom_smd_qrtr_smd_match, + .drv = { .name = "qcom_smd_qrtr", - .owner = THIS_MODULE, }, }; -module_qcom_smd_driver(qcom_smd_qrtr_driver); +module_rpmsg_driver(qcom_smd_qrtr_driver); MODULE_DESCRIPTION("Qualcomm IPC-Router SMD interface driver"); MODULE_LICENSE("GPL v2"); diff --git a/net/rds/connection.c b/net/rds/connection.c index 1fa75ab7b733230585666abcb7279ba691365256..6a5ebdea7d2e9eb3b624a01a8a87ef601e3b2f13 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -333,11 +333,19 @@ void rds_conn_shutdown(struct rds_conn_path *cp) rds_conn_path_reset(cp); if (!rds_conn_path_transition(cp, RDS_CONN_DISCONNECTING, + RDS_CONN_DOWN) && + !rds_conn_path_transition(cp, RDS_CONN_ERROR, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing * down the connection, and someone unloads the rds module. - * Quite reproduceable with loopback connections. + * Quite reproducible with loopback connections. * Mostly harmless. + * + * Note that this also happens with rds-tcp because + * we could have triggered rds_conn_path_drop in irq + * mode from rds_tcp_state change on the receipt of + * a FIN, thus we need to recheck for RDS_CONN_ERROR + * here. */ rds_conn_path_error(cp, "%s: failed to transition " "to state DOWN, current state " diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 1c38d2c7caa8e955585b45f0c9218a0775013b4d..80fb6f63e768d3461c47533615c875526bb8bab9 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -702,9 +702,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ - err = rdma_accept(cm_id, &conn_param); - if (err) - rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); + if (rdma_accept(cm_id, &conn_param)) + rds_ib_conn_error(conn, "rdma_accept failed\n"); out: if (conn) diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 4fe8f4fec4eee66c826b5beb5b02ae61d19b483a..86ef907067bb084e01ac4f8d5f00d0c17f40ac55 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -78,17 +78,15 @@ struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages) return ibmr; out_no_cigar: - if (ibmr) { - if (fmr->fmr) - ib_dealloc_fmr(fmr->fmr); - kfree(ibmr); - } + kfree(ibmr); atomic_dec(&pool->item_count); + return ERR_PTR(err); } -int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, - struct scatterlist *sg, unsigned int nents) +static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, + struct rds_ib_mr *ibmr, struct scatterlist *sg, + unsigned int nents) { struct ib_device *dev = rds_ibdev->dev; struct rds_ib_fmr *fmr = &ibmr->u.fmr; @@ -114,29 +112,39 @@ int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, u64 dma_addr = ib_sg_dma_address(dev, &scat[i]); if (dma_addr & ~PAGE_MASK) { - if (i > 0) + if (i > 0) { + ib_dma_unmap_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); return -EINVAL; - else + } else { ++page_cnt; + } } if ((dma_addr + dma_len) & ~PAGE_MASK) { - if (i < sg_dma_len - 1) + if (i < sg_dma_len - 1) { + ib_dma_unmap_sg(dev, sg, nents, + DMA_BIDIRECTIONAL); return -EINVAL; - else + } else { ++page_cnt; + } } len += dma_len; } page_cnt += len >> PAGE_SHIFT; - if (page_cnt > ibmr->pool->fmr_attr.max_pages) + if (page_cnt > ibmr->pool->fmr_attr.max_pages) { + ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -EINVAL; + } dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, rdsibdev_to_node(rds_ibdev)); - if (!dma_pages) + if (!dma_pages) { + ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -ENOMEM; + } page_cnt = 0; for (i = 0; i < sg_dma_len; ++i) { @@ -149,8 +157,10 @@ int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr, } ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr); - if (ret) + if (ret) { + ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); goto out; + } /* Success - we successfully remapped the MR, so we can * safely tear down the old mapping. diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h index 5d6e98a79a5e4b3de1f472c5fc513fce545bf6f9..0ea4ab017a8cc3f807931e1194cddb5048a82956 100644 --- a/net/rds/ib_mr.h +++ b/net/rds/ib_mr.h @@ -125,8 +125,6 @@ void rds_ib_mr_exit(void); void __rds_ib_teardown_mr(struct rds_ib_mr *); void rds_ib_teardown_mr(struct rds_ib_mr *); struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int); -int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *, - struct scatterlist *, unsigned int); struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *); int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **); struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *, diff --git a/net/rds/recv.c b/net/rds/recv.c index 8b7e7b7f2c2dbebee31dd7626540b797f1fa353f..c70c32cb05f5ff3a0f91dd0627b9525413c37318 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -594,7 +594,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, goto out; while (1) { - struct iov_iter save; /* If there are pending notifications, do those - and nothing else */ if (!list_empty(&rs->rs_notify_queue)) { ret = rds_notify_queue_get(rs, msg); @@ -630,7 +629,6 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, rdsdebug("copying inc %p from %pI4:%u to user\n", inc, &inc->i_conn->c_faddr, ntohs(inc->i_hdr.h_sport)); - save = msg->msg_iter; ret = inc->i_conn->c_trans->inc_copy_to_user(inc, &msg->msg_iter); if (ret < 0) break; @@ -644,7 +642,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, rds_inc_put(inc); inc = NULL; rds_stats_inc(s_recv_deliver_raced); - msg->msg_iter = save; + iov_iter_revert(&msg->msg_iter, ret); continue; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 22569007677357ba40347ee46e6584c683de2597..431404dbdad1cebe5d80cdb25de8b60db75fa87b 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -84,13 +84,10 @@ static struct ctl_table rds_tcp_sysctl_table[] = { /* doing it this way avoids calling tcp_sk() */ void rds_tcp_nonagle(struct socket *sock) { - mm_segment_t oldfs = get_fs(); int val = 1; - set_fs(KERNEL_DS); - sock->ops->setsockopt(sock, SOL_TCP, TCP_NODELAY, (char __user *)&val, + kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (void *)&val, sizeof(val)); - set_fs(oldfs); } u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc) diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index dcf4742083eab360ded7fcaddad8608b2526aa29..52d11d7725c839820f76410c9837b762f2b2205e 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -40,13 +40,7 @@ static void rds_tcp_cork(struct socket *sock, int val) { - mm_segment_t oldfs; - - oldfs = get_fs(); - set_fs(KERNEL_DS); - sock->ops->setsockopt(sock, SOL_TCP, TCP_CORK, (char __user *)&val, - sizeof(val)); - set_fs(oldfs); + kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (void *)&val, sizeof(val)); } void rds_tcp_xmit_path_prepare(struct rds_conn_path *cp) diff --git a/net/rds/threads.c b/net/rds/threads.c index e36e333a0aa0d7430c852ce419ac7ed85094bbec..3e447d056d092a405311265a06a8596b2ce8dc87 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -156,7 +156,7 @@ void rds_connect_worker(struct work_struct *work) struct rds_connection *conn = cp->cp_conn; int ret; - if (cp->cp_index > 1 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) + if (cp->cp_index > 0 && cp->cp_conn->c_laddr > cp->cp_conn->c_faddr) return; clear_bit(RDS_RECONNECT_PENDING, &cp->cp_flags); ret = rds_conn_path_transition(cp, RDS_CONN_DOWN, RDS_CONN_CONNECTING); diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 26a7b1db1361e554733b0ff40a54d8e68e59af09..7486926e60a88a56a364b727800f3cd1a671d808 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -739,6 +739,25 @@ static inline bool rxrpc_abort_call(const char *why, struct rxrpc_call *call, return ret; } +/* + * Abort a call due to a protocol error. + */ +static inline bool __rxrpc_abort_eproto(struct rxrpc_call *call, + struct sk_buff *skb, + const char *eproto_why, + const char *why, + u32 abort_code) +{ + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_rx_eproto(call, sp->hdr.serial, eproto_why); + return rxrpc_abort_call(why, call, sp->hdr.seq, abort_code, -EPROTO); +} + +#define rxrpc_abort_eproto(call, skb, eproto_why, abort_why, abort_code) \ + __rxrpc_abort_eproto((call), (skb), tracepoint_string(eproto_why), \ + (abort_why), (abort_code)) + /* * conn_client.c */ diff --git a/net/rxrpc/call_accept.c b/net/rxrpc/call_accept.c index 0ed181f53f32a0145c03b0006b92de5c7a0101aa..1752fcf8e8f1dd85866004a2ee38c8bbaa040138 100644 --- a/net/rxrpc/call_accept.c +++ b/net/rxrpc/call_accept.c @@ -413,11 +413,11 @@ struct rxrpc_call *rxrpc_new_incoming_call(struct rxrpc_local *local, case RXRPC_CONN_REMOTELY_ABORTED: rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - conn->remote_abort, ECONNABORTED); + conn->remote_abort, -ECONNABORTED); break; case RXRPC_CONN_LOCALLY_ABORTED: rxrpc_abort_call("CON", call, sp->hdr.seq, - conn->local_abort, ECONNABORTED); + conn->local_abort, -ECONNABORTED); break; default: BUG(); @@ -600,7 +600,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx) write_lock_bh(&call->state_lock); switch (call->state) { case RXRPC_CALL_SERVER_ACCEPTING: - __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, ECONNABORTED); + __rxrpc_abort_call("REJ", call, 1, RX_USER_ABORT, -ECONNABORTED); abort = true; /* fall through */ case RXRPC_CALL_COMPLETE: diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index 97a17ada4431d58b7a0f9c07be3b13b0230a6390..7a77844aab16be5f32a16b7edd3a7f440006c221 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -386,7 +386,7 @@ void rxrpc_process_call(struct work_struct *work) now = ktime_get_real(); if (ktime_before(call->expire_at, now)) { - rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, ETIME); + rxrpc_abort_call("EXP", call, 0, RX_CALL_TIMEOUT, -ETIME); set_bit(RXRPC_CALL_EV_ABORT, &call->events); goto recheck_state; } diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c index d79cd36987a95b86f2af9fac4688ab86e20f41d5..47f7f4205653aa0643c77f7381688df79b51cbbe 100644 --- a/net/rxrpc/call_object.c +++ b/net/rxrpc/call_object.c @@ -486,7 +486,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->to_be_accepted.next, struct rxrpc_call, accept_link); list_del(&call->accept_link); - rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_abort_call("SKR", call, 0, RX_CALL_DEAD, -ECONNRESET); rxrpc_put_call(call, rxrpc_call_put); } @@ -494,7 +494,7 @@ void rxrpc_release_calls_on_socket(struct rxrpc_sock *rx) call = list_entry(rx->sock_calls.next, struct rxrpc_call, sock_link); rxrpc_get_call(call, rxrpc_call_got); - rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, ECONNRESET); + rxrpc_abort_call("SKT", call, 0, RX_CALL_DEAD, -ECONNRESET); rxrpc_send_abort_packet(call); rxrpc_release_call(rx, call); rxrpc_put_call(call, rxrpc_call_put); diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index c3be03e8d098213e4956bb644827865206470e19..e8dea0d49e7fedf2951616c1aed8e93ec693a1dd 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -550,6 +550,7 @@ static void rxrpc_activate_one_channel(struct rxrpc_connection *conn, call->cid = conn->proto.cid | channel; call->call_id = call_id; + trace_rxrpc_connect_call(call); _net("CONNECT call %08x:%08x as call %d on conn %d", call->cid, call->call_id, call->debug_id, conn->debug_id); diff --git a/net/rxrpc/conn_event.c b/net/rxrpc/conn_event.c index b099b64366f356c27dea0a4dd215cc1034e61b55..46babcf82ce8648190e018ee69ad16701e2e969a 100644 --- a/net/rxrpc/conn_event.c +++ b/net/rxrpc/conn_event.c @@ -168,7 +168,7 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, * generate a connection-level abort */ static int rxrpc_abort_connection(struct rxrpc_connection *conn, - u32 error, u32 abort_code) + int error, u32 abort_code) { struct rxrpc_wire_header whdr; struct msghdr msg; @@ -281,14 +281,17 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, case RXRPC_PACKET_TYPE_ABORT: if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), - &wtmp, sizeof(wtmp)) < 0) + &wtmp, sizeof(wtmp)) < 0) { + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("bad_abort")); return -EPROTO; + } abort_code = ntohl(wtmp); _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code); conn->state = RXRPC_CONN_REMOTELY_ABORTED; rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, ECONNABORTED); + abort_code, -ECONNABORTED); return -ECONNABORTED; case RXRPC_PACKET_TYPE_CHALLENGE: @@ -327,7 +330,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn, return 0; default: - _leave(" = -EPROTO [%u]", sp->hdr.type); + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("bad_conn_pkt")); return -EPROTO; } } @@ -370,7 +374,7 @@ static void rxrpc_secure_connection(struct rxrpc_connection *conn) abort: _debug("abort %d, %d", ret, abort_code); - rxrpc_abort_connection(conn, -ret, abort_code); + rxrpc_abort_connection(conn, ret, abort_code); _leave(" [aborted]"); } @@ -419,9 +423,8 @@ void rxrpc_process_connection(struct work_struct *work) goto out; protocol_error: - if (rxrpc_abort_connection(conn, -ret, abort_code) < 0) + if (rxrpc_abort_connection(conn, ret, abort_code) < 0) goto requeue_and_leave; rxrpc_free_skb(skb, rxrpc_skb_rx_freed); - _leave(" [EPROTO]"); goto out; } diff --git a/net/rxrpc/input.c b/net/rxrpc/input.c index 18b2ad8be8e2b57dd57ef846287add68b027b08e..45dba732a3b4743ba37c46d05b02125f16eac23d 100644 --- a/net/rxrpc/input.c +++ b/net/rxrpc/input.c @@ -30,7 +30,7 @@ static void rxrpc_proto_abort(const char *why, struct rxrpc_call *call, rxrpc_seq_t seq) { - if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, EBADMSG)) { + if (rxrpc_abort_call(why, call, seq, RX_PROTOCOL_ERROR, -EBADMSG)) { set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } @@ -665,6 +665,8 @@ static void rxrpc_input_ackinfo(struct rxrpc_call *call, struct sk_buff *skb, rwind = RXRPC_RXTX_BUFF_SIZE - 1; if (rwind > call->tx_winsize) wake = true; + trace_rxrpc_rx_rwind_change(call, sp->hdr.serial, + ntohl(ackinfo->rwind), wake); call->tx_winsize = rwind; } @@ -877,7 +879,7 @@ static void rxrpc_input_ackall(struct rxrpc_call *call, struct sk_buff *skb) } /* - * Process an ABORT packet. + * Process an ABORT packet directed at a call. */ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) { @@ -892,10 +894,12 @@ static void rxrpc_input_abort(struct rxrpc_call *call, struct sk_buff *skb) &wtmp, sizeof(wtmp)) >= 0) abort_code = ntohl(wtmp); + trace_rxrpc_rx_abort(call, sp->hdr.serial, abort_code); + _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code); if (rxrpc_set_call_completion(call, RXRPC_CALL_REMOTELY_ABORTED, - abort_code, ECONNABORTED)) + abort_code, -ECONNABORTED)) rxrpc_notify_socket(call); } @@ -958,7 +962,7 @@ static void rxrpc_input_implicit_end_call(struct rxrpc_connection *conn, case RXRPC_CALL_COMPLETE: break; default: - if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, ESHUTDOWN)) { + if (rxrpc_abort_call("IMP", call, 0, RX_CALL_DEAD, -ESHUTDOWN)) { set_bit(RXRPC_CALL_EV_ABORT, &call->events); rxrpc_queue_call(call); } @@ -1017,8 +1021,11 @@ int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb) struct rxrpc_wire_header whdr; /* dig out the RxRPC connection details */ - if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) + if (skb_copy_bits(skb, 0, &whdr, sizeof(whdr)) < 0) { + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("bad_hdr")); return -EBADMSG; + } memset(sp, 0, sizeof(*sp)); sp->hdr.epoch = ntohl(whdr.epoch); diff --git a/net/rxrpc/insecure.c b/net/rxrpc/insecure.c index 7d4375e557e6ee4a50961ad25d9e6f01a34290db..af276f173b10ebb26a8b68d2560ad958b8f3d09a 100644 --- a/net/rxrpc/insecure.c +++ b/net/rxrpc/insecure.c @@ -46,7 +46,10 @@ static int none_respond_to_challenge(struct rxrpc_connection *conn, struct sk_buff *skb, u32 *_abort_code) { - *_abort_code = RX_PROTOCOL_ERROR; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("chall_none")); return -EPROTO; } @@ -54,7 +57,10 @@ static int none_verify_response(struct rxrpc_connection *conn, struct sk_buff *skb, u32 *_abort_code) { - *_abort_code = RX_PROTOCOL_ERROR; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, + tracepoint_string("resp_none")); return -EPROTO; } diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index bf13b8470c9ad51783325d7f47c32414b07f0c40..1ed9c0c2e94f1c70ab0ca61fa16e87f76530f78a 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -296,7 +296,7 @@ void rxrpc_peer_error_distributor(struct work_struct *work) hlist_del_init(&call->error_link); rxrpc_see_call(call); - if (rxrpc_set_call_completion(call, compl, 0, error)) + if (rxrpc_set_call_completion(call, compl, 0, -error)) rxrpc_notify_socket(call); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 3e2f1a8e9c5b51bf90ce8679c06b6ec8a8958ea9..f9caf3b775097f62e8cb27fe12f3e925a3f8c323 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -83,11 +83,11 @@ static int rxrpc_recvmsg_term(struct rxrpc_call *call, struct msghdr *msg) ret = put_cmsg(msg, SOL_RXRPC, RXRPC_ABORT, 4, &tmp); break; case RXRPC_CALL_NETWORK_ERROR: - tmp = call->error; + tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_NET_ERROR, 4, &tmp); break; case RXRPC_CALL_LOCAL_ERROR: - tmp = call->error; + tmp = -call->error; ret = put_cmsg(msg, SOL_RXRPC, RXRPC_LOCAL_ERROR, 4, &tmp); break; default: @@ -682,14 +682,16 @@ int rxrpc_kernel_recv_data(struct socket *sock, struct rxrpc_call *call, return ret; short_data: + trace_rxrpc_rx_eproto(call, 0, tracepoint_string("short_data")); ret = -EBADMSG; goto out; excess_data: + trace_rxrpc_rx_eproto(call, 0, tracepoint_string("excess_data")); ret = -EMSGSIZE; goto out; call_complete: *_abort = call->abort_code; - ret = -call->error; + ret = call->error; if (call->completion == RXRPC_CALL_SUCCEEDED) { ret = 1; if (size > 0) diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 4374e7b9c7bff9fdd1d36a5a9fc8f135414ed360..1bb9b2ccc2673714367d2ec4356d39b5a3839895 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -148,15 +148,13 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call, u32 data_size, void *sechdr) { - struct rxrpc_skb_priv *sp; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxkad_level1_hdr hdr; struct rxrpc_crypt iv; struct scatterlist sg; u16 check; - sp = rxrpc_skb(skb); - _enter(""); check = sp->hdr.seq ^ call->call_id; @@ -323,6 +321,7 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_crypt iv; struct scatterlist sg[16]; struct sk_buff *trailer; + bool aborted; u32 data_size, buf; u16 check; int nsg; @@ -330,7 +329,8 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, _enter(""); if (len < 8) { - rxrpc_abort_call("V1H", call, seq, RXKADSEALEDINCON, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_hdr", "V1H", + RXKADSEALEDINCON); goto protocol_error; } @@ -355,7 +355,8 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, /* Extract the decrypted packet length */ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { - rxrpc_abort_call("XV1", call, seq, RXKADDATALEN, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_len", "XV1", + RXKADDATALEN); goto protocol_error; } offset += sizeof(sechdr); @@ -368,12 +369,14 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - rxrpc_abort_call("V1C", call, seq, RXKADSEALEDINCON, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_check", "V1C", + RXKADSEALEDINCON); goto protocol_error; } if (data_size > len) { - rxrpc_abort_call("V1L", call, seq, RXKADDATALEN, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_1_datalen", "V1L", + RXKADDATALEN); goto protocol_error; } @@ -381,8 +384,8 @@ static int rxkad_verify_packet_1(struct rxrpc_call *call, struct sk_buff *skb, return 0; protocol_error: - rxrpc_send_abort_packet(call); - _leave(" = -EPROTO"); + if (aborted) + rxrpc_send_abort_packet(call); return -EPROTO; nomem: @@ -403,6 +406,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, struct rxrpc_crypt iv; struct scatterlist _sg[4], *sg; struct sk_buff *trailer; + bool aborted; u32 data_size, buf; u16 check; int nsg; @@ -410,7 +414,8 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, _enter(",{%d}", skb->len); if (len < 8) { - rxrpc_abort_call("V2H", call, seq, RXKADSEALEDINCON, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_hdr", "V2H", + RXKADSEALEDINCON); goto protocol_error; } @@ -445,7 +450,8 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, /* Extract the decrypted packet length */ if (skb_copy_bits(skb, offset, &sechdr, sizeof(sechdr)) < 0) { - rxrpc_abort_call("XV2", call, seq, RXKADDATALEN, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_len", "XV2", + RXKADDATALEN); goto protocol_error; } offset += sizeof(sechdr); @@ -458,12 +464,14 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, check ^= seq ^ call->call_id; check &= 0xffff; if (check != 0) { - rxrpc_abort_call("V2C", call, seq, RXKADSEALEDINCON, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_check", "V2C", + RXKADSEALEDINCON); goto protocol_error; } if (data_size > len) { - rxrpc_abort_call("V2L", call, seq, RXKADDATALEN, EPROTO); + aborted = rxrpc_abort_eproto(call, skb, "rxkad_2_datalen", "V2L", + RXKADDATALEN); goto protocol_error; } @@ -471,8 +479,8 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, return 0; protocol_error: - rxrpc_send_abort_packet(call); - _leave(" = -EPROTO"); + if (aborted) + rxrpc_send_abort_packet(call); return -EPROTO; nomem: @@ -491,6 +499,7 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher); struct rxrpc_crypt iv; struct scatterlist sg; + bool aborted; u16 cksum; u32 x, y; @@ -522,10 +531,9 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, cksum = 1; /* zero checksums are not permitted */ if (cksum != expected_cksum) { - rxrpc_abort_call("VCK", call, seq, RXKADSEALEDINCON, EPROTO); - rxrpc_send_abort_packet(call); - _leave(" = -EPROTO [csum failed]"); - return -EPROTO; + aborted = rxrpc_abort_eproto(call, skb, "rxkad_csum", "VCK", + RXKADSEALEDINCON); + goto protocol_error; } switch (call->conn->params.security_level) { @@ -538,6 +546,11 @@ static int rxkad_verify_packet(struct rxrpc_call *call, struct sk_buff *skb, default: return -ENOANO; } + +protocol_error: + if (aborted) + rxrpc_send_abort_packet(call); + return -EPROTO; } /* @@ -754,22 +767,23 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, struct rxkad_response resp __attribute__((aligned(8))); /* must be aligned for crypto */ struct rxrpc_skb_priv *sp = rxrpc_skb(skb); + const char *eproto; u32 version, nonce, min_level, abort_code; int ret; _enter("{%d,%x}", conn->debug_id, key_serial(conn->params.key)); - if (!conn->params.key) { - _leave(" = -EPROTO [no key]"); - return -EPROTO; - } + eproto = tracepoint_string("chall_no_key"); + abort_code = RX_PROTOCOL_ERROR; + if (!conn->params.key) + goto protocol_error; + abort_code = RXKADEXPIRED; ret = key_validate(conn->params.key); - if (ret < 0) { - *_abort_code = RXKADEXPIRED; - return ret; - } + if (ret < 0) + goto other_error; + eproto = tracepoint_string("chall_short"); abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &challenge, sizeof(challenge)) < 0) @@ -782,13 +796,15 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, _proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }", sp->hdr.serial, version, nonce, min_level); + eproto = tracepoint_string("chall_ver"); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) goto protocol_error; abort_code = RXKADLEVELFAIL; + ret = -EACCES; if (conn->params.security_level < min_level) - goto protocol_error; + goto other_error; token = conn->params.key->payload.data[0]; @@ -815,28 +831,34 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn, return rxkad_send_response(conn, &sp->hdr, &resp, token->kad); protocol_error: + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); + ret = -EPROTO; +other_error: *_abort_code = abort_code; - _leave(" = -EPROTO [%d]", abort_code); - return -EPROTO; + return ret; } /* * decrypt the kerberos IV ticket in the response */ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, + struct sk_buff *skb, void *ticket, size_t ticket_len, struct rxrpc_crypt *_session_key, time_t *_expiry, u32 *_abort_code) { struct skcipher_request *req; + struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt iv, key; struct scatterlist sg[1]; struct in_addr addr; unsigned int life; + const char *eproto; time_t issue, now; bool little_endian; int ret; + u32 abort_code; u8 *p, *q, *name, *end; _enter("{%d},{%x}", conn->debug_id, key_serial(conn->server_key)); @@ -847,11 +869,11 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, if (ret < 0) { switch (ret) { case -EKEYEXPIRED: - *_abort_code = RXKADEXPIRED; - goto error; + abort_code = RXKADEXPIRED; + goto other_error; default: - *_abort_code = RXKADNOAUTH; - goto error; + abort_code = RXKADNOAUTH; + goto other_error; } } @@ -860,13 +882,11 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv)); + ret = -ENOMEM; req = skcipher_request_alloc(conn->server_key->payload.data[0], GFP_NOFS); - if (!req) { - *_abort_code = RXKADNOAUTH; - ret = -ENOMEM; - goto error; - } + if (!req) + goto temporary_error; sg_init_one(&sg[0], ticket, ticket_len); skcipher_request_set_callback(req, 0, NULL, NULL); @@ -877,11 +897,12 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p = ticket; end = p + ticket_len; -#define Z(size) \ +#define Z(field) \ ({ \ u8 *__str = p; \ + eproto = tracepoint_string("rxkad_bad_"#field); \ q = memchr(p, 0, end - p); \ - if (!q || q - p > (size)) \ + if (!q || q - p > (field##_SZ)) \ goto bad_ticket; \ for (; p < q; p++) \ if (!isprint(*p)) \ @@ -896,17 +917,18 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, p++; /* extract the authentication name */ - name = Z(ANAME_SZ); + name = Z(ANAME); _debug("KIV ANAME: %s", name); /* extract the principal's instance */ - name = Z(INST_SZ); + name = Z(INST); _debug("KIV INST : %s", name); /* extract the principal's authentication domain */ - name = Z(REALM_SZ); + name = Z(REALM); _debug("KIV REALM: %s", name); + eproto = tracepoint_string("rxkad_bad_len"); if (end - p < 4 + 8 + 4 + 2) goto bad_ticket; @@ -941,36 +963,37 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn, /* check the ticket is in date */ if (issue > now) { - *_abort_code = RXKADNOAUTH; + abort_code = RXKADNOAUTH; ret = -EKEYREJECTED; - goto error; + goto other_error; } if (issue < now - life) { - *_abort_code = RXKADEXPIRED; + abort_code = RXKADEXPIRED; ret = -EKEYEXPIRED; - goto error; + goto other_error; } *_expiry = issue + life; /* get the service name */ - name = Z(SNAME_SZ); + name = Z(SNAME); _debug("KIV SNAME: %s", name); /* get the service instance name */ - name = Z(INST_SZ); + name = Z(INST); _debug("KIV SINST: %s", name); - - ret = 0; -error: - _leave(" = %d", ret); - return ret; + return 0; bad_ticket: - *_abort_code = RXKADBADTICKET; - ret = -EBADMSG; - goto error; + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); + abort_code = RXKADBADTICKET; + ret = -EPROTO; +other_error: + *_abort_code = abort_code; + return ret; +temporary_error: + return ret; } /* @@ -1020,6 +1043,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, __attribute__((aligned(8))); /* must be aligned for crypto */ struct rxrpc_skb_priv *sp = rxrpc_skb(skb); struct rxrpc_crypt session_key; + const char *eproto; time_t expiry; void *ticket; u32 abort_code, version, kvno, ticket_len, level; @@ -1028,6 +1052,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _enter("{%d,%x}", conn->debug_id, key_serial(conn->server_key)); + eproto = tracepoint_string("rxkad_rsp_short"); abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), &response, sizeof(response)) < 0) @@ -1041,40 +1066,43 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, _proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }", sp->hdr.serial, version, kvno, ticket_len); + eproto = tracepoint_string("rxkad_rsp_ver"); abort_code = RXKADINCONSISTENCY; if (version != RXKAD_VERSION) goto protocol_error; + eproto = tracepoint_string("rxkad_rsp_tktlen"); abort_code = RXKADTICKETLEN; if (ticket_len < 4 || ticket_len > MAXKRB5TICKETLEN) goto protocol_error; + eproto = tracepoint_string("rxkad_rsp_unkkey"); abort_code = RXKADUNKNOWNKEY; if (kvno >= RXKAD_TKT_TYPE_KERBEROS_V5) goto protocol_error; /* extract the kerberos ticket and decrypt and decode it */ + ret = -ENOMEM; ticket = kmalloc(ticket_len, GFP_NOFS); if (!ticket) - return -ENOMEM; + goto temporary_error; + eproto = tracepoint_string("rxkad_tkt_short"); abort_code = RXKADPACKETSHORT; if (skb_copy_bits(skb, sizeof(struct rxrpc_wire_header), ticket, ticket_len) < 0) goto protocol_error_free; - ret = rxkad_decrypt_ticket(conn, ticket, ticket_len, &session_key, - &expiry, &abort_code); - if (ret < 0) { - *_abort_code = abort_code; - kfree(ticket); - return ret; - } + ret = rxkad_decrypt_ticket(conn, skb, ticket, ticket_len, &session_key, + &expiry, _abort_code); + if (ret < 0) + goto temporary_error_free; /* use the session key from inside the ticket to decrypt the * response */ rxkad_decrypt_response(conn, &response, &session_key); + eproto = tracepoint_string("rxkad_rsp_param"); abort_code = RXKADSEALEDINCON; if (ntohl(response.encrypted.epoch) != conn->proto.epoch) goto protocol_error_free; @@ -1085,6 +1113,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, csum = response.encrypted.checksum; response.encrypted.checksum = 0; rxkad_calc_response_checksum(&response); + eproto = tracepoint_string("rxkad_rsp_csum"); if (response.encrypted.checksum != csum) goto protocol_error_free; @@ -1093,11 +1122,15 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, struct rxrpc_call *call; u32 call_id = ntohl(response.encrypted.call_id[i]); + eproto = tracepoint_string("rxkad_rsp_callid"); if (call_id > INT_MAX) goto protocol_error_unlock; + eproto = tracepoint_string("rxkad_rsp_callctr"); if (call_id < conn->channels[i].call_counter) goto protocol_error_unlock; + + eproto = tracepoint_string("rxkad_rsp_callst"); if (call_id > conn->channels[i].call_counter) { call = rcu_dereference_protected( conn->channels[i].call, @@ -1109,10 +1142,12 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, } spin_unlock(&conn->channel_lock); + eproto = tracepoint_string("rxkad_rsp_seq"); abort_code = RXKADOUTOFSEQUENCE; if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1) goto protocol_error_free; + eproto = tracepoint_string("rxkad_rsp_level"); abort_code = RXKADLEVELFAIL; level = ntohl(response.encrypted.level); if (level > RXRPC_SECURITY_ENCRYPT) @@ -1123,10 +1158,8 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, * this the connection security can be handled in exactly the same way * as for a client connection */ ret = rxrpc_get_server_data_key(conn, &session_key, expiry, kvno); - if (ret < 0) { - kfree(ticket); - return ret; - } + if (ret < 0) + goto temporary_error_free; kfree(ticket); _leave(" = 0"); @@ -1137,9 +1170,18 @@ static int rxkad_verify_response(struct rxrpc_connection *conn, protocol_error_free: kfree(ticket); protocol_error: + trace_rxrpc_rx_eproto(NULL, sp->hdr.serial, eproto); *_abort_code = abort_code; - _leave(" = -EPROTO [%d]", abort_code); return -EPROTO; + +temporary_error_free: + kfree(ticket); +temporary_error: + /* Ignore the response packet if we got a temporary error such as + * ENOMEM. We just want to send the challenge again. Note that we + * also come out this way if the ticket decryption fails. + */ + return ret; } /* diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c index 97ab214ca4118d7a451a4e56a916bd5809ae81f3..96ffa5d5733bd73249e32e2c10a420af9946e9b2 100644 --- a/net/rxrpc/sendmsg.c +++ b/net/rxrpc/sendmsg.c @@ -556,7 +556,7 @@ int rxrpc_do_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, size_t len) ret = -ESHUTDOWN; } else if (cmd == RXRPC_CMD_SEND_ABORT) { ret = 0; - if (rxrpc_abort_call("CMD", call, 0, abort_code, ECONNABORTED)) + if (rxrpc_abort_call("CMD", call, 0, abort_code, -ECONNABORTED)) ret = rxrpc_send_abort_packet(call); } else if (cmd != RXRPC_CMD_SEND_DATA) { ret = -EINVAL; @@ -623,7 +623,8 @@ int rxrpc_kernel_send_data(struct socket *sock, struct rxrpc_call *call, read_unlock_bh(&call->state_lock); break; default: - /* Request phase complete for this client call */ + /* Request phase complete for this client call */ + trace_rxrpc_rx_eproto(call, 0, tracepoint_string("late_send")); ret = -EPROTO; break; } @@ -642,20 +643,24 @@ EXPORT_SYMBOL(rxrpc_kernel_send_data); * @error: Local error value * @why: 3-char string indicating why. * - * Allow a kernel service to abort a call, if it's still in an abortable state. + * Allow a kernel service to abort a call, if it's still in an abortable state + * and return true if the call was aborted, false if it was already complete. */ -void rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, +bool rxrpc_kernel_abort_call(struct socket *sock, struct rxrpc_call *call, u32 abort_code, int error, const char *why) { + bool aborted; + _enter("{%d},%d,%d,%s", call->debug_id, abort_code, error, why); mutex_lock(&call->user_mutex); - if (rxrpc_abort_call(why, call, 0, abort_code, error)) + aborted = rxrpc_abort_call(why, call, 0, abort_code, error); + if (aborted) rxrpc_send_abort_packet(call); mutex_unlock(&call->user_mutex); - _leave(""); + return aborted; } EXPORT_SYMBOL(rxrpc_kernel_abort_call); diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 403790cce7d2324054aa48b49c70f7d4deff8a2a..9fb84f0de6af0c3a75f190ed66cab0dc771db5e4 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -352,6 +352,51 @@ config NET_SCH_PLUG To compile this code as a module, choose M here: the module will be called sch_plug. +menuconfig NET_SCH_DEFAULT + bool "Allow override default queue discipline" + ---help--- + Support for selection of default queuing discipline. + + Nearly all users can safely say no here, and the default + of pfifo_fast will be used. Many distributions already set + the default value via /proc/sys/net/core/default_qdisc. + + If unsure, say N. + +if NET_SCH_DEFAULT + +choice + prompt "Default queuing discipline" + default DEFAULT_PFIFO_FAST + help + Select the queueing discipline that will be used by default + for all network devices. + + config DEFAULT_FQ + bool "Fair Queue" if NET_SCH_FQ + + config DEFAULT_CODEL + bool "Controlled Delay" if NET_SCH_CODEL + + config DEFAULT_FQ_CODEL + bool "Fair Queue Controlled Delay" if NET_SCH_FQ_CODEL + + config DEFAULT_SFQ + bool "Stochastic Fair Queue" if NET_SCH_SFQ + + config DEFAULT_PFIFO_FAST + bool "Priority FIFO Fast" +endchoice + +config DEFAULT_NET_SCH + string + default "pfifo_fast" if DEFAULT_PFIFO_FAST + default "fq" if DEFAULT_FQ + default "fq_codel" if DEFAULT_FQ_CODEL + default "sfq" if DEFAULT_SFQ + default "pfifo_fast" +endif + comment "Classification" config NET_CLS diff --git a/net/sched/act_api.c b/net/sched/act_api.c index b70aa57319ea3233395dc7ae349b8b7aab5dfd03..a90e8f355c00e9a39b4a17403b6c7459b6ea9c2f 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -428,24 +428,49 @@ static struct tc_action_ops *tc_lookup_action(struct nlattr *kind) return res; } +/*TCA_ACT_MAX_PRIO is 32, there count upto 32 */ +#define TCA_ACT_MAX_PRIO_MASK 0x1FF int tcf_action_exec(struct sk_buff *skb, struct tc_action **actions, int nr_actions, struct tcf_result *res) { int ret = -1, i; + u32 jmp_prgcnt = 0; + u32 jmp_ttl = TCA_ACT_MAX_PRIO; /*matches actions per filter */ if (skb_skip_tc_classify(skb)) return TC_ACT_OK; +restart_act_graph: for (i = 0; i < nr_actions; i++) { const struct tc_action *a = actions[i]; + if (jmp_prgcnt > 0) { + jmp_prgcnt -= 1; + continue; + } repeat: ret = a->ops->act(skb, a, res); if (ret == TC_ACT_REPEAT) goto repeat; /* we need a ttl - JHS */ + + if (TC_ACT_EXT_CMP(ret, TC_ACT_JUMP)) { + jmp_prgcnt = ret & TCA_ACT_MAX_PRIO_MASK; + if (!jmp_prgcnt || (jmp_prgcnt > nr_actions)) { + /* faulty opcode, stop pipeline */ + return TC_ACT_OK; + } else { + jmp_ttl -= 1; + if (jmp_ttl > 0) + goto restart_act_graph; + else /* faulty graph, stop pipeline */ + return TC_ACT_OK; + } + } + if (ret != TC_ACT_PIPE) break; } + return ret; } EXPORT_SYMBOL(tcf_action_exec); @@ -529,20 +554,20 @@ int tcf_action_dump(struct sk_buff *skb, struct list_head *actions, return err; } -static int nla_memdup_cookie(struct tc_action *a, struct nlattr **tb) +static struct tc_cookie *nla_memdup_cookie(struct nlattr **tb) { - a->act_cookie = kzalloc(sizeof(*a->act_cookie), GFP_KERNEL); - if (!a->act_cookie) - return -ENOMEM; + struct tc_cookie *c = kzalloc(sizeof(*c), GFP_KERNEL); + if (!c) + return NULL; - a->act_cookie->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); - if (!a->act_cookie->data) { - kfree(a->act_cookie); - return -ENOMEM; + c->data = nla_memdup(tb[TCA_ACT_COOKIE], GFP_KERNEL); + if (!c->data) { + kfree(c); + return NULL; } - a->act_cookie->len = nla_len(tb[TCA_ACT_COOKIE]); + c->len = nla_len(tb[TCA_ACT_COOKIE]); - return 0; + return c; } struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, @@ -551,13 +576,14 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, { struct tc_action *a; struct tc_action_ops *a_o; + struct tc_cookie *cookie = NULL; char act_name[IFNAMSIZ]; struct nlattr *tb[TCA_ACT_MAX + 1]; struct nlattr *kind; int err; if (name == NULL) { - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); if (err < 0) goto err_out; err = -EINVAL; @@ -566,6 +592,18 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, goto err_out; if (nla_strlcpy(act_name, kind, IFNAMSIZ) >= IFNAMSIZ) goto err_out; + if (tb[TCA_ACT_COOKIE]) { + int cklen = nla_len(tb[TCA_ACT_COOKIE]); + + if (cklen > TC_COOKIE_MAX_SIZE) + goto err_out; + + cookie = nla_memdup_cookie(tb); + if (!cookie) { + err = -ENOMEM; + goto err_out; + } + } } else { err = -EINVAL; if (strlcpy(act_name, name, IFNAMSIZ) >= IFNAMSIZ) @@ -604,20 +642,12 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, if (err < 0) goto err_mod; - if (tb[TCA_ACT_COOKIE]) { - int cklen = nla_len(tb[TCA_ACT_COOKIE]); - - if (cklen > TC_COOKIE_MAX_SIZE) { - err = -EINVAL; - tcf_hash_release(a, bind); - goto err_mod; - } - - if (nla_memdup_cookie(a, tb) < 0) { - err = -ENOMEM; - tcf_hash_release(a, bind); - goto err_mod; + if (name == NULL && tb[TCA_ACT_COOKIE]) { + if (a->act_cookie) { + kfree(a->act_cookie->data); + kfree(a->act_cookie); } + a->act_cookie = cookie; } /* module count goes up only when brand new policy is created @@ -632,6 +662,10 @@ struct tc_action *tcf_action_init_1(struct net *net, struct nlattr *nla, err_mod: module_put(a_o->owner); err_out: + if (cookie) { + kfree(cookie->data); + kfree(cookie); + } return ERR_PTR(err); } @@ -654,7 +688,7 @@ int tcf_action_init(struct net *net, struct nlattr *nla, struct nlattr *est, int err; int i; - err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); if (err < 0) return err; @@ -786,7 +820,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, int index; int err; - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); if (err < 0) goto err_out; @@ -835,7 +869,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, b = skb_tail_pointer(skb); - err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL); + err = nla_parse_nested(tb, TCA_ACT_MAX, nla, NULL, NULL); if (err < 0) goto err_out; @@ -921,7 +955,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n, struct tc_action *act; LIST_HEAD(actions); - ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL); + ret = nla_parse_nested(tb, TCA_ACT_MAX_PRIO, nla, NULL, NULL); if (ret < 0) return ret; @@ -993,7 +1027,8 @@ static int tcf_action_add(struct net *net, struct nlattr *nla, return tcf_add_notify(net, n, &actions, portid); } -static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) +static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_ACT_MAX + 1]; @@ -1004,7 +1039,8 @@ static int tc_ctl_action(struct sk_buff *skb, struct nlmsghdr *n) !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; - ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL); + ret = nlmsg_parse(n, sizeof(struct tcamsg), tca, TCA_ACT_MAX, NULL, + extack); if (ret < 0) return ret; @@ -1051,19 +1087,20 @@ static struct nlattr *find_dump_kind(const struct nlmsghdr *n) struct nlattr *nla[TCAA_MAX + 1]; struct nlattr *kind; - if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, NULL) < 0) + if (nlmsg_parse(n, sizeof(struct tcamsg), nla, TCAA_MAX, + NULL, NULL) < 0) return NULL; tb1 = nla[TCA_ACT_TAB]; if (tb1 == NULL) return NULL; if (nla_parse(tb, TCA_ACT_MAX_PRIO, nla_data(tb1), - NLMSG_ALIGN(nla_len(tb1)), NULL) < 0) + NLMSG_ALIGN(nla_len(tb1)), NULL, NULL) < 0) return NULL; if (tb[1] == NULL) return NULL; - if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL) < 0) + if (nla_parse_nested(tb2, TCA_ACT_MAX, tb[1], NULL, NULL) < 0) return NULL; kind = tb2[TCA_ACT_KIND]; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index 520baa41cba3a8d24a659b67c5a5ed8a44129f37..d33947d6e9d017a05b704d233ec97f73abb2ab74 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -283,7 +283,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy); + ret = nla_parse_nested(tb, TCA_ACT_BPF_MAX, nla, act_bpf_policy, NULL); if (ret < 0) return ret; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index f9bb43c25697e70d18fe9bbba90f6e98dfe05759..2155bc6c6a1e79049de9dc9c8e611b592e42a77f 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -109,7 +109,8 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy); + ret = nla_parse_nested(tb, TCA_CONNMARK_MAX, nla, connmark_policy, + NULL); if (ret < 0) return ret; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index e978ccd4402cbc68ba1c46e20909a047978df1c2..ab6fdbd34db774bdb9d8a958ead2c808a3405d13 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -59,7 +59,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy); + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla, csum_policy, NULL); if (err < 0) return err; @@ -181,6 +181,9 @@ static int tcf_csum_ipv4_tcp(struct sk_buff *skb, unsigned int ihl, struct tcphdr *tcph; const struct iphdr *iph; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) + return 1; + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; @@ -202,6 +205,9 @@ static int tcf_csum_ipv6_tcp(struct sk_buff *skb, unsigned int ihl, struct tcphdr *tcph; const struct ipv6hdr *ip6h; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_TCPV6) + return 1; + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); if (tcph == NULL) return 0; @@ -225,6 +231,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, unsigned int ihl, const struct iphdr *iph; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, @@ -278,6 +287,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, unsigned int ihl, const struct ipv6hdr *ip6h; u16 ul; + if (skb_is_gso(skb) && skb_shinfo(skb)->gso_type & SKB_GSO_UDP) + return 1; + /* * Support both UDP and UDPLITE checksum algorithms, Don't use * udph->len to get the real length without any protocol check, diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index e6c874a2b283f6e265b61a39f1fcd053389b9051..99afe8b1f1fb015584f303e7352da8b6171774eb 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -73,7 +73,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy); + err = nla_parse_nested(tb, TCA_GACT_MAX, nla, gact_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 71e7ff22f7c92a86cacad9a1b8d18d3d726f52fb..c5dec308b8b1eb29505fed1b9a71b3ef70f5e91a 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -443,7 +443,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, int ret = 0; int err; - err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy); + err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy, NULL); if (err < 0) return err; @@ -514,7 +514,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, if (tb[TCA_IFE_METALST]) { err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST], - NULL); + NULL, NULL); if (err) { metadata_parse_err: if (exists) @@ -603,8 +603,8 @@ static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind, return -1; } -int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, - u16 metaid, u16 mlen, void *mdata) +static int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife, + u16 metaid, u16 mlen, void *mdata) { struct tcf_meta_info *e; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 992ef8d624f11819e032411e72ea60aa6aa93ba1..36f0ced9e60c03297e195135ca5a8a53d1a3a27b 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -107,7 +107,7 @@ static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy); + err = nla_parse_nested(tb, TCA_IPT_MAX, nla, ipt_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index af49c7dca8608cebbaa0224976ae7df104e5526f..1b5549ababd4690617b5ff0c3842c6cbb4806a41 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -87,7 +87,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy); + ret = nla_parse_nested(tb, TCA_MIRRED_MAX, nla, mirred_policy, NULL); if (ret < 0) return ret; if (tb[TCA_MIRRED_PARMS] == NULL) diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 9b6aec665495992fd6d63773890b5e897ba51819..9016ab8a0649780a02d8b29f76f98f19c0283e11 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -50,7 +50,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy); + err = nla_parse_nested(tb, TCA_NAT_MAX, nla, nat_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index c1310472f620fd44b9ae8a6cfeefef5419cad54e..164b5ac094be6d8bbd7a04aa8aa962a3c693ab44 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -72,7 +72,7 @@ static struct tcf_pedit_key_ex *tcf_pedit_keys_ex_parse(struct nlattr *nla, } err = nla_parse_nested(tb, TCA_PEDIT_KEY_EX_MAX, ka, - pedit_key_ex_policy); + pedit_key_ex_policy, NULL); if (err) goto err_out; @@ -147,7 +147,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy); + err = nla_parse_nested(tb, TCA_PEDIT_MAX, nla, pedit_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index 0ba91d1ce99480c4817684dbe0b4fde61811e03e..f42008b293112d1a6da2bcf6ef8af564b382ba39 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -90,7 +90,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy); + err = nla_parse_nested(tb, TCA_POLICE_MAX, nla, police_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 0b8217b4763f59d2d4a71684080536b080ab27e1..59d6645a4007818656376b02a9c5a458e028420b 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -50,7 +50,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy); + ret = nla_parse_nested(tb, TCA_SAMPLE_MAX, nla, sample_policy, NULL); if (ret < 0) return ret; if (!tb[TCA_SAMPLE_PARMS] || !tb[TCA_SAMPLE_RATE] || diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 823a73ad0c602b8e079be04934f8a57d53480da1..43605e7ce05107adb79bd1dbc562f4961eb20e51 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -94,7 +94,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy); + err = nla_parse_nested(tb, TCA_DEF_MAX, nla, simple_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index 06ccae3c12eecf85383105489320537d52c4d948..6b3e65d7de0c2e81240618e4500ee894819df833 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -82,7 +82,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, if (nla == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy); + err = nla_parse_nested(tb, TCA_SKBEDIT_MAX, nla, skbedit_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index c736627f8f4a0e0ff86db535ec95459a417e4ada..a73c4bbcada293b2923a41572104d6f9362d37d9 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -103,7 +103,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy); + err = nla_parse_nested(tb, TCA_SKBMOD_MAX, nla, skbmod_policy, NULL); if (err < 0) return err; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index e3a58e02119877d237f9aa18c5f9d9fdfb44d0c9..b9a2f241a5b3adca04a5d162569f7908c01c2912 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -89,7 +89,8 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy); + err = nla_parse_nested(tb, TCA_TUNNEL_KEY_MAX, nla, tunnel_key_policy, + NULL); if (err < 0) return err; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 19e0dba305ce8101d9db33ff0a1eaab849a730d0..13ba3a89f675d7d5ddaeab893a9d548be735e39e 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -121,7 +121,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, if (!nla) return -EINVAL; - err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy); + err = nla_parse_nested(tb, TCA_VLAN_MAX, nla, vlan_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 732f7cae459d4656aa8d69020f44a66abb501f34..22f88b35a5463e1d896271f1e1b05675bcaa1955 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -178,14 +178,11 @@ static struct tcf_proto *tcf_proto_create(const char *kind, u32 protocol, return ERR_PTR(err); } -static bool tcf_proto_destroy(struct tcf_proto *tp, bool force) +static void tcf_proto_destroy(struct tcf_proto *tp) { - if (tp->ops->destroy(tp, force)) { - module_put(tp->ops->owner); - kfree_rcu(tp, rcu); - return true; - } - return false; + tp->ops->destroy(tp); + module_put(tp->ops->owner); + kfree_rcu(tp, rcu); } void tcf_destroy_chain(struct tcf_proto __rcu **fl) @@ -194,14 +191,15 @@ void tcf_destroy_chain(struct tcf_proto __rcu **fl) while ((tp = rtnl_dereference(*fl)) != NULL) { RCU_INIT_POINTER(*fl, tp->next); - tcf_proto_destroy(tp, true); + tcf_proto_destroy(tp); } } EXPORT_SYMBOL(tcf_destroy_chain); /* Add/change/delete/get a filter node */ -static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) +static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *tca[TCA_MAX + 1]; @@ -229,7 +227,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) replay: tp_created = 0; - err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL); + err = nlmsg_parse(n, sizeof(*t), tca, TCA_MAX, NULL, extack); if (err < 0) return err; @@ -360,7 +358,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) RCU_INIT_POINTER(*back, next); tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER, false); - tcf_proto_destroy(tp, true); + tcf_proto_destroy(tp); err = 0; goto errout; } @@ -371,24 +369,28 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) goto errout; } } else { + bool last; + switch (n->nlmsg_type) { case RTM_NEWTFILTER: if (n->nlmsg_flags & NLM_F_EXCL) { if (tp_created) - tcf_proto_destroy(tp, true); + tcf_proto_destroy(tp); err = -EEXIST; goto errout; } break; case RTM_DELTFILTER: - err = tp->ops->delete(tp, fh); + err = tp->ops->delete(tp, fh, &last); if (err) goto errout; next = rtnl_dereference(tp->next); tfilter_notify(net, skb, n, tp, t->tcm_handle, RTM_DELTFILTER, false); - if (tcf_proto_destroy(tp, false)) + if (last) { RCU_INIT_POINTER(*back, next); + tcf_proto_destroy(tp); + } goto errout; case RTM_GETTFILTER: err = tfilter_notify(net, skb, n, tp, fh, @@ -410,7 +412,7 @@ static int tc_ctl_tfilter(struct sk_buff *skb, struct nlmsghdr *n) tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false); } else { if (tp_created) - tcf_proto_destroy(tp, true); + tcf_proto_destroy(tp); } errout: diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 5877f6061b57589ce9ba9226281e85f47eafb523..c4fd63a068f98b1288bacebd5f352af6037362b9 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -93,30 +93,28 @@ static void basic_delete_filter(struct rcu_head *head) kfree(f); } -static bool basic_destroy(struct tcf_proto *tp, bool force) +static void basic_destroy(struct tcf_proto *tp) { struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f, *n; - if (!force && !list_empty(&head->flist)) - return false; - list_for_each_entry_safe(f, n, &head->flist, link) { list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, basic_delete_filter); } kfree_rcu(head, rcu); - return true; } -static int basic_delete(struct tcf_proto *tp, unsigned long arg) +static int basic_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { + struct basic_head *head = rtnl_dereference(tp->root); struct basic_filter *f = (struct basic_filter *) arg; list_del_rcu(&f->link); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, basic_delete_filter); + *last = list_empty(&head->flist); return 0; } @@ -174,7 +172,7 @@ static int basic_change(struct net *net, struct sk_buff *in_skb, return -EINVAL; err = nla_parse_nested(tb, TCA_BASIC_MAX, tca[TCA_OPTIONS], - basic_policy); + basic_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 80f688436dd70bae84f0c2fffd3d16c4b1d2c4da..5ebeae996e6327023bfce0307cd6b6a07c0860c2 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -274,25 +274,24 @@ static void __cls_bpf_delete(struct tcf_proto *tp, struct cls_bpf_prog *prog) call_rcu(&prog->rcu, cls_bpf_delete_prog_rcu); } -static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg) +static int cls_bpf_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { + struct cls_bpf_head *head = rtnl_dereference(tp->root); + __cls_bpf_delete(tp, (struct cls_bpf_prog *) arg); + *last = list_empty(&head->plist); return 0; } -static bool cls_bpf_destroy(struct tcf_proto *tp, bool force) +static void cls_bpf_destroy(struct tcf_proto *tp) { struct cls_bpf_head *head = rtnl_dereference(tp->root); struct cls_bpf_prog *prog, *tmp; - if (!force && !list_empty(&head->plist)) - return false; - list_for_each_entry_safe(prog, tmp, &head->plist, link) __cls_bpf_delete(tp, prog); kfree_rcu(head, rcu); - return true; } static unsigned long cls_bpf_get(struct tcf_proto *tp, u32 handle) @@ -478,7 +477,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, if (tca[TCA_OPTIONS] == NULL) return -EINVAL; - ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy); + ret = nla_parse_nested(tb, TCA_BPF_MAX, tca[TCA_OPTIONS], bpf_policy, + NULL); if (ret < 0) return ret; diff --git a/net/sched/cls_cgroup.c b/net/sched/cls_cgroup.c index c1f20077837f06bb1998f5621c797e575e10ca21..12ce547eea04dfb57d3f47540b57e8150a8ebca4 100644 --- a/net/sched/cls_cgroup.c +++ b/net/sched/cls_cgroup.c @@ -99,7 +99,7 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, new->handle = handle; new->tp = tp; err = nla_parse_nested(tb, TCA_CGROUP_MAX, tca[TCA_OPTIONS], - cgroup_policy); + cgroup_policy, NULL); if (err < 0) goto errout; @@ -131,20 +131,16 @@ static int cls_cgroup_change(struct net *net, struct sk_buff *in_skb, return err; } -static bool cls_cgroup_destroy(struct tcf_proto *tp, bool force) +static void cls_cgroup_destroy(struct tcf_proto *tp) { struct cls_cgroup_head *head = rtnl_dereference(tp->root); - if (!force) - return false; /* Head can still be NULL due to cls_cgroup_init(). */ if (head) call_rcu(&head->rcu, cls_cgroup_destroy_rcu); - - return true; } -static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg) +static int cls_cgroup_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 3d6b9286c203f298b14b5254e5c12cb4781eb4b1..3065752b9cda59e4cb2555acfe2ae26722152ffc 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -400,7 +400,7 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy); + err = nla_parse_nested(tb, TCA_FLOW_MAX, opt, flow_policy, NULL); if (err < 0) return err; @@ -508,9 +508,8 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, get_random_bytes(&fnew->hashrnd, 4); } - fnew->perturb_timer.function = flow_perturbation; - fnew->perturb_timer.data = (unsigned long)fnew; - init_timer_deferrable(&fnew->perturb_timer); + setup_deferrable_timer(&fnew->perturb_timer, flow_perturbation, + (unsigned long)fnew); tcf_exts_change(tp, &fnew->exts, &e); tcf_em_tree_change(tp, &fnew->ematches, &t); @@ -563,12 +562,14 @@ static int flow_change(struct net *net, struct sk_buff *in_skb, return err; } -static int flow_delete(struct tcf_proto *tp, unsigned long arg) +static int flow_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { + struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f = (struct flow_filter *)arg; list_del_rcu(&f->list); call_rcu(&f->rcu, flow_destroy_filter); + *last = list_empty(&head->filters); return 0; } @@ -584,20 +585,16 @@ static int flow_init(struct tcf_proto *tp) return 0; } -static bool flow_destroy(struct tcf_proto *tp, bool force) +static void flow_destroy(struct tcf_proto *tp) { struct flow_head *head = rtnl_dereference(tp->root); struct flow_filter *f, *next; - if (!force && !list_empty(&head->filters)) - return false; - list_for_each_entry_safe(f, next, &head->filters, list) { list_del_rcu(&f->list); call_rcu(&f->rcu, flow_destroy_filter); } kfree_rcu(head, rcu); - return true; } static unsigned long flow_get(struct tcf_proto *tp, u32 handle) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 9d0c99d2e9fbcc35aee5a51274cf9fe7f542a14f..ca526c0881bd6b4bcff19835befb537f45c6d5ff 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -47,6 +48,7 @@ struct fl_flow_key { struct flow_dissector_key_ipv6_addrs enc_ipv6; }; struct flow_dissector_key_ports enc_tp; + struct flow_dissector_key_mpls mpls; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -328,21 +330,16 @@ static void fl_destroy_rcu(struct rcu_head *rcu) schedule_work(&head->work); } -static bool fl_destroy(struct tcf_proto *tp, bool force) +static void fl_destroy(struct tcf_proto *tp) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f, *next; - if (!force && !list_empty(&head->filters)) - return false; - list_for_each_entry_safe(f, next, &head->filters, list) __fl_delete(tp, f); __module_get(THIS_MODULE); call_rcu(&head->rcu, fl_destroy_rcu); - - return true; } static unsigned long fl_get(struct tcf_proto *tp, u32 handle) @@ -423,6 +420,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_ARP_SHA_MASK] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA] = { .len = ETH_ALEN }, [TCA_FLOWER_KEY_ARP_THA_MASK] = { .len = ETH_ALEN }, + [TCA_FLOWER_KEY_MPLS_TTL] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_BOS] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_TC] = { .type = NLA_U8 }, + [TCA_FLOWER_KEY_MPLS_LABEL] = { .type = NLA_U32 }, }; static void fl_set_key_val(struct nlattr **tb, @@ -438,6 +439,41 @@ static void fl_set_key_val(struct nlattr **tb, memcpy(mask, nla_data(tb[mask_type]), len); } +static int fl_set_key_mpls(struct nlattr **tb, + struct flow_dissector_key_mpls *key_val, + struct flow_dissector_key_mpls *key_mask) +{ + if (tb[TCA_FLOWER_KEY_MPLS_TTL]) { + key_val->mpls_ttl = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TTL]); + key_mask->mpls_ttl = MPLS_TTL_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_BOS]) { + u8 bos = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_BOS]); + + if (bos & ~MPLS_BOS_MASK) + return -EINVAL; + key_val->mpls_bos = bos; + key_mask->mpls_bos = MPLS_BOS_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_TC]) { + u8 tc = nla_get_u8(tb[TCA_FLOWER_KEY_MPLS_TC]); + + if (tc & ~MPLS_TC_MASK) + return -EINVAL; + key_val->mpls_tc = tc; + key_mask->mpls_tc = MPLS_TC_MASK; + } + if (tb[TCA_FLOWER_KEY_MPLS_LABEL]) { + u32 label = nla_get_u32(tb[TCA_FLOWER_KEY_MPLS_LABEL]); + + if (label & ~MPLS_LABEL_MASK) + return -EINVAL; + key_val->mpls_label = label; + key_mask->mpls_label = MPLS_LABEL_MASK; + } + return 0; +} + static void fl_set_key_vlan(struct nlattr **tb, struct flow_dissector_key_vlan *key_val, struct flow_dissector_key_vlan *key_mask) @@ -594,6 +630,11 @@ static int fl_set_key(struct net *net, struct nlattr **tb, &mask->icmp.code, TCA_FLOWER_KEY_ICMPV6_CODE_MASK, sizeof(key->icmp.code)); + } else if (key->basic.n_proto == htons(ETH_P_MPLS_UC) || + key->basic.n_proto == htons(ETH_P_MPLS_MC)) { + ret = fl_set_key_mpls(tb, &key->mpls, &mask->mpls); + if (ret) + return ret; } else if (key->basic.n_proto == htons(ETH_P_ARP) || key->basic.n_proto == htons(ETH_P_RARP)) { fl_set_key_val(tb, &key->arp.sip, TCA_FLOWER_KEY_ARP_SIP, @@ -729,6 +770,8 @@ static void fl_init_dissector(struct cls_fl_head *head, FLOW_DISSECTOR_KEY_ICMP, icmp); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_ARP, arp); + FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, + FLOW_DISSECTOR_KEY_MPLS, mpls); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, FLOW_DISSECTOR_KEY_VLAN, vlan); FL_KEY_SET_IF_MASKED(&mask->key, keys, cnt, @@ -848,7 +891,8 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (!tb) return -ENOBUFS; - err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], fl_policy); + err = nla_parse_nested(tb, TCA_FLOWER_MAX, tca[TCA_OPTIONS], + fl_policy, NULL); if (err < 0) goto errout_tb; @@ -946,7 +990,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, return err; } -static int fl_delete(struct tcf_proto *tp, unsigned long arg) +static int fl_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct cls_fl_head *head = rtnl_dereference(tp->root); struct cls_fl_filter *f = (struct cls_fl_filter *) arg; @@ -955,6 +999,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg) rhashtable_remove_fast(&head->ht, &f->ht_node, head->ht_params); __fl_delete(tp, f); + *last = list_empty(&head->filters); return 0; } @@ -994,6 +1039,41 @@ static int fl_dump_key_val(struct sk_buff *skb, return 0; } +static int fl_dump_key_mpls(struct sk_buff *skb, + struct flow_dissector_key_mpls *mpls_key, + struct flow_dissector_key_mpls *mpls_mask) +{ + int err; + + if (!memchr_inv(mpls_mask, 0, sizeof(*mpls_mask))) + return 0; + if (mpls_mask->mpls_ttl) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TTL, + mpls_key->mpls_ttl); + if (err) + return err; + } + if (mpls_mask->mpls_tc) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_TC, + mpls_key->mpls_tc); + if (err) + return err; + } + if (mpls_mask->mpls_label) { + err = nla_put_u32(skb, TCA_FLOWER_KEY_MPLS_LABEL, + mpls_key->mpls_label); + if (err) + return err; + } + if (mpls_mask->mpls_bos) { + err = nla_put_u8(skb, TCA_FLOWER_KEY_MPLS_BOS, + mpls_key->mpls_bos); + if (err) + return err; + } + return 0; +} + static int fl_dump_key_vlan(struct sk_buff *skb, struct flow_dissector_key_vlan *vlan_key, struct flow_dissector_key_vlan *vlan_mask) @@ -1099,6 +1179,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, unsigned long fh, sizeof(key->basic.n_proto))) goto nla_put_failure; + if (fl_dump_key_mpls(skb, &key->mpls, &mask->mpls)) + goto nla_put_failure; + if (fl_dump_key_vlan(skb, &key->vlan, &mask->vlan)) goto nla_put_failure; diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 9dc63d54e1672e82cfae5f677a1ba811c72df337..d3885362e017e1b477c6c2e4d8abea4b1e021c53 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -127,20 +127,14 @@ static void fw_delete_filter(struct rcu_head *head) kfree(f); } -static bool fw_destroy(struct tcf_proto *tp, bool force) +static void fw_destroy(struct tcf_proto *tp) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f; int h; if (head == NULL) - return true; - - if (!force) { - for (h = 0; h < HTSIZE; h++) - if (rcu_access_pointer(head->ht[h])) - return false; - } + return; for (h = 0; h < HTSIZE; h++) { while ((f = rtnl_dereference(head->ht[h])) != NULL) { @@ -150,17 +144,17 @@ static bool fw_destroy(struct tcf_proto *tp, bool force) call_rcu(&f->rcu, fw_delete_filter); } } - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); - return true; } -static int fw_delete(struct tcf_proto *tp, unsigned long arg) +static int fw_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct fw_head *head = rtnl_dereference(tp->root); struct fw_filter *f = (struct fw_filter *)arg; struct fw_filter __rcu **fp; struct fw_filter *pfp; + int ret = -EINVAL; + int h; if (head == NULL || f == NULL) goto out; @@ -173,11 +167,21 @@ static int fw_delete(struct tcf_proto *tp, unsigned long arg) RCU_INIT_POINTER(*fp, rtnl_dereference(f->next)); tcf_unbind_filter(tp, &f->res); call_rcu(&f->rcu, fw_delete_filter); - return 0; + ret = 0; + break; } } + + *last = true; + for (h = 0; h < HTSIZE; h++) { + if (rcu_access_pointer(head->ht[h])) { + *last = false; + break; + } + } + out: - return -EINVAL; + return ret; } static const struct nla_policy fw_policy[TCA_FW_MAX + 1] = { @@ -250,7 +254,7 @@ static int fw_change(struct net *net, struct sk_buff *in_skb, if (!opt) return handle ? -EINVAL : 0; /* Succeed if it is old method. */ - err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy); + err = nla_parse_nested(tb, TCA_FW_MAX, opt, fw_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 224eb2c143462a39f3a05e78203344e44e6eb578..dee469fed9671d518dbeddd6bd48d96cc9158675 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -90,19 +90,18 @@ static void mall_destroy_hw_filter(struct tcf_proto *tp, &offload); } -static bool mall_destroy(struct tcf_proto *tp, bool force) +static void mall_destroy(struct tcf_proto *tp) { struct cls_mall_head *head = rtnl_dereference(tp->root); struct net_device *dev = tp->q->dev_queue->dev; if (!head) - return true; + return; if (tc_should_offload(dev, tp, head->flags)) mall_destroy_hw_filter(tp, head, (unsigned long) head); call_rcu(&head->rcu, mall_destroy_rcu); - return true; } static unsigned long mall_get(struct tcf_proto *tp, u32 handle) @@ -161,8 +160,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, if (head) return -EEXIST; - err = nla_parse_nested(tb, TCA_MATCHALL_MAX, - tca[TCA_OPTIONS], mall_policy); + err = nla_parse_nested(tb, TCA_MATCHALL_MAX, tca[TCA_OPTIONS], + mall_policy, NULL); if (err < 0) return err; @@ -204,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - if (head) - call_rcu(&head->rcu, mall_destroy_rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: @@ -216,7 +214,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return err; } -static int mall_delete(struct tcf_proto *tp, unsigned long arg) +static int mall_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { return -EOPNOTSUPP; } diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 455fc8f83d0ae0ab48ba97fc2a11ce99e8e00784..d63d5502ee020350e72b818c5607000a21df915a 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -140,8 +140,6 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, goto failure; id = dst->tclassid; - if (head == NULL) - goto old_method; iif = inet_iif(skb); @@ -194,15 +192,6 @@ static int route4_classify(struct sk_buff *skb, const struct tcf_proto *tp, route4_set_fastmap(head, id, iif, ROUTE4_FAILURE); failure: return -1; - -old_method: - if (id && (TC_H_MAJ(id) == 0 || - !(TC_H_MAJ(id^tp->q->handle)))) { - res->classid = id; - res->class = 0; - return 0; - } - return -1; } static inline u32 to_hash(u32 id) @@ -234,9 +223,6 @@ static unsigned long route4_get(struct tcf_proto *tp, u32 handle) struct route4_filter *f; unsigned int h1, h2; - if (!head) - return 0; - h1 = to_hash(handle); if (h1 > 256) return 0; @@ -276,20 +262,13 @@ static void route4_delete_filter(struct rcu_head *head) kfree(f); } -static bool route4_destroy(struct tcf_proto *tp, bool force) +static void route4_destroy(struct tcf_proto *tp) { struct route4_head *head = rtnl_dereference(tp->root); int h1, h2; if (head == NULL) - return true; - - if (!force) { - for (h1 = 0; h1 <= 256; h1++) { - if (rcu_access_pointer(head->table[h1])) - return false; - } - } + return; for (h1 = 0; h1 <= 256; h1++) { struct route4_bucket *b; @@ -312,12 +291,10 @@ static bool route4_destroy(struct tcf_proto *tp, bool force) kfree_rcu(b, rcu); } } - RCU_INIT_POINTER(tp->root, NULL); kfree_rcu(head, rcu); - return true; } -static int route4_delete(struct tcf_proto *tp, unsigned long arg) +static int route4_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct route4_head *head = rtnl_dereference(tp->root); struct route4_filter *f = (struct route4_filter *)arg; @@ -325,7 +302,7 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) struct route4_filter *nf; struct route4_bucket *b; unsigned int h = 0; - int i; + int i, h1; if (!head || !f) return -EINVAL; @@ -356,16 +333,25 @@ static int route4_delete(struct tcf_proto *tp, unsigned long arg) rt = rtnl_dereference(b->ht[i]); if (rt) - return 0; + goto out; } /* OK, session has no flows */ RCU_INIT_POINTER(head->table[to_hash(h)], NULL); kfree_rcu(b, rcu); + break; + } + } - return 0; +out: + *last = true; + for (h1 = 0; h1 <= 256; h1++) { + if (rcu_access_pointer(head->table[h1])) { + *last = false; + break; } } + return 0; } @@ -489,7 +475,7 @@ static int route4_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return handle ? -EINVAL : 0; - err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy); + err = nla_parse_nested(tb, TCA_ROUTE4_MAX, opt, route4_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 322438fb3ffcb426194be6c1dd05893b27cdf51c..0d9d077986992926a79af6236b8840181517679d 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -152,8 +152,6 @@ static int rsvp_classify(struct sk_buff *skb, const struct tcf_proto *tp, return -1; nhptr = ip_hdr(skb); #endif - if (unlikely(!head)) - return -1; restart: #if RSVP_DST_LEN == 4 @@ -302,22 +300,13 @@ static void rsvp_delete_filter(struct tcf_proto *tp, struct rsvp_filter *f) call_rcu(&f->rcu, rsvp_delete_filter_rcu); } -static bool rsvp_destroy(struct tcf_proto *tp, bool force) +static void rsvp_destroy(struct tcf_proto *tp) { struct rsvp_head *data = rtnl_dereference(tp->root); int h1, h2; if (data == NULL) - return true; - - if (!force) { - for (h1 = 0; h1 < 256; h1++) { - if (rcu_access_pointer(data->ht[h1])) - return false; - } - } - - RCU_INIT_POINTER(tp->root, NULL); + return; for (h1 = 0; h1 < 256; h1++) { struct rsvp_session *s; @@ -337,10 +326,9 @@ static bool rsvp_destroy(struct tcf_proto *tp, bool force) } } kfree_rcu(data, rcu); - return true; } -static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) +static int rsvp_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct rsvp_head *head = rtnl_dereference(tp->root); struct rsvp_filter *nfp, *f = (struct rsvp_filter *)arg; @@ -348,7 +336,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) unsigned int h = f->handle; struct rsvp_session __rcu **sp; struct rsvp_session *nsp, *s = f->sess; - int i; + int i, h1; fp = &s->ht[(h >> 8) & 0xFF]; for (nfp = rtnl_dereference(*fp); nfp; @@ -361,7 +349,7 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) for (i = 0; i <= 16; i++) if (s->ht[i]) - return 0; + goto out; /* OK, session has no flows */ sp = &head->ht[h & 0xFF]; @@ -370,13 +358,23 @@ static int rsvp_delete(struct tcf_proto *tp, unsigned long arg) if (nsp == s) { RCU_INIT_POINTER(*sp, s->next); kfree_rcu(s, rcu); - return 0; + goto out; } } - return 0; + break; } } + +out: + *last = true; + for (h1 = 0; h1 < 256; h1++) { + if (rcu_access_pointer(head->ht[h1])) { + *last = false; + break; + } + } + return 0; } @@ -484,7 +482,7 @@ static int rsvp_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return handle ? -EINVAL : 0; - err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy); + err = nla_parse_nested(tb, TCA_RSVP_MAX, opt, rsvp_policy, NULL); if (err < 0) return err; diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 0751245a6aced60163601b94ab1386e0c5e30565..8a8a58357c39b8e38c2d3740aaa93ed67866fe54 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -150,7 +150,7 @@ static void tcindex_destroy_fexts(struct rcu_head *head) kfree(f); } -static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) +static int tcindex_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcindex_filter_result *r = (struct tcindex_filter_result *) arg; @@ -186,6 +186,8 @@ static int tcindex_delete(struct tcf_proto *tp, unsigned long arg) call_rcu(&f->rcu, tcindex_destroy_fexts); else call_rcu(&r->rcu, tcindex_destroy_rexts); + + *last = false; return 0; } @@ -193,7 +195,9 @@ static int tcindex_destroy_element(struct tcf_proto *tp, unsigned long arg, struct tcf_walker *walker) { - return tcindex_delete(tp, arg); + bool last; + + return tcindex_delete(tp, arg, &last); } static void __tcindex_destroy(struct rcu_head *head) @@ -482,7 +486,7 @@ tcindex_change(struct net *net, struct sk_buff *in_skb, if (!opt) return 0; - err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy); + err = nla_parse_nested(tb, TCA_TCINDEX_MAX, opt, tcindex_policy, NULL); if (err < 0) return err; @@ -529,14 +533,11 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker) } } -static bool tcindex_destroy(struct tcf_proto *tp, bool force) +static void tcindex_destroy(struct tcf_proto *tp) { struct tcindex_data *p = rtnl_dereference(tp->root); struct tcf_walker walker; - if (!force) - return false; - pr_debug("tcindex_destroy(tp %p),p %p\n", tp, p); walker.count = 0; walker.skip = 0; @@ -544,7 +545,6 @@ static bool tcindex_destroy(struct tcf_proto *tp, bool force) tcindex_walk(tp, &walker); call_rcu(&p->rcu, __tcindex_destroy); - return true; } diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4dbe0c680fe6363a88ca47cb167dcae9327920d1..d20e72a095d578e65eda0dafc62217146aeea1c1 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -585,37 +585,13 @@ static bool ht_empty(struct tc_u_hnode *ht) return true; } -static bool u32_destroy(struct tcf_proto *tp, bool force) +static void u32_destroy(struct tcf_proto *tp) { struct tc_u_common *tp_c = tp->data; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); WARN_ON(root_ht == NULL); - if (!force) { - if (root_ht) { - if (root_ht->refcnt > 1) - return false; - if (root_ht->refcnt == 1) { - if (!ht_empty(root_ht)) - return false; - } - } - - if (tp_c->refcnt > 1) - return false; - - if (tp_c->refcnt == 1) { - struct tc_u_hnode *ht; - - for (ht = rtnl_dereference(tp_c->hlist); - ht; - ht = rtnl_dereference(ht->next)) - if (!ht_empty(ht)) - return false; - } - } - if (root_ht && --root_ht->refcnt == 0) u32_destroy_hnode(tp, root_ht); @@ -640,20 +616,22 @@ static bool u32_destroy(struct tcf_proto *tp, bool force) } tp->data = NULL; - return true; } -static int u32_delete(struct tcf_proto *tp, unsigned long arg) +static int u32_delete(struct tcf_proto *tp, unsigned long arg, bool *last) { struct tc_u_hnode *ht = (struct tc_u_hnode *)arg; struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); + struct tc_u_common *tp_c = tp->data; + int ret = 0; if (ht == NULL) - return 0; + goto out; if (TC_U32_KEY(ht->handle)) { u32_remove_hw_knode(tp, ht->handle); - return u32_delete_key(tp, (struct tc_u_knode *)ht); + ret = u32_delete_key(tp, (struct tc_u_knode *)ht); + goto out; } if (root_ht == ht) @@ -666,7 +644,40 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg) return -EBUSY; } - return 0; +out: + *last = true; + if (root_ht) { + if (root_ht->refcnt > 1) { + *last = false; + goto ret; + } + if (root_ht->refcnt == 1) { + if (!ht_empty(root_ht)) { + *last = false; + goto ret; + } + } + } + + if (tp_c->refcnt > 1) { + *last = false; + goto ret; + } + + if (tp_c->refcnt == 1) { + struct tc_u_hnode *ht; + + for (ht = rtnl_dereference(tp_c->hlist); + ht; + ht = rtnl_dereference(ht->next)) + if (!ht_empty(ht)) { + *last = false; + break; + } + } + +ret: + return ret; } #define NR_U32_NODE (1<<12) @@ -860,7 +871,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (opt == NULL) return handle ? -EINVAL : 0; - err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy); + err = nla_parse_nested(tb, TCA_U32_MAX, opt, u32_policy, NULL); if (err < 0) return err; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index ae7e4f5b348b86ad352e0011c8e0d0227a44f1e3..eb0e9bab54c175d706b6f89f354f2dd5b8793a50 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -912,7 +912,7 @@ static int em_meta_change(struct net *net, void *data, int len, struct tcf_meta_hdr *hdr; struct meta_match *meta = NULL; - err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy); + err = nla_parse(tb, TCA_EM_META_MAX, data, len, meta_policy, NULL); if (err < 0) goto errout; diff --git a/net/sched/ematch.c b/net/sched/ematch.c index fbb7ebfc58c6761f6afb58e62646908b10e2bf09..03b677bc07005c7f864077edad6f22ca9dadbfdd 100644 --- a/net/sched/ematch.c +++ b/net/sched/ematch.c @@ -314,7 +314,7 @@ int tcf_em_tree_validate(struct tcf_proto *tp, struct nlattr *nla, if (!nla) return 0; - err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy); + err = nla_parse_nested(tb, TCA_EMATCH_TREE_MAX, nla, em_policy, NULL); if (err < 0) goto errout; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bcf49cd2278670197f2a7e9d4e9a62ae8d117468..bbe57d57b67fd498692bd41db49147511f1bb091 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -251,6 +251,15 @@ int qdisc_set_default(const char *name) return ops ? 0 : -ENOENT; } +#ifdef CONFIG_NET_SCH_DEFAULT +/* Set default value from kernel config */ +static int __init sch_default_qdisc(void) +{ + return qdisc_set_default(CONFIG_DEFAULT_NET_SCH); +} +late_initcall(sch_default_qdisc); +#endif + /* We know handle. Find qdisc among all qdisc's attached to device * (root qdisc, all its children, children of children etc.) * Note: caller either uses rtnl or rcu_read_lock() @@ -274,7 +283,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle) return NULL; } -void qdisc_hash_add(struct Qdisc *q) +void qdisc_hash_add(struct Qdisc *q, bool invisible) { if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) { struct Qdisc *root = qdisc_dev(q)->qdisc; @@ -282,6 +291,8 @@ void qdisc_hash_add(struct Qdisc *q) WARN_ON_ONCE(root == &noop_qdisc); ASSERT_RTNL(); hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle); + if (invisible) + q->flags |= TCQ_F_INVISIBLE; } } EXPORT_SYMBOL(qdisc_hash_add); @@ -455,7 +466,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) u16 *tab = NULL; int err; - err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy); + err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, NULL); if (err < 0) return ERR_PTR(err); if (!tb[TCA_STAB_BASE]) @@ -1003,7 +1014,7 @@ static struct Qdisc *qdisc_create(struct net_device *dev, goto err_out4; } - qdisc_hash_add(sch); + qdisc_hash_add(sch, false); return sch; } @@ -1114,7 +1125,8 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) * Delete/get qdisc. */ -static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) +static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); @@ -1129,7 +1141,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); if (err < 0) return err; @@ -1183,7 +1195,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n) * Create/change qdisc. */ -static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) +static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm; @@ -1198,7 +1211,7 @@ static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n) replay: /* Reinit, just in case something touches this. */ - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); if (err < 0) return err; @@ -1401,9 +1414,14 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid, return -1; } -static bool tc_qdisc_dump_ignore(struct Qdisc *q) +static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible) { - return (q->flags & TCQ_F_BUILTIN) ? true : false; + if (q->flags & TCQ_F_BUILTIN) + return true; + if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible) + return true; + + return false; } static int qdisc_notify(struct net *net, struct sk_buff *oskb, @@ -1417,12 +1435,12 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, if (!skb) return -ENOBUFS; - if (old && !tc_qdisc_dump_ignore(old)) { + if (old && !tc_qdisc_dump_ignore(old, false)) { if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0) goto err_out; } - if (new && !tc_qdisc_dump_ignore(new)) { + if (new && !tc_qdisc_dump_ignore(new, false)) { if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0) goto err_out; @@ -1439,7 +1457,8 @@ static int qdisc_notify(struct net *net, struct sk_buff *oskb, static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, struct netlink_callback *cb, - int *q_idx_p, int s_q_idx, bool recur) + int *q_idx_p, int s_q_idx, bool recur, + bool dump_invisible) { int ret = 0, q_idx = *q_idx_p; struct Qdisc *q; @@ -1452,7 +1471,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, if (q_idx < s_q_idx) { q_idx++; } else { - if (!tc_qdisc_dump_ignore(q) && + if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) @@ -1474,7 +1493,7 @@ static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb, q_idx++; continue; } - if (!tc_qdisc_dump_ignore(q) && + if (!tc_qdisc_dump_ignore(q, dump_invisible) && tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) @@ -1496,12 +1515,21 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) int idx, q_idx; int s_idx, s_q_idx; struct net_device *dev; + const struct nlmsghdr *nlh = cb->nlh; + struct tcmsg *tcm = nlmsg_data(nlh); + struct nlattr *tca[TCA_MAX + 1]; + int err; s_idx = cb->args[0]; s_q_idx = q_idx = cb->args[1]; idx = 0; ASSERT_RTNL(); + + err = nlmsg_parse(nlh, sizeof(*tcm), tca, TCA_MAX, NULL, NULL); + if (err < 0) + return err; + for_each_netdev(net, dev) { struct netdev_queue *dev_queue; @@ -1512,13 +1540,14 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) q_idx = 0; if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx, - true) < 0) + true, tca[TCA_DUMP_INVISIBLE]) < 0) goto done; dev_queue = dev_ingress_queue(dev); if (dev_queue && tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, - &q_idx, s_q_idx, false) < 0) + &q_idx, s_q_idx, false, + tca[TCA_DUMP_INVISIBLE]) < 0) goto done; cont: @@ -1540,7 +1569,8 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) -static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) +static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct tcmsg *tcm = nlmsg_data(n); @@ -1559,7 +1589,7 @@ static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n) !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) return -EPERM; - err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL); + err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL, extack); if (err < 0) return err; @@ -1762,7 +1792,7 @@ static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb, { struct qdisc_dump_args arg; - if (tc_qdisc_dump_ignore(q) || + if (tc_qdisc_dump_ignore(q, false) || *t_p < s_t || !q->ops->cl_ops || (tcm->tcm_parent && TC_H_MAJ(tcm->tcm_parent) != q->handle)) { diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 2209c2ddacbfb0b1f222cb593477891d7124d9e3..40cbceed4de82aa58d70c2c67782fc65088b25c5 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -214,7 +214,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, if (opt == NULL) return -EINVAL; - error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy); + error = nla_parse_nested(tb, TCA_ATM_MAX, opt, atm_policy, NULL); if (error < 0) return error; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index d6ca18dc04c3e9e72efedd44088e95118a06b711..7415859fd4c3f65e6ff801b08d683b796d1eaa03 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1137,7 +1137,7 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) struct tc_ratespec *r; int err; - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); if (err < 0) return err; @@ -1161,6 +1161,8 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) sch->handle); if (!q->link.q) q->link.q = &noop_qdisc; + else + qdisc_hash_add(q->link.q, true); q->link.priority = TC_CBQ_MAXPRIO - 1; q->link.priority2 = TC_CBQ_MAXPRIO - 1; @@ -1472,7 +1474,7 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy); + err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL); if (err < 0) return err; @@ -1600,6 +1602,9 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!cl->q) cl->q = &noop_qdisc; + else + qdisc_hash_add(cl->q, true); + cl->common.classid = classid; cl->tparent = parent; cl->qdisc = sch; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index 3b86a97bc67c3e953cb181eddcb5c0c16bf3b27f..b30a2c70bd489b36a2295a11707b6a36d4eb9ac0 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -58,7 +58,6 @@ struct choke_sched_data { /* Variables */ struct red_vars vars; - struct tcf_proto __rcu *filter_list; struct { u32 prob_drop; /* Early probability drops */ u32 prob_mark; /* Early probability marks */ @@ -152,11 +151,6 @@ static inline void choke_set_classid(struct sk_buff *skb, u16 classid) choke_skb_cb(skb)->classid = classid; } -static u16 choke_get_classid(const struct sk_buff *skb) -{ - return choke_skb_cb(skb)->classid; -} - /* * Compare flow of two packets * Returns true only if source and destination address and port match. @@ -187,40 +181,6 @@ static bool choke_match_flow(struct sk_buff *skb1, sizeof(choke_skb_cb(skb1)->keys)); } -/* - * Classify flow using either: - * 1. pre-existing classification result in skb - * 2. fast internal classification - * 3. use TC filter based classification - */ -static bool choke_classify(struct sk_buff *skb, - struct Qdisc *sch, int *qerr) - -{ - struct choke_sched_data *q = qdisc_priv(sch); - struct tcf_result res; - struct tcf_proto *fl; - int result; - - fl = rcu_dereference_bh(q->filter_list); - result = tc_classify(skb, fl, &res, false); - if (result >= 0) { -#ifdef CONFIG_NET_CLS_ACT - switch (result) { - case TC_ACT_STOLEN: - case TC_ACT_QUEUED: - *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN; - case TC_ACT_SHOT: - return false; - } -#endif - choke_set_classid(skb, TC_H_MIN(res.classid)); - return true; - } - - return false; -} - /* * Select a packet at random from queue * HACK: since queue can have holes from previous deletion; retry several @@ -257,25 +217,15 @@ static bool choke_match_random(const struct choke_sched_data *q, return false; oskb = choke_peek_random(q, pidx); - if (rcu_access_pointer(q->filter_list)) - return choke_get_classid(nskb) == choke_get_classid(oskb); - return choke_match_flow(oskb, nskb); } static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS; struct choke_sched_data *q = qdisc_priv(sch); const struct red_parms *p = &q->parms; - if (rcu_access_pointer(q->filter_list)) { - /* If using external classifiers, get result and record it. */ - if (!choke_classify(skb, sch, &ret)) - goto other_drop; /* Packet was eaten by filter */ - } - choke_skb_cb(skb)->keys_valid = 0; /* Compute average queue usage (see RED) */ q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen); @@ -339,12 +289,6 @@ static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch, congestion_drop: qdisc_drop(skb, sch, to_free); return NET_XMIT_CN; - -other_drop: - if (ret & __NET_XMIT_BYPASS) - qdisc_qstats_drop(sch); - __qdisc_drop(skb, to_free); - return ret; } static struct sk_buff *choke_dequeue(struct Qdisc *sch) @@ -413,7 +357,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy); + err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy, NULL); if (err < 0) return err; @@ -432,10 +376,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt) if (mask != q->tab_mask) { struct sk_buff **ntab; - ntab = kcalloc(mask + 1, sizeof(struct sk_buff *), - GFP_KERNEL | __GFP_NOWARN); - if (!ntab) - ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *)); + ntab = kvmalloc_array((mask + 1), sizeof(struct sk_buff *), GFP_KERNEL | __GFP_ZERO); if (!ntab) return -ENOMEM; @@ -538,7 +479,6 @@ static void choke_destroy(struct Qdisc *sch) { struct choke_sched_data *q = qdisc_priv(sch); - tcf_destroy_chain(&q->filter_list); choke_free(q->tab); } diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 5bfa79ee657cf3674d2b6204aee80f63207b84f9..c518a1efcb9d3bd98d1de0d921bd250ed46bc69c 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -140,7 +140,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy); + err = nla_parse_nested(tb, TCA_CODEL_MAX, opt, codel_policy, NULL); if (err < 0) return err; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index bb4cbdf7500482b170eef6e7923cf2f2259e52b5..58a8c32eab2394f90e5935ad661b6223afee106b 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -76,7 +76,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy); + err = nla_parse_nested(tb, TCA_DRR_MAX, opt, drr_policy, NULL); if (err < 0) return err; @@ -117,6 +117,8 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; + else + qdisc_hash_add(cl->qdisc, true); if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, &cl->rate_est, diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 5334e309f17f0ef4416dddcdc9278c14ecb5585d..1c0f877f673a73977d3191a78a2964e6c1d41259 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -129,7 +129,7 @@ static int dsmark_change(struct Qdisc *sch, u32 classid, u32 parent, if (!opt) goto errout; - err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); if (err < 0) goto errout; @@ -342,7 +342,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) goto errout; - err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy); + err = nla_parse_nested(tb, TCA_DSMARK_MAX, opt, dsmark_policy, NULL); if (err < 0) goto errout; @@ -374,6 +374,8 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (p->q == NULL) p->q = &noop_qdisc; + else + qdisc_hash_add(p->q, true); pr_debug("%s: qdisc %p\n", __func__, p->q); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index a4f738ac77283b19927aef0a0e6c9fc8dafcdd42..b488721a0059adb24aea47240afa0164a6e467a9 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -624,16 +624,6 @@ static void fq_rehash(struct fq_sched_data *q, q->stat_gc_flows += fcnt; } -static void *fq_alloc_node(size_t sz, int node) -{ - void *ptr; - - ptr = kmalloc_node(sz, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN, node); - if (!ptr) - ptr = vmalloc_node(sz, node); - return ptr; -} - static void fq_free(void *addr) { kvfree(addr); @@ -650,7 +640,7 @@ static int fq_resize(struct Qdisc *sch, u32 log) return 0; /* If XPS was setup, we can allocate memory on right NUMA node */ - array = fq_alloc_node(sizeof(struct rb_root) << log, + array = kvmalloc_node(sizeof(struct rb_root) << log, GFP_KERNEL | __GFP_REPEAT, netdev_queue_numa_node_read(sch->dev_queue)); if (!array) return -ENOMEM; @@ -698,7 +688,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy); + err = nla_parse_nested(tb, TCA_FQ_MAX, opt, fq_policy, NULL); if (err < 0) return err; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9f3a884d15903fd9012c01b5eee802e02f9f709e..9201abce928cda39f099fc94d9fc96de1468a947 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -288,7 +288,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) struct fq_codel_flow *flow; struct list_head *head; u32 prev_drop_count, prev_ecn_mark; - unsigned int prev_backlog; begin: head = &q->new_flows; @@ -307,7 +306,6 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch) prev_drop_count = q->cstats.drop_count; prev_ecn_mark = q->cstats.ecn_mark; - prev_backlog = sch->qstats.backlog; skb = codel_dequeue(sch, &sch->qstats.backlog, &q->cparams, &flow->cvars, &q->cstats, qdisc_pkt_len, @@ -385,7 +383,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy); + err = nla_parse_nested(tb, TCA_FQ_CODEL_MAX, opt, fq_codel_policy, + NULL); if (err < 0) return err; if (tb[TCA_FQ_CODEL_FLOWS]) { @@ -447,27 +446,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt) return 0; } -static void *fq_codel_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - return ptr; -} - -static void fq_codel_free(void *addr) -{ - kvfree(addr); -} - static void fq_codel_destroy(struct Qdisc *sch) { struct fq_codel_sched_data *q = qdisc_priv(sch); tcf_destroy_chain(&q->filter_list); - fq_codel_free(q->backlogs); - fq_codel_free(q->flows); + kvfree(q->backlogs); + kvfree(q->flows); } static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) @@ -494,13 +479,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt) } if (!q->flows) { - q->flows = fq_codel_zalloc(q->flows_cnt * - sizeof(struct fq_codel_flow)); + q->flows = kvzalloc(q->flows_cnt * + sizeof(struct fq_codel_flow), GFP_KERNEL); if (!q->flows) return -ENOMEM; - q->backlogs = fq_codel_zalloc(q->flows_cnt * sizeof(u32)); + q->backlogs = kvzalloc(q->flows_cnt * sizeof(u32), GFP_KERNEL); if (!q->backlogs) { - fq_codel_free(q->flows); + kvfree(q->flows); return -ENOMEM; } for (i = 0; i < q->flows_cnt; i++) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 1a2f9e964330a5cd0c0b9a5cac91807221b0ffd9..52a2c55f6d9eb29ec1de3135596fe5a1529ebf23 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -795,7 +795,7 @@ static void attach_default_qdiscs(struct net_device *dev) } #ifdef CONFIG_NET_SCHED if (dev->qdisc != &noop_qdisc) - qdisc_hash_add(dev->qdisc); + qdisc_hash_add(dev->qdisc, false); #endif } diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index c78a093c551ae84a0ba5ba13a7af9cc89e2ca383..17c7130454bd90e8af1d17e95f477ea558fb481d 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -401,7 +401,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); if (err < 0) return err; @@ -470,7 +470,7 @@ static int gred_init(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy); + err = nla_parse_nested(tb, TCA_GRED_MAX, opt, gred_policy, NULL); if (err < 0) return err; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 3ffaa6fb0990f0aa31487a2f1829b1f1accf8b21..5cb82f6c1b0605d7a4373062dc4232e14347bc22 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -957,7 +957,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy); + err = nla_parse_nested(tb, TCA_HFSC_MAX, opt, hfsc_policy, NULL); if (err < 0) return err; @@ -1066,6 +1066,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; + else + qdisc_hash_add(cl->qdisc, true); INIT_LIST_HEAD(&cl->children); cl->vt_tree = RB_ROOT; cl->cf_tree = RB_ROOT; @@ -1425,6 +1427,8 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) sch->handle); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; + else + qdisc_hash_add(q->root.qdisc, true); INIT_LIST_HEAD(&q->root.children); q->root.vt_tree = RB_ROOT; q->root.cf_tree = RB_ROOT; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index 2fae8b5f1b80c017c4ae60df54c9143f82de4e9d..51d3ba682af9ba0f69a3f3c3036519bf527cdee0 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -467,29 +467,14 @@ static void hhf_reset(struct Qdisc *sch) rtnl_kfree_skbs(skb, skb); } -static void *hhf_zalloc(size_t sz) -{ - void *ptr = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vzalloc(sz); - - return ptr; -} - -static void hhf_free(void *addr) -{ - kvfree(addr); -} - static void hhf_destroy(struct Qdisc *sch) { int i; struct hhf_sched_data *q = qdisc_priv(sch); for (i = 0; i < HHF_ARRAYS_CNT; i++) { - hhf_free(q->hhf_arrays[i]); - hhf_free(q->hhf_valid_bits[i]); + kvfree(q->hhf_arrays[i]); + kvfree(q->hhf_valid_bits[i]); } for (i = 0; i < HH_FLOWS_CNT; i++) { @@ -503,7 +488,7 @@ static void hhf_destroy(struct Qdisc *sch) kfree(flow); } } - hhf_free(q->hh_flows); + kvfree(q->hh_flows); } static const struct nla_policy hhf_policy[TCA_HHF_MAX + 1] = { @@ -529,7 +514,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy); + err = nla_parse_nested(tb, TCA_HHF_MAX, opt, hhf_policy, NULL); if (err < 0) return err; @@ -609,8 +594,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) if (!q->hh_flows) { /* Initialize heavy-hitter flow table. */ - q->hh_flows = hhf_zalloc(HH_FLOWS_CNT * - sizeof(struct list_head)); + q->hh_flows = kvzalloc(HH_FLOWS_CNT * + sizeof(struct list_head), GFP_KERNEL); if (!q->hh_flows) return -ENOMEM; for (i = 0; i < HH_FLOWS_CNT; i++) @@ -624,8 +609,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_arrays[i] = hhf_zalloc(HHF_ARRAYS_LEN * - sizeof(u32)); + q->hhf_arrays[i] = kvzalloc(HHF_ARRAYS_LEN * + sizeof(u32), GFP_KERNEL); if (!q->hhf_arrays[i]) { /* Note: hhf_destroy() will be called * by our caller. @@ -637,8 +622,8 @@ static int hhf_init(struct Qdisc *sch, struct nlattr *opt) /* Initialize valid bits of heavy-hitter filter arrays. */ for (i = 0; i < HHF_ARRAYS_CNT; i++) { - q->hhf_valid_bits[i] = hhf_zalloc(HHF_ARRAYS_LEN / - BITS_PER_BYTE); + q->hhf_valid_bits[i] = kvzalloc(HHF_ARRAYS_LEN / + BITS_PER_BYTE, GFP_KERNEL); if (!q->hhf_valid_bits[i]) { /* Note: hhf_destroy() will be called * by our caller. diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4cd5fb134bc9e2dbcdd61b51fb951f94301ed54c..570ef3b0c09b7abe49da4787b668397d4413f0e8 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1017,7 +1017,7 @@ static int htb_init(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); if (err < 0) return err; @@ -1342,7 +1342,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, if (!opt) goto failure; - err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy, NULL); if (err < 0) goto failure; @@ -1460,6 +1460,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, qdisc_class_hash_insert(&q->clhash, &cl->common); if (parent) parent->children++; + if (cl->un.leaf.q != &noop_qdisc) + qdisc_hash_add(cl->un.leaf.q, true); } else { if (tca[TCA_RATE]) { err = gen_replace_estimator(&cl->bstats, NULL, diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 20b7f1646f69270e08d8b7588759a0146f262e89..cadfdd4f1e521b3d68b8fa62d5797f3ff604651d 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -84,7 +84,7 @@ static void mq_attach(struct Qdisc *sch) qdisc_destroy(old); #ifdef CONFIG_NET_SCHED if (ntx < dev->real_num_tx_queues) - qdisc_hash_add(qdisc); + qdisc_hash_add(qdisc, false); #endif } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index 922683418e53853cb71747d8d30ab0e4a989254b..0a4cf27ea54bd78768d4fa084f7b082460f5f266 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -21,14 +21,13 @@ struct mqprio_sched { struct Qdisc **qdiscs; - int hw_owned; + int hw_offload; }; static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); - struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { @@ -39,10 +38,15 @@ static void mqprio_destroy(struct Qdisc *sch) kfree(priv->qdiscs); } - if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) + if (priv->hw_offload && dev->netdev_ops->ndo_setup_tc) { + struct tc_mqprio_qopt offload = { 0 }; + struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO, + { .mqprio = &offload } }; + dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); - else + } else { netdev_set_num_tc(dev, 0); + } } static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) @@ -59,15 +63,20 @@ static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) return -EINVAL; } - /* net_device does not support requested operation */ - if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) - return -EINVAL; + /* Limit qopt->hw to maximum supported offload value. Drivers have + * the option of overriding this later if they don't support the a + * given offload type. + */ + if (qopt->hw > TC_MQPRIO_HW_OFFLOAD_MAX) + qopt->hw = TC_MQPRIO_HW_OFFLOAD_MAX; - /* if hw owned qcount and qoffset are taken from LLD so - * no reason to verify them here + /* If hardware offload is requested we will leave it to the device + * to either populate the queue counts itself or to validate the + * provided queue counts. If ndo_setup_tc is not present then + * hardware doesn't support offload and we should return an error. */ if (qopt->hw) - return 0; + return dev->netdev_ops->ndo_setup_tc ? 0 : -EINVAL; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; @@ -139,13 +148,15 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) * supplied and verified mapping */ if (qopt->hw) { - struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, - { .tc = qopt->num_tc }}; + struct tc_mqprio_qopt offload = *qopt; + struct tc_to_netdev tc = { .type = TC_SETUP_MQPRIO, + { .mqprio = &offload } }; - priv->hw_owned = 1; err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) return err; + + priv->hw_offload = offload.hw; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) @@ -175,7 +186,7 @@ static void mqprio_attach(struct Qdisc *sch) if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) - qdisc_hash_add(qdisc); + qdisc_hash_add(qdisc, false); } kfree(priv->qdiscs); priv->qdiscs = NULL; @@ -243,7 +254,7 @@ static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); - opt.hw = priv->hw_owned; + opt.hw = priv->hw_offload; for (i = 0; i < netdev_get_num_tc(dev); i++) { opt.count[i] = dev->tc_to_txq[i].count; diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index e7839a0d0eaa52572f675fdb1dfc590c2a70ac76..43a3a10b3c8118fc2e0deff98be2635d2ad81330 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -217,6 +217,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) sch_tree_lock(sch); old = q->queues[i]; q->queues[i] = child; + if (child != &noop_qdisc) + qdisc_hash_add(child, true); if (old != &noop_qdisc) { qdisc_tree_reduce_backlog(old, diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index c8bb62a1e7449344a0fd81241fe0102ea2f9c0f9..1b3dd6190e9386c6f8104153eb9fdc0255e03ac5 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -462,7 +462,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* If a delay is expected, orphan the skb. (orphaning usually takes * place at TX completion time, so _before_ the link transit delay) */ - if (q->latency || q->jitter) + if (q->latency || q->jitter || q->rate) skb_orphan_partial(skb); /* @@ -530,21 +530,31 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, now = psched_get_time(); if (q->rate) { - struct sk_buff *last; + struct netem_skb_cb *last = NULL; + + if (sch->q.tail) + last = netem_skb_cb(sch->q.tail); + if (q->t_root.rb_node) { + struct sk_buff *t_skb; + struct netem_skb_cb *t_last; + + t_skb = netem_rb_to_skb(rb_last(&q->t_root)); + t_last = netem_skb_cb(t_skb); + if (!last || + t_last->time_to_send > last->time_to_send) { + last = t_last; + } + } - if (sch->q.qlen) - last = sch->q.tail; - else - last = netem_rb_to_skb(rb_last(&q->t_root)); if (last) { /* * Last packet in queue is reference point (now), * calculate this time bonus and subtract * from delay. */ - delay -= netem_skb_cb(last)->time_to_send - now; + delay -= last->time_to_send - now; delay = max_t(psched_tdiff_t, 0, delay); - now = netem_skb_cb(last)->time_to_send; + now = last->time_to_send; } delay += packet_len_2_sched_time(qdisc_pkt_len(skb), q); @@ -692,15 +702,11 @@ static int get_dist_table(struct Qdisc *sch, const struct nlattr *attr) spinlock_t *root_lock; struct disttable *d; int i; - size_t s; if (n > NETEM_DIST_MAX) return -EINVAL; - s = sizeof(struct disttable) + n * sizeof(s16); - d = kmalloc(s, GFP_KERNEL | __GFP_NOWARN); - if (!d) - d = vmalloc(s); + d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL); if (!d) return -ENOMEM; @@ -833,7 +839,7 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla, if (nested_len >= nla_attr_size(0)) return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len), - nested_len, policy); + nested_len, policy, NULL); memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1)); return 0; diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 5c3a99d6aa82ae0dcfd71ebceaae8de5d943e352..6c2791d6102dae67fe006b17723d6f50edc07885 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -190,7 +190,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt) if (!opt) return -EINVAL; - err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy); + err = nla_parse_nested(tb, TCA_PIE_MAX, opt, pie_policy, NULL); if (err < 0) return err; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index d4d7db267b6edfa56582ca4a588590e0ded9fe66..92c2e6d448d7984af35d6beb2cb3aea717b76511 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -192,8 +192,11 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) qdisc_destroy(child); } - for (i = oldbands; i < q->bands; i++) + for (i = oldbands; i < q->bands; i++) { q->queues[i] = queues[i]; + if (q->queues[i] != &noop_qdisc) + qdisc_hash_add(q->queues[i], true); + } sch_tree_unlock(sch); return 0; diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index f9e712ce2d15ce9280c31d2f75d62b84034ae51d..041eba3006ccdd84e1ea1419443f1d169550ceb5 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -418,7 +418,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, return -EINVAL; } - err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy); + err = nla_parse_nested(tb, TCA_QFQ_MAX, tca[TCA_OPTIONS], qfq_policy, + NULL); if (err < 0) return err; @@ -494,6 +495,8 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, goto destroy_class; } + if (cl->qdisc != &noop_qdisc) + qdisc_hash_add(cl->qdisc, true); sch_tree_lock(sch); qdisc_class_hash_insert(&q->clhash, &cl->common); sch_tree_unlock(sch); diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 249b2a18acbd99288eb0a2579a0f29c2ab0b3ded..11292adce4121022a86aff0aed537be3d9319490 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -173,7 +173,7 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) if (opt == NULL) return -EINVAL; - err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy); + err = nla_parse_nested(tb, TCA_RED_MAX, opt, red_policy, NULL); if (err < 0) return err; @@ -191,6 +191,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt) return PTR_ERR(child); } + if (child != &noop_qdisc) + qdisc_hash_add(child, true); sch_tree_lock(sch); q->flags = ctl->flags; q->limit = ctl->limit; diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index fe6963d2151956c508b510edec680b89201173ce..0f777273ba293dcf2e62ae484dd0033bce942a2b 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -495,7 +495,7 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) int err; if (opt) { - err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy); + err = nla_parse_nested(tb, TCA_SFB_MAX, opt, sfb_policy, NULL); if (err < 0) return -EINVAL; @@ -513,6 +513,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt) if (IS_ERR(child)) return PTR_ERR(child); + if (child != &noop_qdisc) + qdisc_hash_add(child, true); sch_tree_lock(sch); qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen, diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 42e8c8615e6563a2deabbb3c3437e3985d01ae14..332d94be6e1cb09386307b6c16b6954244a805c9 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -685,11 +685,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt) static void *sfq_alloc(size_t sz) { - void *ptr = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN); - - if (!ptr) - ptr = vmalloc(sz); - return ptr; + return kvmalloc(sz, GFP_KERNEL); } static void sfq_free(void *addr) @@ -714,9 +710,8 @@ static int sfq_init(struct Qdisc *sch, struct nlattr *opt) struct sfq_sched_data *q = qdisc_priv(sch); int i; - q->perturb_timer.function = sfq_perturbation; - q->perturb_timer.data = (unsigned long)sch; - init_timer_deferrable(&q->perturb_timer); + setup_deferrable_timer(&q->perturb_timer, sfq_perturbation, + (unsigned long)sch); for (i = 0; i < SFQ_MAX_DEPTH + 1; i++) { q->dep[i].next = i + SFQ_MAX_FLOWS; diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index 303355c449ab336227d9b115496e0882f2f2a079..b2e4b6ad241a8e538c654ef4e8417ab756740a45 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -315,7 +315,7 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) s64 buffer, mtu; u64 rate64 = 0, prate64 = 0; - err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy); + err = nla_parse_nested(tb, TCA_TBF_MAX, opt, tbf_policy, NULL); if (err < 0) return err; @@ -396,6 +396,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt) q->qdisc->qstats.backlog); qdisc_destroy(q->qdisc); q->qdisc = child; + if (child != &noop_qdisc) + qdisc_hash_add(child, true); } q->limit = qopt->limit; if (tb[TCA_TBF_PBURST]) diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index e3621cb4827fadb5f5cb41ebe8455dfa3300a765..697721a7a3f1761373aa66b847bd744ea1b42d10 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -306,14 +306,24 @@ int sctp_chunk_abandoned(struct sctp_chunk *chunk) if (SCTP_PR_TTL_ENABLED(chunk->sinfo.sinfo_flags) && time_after(jiffies, chunk->msg->expires_at)) { - if (chunk->sent_count) + struct sctp_stream_out *streamout = + &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + + if (chunk->sent_count) { chunk->asoc->abandoned_sent[SCTP_PR_INDEX(TTL)]++; - else + streamout->abandoned_sent[SCTP_PR_INDEX(TTL)]++; + } else { chunk->asoc->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + streamout->abandoned_unsent[SCTP_PR_INDEX(TTL)]++; + } return 1; } else if (SCTP_PR_RTX_ENABLED(chunk->sinfo.sinfo_flags) && chunk->sent_count > chunk->sinfo.sinfo_timetolive) { + struct sctp_stream_out *streamout = + &chunk->asoc->stream->out[chunk->sinfo.sinfo_stream]; + chunk->asoc->abandoned_sent[SCTP_PR_INDEX(RTX)]++; + streamout->abandoned_sent[SCTP_PR_INDEX(RTX)]++; return 1; } else if (!SCTP_PR_POLICY(chunk->sinfo.sinfo_flags) && chunk->msg->expires_at && diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index 8081476ed313cca8ad8f9876ec5ef5dc8fd68207..fe4c3d462f6ebc48d11d2a587f2adb7a39fde2e5 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -353,6 +353,8 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, struct sctp_chunk *chk, *temp; list_for_each_entry_safe(chk, temp, queue, transmitted_list) { + struct sctp_stream_out *streamout; + if (!SCTP_PR_PRIO_ENABLED(chk->sinfo.sinfo_flags) || chk->sinfo.sinfo_timetolive <= sinfo->sinfo_timetolive) continue; @@ -361,8 +363,10 @@ static int sctp_prsctp_prune_sent(struct sctp_association *asoc, sctp_insert_list(&asoc->outqueue.abandoned, &chk->transmitted_list); + streamout = &asoc->stream->out[chk->sinfo.sinfo_stream]; asoc->sent_cnt_removable--; asoc->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; + streamout->abandoned_sent[SCTP_PR_INDEX(PRIO)]++; if (!chk->tsn_gap_acked) { if (chk->transport) @@ -396,6 +400,12 @@ static int sctp_prsctp_prune_unsent(struct sctp_association *asoc, q->out_qlen -= chk->skb->len; asoc->sent_cnt_removable--; asoc->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + if (chk->sinfo.sinfo_stream < asoc->stream->outcnt) { + struct sctp_stream_out *streamout = + &asoc->stream->out[chk->sinfo.sinfo_stream]; + + streamout->abandoned_unsent[SCTP_PR_INDEX(PRIO)]++; + } msg_len -= SCTP_DATA_SNDSIZE(chk) + sizeof(struct sk_buff) + diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 118faff6a332ee24caf3d772b6f00641128ef104..8a08f13469c4cc26aeec55de6793b2c2e3c562a6 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -1512,14 +1512,12 @@ int sctp_user_addto_chunk(struct sctp_chunk *chunk, int len, struct iov_iter *from) { void *target; - ssize_t copied; /* Make room in chunk for data. */ target = skb_put(chunk->skb, len); /* Copy data (whole iovec) into chunk */ - copied = copy_from_iter(target, len, from); - if (copied != len) + if (!copy_from_iter_full(target, len, from)) return -EFAULT; /* Adjust the chunk length field. */ diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 24c6ccce7539097728f3733ae5ecc037562cefcf..4f5e6cfc7f601b4de8db8802668d553f1af8491d 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -3872,9 +3872,18 @@ sctp_disposition_t sctp_sf_do_reconf(struct net *net, else if (param.p->type == SCTP_PARAM_RESET_IN_REQUEST) reply = sctp_process_strreset_inreq( (struct sctp_association *)asoc, param, &ev); - /* More handles for other types will be added here, by now it - * just ignores other types. - */ + else if (param.p->type == SCTP_PARAM_RESET_TSN_REQUEST) + reply = sctp_process_strreset_tsnreq( + (struct sctp_association *)asoc, param, &ev); + else if (param.p->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) + reply = sctp_process_strreset_addstrm_out( + (struct sctp_association *)asoc, param, &ev); + else if (param.p->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) + reply = sctp_process_strreset_addstrm_in( + (struct sctp_association *)asoc, param, &ev); + else if (param.p->type == SCTP_PARAM_RESET_RESPONSE) + reply = sctp_process_strreset_resp( + (struct sctp_association *)asoc, param, &ev); if (ev) sctp_add_cmd_sf(commands, SCTP_CMD_EVENT_ULP, diff --git a/net/sctp/socket.c b/net/sctp/socket.c index d9d4c92e06b312e6c300afd8f3d5db33161fd9f7..f16c8d97b7f313e9f671d8bb8620b0f9566e619f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3758,6 +3758,39 @@ static int sctp_setsockopt_default_prinfo(struct sock *sk, return retval; } +static int sctp_setsockopt_reconfig_supported(struct sock *sk, + char __user *optval, + unsigned int optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EINVAL; + + if (optlen != sizeof(params)) + goto out; + + if (copy_from_user(¶ms, optval, optlen)) { + retval = -EFAULT; + goto out; + } + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + asoc->reconf_enable = !!params.assoc_value; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + sp->ep->reconf_enable = !!params.assoc_value; + } else { + goto out; + } + + retval = 0; + +out: + return retval; +} + static int sctp_setsockopt_enable_strreset(struct sock *sk, char __user *optval, unsigned int optlen) @@ -4038,6 +4071,9 @@ static int sctp_setsockopt(struct sock *sk, int level, int optname, case SCTP_DEFAULT_PRINFO: retval = sctp_setsockopt_default_prinfo(sk, optval, optlen); break; + case SCTP_RECONFIG_SUPPORTED: + retval = sctp_setsockopt_reconfig_supported(sk, optval, optlen); + break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_setsockopt_enable_strreset(sk, optval, optlen); break; @@ -6540,6 +6576,102 @@ static int sctp_getsockopt_pr_assocstatus(struct sock *sk, int len, return retval; } +static int sctp_getsockopt_pr_streamstatus(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_stream_out *streamout; + struct sctp_association *asoc; + struct sctp_prstatus params; + int retval = -EINVAL; + int policy; + + if (len < sizeof(params)) + goto out; + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) { + retval = -EFAULT; + goto out; + } + + policy = params.sprstat_policy; + if (policy & ~SCTP_PR_SCTP_MASK) + goto out; + + asoc = sctp_id2assoc(sk, params.sprstat_assoc_id); + if (!asoc || params.sprstat_sid >= asoc->stream->outcnt) + goto out; + + streamout = &asoc->stream->out[params.sprstat_sid]; + if (policy == SCTP_PR_SCTP_NONE) { + params.sprstat_abandoned_unsent = 0; + params.sprstat_abandoned_sent = 0; + for (policy = 0; policy <= SCTP_PR_INDEX(MAX); policy++) { + params.sprstat_abandoned_unsent += + streamout->abandoned_unsent[policy]; + params.sprstat_abandoned_sent += + streamout->abandoned_sent[policy]; + } + } else { + params.sprstat_abandoned_unsent = + streamout->abandoned_unsent[__SCTP_PR_INDEX(policy)]; + params.sprstat_abandoned_sent = + streamout->abandoned_sent[__SCTP_PR_INDEX(policy)]; + } + + if (put_user(len, optlen) || copy_to_user(optval, ¶ms, len)) { + retval = -EFAULT; + goto out; + } + + retval = 0; + +out: + return retval; +} + +static int sctp_getsockopt_reconfig_supported(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc; + int retval = -EFAULT; + + if (len < sizeof(params)) { + retval = -EINVAL; + goto out; + } + + len = sizeof(params); + if (copy_from_user(¶ms, optval, len)) + goto out; + + asoc = sctp_id2assoc(sk, params.assoc_id); + if (asoc) { + params.assoc_value = asoc->reconf_enable; + } else if (!params.assoc_id) { + struct sctp_sock *sp = sctp_sk(sk); + + params.assoc_value = sp->ep->reconf_enable; + } else { + retval = -EINVAL; + goto out; + } + + if (put_user(len, optlen)) + goto out; + + if (copy_to_user(optval, ¶ms, len)) + goto out; + + retval = 0; + +out: + return retval; +} + static int sctp_getsockopt_enable_strreset(struct sock *sk, int len, char __user *optval, int __user *optlen) @@ -6748,6 +6880,14 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_pr_assocstatus(sk, len, optval, optlen); break; + case SCTP_PR_STREAM_STATUS: + retval = sctp_getsockopt_pr_streamstatus(sk, len, optval, + optlen); + break; + case SCTP_RECONFIG_SUPPORTED: + retval = sctp_getsockopt_reconfig_supported(sk, len, optval, + optlen); + break; case SCTP_ENABLE_STREAM_RESET: retval = sctp_getsockopt_enable_strreset(sk, len, optval, optlen); @@ -7440,9 +7580,12 @@ struct sk_buff *sctp_skb_recv_datagram(struct sock *sk, int flags, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk_can_busy_loop(sk) && - sk_busy_loop(sk, noblock)) - continue; + if (sk_can_busy_loop(sk)) { + sk_busy_loop(sk, noblock); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + continue; + } /* User doesn't want to wait. */ error = -EAGAIN; diff --git a/net/sctp/stream.c b/net/sctp/stream.c index bbed997e1c5f01d4401adcd4664be4b6d16a93fa..dda53a2939866d5bb7e3233a1ba0a90160ecf488 100644 --- a/net/sctp/stream.c +++ b/net/sctp/stream.c @@ -294,18 +294,6 @@ int sctp_send_add_streams(struct sctp_association *asoc, stream->out = streamout; } - if (in) { - struct sctp_stream_in *streamin; - - streamin = krealloc(stream->in, incnt * sizeof(*streamin), - GFP_KERNEL); - if (!streamin) - goto out; - - memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); - stream->in = streamin; - } - chunk = sctp_make_strreset_addstrm(asoc, out, in); if (!chunk) goto out; @@ -330,13 +318,14 @@ int sctp_send_add_streams(struct sctp_association *asoc, } static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param( - struct sctp_association *asoc, __u32 resp_seq) + struct sctp_association *asoc, __u32 resp_seq, + __be16 type) { struct sctp_chunk *chunk = asoc->strreset_chunk; struct sctp_reconf_chunk *hdr; union sctp_params param; - if (ntohl(resp_seq) != asoc->strreset_outseq || !chunk) + if (!chunk) return NULL; hdr = (struct sctp_reconf_chunk *)chunk->chunk_hdr; @@ -347,13 +336,21 @@ static sctp_paramhdr_t *sctp_chunk_lookup_strreset_param( */ struct sctp_strreset_tsnreq *req = param.v; - if (req->request_seq == resp_seq) + if ((!resp_seq || req->request_seq == resp_seq) && + (!type || type == req->param_hdr.type)) return param.v; } return NULL; } +static void sctp_update_strreset_result(struct sctp_association *asoc, + __u32 result) +{ + asoc->strreset_result[1] = asoc->strreset_result[0]; + asoc->strreset_result[0] = result; +} + struct sctp_chunk *sctp_process_strreset_outreq( struct sctp_association *asoc, union sctp_params param, @@ -370,15 +367,19 @@ struct sctp_chunk *sctp_process_strreset_outreq( if (ntohl(outreq->send_reset_at_tsn) > sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map)) { result = SCTP_STRRESET_IN_PROGRESS; - goto out; + goto err; } - if (request_seq > asoc->strreset_inseq) { + if (TSN_lt(asoc->strreset_inseq, request_seq) || + TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; - goto out; - } else if (request_seq == asoc->strreset_inseq) { - asoc->strreset_inseq++; + goto err; + } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { + i = asoc->strreset_inseq - request_seq - 1; + result = asoc->strreset_result[i]; + goto err; } + asoc->strreset_inseq++; /* Check strreset_enable after inseq inc, as sender cannot tell * the peer doesn't enable strreset after receiving response with @@ -388,13 +389,9 @@ struct sctp_chunk *sctp_process_strreset_outreq( goto out; if (asoc->strreset_chunk) { - sctp_paramhdr_t *param_hdr; - struct sctp_transport *t; - - param_hdr = sctp_chunk_lookup_strreset_param( - asoc, outreq->response_seq); - if (!param_hdr || param_hdr->type != - SCTP_PARAM_RESET_IN_REQUEST) { + if (!sctp_chunk_lookup_strreset_param( + asoc, outreq->response_seq, + SCTP_PARAM_RESET_IN_REQUEST)) { /* same process with outstanding isn't 0 */ result = SCTP_STRRESET_ERR_IN_PROGRESS; goto out; @@ -404,6 +401,8 @@ struct sctp_chunk *sctp_process_strreset_outreq( asoc->strreset_outseq++; if (!asoc->strreset_outstanding) { + struct sctp_transport *t; + t = asoc->strreset_chunk->transport; if (del_timer(&t->reconf_timer)) sctp_transport_put(t); @@ -439,6 +438,8 @@ struct sctp_chunk *sctp_process_strreset_outreq( GFP_ATOMIC); out: + sctp_update_strreset_result(asoc, result); +err: return sctp_make_strreset_resp(asoc, result, request_seq); } @@ -455,12 +456,18 @@ struct sctp_chunk *sctp_process_strreset_inreq( __u32 request_seq; request_seq = ntohl(inreq->request_seq); - if (request_seq > asoc->strreset_inseq) { + if (TSN_lt(asoc->strreset_inseq, request_seq) || + TSN_lt(request_seq, asoc->strreset_inseq - 2)) { result = SCTP_STRRESET_ERR_BAD_SEQNO; - goto out; - } else if (request_seq == asoc->strreset_inseq) { - asoc->strreset_inseq++; + goto err; + } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { + i = asoc->strreset_inseq - request_seq - 1; + result = asoc->strreset_result[i]; + if (result == SCTP_STRRESET_PERFORMED) + return NULL; + goto err; } + asoc->strreset_inseq++; if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_STREAM_REQ)) goto out; @@ -495,12 +502,407 @@ struct sctp_chunk *sctp_process_strreset_inreq( asoc->strreset_outstanding = 1; sctp_chunk_hold(asoc->strreset_chunk); + result = SCTP_STRRESET_PERFORMED; + *evp = sctp_ulpevent_make_stream_reset_event(asoc, SCTP_STREAM_RESET_INCOMING_SSN, nums, str_p, GFP_ATOMIC); out: + sctp_update_strreset_result(asoc, result); +err: if (!chunk) chunk = sctp_make_strreset_resp(asoc, result, request_seq); return chunk; } + +struct sctp_chunk *sctp_process_strreset_tsnreq( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + __u32 init_tsn = 0, next_tsn = 0, max_tsn_seen; + struct sctp_strreset_tsnreq *tsnreq = param.v; + struct sctp_stream *stream = asoc->stream; + __u32 result = SCTP_STRRESET_DENIED; + __u32 request_seq; + __u16 i; + + request_seq = ntohl(tsnreq->request_seq); + if (TSN_lt(asoc->strreset_inseq, request_seq) || + TSN_lt(request_seq, asoc->strreset_inseq - 2)) { + result = SCTP_STRRESET_ERR_BAD_SEQNO; + goto err; + } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { + i = asoc->strreset_inseq - request_seq - 1; + result = asoc->strreset_result[i]; + if (result == SCTP_STRRESET_PERFORMED) { + next_tsn = asoc->next_tsn; + init_tsn = + sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + 1; + } + goto err; + } + asoc->strreset_inseq++; + + if (!(asoc->strreset_enable & SCTP_ENABLE_RESET_ASSOC_REQ)) + goto out; + + if (asoc->strreset_outstanding) { + result = SCTP_STRRESET_ERR_IN_PROGRESS; + goto out; + } + + /* G3: The same processing as though a SACK chunk with no gap report + * and a cumulative TSN ACK of the Sender's Next TSN minus 1 were + * received MUST be performed. + */ + max_tsn_seen = sctp_tsnmap_get_max_tsn_seen(&asoc->peer.tsn_map); + sctp_ulpq_reasm_flushtsn(&asoc->ulpq, max_tsn_seen); + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + + /* G1: Compute an appropriate value for the Receiver's Next TSN -- the + * TSN that the peer should use to send the next DATA chunk. The + * value SHOULD be the smallest TSN not acknowledged by the + * receiver of the request plus 2^31. + */ + init_tsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map) + (1 << 31); + sctp_tsnmap_init(&asoc->peer.tsn_map, SCTP_TSN_MAP_INITIAL, + init_tsn, GFP_ATOMIC); + + /* G4: The same processing as though a FWD-TSN chunk (as defined in + * [RFC3758]) with all streams affected and a new cumulative TSN + * ACK of the Receiver's Next TSN minus 1 were received MUST be + * performed. + */ + sctp_outq_free(&asoc->outqueue); + + /* G2: Compute an appropriate value for the local endpoint's next TSN, + * i.e., the next TSN assigned by the receiver of the SSN/TSN reset + * chunk. The value SHOULD be the highest TSN sent by the receiver + * of the request plus 1. + */ + next_tsn = asoc->next_tsn; + asoc->ctsn_ack_point = next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + + /* G5: The next expected and outgoing SSNs MUST be reset to 0 for all + * incoming and outgoing streams. + */ + for (i = 0; i < stream->outcnt; i++) + stream->out[i].ssn = 0; + for (i = 0; i < stream->incnt; i++) + stream->in[i].ssn = 0; + + result = SCTP_STRRESET_PERFORMED; + + *evp = sctp_ulpevent_make_assoc_reset_event(asoc, 0, init_tsn, + next_tsn, GFP_ATOMIC); + +out: + sctp_update_strreset_result(asoc, result); +err: + return sctp_make_strreset_tsnresp(asoc, result, request_seq, + next_tsn, init_tsn); +} + +struct sctp_chunk *sctp_process_strreset_addstrm_out( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + struct sctp_strreset_addstrm *addstrm = param.v; + struct sctp_stream *stream = asoc->stream; + __u32 result = SCTP_STRRESET_DENIED; + struct sctp_stream_in *streamin; + __u32 request_seq, incnt; + __u16 in, i; + + request_seq = ntohl(addstrm->request_seq); + if (TSN_lt(asoc->strreset_inseq, request_seq) || + TSN_lt(request_seq, asoc->strreset_inseq - 2)) { + result = SCTP_STRRESET_ERR_BAD_SEQNO; + goto err; + } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { + i = asoc->strreset_inseq - request_seq - 1; + result = asoc->strreset_result[i]; + goto err; + } + asoc->strreset_inseq++; + + if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) + goto out; + + if (asoc->strreset_chunk) { + if (!sctp_chunk_lookup_strreset_param( + asoc, 0, SCTP_PARAM_RESET_ADD_IN_STREAMS)) { + /* same process with outstanding isn't 0 */ + result = SCTP_STRRESET_ERR_IN_PROGRESS; + goto out; + } + + asoc->strreset_outstanding--; + asoc->strreset_outseq++; + + if (!asoc->strreset_outstanding) { + struct sctp_transport *t; + + t = asoc->strreset_chunk->transport; + if (del_timer(&t->reconf_timer)) + sctp_transport_put(t); + + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + } + } + + in = ntohs(addstrm->number_of_streams); + incnt = stream->incnt + in; + if (!in || incnt > SCTP_MAX_STREAM) + goto out; + + streamin = krealloc(stream->in, incnt * sizeof(*streamin), + GFP_ATOMIC); + if (!streamin) + goto out; + + memset(streamin + stream->incnt, 0, in * sizeof(*streamin)); + stream->in = streamin; + stream->incnt = incnt; + + result = SCTP_STRRESET_PERFORMED; + + *evp = sctp_ulpevent_make_stream_change_event(asoc, + 0, ntohs(addstrm->number_of_streams), 0, GFP_ATOMIC); + +out: + sctp_update_strreset_result(asoc, result); +err: + return sctp_make_strreset_resp(asoc, result, request_seq); +} + +struct sctp_chunk *sctp_process_strreset_addstrm_in( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + struct sctp_strreset_addstrm *addstrm = param.v; + struct sctp_stream *stream = asoc->stream; + __u32 result = SCTP_STRRESET_DENIED; + struct sctp_stream_out *streamout; + struct sctp_chunk *chunk = NULL; + __u32 request_seq, outcnt; + __u16 out, i; + + request_seq = ntohl(addstrm->request_seq); + if (TSN_lt(asoc->strreset_inseq, request_seq) || + TSN_lt(request_seq, asoc->strreset_inseq - 2)) { + result = SCTP_STRRESET_ERR_BAD_SEQNO; + goto err; + } else if (TSN_lt(request_seq, asoc->strreset_inseq)) { + i = asoc->strreset_inseq - request_seq - 1; + result = asoc->strreset_result[i]; + if (result == SCTP_STRRESET_PERFORMED) + return NULL; + goto err; + } + asoc->strreset_inseq++; + + if (!(asoc->strreset_enable & SCTP_ENABLE_CHANGE_ASSOC_REQ)) + goto out; + + if (asoc->strreset_outstanding) { + result = SCTP_STRRESET_ERR_IN_PROGRESS; + goto out; + } + + out = ntohs(addstrm->number_of_streams); + outcnt = stream->outcnt + out; + if (!out || outcnt > SCTP_MAX_STREAM) + goto out; + + streamout = krealloc(stream->out, outcnt * sizeof(*streamout), + GFP_ATOMIC); + if (!streamout) + goto out; + + memset(streamout + stream->outcnt, 0, out * sizeof(*streamout)); + stream->out = streamout; + + chunk = sctp_make_strreset_addstrm(asoc, out, 0); + if (!chunk) + goto out; + + asoc->strreset_chunk = chunk; + asoc->strreset_outstanding = 1; + sctp_chunk_hold(asoc->strreset_chunk); + + stream->outcnt = outcnt; + + result = SCTP_STRRESET_PERFORMED; + + *evp = sctp_ulpevent_make_stream_change_event(asoc, + 0, 0, ntohs(addstrm->number_of_streams), GFP_ATOMIC); + +out: + sctp_update_strreset_result(asoc, result); +err: + if (!chunk) + chunk = sctp_make_strreset_resp(asoc, result, request_seq); + + return chunk; +} + +struct sctp_chunk *sctp_process_strreset_resp( + struct sctp_association *asoc, + union sctp_params param, + struct sctp_ulpevent **evp) +{ + struct sctp_strreset_resp *resp = param.v; + struct sctp_stream *stream = asoc->stream; + struct sctp_transport *t; + __u16 i, nums, flags = 0; + sctp_paramhdr_t *req; + __u32 result; + + req = sctp_chunk_lookup_strreset_param(asoc, resp->response_seq, 0); + if (!req) + return NULL; + + result = ntohl(resp->result); + if (result != SCTP_STRRESET_PERFORMED) { + /* if in progress, do nothing but retransmit */ + if (result == SCTP_STRRESET_IN_PROGRESS) + return NULL; + else if (result == SCTP_STRRESET_DENIED) + flags = SCTP_STREAM_RESET_DENIED; + else + flags = SCTP_STREAM_RESET_FAILED; + } + + if (req->type == SCTP_PARAM_RESET_OUT_REQUEST) { + struct sctp_strreset_outreq *outreq; + __u16 *str_p; + + outreq = (struct sctp_strreset_outreq *)req; + str_p = outreq->list_of_streams; + nums = (ntohs(outreq->param_hdr.length) - sizeof(*outreq)) / 2; + + if (result == SCTP_STRRESET_PERFORMED) { + if (nums) { + for (i = 0; i < nums; i++) + stream->out[ntohs(str_p[i])].ssn = 0; + } else { + for (i = 0; i < stream->outcnt; i++) + stream->out[i].ssn = 0; + } + + flags = SCTP_STREAM_RESET_OUTGOING_SSN; + } + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, + nums, str_p, GFP_ATOMIC); + } else if (req->type == SCTP_PARAM_RESET_IN_REQUEST) { + struct sctp_strreset_inreq *inreq; + __u16 *str_p; + + /* if the result is performed, it's impossible for inreq */ + if (result == SCTP_STRRESET_PERFORMED) + return NULL; + + inreq = (struct sctp_strreset_inreq *)req; + str_p = inreq->list_of_streams; + nums = (ntohs(inreq->param_hdr.length) - sizeof(*inreq)) / 2; + + *evp = sctp_ulpevent_make_stream_reset_event(asoc, flags, + nums, str_p, GFP_ATOMIC); + } else if (req->type == SCTP_PARAM_RESET_TSN_REQUEST) { + struct sctp_strreset_resptsn *resptsn; + __u32 stsn, rtsn; + + /* check for resptsn, as sctp_verify_reconf didn't do it*/ + if (ntohs(param.p->length) != sizeof(*resptsn)) + return NULL; + + resptsn = (struct sctp_strreset_resptsn *)resp; + stsn = ntohl(resptsn->senders_next_tsn); + rtsn = ntohl(resptsn->receivers_next_tsn); + + if (result == SCTP_STRRESET_PERFORMED) { + __u32 mtsn = sctp_tsnmap_get_max_tsn_seen( + &asoc->peer.tsn_map); + + sctp_ulpq_reasm_flushtsn(&asoc->ulpq, mtsn); + sctp_ulpq_abort_pd(&asoc->ulpq, GFP_ATOMIC); + + sctp_tsnmap_init(&asoc->peer.tsn_map, + SCTP_TSN_MAP_INITIAL, + stsn, GFP_ATOMIC); + + sctp_outq_free(&asoc->outqueue); + + asoc->next_tsn = rtsn; + asoc->ctsn_ack_point = asoc->next_tsn - 1; + asoc->adv_peer_ack_point = asoc->ctsn_ack_point; + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].ssn = 0; + for (i = 0; i < stream->incnt; i++) + stream->in[i].ssn = 0; + } + + for (i = 0; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + + *evp = sctp_ulpevent_make_assoc_reset_event(asoc, flags, + stsn, rtsn, GFP_ATOMIC); + } else if (req->type == SCTP_PARAM_RESET_ADD_OUT_STREAMS) { + struct sctp_strreset_addstrm *addstrm; + __u16 number; + + addstrm = (struct sctp_strreset_addstrm *)req; + nums = ntohs(addstrm->number_of_streams); + number = stream->outcnt - nums; + + if (result == SCTP_STRRESET_PERFORMED) + for (i = number; i < stream->outcnt; i++) + stream->out[i].state = SCTP_STREAM_OPEN; + else + stream->outcnt = number; + + *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, + 0, nums, GFP_ATOMIC); + } else if (req->type == SCTP_PARAM_RESET_ADD_IN_STREAMS) { + struct sctp_strreset_addstrm *addstrm; + + /* if the result is performed, it's impossible for addstrm in + * request. + */ + if (result == SCTP_STRRESET_PERFORMED) + return NULL; + + addstrm = (struct sctp_strreset_addstrm *)req; + nums = ntohs(addstrm->number_of_streams); + + *evp = sctp_ulpevent_make_stream_change_event(asoc, flags, + nums, 0, GFP_ATOMIC); + } + + asoc->strreset_outstanding--; + asoc->strreset_outseq++; + + /* remove everything for this reconf request */ + if (!asoc->strreset_outstanding) { + t = asoc->strreset_chunk->transport; + if (del_timer(&t->reconf_timer)) + sctp_transport_put(t); + + sctp_chunk_put(asoc->strreset_chunk); + asoc->strreset_chunk = NULL; + } + + return NULL; +} diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index daf8554fd42a5e537bb58572823b2028f74be930..0e732f68c2bfc3b791dade5a85c36628904ee490 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -274,6 +274,13 @@ static struct ctl_table sctp_net_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, + { + .procname = "reconf_enable", + .data = &init_net.sctp.reconf_enable, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "auth_enable", .data = &init_net.sctp.auth_enable, diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index c8881bc542a066e6f7f234beea3c7208394242c5..ec2b3e013c2f4ba48eabfc8d8ed20921cabc08a1 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -883,6 +883,62 @@ struct sctp_ulpevent *sctp_ulpevent_make_stream_reset_event( return event; } +struct sctp_ulpevent *sctp_ulpevent_make_assoc_reset_event( + const struct sctp_association *asoc, __u16 flags, __u32 local_tsn, + __u32 remote_tsn, gfp_t gfp) +{ + struct sctp_assoc_reset_event *areset; + struct sctp_ulpevent *event; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_assoc_reset_event), + MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + areset = (struct sctp_assoc_reset_event *) + skb_put(skb, sizeof(struct sctp_assoc_reset_event)); + + areset->assocreset_type = SCTP_ASSOC_RESET_EVENT; + areset->assocreset_flags = flags; + areset->assocreset_length = sizeof(struct sctp_assoc_reset_event); + sctp_ulpevent_set_owner(event, asoc); + areset->assocreset_assoc_id = sctp_assoc2id(asoc); + areset->assocreset_local_tsn = local_tsn; + areset->assocreset_remote_tsn = remote_tsn; + + return event; +} + +struct sctp_ulpevent *sctp_ulpevent_make_stream_change_event( + const struct sctp_association *asoc, __u16 flags, + __u32 strchange_instrms, __u32 strchange_outstrms, gfp_t gfp) +{ + struct sctp_stream_change_event *schange; + struct sctp_ulpevent *event; + struct sk_buff *skb; + + event = sctp_ulpevent_new(sizeof(struct sctp_stream_change_event), + MSG_NOTIFICATION, gfp); + if (!event) + return NULL; + + skb = sctp_event2skb(event); + schange = (struct sctp_stream_change_event *) + skb_put(skb, sizeof(struct sctp_stream_change_event)); + + schange->strchange_type = SCTP_STREAM_CHANGE_EVENT; + schange->strchange_flags = flags; + schange->strchange_length = sizeof(struct sctp_stream_change_event); + sctp_ulpevent_set_owner(event, asoc); + schange->strchange_assoc_id = sctp_assoc2id(asoc); + schange->strchange_instrms = strchange_instrms; + schange->strchange_outstrms = strchange_outstrms; + + return event; +} + /* Return the notification type, assuming this is a notification * event. */ diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 093803786eacf3388dc470a04c02293f60e4102e..6793d7348cc811f41395c5ca3a9d1f1390813646 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -101,7 +101,7 @@ struct proto smc_proto = { .unhash = smc_unhash_sk, .obj_size = sizeof(struct smc_sock), .h.smc_hash = &smc_v4_hashinfo, - .slab_flags = SLAB_DESTROY_BY_RCU, + .slab_flags = SLAB_TYPESAFE_BY_RCU, }; EXPORT_SYMBOL_GPL(smc_proto); @@ -147,7 +147,6 @@ static int smc_release(struct socket *sock) schedule_delayed_work(&smc->sock_put_work, SMC_CLOSE_SOCK_PUT_DELAY); } - sk->sk_prot->unhash(sk); release_sock(sk); sock_put(sk); @@ -451,6 +450,9 @@ static int smc_connect_rdma(struct smc_sock *smc) goto decline_rdma_unlock; } + smc_close_init(smc); + smc_rx_init(smc); + if (local_contact == SMC_FIRST_CONTACT) { rc = smc_ib_ready_link(link); if (rc) { @@ -477,7 +479,6 @@ static int smc_connect_rdma(struct smc_sock *smc) mutex_unlock(&smc_create_lgr_pending); smc_tx_init(smc); - smc_rx_init(smc); out_connected: smc_copy_sock_settings_to_clc(smc); @@ -637,7 +638,8 @@ struct sock *smc_accept_dequeue(struct sock *parent, smc_accept_unlink(new_sk); if (new_sk->sk_state == SMC_CLOSED) { - /* tbd in follow-on patch: close this sock */ + new_sk->sk_prot->unhash(new_sk); + sock_put(new_sk); continue; } if (new_sock) @@ -657,8 +659,13 @@ void smc_close_non_accepted(struct sock *sk) if (!sk->sk_lingertime) /* wait for peer closing */ sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT; - if (!smc->use_fallback) + if (smc->use_fallback) { + sk->sk_state = SMC_CLOSED; + } else { smc_close_active(smc); + sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } if (smc->clcsock) { struct socket *tcp; @@ -666,11 +673,9 @@ void smc_close_non_accepted(struct sock *sk) smc->clcsock = NULL; sock_release(tcp); } - sock_set_flag(sk, SOCK_DEAD); - sk->sk_shutdown |= SHUTDOWN_MASK; if (smc->use_fallback) { schedule_delayed_work(&smc->sock_put_work, TCP_TIMEWAIT_LEN); - } else { + } else if (sk->sk_state == SMC_CLOSED) { smc_conn_free(&smc->conn); schedule_delayed_work(&smc->sock_put_work, SMC_CLOSE_SOCK_PUT_DELAY); @@ -800,6 +805,9 @@ static void smc_listen_work(struct work_struct *work) goto decline_rdma; } + smc_close_init(new_smc); + smc_rx_init(new_smc); + rc = smc_clc_send_accept(new_smc, local_contact); if (rc) goto out_err; @@ -839,7 +847,6 @@ static void smc_listen_work(struct work_struct *work) } smc_tx_init(new_smc); - smc_rx_init(new_smc); out_connected: sk_refcnt_debug_inc(newsmcsk); diff --git a/net/smc/smc.h b/net/smc/smc.h index ee5fbea24549d2df9f55bb0ad177d548c637bb0f..6e44313e4467d01fbdc77f07ee7d14c50ab92ce2 100644 --- a/net/smc/smc.h +++ b/net/smc/smc.h @@ -164,6 +164,7 @@ struct smc_connection { #ifndef KERNEL_HAS_ATOMIC64 spinlock_t acurs_lock; /* protect cursors */ #endif + struct work_struct close_work; /* peer sent some closing */ }; struct smc_sock { /* smc sock container */ diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c index 5a339493872efd123826c31a2771485b3322f67a..a7294edbc22177d2c5b5a21deffaacea1e0855e2 100644 --- a/net/smc/smc_cdc.c +++ b/net/smc/smc_cdc.c @@ -217,8 +217,13 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc->sk.sk_err = ECONNRESET; conn->local_tx_ctrl.conn_state_flags.peer_conn_abort = 1; } - if (smc_cdc_rxed_any_close_or_senddone(conn)) - smc_close_passive_received(smc); + if (smc_cdc_rxed_any_close_or_senddone(conn)) { + smc->sk.sk_shutdown |= RCV_SHUTDOWN; + if (smc->clcsock && smc->clcsock->sk) + smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; + sock_set_flag(&smc->sk, SOCK_DONE); + schedule_work(&conn->close_work); + } /* piggy backed tx info */ /* trigger sndbuf consumer: RDMA write into peer RMBE and CDC */ @@ -228,8 +233,6 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, smc_close_wake_tx_prepared(smc); } - /* subsequent patch: trigger socket release if connection closed */ - /* socket connected but not accepted */ if (!smc->sk.sk_socket) return; diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c index 67a71d170bedb4be2658cfa5f7a654098da4962f..3c2e166b5d222f4c932179ae442cbd88969efc92 100644 --- a/net/smc/smc_close.c +++ b/net/smc/smc_close.c @@ -117,7 +117,6 @@ void smc_close_active_abort(struct smc_sock *smc) struct smc_cdc_conn_state_flags *txflags = &smc->conn.local_tx_ctrl.conn_state_flags; - bh_lock_sock(&smc->sk); smc->sk.sk_err = ECONNABORTED; if (smc->clcsock && smc->clcsock->sk) { smc->clcsock->sk->sk_err = ECONNABORTED; @@ -125,6 +124,7 @@ void smc_close_active_abort(struct smc_sock *smc) } switch (smc->sk.sk_state) { case SMC_INIT: + case SMC_ACTIVE: smc->sk.sk_state = SMC_PEERABORTWAIT; break; case SMC_APPCLOSEWAIT1: @@ -161,10 +161,15 @@ void smc_close_active_abort(struct smc_sock *smc) } sock_set_flag(&smc->sk, SOCK_DEAD); - bh_unlock_sock(&smc->sk); smc->sk.sk_state_change(&smc->sk); } +static inline bool smc_close_sent_any_close(struct smc_connection *conn) +{ + return conn->local_tx_ctrl.conn_state_flags.peer_conn_abort || + conn->local_tx_ctrl.conn_state_flags.peer_conn_closed; +} + int smc_close_active(struct smc_sock *smc) { struct smc_cdc_conn_state_flags *txflags = @@ -185,8 +190,7 @@ int smc_close_active(struct smc_sock *smc) case SMC_INIT: sk->sk_state = SMC_CLOSED; if (smc->smc_listen_work.func) - flush_work(&smc->smc_listen_work); - sock_put(sk); + cancel_work_sync(&smc->smc_listen_work); break; case SMC_LISTEN: sk->sk_state = SMC_CLOSED; @@ -198,7 +202,7 @@ int smc_close_active(struct smc_sock *smc) } release_sock(sk); smc_close_cleanup_listen(sk); - flush_work(&smc->tcp_listen_work); + cancel_work_sync(&smc->smc_listen_work); lock_sock(sk); break; case SMC_ACTIVE: @@ -218,7 +222,7 @@ int smc_close_active(struct smc_sock *smc) case SMC_APPFINCLOSEWAIT: /* socket already shutdown wr or both (active close) */ if (txflags->peer_done_writing && - !txflags->peer_conn_closed) { + !smc_close_sent_any_close(conn)) { /* just shutdown wr done, send close request */ rc = smc_close_final(conn); } @@ -248,6 +252,13 @@ int smc_close_active(struct smc_sock *smc) break; case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: + if (txflags->peer_done_writing && + !smc_close_sent_any_close(conn)) { + /* just shutdown wr done, send close request */ + rc = smc_close_final(conn); + } + /* peer sending PeerConnectionClosed will cause transition */ + break; case SMC_PEERFINCLOSEWAIT: /* peer sending PeerConnectionClosed will cause transition */ break; @@ -285,7 +296,7 @@ static void smc_close_passive_abort_received(struct smc_sock *smc) case SMC_PEERCLOSEWAIT1: case SMC_PEERCLOSEWAIT2: if (txflags->peer_done_writing && - !txflags->peer_conn_closed) { + !smc_close_sent_any_close(&smc->conn)) { /* just shutdown, but not yet closed locally */ smc_close_abort(&smc->conn); sk->sk_state = SMC_PROCESSABORT; @@ -306,22 +317,27 @@ static void smc_close_passive_abort_received(struct smc_sock *smc) /* Some kind of closing has been received: peer_conn_closed, peer_conn_abort, * or peer_done_writing. - * Called under tasklet context. */ -void smc_close_passive_received(struct smc_sock *smc) +static void smc_close_passive_work(struct work_struct *work) { - struct smc_cdc_conn_state_flags *rxflags = - &smc->conn.local_rx_ctrl.conn_state_flags; + struct smc_connection *conn = container_of(work, + struct smc_connection, + close_work); + struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + struct smc_cdc_conn_state_flags *rxflags; struct sock *sk = &smc->sk; int old_state; - sk->sk_shutdown |= RCV_SHUTDOWN; - if (smc->clcsock && smc->clcsock->sk) - smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; - sock_set_flag(&smc->sk, SOCK_DONE); - + lock_sock(&smc->sk); old_state = sk->sk_state; + if (!conn->alert_token_local) { + /* abnormal termination */ + smc_close_active_abort(smc); + goto wakeup; + } + + rxflags = &smc->conn.local_rx_ctrl.conn_state_flags; if (rxflags->peer_conn_abort) { smc_close_passive_abort_received(smc); goto wakeup; @@ -331,7 +347,7 @@ void smc_close_passive_received(struct smc_sock *smc) case SMC_INIT: if (atomic_read(&smc->conn.bytes_to_rcv) || (rxflags->peer_done_writing && - !rxflags->peer_conn_closed)) + !smc_cdc_rxed_any_close(conn))) sk->sk_state = SMC_APPCLOSEWAIT1; else sk->sk_state = SMC_CLOSED; @@ -348,7 +364,7 @@ void smc_close_passive_received(struct smc_sock *smc) if (!smc_cdc_rxed_any_close(&smc->conn)) break; if (sock_flag(sk, SOCK_DEAD) && - (sk->sk_shutdown == SHUTDOWN_MASK)) { + smc_close_sent_any_close(conn)) { /* smc_release has already been called locally */ sk->sk_state = SMC_CLOSED; } else { @@ -367,17 +383,19 @@ void smc_close_passive_received(struct smc_sock *smc) } wakeup: - if (old_state != sk->sk_state) - sk->sk_state_change(sk); sk->sk_data_ready(sk); /* wakeup blocked rcvbuf consumers */ sk->sk_write_space(sk); /* wakeup blocked sndbuf producers */ - if ((sk->sk_state == SMC_CLOSED) && - (sock_flag(sk, SOCK_DEAD) || (old_state == SMC_INIT))) { - smc_conn_free(&smc->conn); - schedule_delayed_work(&smc->sock_put_work, - SMC_CLOSE_SOCK_PUT_DELAY); + if (old_state != sk->sk_state) { + sk->sk_state_change(sk); + if ((sk->sk_state == SMC_CLOSED) && + (sock_flag(sk, SOCK_DEAD) || !sk->sk_socket)) { + smc_conn_free(&smc->conn); + schedule_delayed_work(&smc->sock_put_work, + SMC_CLOSE_SOCK_PUT_DELAY); + } } + release_sock(&smc->sk); } void smc_close_sock_put_work(struct work_struct *work) @@ -442,3 +460,9 @@ int smc_close_shutdown_write(struct smc_sock *smc) sk->sk_state_change(&smc->sk); return rc; } + +/* Initialize close properties on connection establishment. */ +void smc_close_init(struct smc_sock *smc) +{ + INIT_WORK(&smc->conn.close_work, smc_close_passive_work); +} diff --git a/net/smc/smc_close.h b/net/smc/smc_close.h index bc9a2df3633cee576b15c0a9d715d1bdf6753957..4a3d99a8d7cbf18598ef74dae61e2be2a0a17b48 100644 --- a/net/smc/smc_close.h +++ b/net/smc/smc_close.h @@ -21,8 +21,8 @@ void smc_close_wake_tx_prepared(struct smc_sock *smc); void smc_close_active_abort(struct smc_sock *smc); int smc_close_active(struct smc_sock *smc); -void smc_close_passive_received(struct smc_sock *smc); void smc_close_sock_put_work(struct work_struct *work); int smc_close_shutdown_write(struct smc_sock *smc); +void smc_close_init(struct smc_sock *smc); #endif /* SMC_CLOSE_H */ diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 0eac633fb3549f6e715f25145b16e22b2bc49241..65020e93ff210bb7f5db079399984c55e54c80f1 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -316,7 +316,7 @@ void smc_lgr_terminate(struct smc_link_group *lgr) smc = container_of(conn, struct smc_sock, conn); sock_hold(&smc->sk); __smc_lgr_unregister_conn(conn); - smc_close_active_abort(smc); + schedule_work(&conn->close_work); sock_put(&smc->sk); node = rb_first(&lgr->conns_all); } diff --git a/net/smc/smc_ib.c b/net/smc/smc_ib.c index e6743c008ac548259447daa0e0c207aa7f249c63..cb69ab977cd73963fef1fe27b1407e58a6e7cadd 100644 --- a/net/smc/smc_ib.c +++ b/net/smc/smc_ib.c @@ -80,12 +80,11 @@ static int smc_ib_modify_qp_rtr(struct smc_link *lnk) memset(&qp_attr, 0, sizeof(qp_attr)); qp_attr.qp_state = IB_QPS_RTR; qp_attr.path_mtu = min(lnk->path_mtu, lnk->peer_mtu); - qp_attr.ah_attr.port_num = lnk->ibport; - qp_attr.ah_attr.ah_flags = IB_AH_GRH; - qp_attr.ah_attr.grh.hop_limit = 1; - memcpy(&qp_attr.ah_attr.grh.dgid, lnk->peer_gid, - sizeof(lnk->peer_gid)); - memcpy(&qp_attr.ah_attr.dmac, lnk->peer_mac, + qp_attr.ah_attr.type = RDMA_AH_ATTR_TYPE_ROCE; + rdma_ah_set_port_num(&qp_attr.ah_attr, lnk->ibport); + rdma_ah_set_grh(&qp_attr.ah_attr, NULL, 0, 0, 1, 0); + rdma_ah_set_dgid_raw(&qp_attr.ah_attr, lnk->peer_gid); + memcpy(&qp_attr.ah_attr.roce.dmac, lnk->peer_mac, sizeof(lnk->peer_mac)); qp_attr.dest_qp_num = lnk->peer_qpn; qp_attr.rq_psn = lnk->peer_psn; /* starting receive packet seq # */ @@ -179,8 +178,6 @@ static void smc_ib_global_event_handler(struct ib_event_handler *handler, u8 port_idx; smcibdev = container_of(handler, struct smc_ib_device, event_handler); - if (!smc_pnet_find_ib(smcibdev->ibdev->name)) - return; switch (ibevent->event) { case IB_EVENT_PORT_ERR: @@ -259,7 +256,6 @@ int smc_ib_create_queue_pair(struct smc_link *lnk) .max_recv_wr = SMC_WR_BUF_CNT * 3, .max_send_sge = SMC_IB_MAX_SEND_SGE, .max_recv_sge = 1, - .max_inline_data = SMC_WR_TX_SIZE, }, .sq_sig_type = IB_SIGNAL_REQ_WR, .qp_type = IB_QPT_RC, diff --git a/net/smc/smc_ib.h b/net/smc/smc_ib.h index a95f74bb556915f92d0fa1bdbd2e34eb9814c3b1..7e1f0e24d17790f526aa50d07ff5e5d6596b6f3c 100644 --- a/net/smc/smc_ib.h +++ b/net/smc/smc_ib.h @@ -11,6 +11,7 @@ #ifndef _SMC_IB_H #define _SMC_IB_H +#include #include #include diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 9d3e7fb8348d3efae50515446591642815697b45..78f7af28ae4f25d71469d54d44f98ef4e2df94eb 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -219,7 +219,7 @@ static bool smc_pnetid_valid(const char *pnet_name, char *pnetid) } /* Find an infiniband device by a given name. The device might not exist. */ -struct smc_ib_device *smc_pnet_find_ib(char *ib_name) +static struct smc_ib_device *smc_pnet_find_ib(char *ib_name) { struct smc_ib_device *ibdev; @@ -523,8 +523,11 @@ void smc_pnet_find_roce_resource(struct sock *sk, read_lock(&smc_pnettable.lock); list_for_each_entry(pnetelem, &smc_pnettable.pnetlist, list) { if (dst->dev == pnetelem->ndev) { - *smcibdev = pnetelem->smcibdev; - *ibport = pnetelem->ib_port; + if (smc_ib_port_active(pnetelem->smcibdev, + pnetelem->ib_port)) { + *smcibdev = pnetelem->smcibdev; + *ibport = pnetelem->ib_port; + } break; } } diff --git a/net/smc/smc_pnet.h b/net/smc/smc_pnet.h index 32ab3df928ca183643fab96ffd8c09667a54fbde..c4f1bccd43589c0d2d2591925dfa3053b7fa4aef 100644 --- a/net/smc/smc_pnet.h +++ b/net/smc/smc_pnet.h @@ -16,7 +16,6 @@ struct smc_ib_device; int smc_pnet_init(void) __init; void smc_pnet_exit(void); int smc_pnet_remove_by_ibdev(struct smc_ib_device *ibdev); -struct smc_ib_device *smc_pnet_find_ib(char *ib_name); void smc_pnet_find_roce_resource(struct sock *sk, struct smc_ib_device **smcibdev, u8 *ibport); diff --git a/net/smc/smc_rx.c b/net/smc/smc_rx.c index c4ef9a4ec56971e685d419a4a89f51e8181709c1..f0c8b089f770a229b7c805fe5b2d59da40a5c1e3 100644 --- a/net/smc/smc_rx.c +++ b/net/smc/smc_rx.c @@ -36,11 +36,10 @@ static void smc_rx_data_ready(struct sock *sk) if (skwq_has_sleeper(wq)) wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); if ((sk->sk_shutdown == SHUTDOWN_MASK) || (sk->sk_state == SMC_CLOSED)) sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_HUP); - else - sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); rcu_read_unlock(); } diff --git a/net/smc/smc_tx.c b/net/smc/smc_tx.c index 69a0013dd25cecbee0658168da577d15ed8913d8..21ec1832ab517d647ac8942ddbd5484492eb4c94 100644 --- a/net/smc/smc_tx.c +++ b/net/smc/smc_tx.c @@ -431,9 +431,13 @@ static void smc_tx_work(struct work_struct *work) struct smc_connection, tx_work); struct smc_sock *smc = container_of(conn, struct smc_sock, conn); + int rc; lock_sock(&smc->sk); - smc_tx_sndbuf_nonempty(conn); + rc = smc_tx_sndbuf_nonempty(conn); + if (!rc && conn->local_rx_ctrl.prod_flags.write_blocked && + !atomic_read(&conn->bytes_to_rcv)) + conn->local_rx_ctrl.prod_flags.write_blocked = 0; release_sock(&smc->sk); } diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index eadf157418dcd42c036685e5858bea6e5c559506..874ee9f9d79674f9576c28f9d1dd669e03e62422 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -447,7 +447,7 @@ static void smc_wr_init_sge(struct smc_link *lnk) lnk->wr_tx_ibs[i].num_sge = 1; lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; lnk->wr_tx_ibs[i].send_flags = - IB_SEND_SIGNALED | IB_SEND_SOLICITED | IB_SEND_INLINE; + IB_SEND_SIGNALED | IB_SEND_SOLICITED; } for (i = 0; i < lnk->wr_rx_cnt; i++) { lnk->wr_rx_sges[i].addr = diff --git a/net/socket.c b/net/socket.c index 985ef06792d6e54c69d296f3e15baf89be972f9c..c2564eb25c6b8faf5c99504ef1d3c90f9bb57c73 100644 --- a/net/socket.c +++ b/net/socket.c @@ -3356,3 +3356,49 @@ int kernel_sock_shutdown(struct socket *sock, enum sock_shutdown_cmd how) return sock->ops->shutdown(sock, how); } EXPORT_SYMBOL(kernel_sock_shutdown); + +/* This routine returns the IP overhead imposed by a socket i.e. + * the length of the underlying IP header, depending on whether + * this is an IPv4 or IPv6 socket and the length from IP options turned + * on at the socket. Assumes that the caller has a lock on the socket. + */ +u32 kernel_sock_ip_overhead(struct sock *sk) +{ + struct inet_sock *inet; + struct ip_options_rcu *opt; + u32 overhead = 0; + bool owned_by_user; +#if IS_ENABLED(CONFIG_IPV6) + struct ipv6_pinfo *np; + struct ipv6_txoptions *optv6 = NULL; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + + if (!sk) + return overhead; + + owned_by_user = sock_owned_by_user(sk); + switch (sk->sk_family) { + case AF_INET: + inet = inet_sk(sk); + overhead += sizeof(struct iphdr); + opt = rcu_dereference_protected(inet->inet_opt, + owned_by_user); + if (opt) + overhead += opt->opt.optlen; + return overhead; +#if IS_ENABLED(CONFIG_IPV6) + case AF_INET6: + np = inet6_sk(sk); + overhead += sizeof(struct ipv6hdr); + if (np) + optv6 = rcu_dereference_protected(np->opt, + owned_by_user); + if (optv6) + overhead += (optv6->opt_flen + optv6->opt_nflen); + return overhead; +#endif /* IS_ENABLED(CONFIG_IPV6) */ + default: /* Returns 0 overhead if the socket is not ipv4 or ipv6 */ + return overhead; + } +} +EXPORT_SYMBOL(kernel_sock_ip_overhead); diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 04ce2c0b660e0d381a22038bb463d46312befaf5..ac09ca8032965bfd4280fb3f6d3410c08cbda243 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -52,6 +52,7 @@ config SUNRPC_XPRT_RDMA tristate "RPC-over-RDMA transport" depends on SUNRPC && INFINIBAND && INFINIBAND_ADDR_TRANS default SUNRPC && INFINIBAND + select SG_POOL help This option allows the NFS client and server to use RDMA transports (InfiniBand, iWARP, or RoCE). diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 52da3ce54bb53c8434f0acc276c0003e385486e7..b5cb921775a0b20358f590b69b4cdfd0609a1acb 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1042,8 +1042,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) struct rpc_task *task; task = rpc_new_task(task_setup_data); - if (IS_ERR(task)) - goto out; rpc_task_set_client(task, task_setup_data->rpc_client); rpc_task_set_rpc_message(task, task_setup_data->rpc_message); @@ -1053,7 +1051,6 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data) atomic_inc(&task->tk_count); rpc_execute(task); -out: return task; } EXPORT_SYMBOL_GPL(rpc_run_task); @@ -1140,10 +1137,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - if (IS_ERR(task)) { - xprt_free_bc_request(req); - goto out; - } task->tk_rqstp = req; /* @@ -1158,7 +1151,6 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) WARN_ON_ONCE(atomic_read(&task->tk_count) != 2); rpc_execute(task); -out: dprintk("RPC: rpc_run_bc_task: task= %p\n", task); return task; } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 5db68b371db2cac54e37cc0a883259dbc61125cb..0cc83839c13c3621a70a4b9e8ded773217a2edf3 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -965,11 +965,6 @@ struct rpc_task *rpc_new_task(const struct rpc_task_setup *setup_data) if (task == NULL) { task = rpc_alloc_task(); - if (task == NULL) { - rpc_release_calldata(setup_data->callback_ops, - setup_data->callback_data); - return ERR_PTR(-ENOMEM); - } flags = RPC_TASK_DYNAMIC; } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a08aeb56b8e457d56285558048e4a41ab00b03c9..bc0f5a0ecbdce2a5203bf707ebfba8c99821d0f7 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -702,59 +702,32 @@ choose_victim(struct svc_serv *serv, struct svc_pool *pool, unsigned int *state) return task; } -/* - * Create or destroy enough new threads to make the number - * of threads the given number. If `pool' is non-NULL, applies - * only to threads in that pool, otherwise round-robins between - * all pools. Caller must ensure that mutual exclusion between this and - * server startup or shutdown. - * - * Destroying threads relies on the service threads filling in - * rqstp->rq_task, which only the nfs ones do. Assumes the serv - * has been created using svc_create_pooled(). - * - * Based on code that used to be in nfsd_svc() but tweaked - * to be pool-aware. - */ -int -svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +/* create new threads */ +static int +svc_start_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) { struct svc_rqst *rqstp; struct task_struct *task; struct svc_pool *chosen_pool; - int error = 0; unsigned int state = serv->sv_nrthreads-1; int node; - if (pool == NULL) { - /* The -1 assumes caller has done a svc_get() */ - nrservs -= (serv->sv_nrthreads-1); - } else { - spin_lock_bh(&pool->sp_lock); - nrservs -= pool->sp_nrthreads; - spin_unlock_bh(&pool->sp_lock); - } - - /* create new threads */ - while (nrservs > 0) { + do { nrservs--; chosen_pool = choose_pool(serv, pool, &state); node = svc_pool_map_get_node(chosen_pool->sp_id); rqstp = svc_prepare_thread(serv, chosen_pool, node); - if (IS_ERR(rqstp)) { - error = PTR_ERR(rqstp); - break; - } + if (IS_ERR(rqstp)) + return PTR_ERR(rqstp); __module_get(serv->sv_ops->svo_module); task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp, node, "%s", serv->sv_name); if (IS_ERR(task)) { - error = PTR_ERR(task); module_put(serv->sv_ops->svo_module); svc_exit_thread(rqstp); - break; + return PTR_ERR(task); } rqstp->rq_task = task; @@ -763,18 +736,103 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) svc_sock_update_bufs(serv); wake_up_process(task); - } + } while (nrservs > 0); + + return 0; +} + + +/* destroy old threads */ +static int +svc_signal_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + /* destroy old threads */ - while (nrservs < 0 && - (task = choose_victim(serv, pool, &state)) != NULL) { + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; send_sig(SIGINT, task, 1); nrservs++; + } while (nrservs < 0); + + return 0; +} + +/* + * Create or destroy enough new threads to make the number + * of threads the given number. If `pool' is non-NULL, applies + * only to threads in that pool, otherwise round-robins between + * all pools. Caller must ensure that mutual exclusion between this and + * server startup or shutdown. + * + * Destroying threads relies on the service threads filling in + * rqstp->rq_task, which only the nfs ones do. Assumes the serv + * has been created using svc_create_pooled(). + * + * Based on code that used to be in nfsd_svc() but tweaked + * to be pool-aware. + */ +int +svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); } - return error; + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_signal_kthreads(serv, pool, nrservs); + return 0; } EXPORT_SYMBOL_GPL(svc_set_num_threads); +/* destroy old threads */ +static int +svc_stop_kthreads(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + struct task_struct *task; + unsigned int state = serv->sv_nrthreads-1; + + /* destroy old threads */ + do { + task = choose_victim(serv, pool, &state); + if (task == NULL) + break; + kthread_stop(task); + nrservs++; + } while (nrservs < 0); + return 0; +} + +int +svc_set_num_threads_sync(struct svc_serv *serv, struct svc_pool *pool, int nrservs) +{ + if (pool == NULL) { + /* The -1 assumes caller has done a svc_get() */ + nrservs -= (serv->sv_nrthreads-1); + } else { + spin_lock_bh(&pool->sp_lock); + nrservs -= pool->sp_nrthreads; + spin_unlock_bh(&pool->sp_lock); + } + + if (nrservs > 0) + return svc_start_kthreads(serv, pool, nrservs); + if (nrservs < 0) + return svc_stop_kthreads(serv, pool, nrservs); + return 0; +} +EXPORT_SYMBOL_GPL(svc_set_num_threads_sync); + /* * Called from a server thread as it's exiting. Caller must hold the "service * mutex" for the service. diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 1f7082144e0168070a9e8ed703ca044f9700b8dc..e34f4ee7f2b6cecdf404daab0d3aab363d94ba01 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -807,7 +807,7 @@ void xdr_init_decode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p) EXPORT_SYMBOL_GPL(xdr_init_decode); /** - * xdr_init_decode - Initialize an xdr_stream for decoding data. + * xdr_init_decode_pages - Initialize an xdr_stream for decoding into pages * @xdr: pointer to xdr_stream struct * @buf: pointer to XDR buffer from which to decode data * @pages: list of pages to decode into diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index b530a2852ba87ac042baec67bf64a3e57e1232b5..3e63c5e97ebe67875f98836eae254f7d3c3460ba 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -651,6 +651,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } +EXPORT_SYMBOL_GPL(xprt_force_disconnect); /** * xprt_conditional_disconnect - force a transport to disconnect diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index ef19fa42c50ff2e15ecdee7d87f1207ffc3c445c..c1ae8142ab734739c8631e537f0c1157e8e06a60 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -4,5 +4,5 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o \ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ - module.o + svc_rdma_rw.o module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index a044be2d6ad726eccfbd991038086aefc746df47..694e9b13ecf07722848d86345589ad4793474fb5 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -494,7 +494,7 @@ rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req, } sge->length = len; - ib_dma_sync_single_for_device(ia->ri_device, sge->addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, DMA_TO_DEVICE); req->rl_send_wr.num_sge++; return true; @@ -523,7 +523,7 @@ rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req, sge[sge_no].addr = rdmab_addr(rb); sge[sge_no].length = xdr->head[0].iov_len; sge[sge_no].lkey = rdmab_lkey(rb); - ib_dma_sync_single_for_device(device, sge[sge_no].addr, + ib_dma_sync_single_for_device(rdmab_device(rb), sge[sge_no].addr, sge[sge_no].length, DMA_TO_DEVICE); /* If there is a Read chunk, the page list is being handled @@ -781,9 +781,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) return 0; out_err: - pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", - PTR_ERR(iptr)); - r_xprt->rx_stats.failed_marshal_count++; + if (PTR_ERR(iptr) != -ENOBUFS) { + pr_err("rpcrdma: rpcrdma_marshal_req failed, status %ld\n", + PTR_ERR(iptr)); + r_xprt->rx_stats.failed_marshal_count++; + } return PTR_ERR(iptr); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index c846ca9f1ebaa661f1e4a93893752950d35a4b6a..a4a8f6989ee747a1a5046e1e5cff92a0ca167775 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -58,9 +58,9 @@ unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; -unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; -static unsigned int min_max_inline = 4096; -static unsigned int max_max_inline = 65536; +unsigned int svcrdma_max_req_size = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int min_max_inline = RPCRDMA_DEF_INLINE_THRESH; +static unsigned int max_max_inline = RPCRDMA_MAX_INLINE_THRESH; atomic_t rdma_stat_recv; atomic_t rdma_stat_read; @@ -247,8 +247,6 @@ int svc_rdma_init(void) dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); dprintk("\tmax_requests : %u\n", svcrdma_max_requests); - dprintk("\tsq_depth : %u\n", - svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index ff1df40f0d261bc956f1af3410d8780f4c582b83..c676ed0efb5af2cceb6ed59e238a0f7a0109f916 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -12,7 +12,17 @@ #undef SVCRDMA_BACKCHANNEL_DEBUG -int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, +/** + * svc_rdma_handle_bc_reply - Process incoming backchannel reply + * @xprt: controlling backchannel transport + * @rdma_resp: pointer to incoming transport header + * @rcvbuf: XDR buffer into which to decode the reply + * + * Returns: + * %0 if @rcvbuf is filled in, xprt_complete_rqst called, + * %-EAGAIN if server should call ->recvfrom again. + */ +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, struct xdr_buf *rcvbuf) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -27,13 +37,13 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, p = (__be32 *)src->iov_base; len = src->iov_len; - xid = rmsgp->rm_xid; + xid = *rdma_resp; #ifdef SVCRDMA_BACKCHANNEL_DEBUG pr_info("%s: xid=%08x, length=%zu\n", __func__, be32_to_cpu(xid), len); pr_info("%s: RPC/RDMA: %*ph\n", - __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + __func__, (int)RPCRDMA_HDRLEN_MIN, rdma_resp); pr_info("%s: RPC: %*ph\n", __func__, (int)len, p); #endif @@ -53,7 +63,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, goto out_unlock; memcpy(dst->iov_base, p, len); - credits = be32_to_cpu(rmsgp->rm_credit); + credits = be32_to_cpup(rdma_resp + 2); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > r_xprt->rx_buf.rb_bc_max_requests) @@ -90,9 +100,9 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, * Caller holds the connection's mutex and has already marshaled * the RPC/RDMA request. * - * This is similar to svc_rdma_reply, but takes an rpc_rqst - * instead, does not support chunks, and avoids blocking memory - * allocation. + * This is similar to svc_rdma_send_reply_msg, but takes a struct + * rpc_rqst instead, does not support chunks, and avoids blocking + * memory allocation. * * XXX: There is still an opportunity to block in svc_rdma_send() * if there are no SQ entries to post the Send. This may occur if @@ -101,59 +111,36 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) { - struct xdr_buf *sndbuf = &rqst->rq_snd_buf; struct svc_rdma_op_ctxt *ctxt; - struct svc_rdma_req_map *vec; - struct ib_send_wr send_wr; int ret; - vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false); - if (ret) + ctxt = svc_rdma_get_context(rdma); + + /* rpcrdma_bc_send_request builds the transport header and + * the backchannel RPC message in the same buffer. Thus only + * one SGE is needed to send both. + */ + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rqst->rq_buffer, + rqst->rq_snd_buf.len); + if (ret < 0) goto out_err; ret = svc_rdma_repost_recv(rdma, GFP_NOIO); if (ret) goto out_err; - ctxt = svc_rdma_get_context(rdma); - ctxt->pages[0] = virt_to_page(rqst->rq_buffer); - ctxt->count = 1; - - ctxt->direction = DMA_TO_DEVICE; - ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = sndbuf->len; - ctxt->sge[0].addr = - ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, - sndbuf->len, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { - ret = -EIO; - goto out_unmap; - } - svc_rdma_count_mappings(rdma, ctxt); - - memset(&send_wr, 0, sizeof(send_wr)); - ctxt->cqe.done = svc_rdma_wc_send; - send_wr.wr_cqe = &ctxt->cqe; - send_wr.sg_list = ctxt->sge; - send_wr.num_sge = 1; - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - - ret = svc_rdma_send(rdma, &send_wr); - if (ret) { - ret = -EIO; + ret = svc_rdma_post_send_wr(rdma, ctxt, 1, 0); + if (ret) goto out_unmap; - } out_err: - svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: %s returns %d\n", __func__, ret); return ret; out_unmap: svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); + ret = -EIO; goto out_err; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c index 1c4aabf0f65772c13265421262feb030ab4a58ca..bdcf7d85a3dc098e63ca3ff77309496262b39b76 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c +++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c @@ -166,92 +166,3 @@ int svc_rdma_xdr_decode_req(struct xdr_buf *rq_arg) dprintk("svcrdma: failed to parse transport header\n"); return -EINVAL; } - -int svc_rdma_xdr_encode_error(struct svcxprt_rdma *xprt, - struct rpcrdma_msg *rmsgp, - enum rpcrdma_errcode err, __be32 *va) -{ - __be32 *startp = va; - - *va++ = rmsgp->rm_xid; - *va++ = rmsgp->rm_vers; - *va++ = xprt->sc_fc_credits; - *va++ = rdma_error; - *va++ = cpu_to_be32(err); - if (err == ERR_VERS) { - *va++ = rpcrdma_version; - *va++ = rpcrdma_version; - } - - return (int)((unsigned long)va - (unsigned long)startp); -} - -/** - * svc_rdma_xdr_get_reply_hdr_length - Get length of Reply transport header - * @rdma_resp: buffer containing Reply transport header - * - * Returns length of transport header, in bytes. - */ -unsigned int svc_rdma_xdr_get_reply_hdr_len(__be32 *rdma_resp) -{ - unsigned int nsegs; - __be32 *p; - - p = rdma_resp; - - /* RPC-over-RDMA V1 replies never have a Read list. */ - p += rpcrdma_fixed_maxsz + 1; - - /* Skip Write list. */ - while (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - /* Skip Reply chunk. */ - if (*p++ != xdr_zero) { - nsegs = be32_to_cpup(p++); - p += nsegs * rpcrdma_segment_maxsz; - } - - return (unsigned long)p - (unsigned long)rdma_resp; -} - -void svc_rdma_xdr_encode_write_list(struct rpcrdma_msg *rmsgp, int chunks) -{ - struct rpcrdma_write_array *ary; - - /* no read-list */ - rmsgp->rm_body.rm_chunks[0] = xdr_zero; - - /* write-array discrim */ - ary = (struct rpcrdma_write_array *) - &rmsgp->rm_body.rm_chunks[1]; - ary->wc_discrim = xdr_one; - ary->wc_nchunks = cpu_to_be32(chunks); - - /* write-list terminator */ - ary->wc_array[chunks].wc_target.rs_handle = xdr_zero; - - /* reply-array discriminator */ - ary->wc_array[chunks].wc_target.rs_length = xdr_zero; -} - -void svc_rdma_xdr_encode_reply_array(struct rpcrdma_write_array *ary, - int chunks) -{ - ary->wc_discrim = xdr_one; - ary->wc_nchunks = cpu_to_be32(chunks); -} - -void svc_rdma_xdr_encode_array_chunk(struct rpcrdma_write_array *ary, - int chunk_no, - __be32 rs_handle, - __be64 rs_offset, - u32 write_len) -{ - struct rpcrdma_segment *seg = &ary->wc_array[chunk_no].wc_target; - seg->rs_handle = rs_handle; - seg->rs_offset = rs_offset; - seg->rs_length = cpu_to_be32(write_len); -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index f7b2daf72a86582807798379ac3be336b061a958..27a99bf5b1a6f669be74f86347c7a16e3ce213fe 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -558,33 +558,85 @@ static void rdma_read_complete(struct svc_rqst *rqstp, rqstp->rq_arg.buflen = head->arg.buflen; } +static void svc_rdma_send_error(struct svcxprt_rdma *xprt, + __be32 *rdma_argp, int status) +{ + struct svc_rdma_op_ctxt *ctxt; + __be32 *p, *err_msgp; + unsigned int length; + struct page *page; + int ret; + + ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); + if (ret) + return; + + page = alloc_page(GFP_KERNEL); + if (!page) + return; + err_msgp = page_address(page); + + p = err_msgp; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); + *p++ = xprt->sc_fc_credits; + *p++ = rdma_error; + if (status == -EPROTONOSUPPORT) { + *p++ = err_vers; + *p++ = rpcrdma_version; + *p++ = rpcrdma_version; + } else { + *p++ = err_chunk; + } + length = (unsigned long)p - (unsigned long)err_msgp; + + /* Map transport header; no RPC message payload */ + ctxt = svc_rdma_get_context(xprt); + ret = svc_rdma_map_reply_hdr(xprt, ctxt, err_msgp, length); + if (ret) { + dprintk("svcrdma: Error %d mapping send for protocol error\n", + ret); + return; + } + + ret = svc_rdma_post_send_wr(xprt, ctxt, 1, 0); + if (ret) { + dprintk("svcrdma: Error %d posting send for protocol error\n", + ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + } +} + /* By convention, backchannel calls arrive via rdma_msg type * messages, and never populate the chunk lists. This makes * the RPC/RDMA header small and fixed in size, so it is * straightforward to check the RPC header's direction field. */ -static bool -svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +static bool svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, + __be32 *rdma_resp) { - __be32 *p = (__be32 *)rmsgp; + __be32 *p; if (!xprt->xpt_bc_xprt) return false; - if (rmsgp->rm_type != rdma_msg) + p = rdma_resp + 3; + if (*p++ != rdma_msg) return false; - if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + + if (*p++ != xdr_zero) return false; - if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + if (*p++ != xdr_zero) return false; - if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + if (*p++ != xdr_zero) return false; - /* sanity */ - if (p[7] != rmsgp->rm_xid) + /* XID sanity */ + if (*p++ != *rdma_resp) return false; /* call direction */ - if (p[8] == cpu_to_be32(RPC_CALL)) + if (*p == cpu_to_be32(RPC_CALL)) return false; return true; @@ -650,8 +702,9 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto out_drop; rqstp->rq_xprt_hlen = ret; - if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { - ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + if (svc_rdma_is_backchannel_reply(xprt, &rmsgp->rm_xid)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, + &rmsgp->rm_xid, &rqstp->rq_arg); svc_rdma_put_context(ctxt, 0); if (ret) @@ -686,7 +739,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) return ret; out_err: - svc_rdma_send_error(rdma_xprt, rmsgp, ret); + svc_rdma_send_error(rdma_xprt, &rmsgp->rm_xid, ret); svc_rdma_put_context(ctxt, 0); return 0; diff --git a/net/sunrpc/xprtrdma/svc_rdma_rw.c b/net/sunrpc/xprtrdma/svc_rdma_rw.c new file mode 100644 index 0000000000000000000000000000000000000000..0cf6202776933bdd227a2a332c77835efe2228bd --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_rw.c @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2016 Oracle. All rights reserved. + * + * Use the core R/W API to move RPC-over-RDMA Read and Write chunks. + */ + +#include +#include +#include + +#include + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +/* Each R/W context contains state for one chain of RDMA Read or + * Write Work Requests. + * + * Each WR chain handles a single contiguous server-side buffer, + * because scatterlist entries after the first have to start on + * page alignment. xdr_buf iovecs cannot guarantee alignment. + * + * Each WR chain handles only one R_key. Each RPC-over-RDMA segment + * from a client may contain a unique R_key, so each WR chain moves + * up to one segment at a time. + * + * The scatterlist makes this data structure over 4KB in size. To + * make it less likely to fail, and to handle the allocation for + * smaller I/O requests without disabling bottom-halves, these + * contexts are created on demand, but cached and reused until the + * controlling svcxprt_rdma is destroyed. + */ +struct svc_rdma_rw_ctxt { + struct list_head rw_list; + struct rdma_rw_ctx rw_ctx; + int rw_nents; + struct sg_table rw_sg_table; + struct scatterlist rw_first_sgl[0]; +}; + +static inline struct svc_rdma_rw_ctxt * +svc_rdma_next_ctxt(struct list_head *list) +{ + return list_first_entry_or_null(list, struct svc_rdma_rw_ctxt, + rw_list); +} + +static struct svc_rdma_rw_ctxt * +svc_rdma_get_rw_ctxt(struct svcxprt_rdma *rdma, unsigned int sges) +{ + struct svc_rdma_rw_ctxt *ctxt; + + spin_lock(&rdma->sc_rw_ctxt_lock); + + ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts); + if (ctxt) { + list_del(&ctxt->rw_list); + spin_unlock(&rdma->sc_rw_ctxt_lock); + } else { + spin_unlock(&rdma->sc_rw_ctxt_lock); + ctxt = kmalloc(sizeof(*ctxt) + + SG_CHUNK_SIZE * sizeof(struct scatterlist), + GFP_KERNEL); + if (!ctxt) + goto out; + INIT_LIST_HEAD(&ctxt->rw_list); + } + + ctxt->rw_sg_table.sgl = ctxt->rw_first_sgl; + if (sg_alloc_table_chained(&ctxt->rw_sg_table, sges, + ctxt->rw_sg_table.sgl)) { + kfree(ctxt); + ctxt = NULL; + } +out: + return ctxt; +} + +static void svc_rdma_put_rw_ctxt(struct svcxprt_rdma *rdma, + struct svc_rdma_rw_ctxt *ctxt) +{ + sg_free_table_chained(&ctxt->rw_sg_table, true); + + spin_lock(&rdma->sc_rw_ctxt_lock); + list_add(&ctxt->rw_list, &rdma->sc_rw_ctxts); + spin_unlock(&rdma->sc_rw_ctxt_lock); +} + +/** + * svc_rdma_destroy_rw_ctxts - Free accumulated R/W contexts + * @rdma: transport about to be destroyed + * + */ +void svc_rdma_destroy_rw_ctxts(struct svcxprt_rdma *rdma) +{ + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&rdma->sc_rw_ctxts)) != NULL) { + list_del(&ctxt->rw_list); + kfree(ctxt); + } +} + +/* A chunk context tracks all I/O for moving one Read or Write + * chunk. This is a a set of rdma_rw's that handle data movement + * for all segments of one chunk. + * + * These are small, acquired with a single allocator call, and + * no more than one is needed per chunk. They are allocated on + * demand, and not cached. + */ +struct svc_rdma_chunk_ctxt { + struct ib_cqe cc_cqe; + struct svcxprt_rdma *cc_rdma; + struct list_head cc_rwctxts; + int cc_sqecount; + enum dma_data_direction cc_dir; +}; + +static void svc_rdma_cc_init(struct svcxprt_rdma *rdma, + struct svc_rdma_chunk_ctxt *cc, + enum dma_data_direction dir) +{ + cc->cc_rdma = rdma; + svc_xprt_get(&rdma->sc_xprt); + + INIT_LIST_HEAD(&cc->cc_rwctxts); + cc->cc_sqecount = 0; + cc->cc_dir = dir; +} + +static void svc_rdma_cc_release(struct svc_rdma_chunk_ctxt *cc) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + + while ((ctxt = svc_rdma_next_ctxt(&cc->cc_rwctxts)) != NULL) { + list_del(&ctxt->rw_list); + + rdma_rw_ctx_destroy(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, cc->cc_dir); + svc_rdma_put_rw_ctxt(rdma, ctxt); + } + svc_xprt_put(&rdma->sc_xprt); +} + +/* State for sending a Write or Reply chunk. + * - Tracks progress of writing one chunk over all its segments + * - Stores arguments for the SGL constructor functions + */ +struct svc_rdma_write_info { + /* write state of this chunk */ + unsigned int wi_seg_off; + unsigned int wi_seg_no; + unsigned int wi_nsegs; + __be32 *wi_segs; + + /* SGL constructor arguments */ + struct xdr_buf *wi_xdr; + unsigned char *wi_base; + unsigned int wi_next_off; + + struct svc_rdma_chunk_ctxt wi_cc; +}; + +static struct svc_rdma_write_info * +svc_rdma_write_info_alloc(struct svcxprt_rdma *rdma, __be32 *chunk) +{ + struct svc_rdma_write_info *info; + + info = kmalloc(sizeof(*info), GFP_KERNEL); + if (!info) + return info; + + info->wi_seg_off = 0; + info->wi_seg_no = 0; + info->wi_nsegs = be32_to_cpup(++chunk); + info->wi_segs = ++chunk; + svc_rdma_cc_init(rdma, &info->wi_cc, DMA_TO_DEVICE); + return info; +} + +static void svc_rdma_write_info_free(struct svc_rdma_write_info *info) +{ + svc_rdma_cc_release(&info->wi_cc); + kfree(info); +} + +/** + * svc_rdma_write_done - Write chunk completion + * @cq: controlling Completion Queue + * @wc: Work Completion + * + * Pages under I/O are freed by a subsequent Send completion. + */ +static void svc_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_cqe *cqe = wc->wr_cqe; + struct svc_rdma_chunk_ctxt *cc = + container_of(cqe, struct svc_rdma_chunk_ctxt, cc_cqe); + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_write_info *info = + container_of(cc, struct svc_rdma_write_info, wi_cc); + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + if (wc->status != IB_WC_WR_FLUSH_ERR) + pr_err("svcrdma: write ctx: %s (%u/0x%x)\n", + ib_wc_status_msg(wc->status), + wc->status, wc->vendor_err); + } + + svc_rdma_write_info_free(info); +} + +/* This function sleeps when the transport's Send Queue is congested. + * + * Assumptions: + * - If ib_post_send() succeeds, only one completion is expected, + * even if one or more WRs are flushed. This is true when posting + * an rdma_rw_ctx or when posting a single signaled WR. + */ +static int svc_rdma_post_chunk_ctxt(struct svc_rdma_chunk_ctxt *cc) +{ + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_xprt *xprt = &rdma->sc_xprt; + struct ib_send_wr *first_wr, *bad_wr; + struct list_head *tmp; + struct ib_cqe *cqe; + int ret; + + first_wr = NULL; + cqe = &cc->cc_cqe; + list_for_each(tmp, &cc->cc_rwctxts) { + struct svc_rdma_rw_ctxt *ctxt; + + ctxt = list_entry(tmp, struct svc_rdma_rw_ctxt, rw_list); + first_wr = rdma_rw_ctx_wrs(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, cqe, first_wr); + cqe = NULL; + } + + do { + if (atomic_sub_return(cc->cc_sqecount, + &rdma->sc_sq_avail) > 0) { + ret = ib_post_send(rdma->sc_qp, first_wr, &bad_wr); + if (ret) + break; + return 0; + } + + atomic_inc(&rdma_stat_sq_starve); + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wait_event(rdma->sc_send_wait, + atomic_read(&rdma->sc_sq_avail) > cc->cc_sqecount); + } while (1); + + pr_err("svcrdma: ib_post_send failed (%d)\n", ret); + set_bit(XPT_CLOSE, &xprt->xpt_flags); + + /* If even one was posted, there will be a completion. */ + if (bad_wr != first_wr) + return 0; + + atomic_add(cc->cc_sqecount, &rdma->sc_sq_avail); + wake_up(&rdma->sc_send_wait); + return -ENOTCONN; +} + +/* Build and DMA-map an SGL that covers one kvec in an xdr_buf + */ +static void svc_rdma_vec_to_sg(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt) +{ + struct scatterlist *sg = ctxt->rw_sg_table.sgl; + + sg_set_buf(&sg[0], info->wi_base, len); + info->wi_base += len; + + ctxt->rw_nents = 1; +} + +/* Build and DMA-map an SGL that covers part of an xdr_buf's pagelist. + */ +static void svc_rdma_pagelist_to_sg(struct svc_rdma_write_info *info, + unsigned int remaining, + struct svc_rdma_rw_ctxt *ctxt) +{ + unsigned int sge_no, sge_bytes, page_off, page_no; + struct xdr_buf *xdr = info->wi_xdr; + struct scatterlist *sg; + struct page **page; + + page_off = (info->wi_next_off + xdr->page_base) & ~PAGE_MASK; + page_no = (info->wi_next_off + xdr->page_base) >> PAGE_SHIFT; + page = xdr->pages + page_no; + info->wi_next_off += remaining; + sg = ctxt->rw_sg_table.sgl; + sge_no = 0; + do { + sge_bytes = min_t(unsigned int, remaining, + PAGE_SIZE - page_off); + sg_set_page(sg, *page, sge_bytes, page_off); + + remaining -= sge_bytes; + sg = sg_next(sg); + page_off = 0; + sge_no++; + page++; + } while (remaining); + + ctxt->rw_nents = sge_no; +} + +/* Construct RDMA Write WRs to send a portion of an xdr_buf containing + * an RPC Reply. + */ +static int +svc_rdma_build_writes(struct svc_rdma_write_info *info, + void (*constructor)(struct svc_rdma_write_info *info, + unsigned int len, + struct svc_rdma_rw_ctxt *ctxt), + unsigned int remaining) +{ + struct svc_rdma_chunk_ctxt *cc = &info->wi_cc; + struct svcxprt_rdma *rdma = cc->cc_rdma; + struct svc_rdma_rw_ctxt *ctxt; + __be32 *seg; + int ret; + + cc->cc_cqe.done = svc_rdma_write_done; + seg = info->wi_segs + info->wi_seg_no * rpcrdma_segment_maxsz; + do { + unsigned int write_len; + u32 seg_length, seg_handle; + u64 seg_offset; + + if (info->wi_seg_no >= info->wi_nsegs) + goto out_overflow; + + seg_handle = be32_to_cpup(seg); + seg_length = be32_to_cpup(seg + 1); + xdr_decode_hyper(seg + 2, &seg_offset); + seg_offset += info->wi_seg_off; + + write_len = min(remaining, seg_length - info->wi_seg_off); + ctxt = svc_rdma_get_rw_ctxt(rdma, + (write_len >> PAGE_SHIFT) + 2); + if (!ctxt) + goto out_noctx; + + constructor(info, write_len, ctxt); + ret = rdma_rw_ctx_init(&ctxt->rw_ctx, rdma->sc_qp, + rdma->sc_port_num, ctxt->rw_sg_table.sgl, + ctxt->rw_nents, 0, seg_offset, + seg_handle, DMA_TO_DEVICE); + if (ret < 0) + goto out_initerr; + + list_add(&ctxt->rw_list, &cc->cc_rwctxts); + cc->cc_sqecount += ret; + if (write_len == seg_length - info->wi_seg_off) { + seg += 4; + info->wi_seg_no++; + info->wi_seg_off = 0; + } else { + info->wi_seg_off += write_len; + } + remaining -= write_len; + } while (remaining); + + return 0; + +out_overflow: + dprintk("svcrdma: inadequate space in Write chunk (%u)\n", + info->wi_nsegs); + return -E2BIG; + +out_noctx: + dprintk("svcrdma: no R/W ctxs available\n"); + return -ENOMEM; + +out_initerr: + svc_rdma_put_rw_ctxt(rdma, ctxt); + pr_err("svcrdma: failed to map pagelist (%d)\n", ret); + return -EIO; +} + +/* Send one of an xdr_buf's kvecs by itself. To send a Reply + * chunk, the whole RPC Reply is written back to the client. + * This function writes either the head or tail of the xdr_buf + * containing the Reply. + */ +static int svc_rdma_send_xdr_kvec(struct svc_rdma_write_info *info, + struct kvec *vec) +{ + info->wi_base = vec->iov_base; + return svc_rdma_build_writes(info, svc_rdma_vec_to_sg, + vec->iov_len); +} + +/* Send an xdr_buf's page list by itself. A Write chunk is + * just the page list. a Reply chunk is the head, page list, + * and tail. This function is shared between the two types + * of chunk. + */ +static int svc_rdma_send_xdr_pagelist(struct svc_rdma_write_info *info, + struct xdr_buf *xdr) +{ + info->wi_xdr = xdr; + info->wi_next_off = 0; + return svc_rdma_build_writes(info, svc_rdma_pagelist_to_sg, + xdr->page_len); +} + +/** + * svc_rdma_send_write_chunk - Write all segments in a Write chunk + * @rdma: controlling RDMA transport + * @wr_ch: Write chunk provided by client + * @xdr: xdr_buf containing the data payload + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Write chunk, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_write_chunk(struct svcxprt_rdma *rdma, __be32 *wr_ch, + struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int ret; + + if (!xdr->page_len) + return 0; + + info = svc_rdma_write_info_alloc(rdma, wr_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + return xdr->page_len; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} + +/** + * svc_rdma_send_reply_chunk - Write all segments in the Reply chunk + * @rdma: controlling RDMA transport + * @rp_ch: Reply chunk provided by client + * @writelist: true if client provided a Write list + * @xdr: xdr_buf containing an RPC Reply + * + * Returns a non-negative number of bytes the chunk consumed, or + * %-E2BIG if the payload was larger than the Reply chunk, + * %-ENOMEM if rdma_rw context pool was exhausted, + * %-ENOTCONN if posting failed (connection is lost), + * %-EIO if rdma_rw initialization failed (DMA mapping, etc). + */ +int svc_rdma_send_reply_chunk(struct svcxprt_rdma *rdma, __be32 *rp_ch, + bool writelist, struct xdr_buf *xdr) +{ + struct svc_rdma_write_info *info; + int consumed, ret; + + info = svc_rdma_write_info_alloc(rdma, rp_ch); + if (!info) + return -ENOMEM; + + ret = svc_rdma_send_xdr_kvec(info, &xdr->head[0]); + if (ret < 0) + goto out_err; + consumed = xdr->head[0].iov_len; + + /* Send the page list in the Reply chunk only if the + * client did not provide Write chunks. + */ + if (!writelist && xdr->page_len) { + ret = svc_rdma_send_xdr_pagelist(info, xdr); + if (ret < 0) + goto out_err; + consumed += xdr->page_len; + } + + if (xdr->tail[0].iov_len) { + ret = svc_rdma_send_xdr_kvec(info, &xdr->tail[0]); + if (ret < 0) + goto out_err; + consumed += xdr->tail[0].iov_len; + } + + ret = svc_rdma_post_chunk_ctxt(&info->wi_cc); + if (ret < 0) + goto out_err; + return consumed; + +out_err: + svc_rdma_write_info_free(info); + return ret; +} diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 515221b16d0956ea027e91985c89606c403d5109..1736337f3a557a68399f5d1b103822cc08d19562 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -1,4 +1,5 @@ /* + * Copyright (c) 2016 Oracle. All rights reserved. * Copyright (c) 2014 Open Grid Computing, Inc. All rights reserved. * Copyright (c) 2005-2006 Network Appliance, Inc. All rights reserved. * @@ -40,6 +41,63 @@ * Author: Tom Tucker */ +/* Operation + * + * The main entry point is svc_rdma_sendto. This is called by the + * RPC server when an RPC Reply is ready to be transmitted to a client. + * + * The passed-in svc_rqst contains a struct xdr_buf which holds an + * XDR-encoded RPC Reply message. sendto must construct the RPC-over-RDMA + * transport header, post all Write WRs needed for this Reply, then post + * a Send WR conveying the transport header and the RPC message itself to + * the client. + * + * svc_rdma_sendto must fully transmit the Reply before returning, as + * the svc_rqst will be recycled as soon as sendto returns. Remaining + * resources referred to by the svc_rqst are also recycled at that time. + * Therefore any resources that must remain longer must be detached + * from the svc_rqst and released later. + * + * Page Management + * + * The I/O that performs Reply transmission is asynchronous, and may + * complete well after sendto returns. Thus pages under I/O must be + * removed from the svc_rqst before sendto returns. + * + * The logic here depends on Send Queue and completion ordering. Since + * the Send WR is always posted last, it will always complete last. Thus + * when it completes, it is guaranteed that all previous Write WRs have + * also completed. + * + * Write WRs are constructed and posted. Each Write segment gets its own + * svc_rdma_rw_ctxt, allowing the Write completion handler to find and + * DMA-unmap the pages under I/O for that Write segment. The Write + * completion handler does not release any pages. + * + * When the Send WR is constructed, it also gets its own svc_rdma_op_ctxt. + * The ownership of all of the Reply's pages are transferred into that + * ctxt, the Send WR is posted, and sendto returns. + * + * The svc_rdma_op_ctxt is presented when the Send WR completes. The + * Send completion handler finally releases the Reply's pages. + * + * This mechanism also assumes that completions on the transport's Send + * Completion Queue do not run in parallel. Otherwise a Write completion + * and Send completion running at the same time could release pages that + * are still DMA-mapped. + * + * Error Handling + * + * - If the Send WR is posted successfully, it will either complete + * successfully, or get flushed. Either way, the Send completion + * handler releases the Reply's pages. + * - If the Send WR cannot be not posted, the forward path releases + * the Reply's pages. + * + * This handles the case, without the use of page reference counting, + * where two different Write segments send portions of the same page. + */ + #include #include #include @@ -55,113 +113,141 @@ static u32 xdr_padsize(u32 len) return (len & 3) ? (4 - (len & 3)) : 0; } -int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec, - bool write_chunk_present) +/* Returns length of transport header, in bytes. + */ +static unsigned int svc_rdma_reply_hdr_len(__be32 *rdma_resp) { - int sge_no; - u32 sge_bytes; - u32 page_bytes; - u32 page_off; - int page_no; - - if (xdr->len != - (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: %s: XDR buffer length error\n", __func__); - return -EIO; - } + unsigned int nsegs; + __be32 *p; - /* Skip the first sge, this is for the RPCRDMA header */ - sge_no = 1; + p = rdma_resp; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p += rpcrdma_fixed_maxsz + 1; - /* Head SGE */ - vec->sge[sge_no].iov_base = xdr->head[0].iov_base; - vec->sge[sge_no].iov_len = xdr->head[0].iov_len; - sge_no++; - - /* pages SGE */ - page_no = 0; - page_bytes = xdr->page_len; - page_off = xdr->page_base; - while (page_bytes) { - vec->sge[sge_no].iov_base = - page_address(xdr->pages[page_no]) + page_off; - sge_bytes = min_t(u32, page_bytes, (PAGE_SIZE - page_off)); - page_bytes -= sge_bytes; - vec->sge[sge_no].iov_len = sge_bytes; - - sge_no++; - page_no++; - page_off = 0; /* reset for next time through loop */ + /* Skip Write list. */ + while (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; } - /* Tail SGE */ - if (xdr->tail[0].iov_len) { - unsigned char *base = xdr->tail[0].iov_base; - size_t len = xdr->tail[0].iov_len; - u32 xdr_pad = xdr_padsize(xdr->page_len); + /* Skip Reply chunk. */ + if (*p++ != xdr_zero) { + nsegs = be32_to_cpup(p++); + p += nsegs * rpcrdma_segment_maxsz; + } - if (write_chunk_present && xdr_pad) { - base += xdr_pad; - len -= xdr_pad; - } + return (unsigned long)p - (unsigned long)rdma_resp; +} - if (len) { - vec->sge[sge_no].iov_base = base; - vec->sge[sge_no].iov_len = len; - sge_no++; +/* One Write chunk is copied from Call transport header to Reply + * transport header. Each segment's length field is updated to + * reflect number of bytes consumed in the segment. + * + * Returns number of segments in this chunk. + */ +static unsigned int xdr_encode_write_chunk(__be32 *dst, __be32 *src, + unsigned int remaining) +{ + unsigned int i, nsegs; + u32 seg_len; + + /* Write list discriminator */ + *dst++ = *src++; + + /* number of segments in this chunk */ + nsegs = be32_to_cpup(src); + *dst++ = *src++; + + for (i = nsegs; i; i--) { + /* segment's RDMA handle */ + *dst++ = *src++; + + /* bytes returned in this segment */ + seg_len = be32_to_cpu(*src); + if (remaining >= seg_len) { + /* entire segment was consumed */ + *dst = *src; + remaining -= seg_len; + } else { + /* segment only partly filled */ + *dst = cpu_to_be32(remaining); + remaining = 0; } - } + dst++; src++; - dprintk("svcrdma: %s: sge_no %d page_no %d " - "page_base %u page_len %u head_len %zu tail_len %zu\n", - __func__, sge_no, page_no, xdr->page_base, xdr->page_len, - xdr->head[0].iov_len, xdr->tail[0].iov_len); + /* segment's RDMA offset */ + *dst++ = *src++; + *dst++ = *src++; + } - vec->count = sge_no; - return 0; + return nsegs; } -static dma_addr_t dma_map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - u32 xdr_off, size_t len, int dir) +/* The client provided a Write list in the Call message. Fill in + * the segments in the first Write chunk in the Reply's transport + * header with the number of bytes consumed in each segment. + * Remaining chunks are returned unused. + * + * Assumptions: + * - Client has provided only one Write chunk + */ +static void svc_rdma_xdr_encode_write_list(__be32 *rdma_resp, __be32 *wr_ch, + unsigned int consumed) { - struct page *page; - dma_addr_t dma_addr; - if (xdr_off < xdr->head[0].iov_len) { - /* This offset is in the head */ - xdr_off += (unsigned long)xdr->head[0].iov_base & ~PAGE_MASK; - page = virt_to_page(xdr->head[0].iov_base); - } else { - xdr_off -= xdr->head[0].iov_len; - if (xdr_off < xdr->page_len) { - /* This offset is in the page list */ - xdr_off += xdr->page_base; - page = xdr->pages[xdr_off >> PAGE_SHIFT]; - xdr_off &= ~PAGE_MASK; - } else { - /* This offset is in the tail */ - xdr_off -= xdr->page_len; - xdr_off += (unsigned long) - xdr->tail[0].iov_base & ~PAGE_MASK; - page = virt_to_page(xdr->tail[0].iov_base); - } + unsigned int nsegs; + __be32 *p, *q; + + /* RPC-over-RDMA V1 replies never have a Read list. */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + q = wr_ch; + while (*q != xdr_zero) { + nsegs = xdr_encode_write_chunk(p, q, consumed); + q += 2 + nsegs * rpcrdma_segment_maxsz; + p += 2 + nsegs * rpcrdma_segment_maxsz; + consumed = 0; } - dma_addr = ib_dma_map_page(xprt->sc_cm_id->device, page, xdr_off, - min_t(size_t, PAGE_SIZE, len), dir); - return dma_addr; + + /* Terminate Write list */ + *p++ = xdr_zero; + + /* Reply chunk discriminator; may be replaced later */ + *p = xdr_zero; +} + +/* The client provided a Reply chunk in the Call message. Fill in + * the segments in the Reply chunk in the Reply message with the + * number of bytes consumed in each segment. + * + * Assumptions: + * - Reply can always fit in the provided Reply chunk + */ +static void svc_rdma_xdr_encode_reply_chunk(__be32 *rdma_resp, __be32 *rp_ch, + unsigned int consumed) +{ + __be32 *p; + + /* Find the Reply chunk in the Reply's xprt header. + * RPC-over-RDMA V1 replies never have a Read list. + */ + p = rdma_resp + rpcrdma_fixed_maxsz + 1; + + /* Skip past Write list */ + while (*p++ != xdr_zero) + p += 1 + be32_to_cpup(p) * rpcrdma_segment_maxsz; + + xdr_encode_write_chunk(p, rp_ch, consumed); } /* Parse the RPC Call's transport header. */ -static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, - struct rpcrdma_write_array **write, - struct rpcrdma_write_array **reply) +static void svc_rdma_get_write_arrays(__be32 *rdma_argp, + __be32 **write, __be32 **reply) { __be32 *p; - p = (__be32 *)&rmsgp->rm_body.rm_chunks[0]; + p = rdma_argp + rpcrdma_fixed_maxsz; /* Read list */ while (*p++ != xdr_zero) @@ -169,7 +255,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, /* Write list */ if (*p != xdr_zero) { - *write = (struct rpcrdma_write_array *)p; + *write = p; while (*p++ != xdr_zero) p += 1 + be32_to_cpu(*p) * 4; } else { @@ -179,7 +265,7 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, /* Reply chunk */ if (*p != xdr_zero) - *reply = (struct rpcrdma_write_array *)p; + *reply = p; else *reply = NULL; } @@ -189,360 +275,321 @@ static void svc_rdma_get_write_arrays(struct rpcrdma_msg *rmsgp, * Invalidate, and responder chooses one rkey to invalidate. * * Find a candidate rkey to invalidate when sending a reply. Picks the - * first rkey it finds in the chunks lists. + * first R_key it finds in the chunk lists. * * Returns zero if RPC's chunk lists are empty. */ -static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp, - struct rpcrdma_write_array *wr_ary, - struct rpcrdma_write_array *rp_ary) +static u32 svc_rdma_get_inv_rkey(__be32 *rdma_argp, + __be32 *wr_lst, __be32 *rp_ch) { - struct rpcrdma_read_chunk *rd_ary; - struct rpcrdma_segment *arg_ch; + __be32 *p; - rd_ary = (struct rpcrdma_read_chunk *)&rdma_argp->rm_body.rm_chunks[0]; - if (rd_ary->rc_discrim != xdr_zero) - return be32_to_cpu(rd_ary->rc_target.rs_handle); + p = rdma_argp + rpcrdma_fixed_maxsz; + if (*p != xdr_zero) + p += 2; + else if (wr_lst && be32_to_cpup(wr_lst + 1)) + p = wr_lst + 2; + else if (rp_ch && be32_to_cpup(rp_ch + 1)) + p = rp_ch + 2; + else + return 0; + return be32_to_cpup(p); +} - if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) { - arg_ch = &wr_ary->wc_array[0].wc_target; - return be32_to_cpu(arg_ch->rs_handle); - } +/* ib_dma_map_page() is used here because svc_rdma_dma_unmap() + * is used during completion to DMA-unmap this memory, and + * it uses ib_dma_unmap_page() exclusively. + */ +static int svc_rdma_dma_map_buf(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + unsigned int sge_no, + unsigned char *base, + unsigned int len) +{ + unsigned long offset = (unsigned long)base & ~PAGE_MASK; + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; - if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) { - arg_ch = &rp_ary->wc_array[0].wc_target; - return be32_to_cpu(arg_ch->rs_handle); - } + dma_addr = ib_dma_map_page(dev, virt_to_page(base), + offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) + return -EIO; + ctxt->sge[sge_no].addr = dma_addr; + ctxt->sge[sge_no].length = len; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; + svc_rdma_count_mappings(rdma, ctxt); return 0; } -/* Assumptions: - * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE - */ -static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, - u32 rmr, u64 to, - u32 xdr_off, int write_len, - struct svc_rdma_req_map *vec) +static int svc_rdma_dma_map_page(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + unsigned int sge_no, + struct page *page, + unsigned int offset, + unsigned int len) { - struct ib_rdma_wr write_wr; - struct ib_sge *sge; - int xdr_sge_no; - int sge_no; - int sge_bytes; - int sge_off; - int bc; - struct svc_rdma_op_ctxt *ctxt; + struct ib_device *dev = rdma->sc_cm_id->device; + dma_addr_t dma_addr; - if (vec->count > RPCSVC_MAXPAGES) { - pr_err("svcrdma: Too many pages (%lu)\n", vec->count); + dma_addr = ib_dma_map_page(dev, page, offset, len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(dev, dma_addr)) return -EIO; - } - dprintk("svcrdma: RDMA_WRITE rmr=%x, to=%llx, xdr_off=%d, " - "write_len=%d, vec->sge=%p, vec->count=%lu\n", - rmr, (unsigned long long)to, xdr_off, - write_len, vec->sge, vec->count); + ctxt->sge[sge_no].addr = dma_addr; + ctxt->sge[sge_no].length = len; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; + svc_rdma_count_mappings(rdma, ctxt); + return 0; +} - ctxt = svc_rdma_get_context(xprt); +/** + * svc_rdma_map_reply_hdr - DMA map the transport header buffer + * @rdma: controlling transport + * @ctxt: op_ctxt for the Send WR + * @rdma_resp: buffer containing transport header + * @len: length of transport header + * + * Returns: + * %0 if the header is DMA mapped, + * %-EIO if DMA mapping failed. + */ +int svc_rdma_map_reply_hdr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + __be32 *rdma_resp, + unsigned int len) +{ ctxt->direction = DMA_TO_DEVICE; - sge = ctxt->sge; - - /* Find the SGE associated with xdr_off */ - for (bc = xdr_off, xdr_sge_no = 1; bc && xdr_sge_no < vec->count; - xdr_sge_no++) { - if (vec->sge[xdr_sge_no].iov_len > bc) - break; - bc -= vec->sge[xdr_sge_no].iov_len; - } - - sge_off = bc; - bc = write_len; - sge_no = 0; - - /* Copy the remaining SGE */ - while (bc != 0) { - sge_bytes = min_t(size_t, - bc, vec->sge[xdr_sge_no].iov_len-sge_off); - sge[sge_no].length = sge_bytes; - sge[sge_no].addr = - dma_map_xdr(xprt, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(xprt->sc_cm_id->device, - sge[sge_no].addr)) - goto err; - svc_rdma_count_mappings(xprt, ctxt); - sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->count++; - sge_off = 0; - sge_no++; - xdr_sge_no++; - if (xdr_sge_no > vec->count) { - pr_err("svcrdma: Too many sges (%d)\n", xdr_sge_no); - goto err; - } - bc -= sge_bytes; - if (sge_no == xprt->sc_max_sge) - break; - } - - /* Prepare WRITE WR */ - memset(&write_wr, 0, sizeof write_wr); - ctxt->cqe.done = svc_rdma_wc_write; - write_wr.wr.wr_cqe = &ctxt->cqe; - write_wr.wr.sg_list = &sge[0]; - write_wr.wr.num_sge = sge_no; - write_wr.wr.opcode = IB_WR_RDMA_WRITE; - write_wr.wr.send_flags = IB_SEND_SIGNALED; - write_wr.rkey = rmr; - write_wr.remote_addr = to; - - /* Post It */ - atomic_inc(&rdma_stat_write); - if (svc_rdma_send(xprt, &write_wr.wr)) - goto err; - return write_len - bc; - err: - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); - return -EIO; + ctxt->pages[0] = virt_to_page(rdma_resp); + ctxt->count = 1; + return svc_rdma_dma_map_page(rdma, ctxt, 0, ctxt->pages[0], 0, len); } -noinline -static int send_write_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_write_array *wr_ary, - struct rpcrdma_msg *rdma_resp, - struct svc_rqst *rqstp, - struct svc_rdma_req_map *vec) +/* Load the xdr_buf into the ctxt's sge array, and DMA map each + * element as it is added. + * + * Returns the number of sge elements loaded on success, or + * a negative errno on failure. + */ +static int svc_rdma_map_reply_msg(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, + struct xdr_buf *xdr, __be32 *wr_lst) { - u32 xfer_len = rqstp->rq_res.page_len; - int write_len; - u32 xdr_off; - int chunk_off; - int chunk_no; - int nchunks; - struct rpcrdma_write_array *res_ary; + unsigned int len, sge_no, remaining, page_off; + struct page **ppages; + unsigned char *base; + u32 xdr_pad; int ret; - res_ary = (struct rpcrdma_write_array *) - &rdma_resp->rm_body.rm_chunks[1]; - - /* Write chunks start at the pagelist */ - nchunks = be32_to_cpu(wr_ary->wc_nchunks); - for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0; - xfer_len && chunk_no < nchunks; - chunk_no++) { - struct rpcrdma_segment *arg_ch; - u64 rs_offset; - - arg_ch = &wr_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length)); - - /* Prepare the response chunk given the length actually - * written */ - xdr_decode_hyper((__be32 *)&arg_ch->rs_offset, &rs_offset); - svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, - arg_ch->rs_handle, - arg_ch->rs_offset, - write_len); - chunk_off = 0; - while (write_len) { - ret = send_write(xprt, rqstp, - be32_to_cpu(arg_ch->rs_handle), - rs_offset + chunk_off, - xdr_off, - write_len, - vec); - if (ret <= 0) - goto out_err; - chunk_off += ret; - xdr_off += ret; - xfer_len -= ret; - write_len -= ret; + sge_no = 1; + + ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, + xdr->head[0].iov_base, + xdr->head[0].iov_len); + if (ret < 0) + return ret; + + /* If a Write chunk is present, the xdr_buf's page list + * is not included inline. However the Upper Layer may + * have added XDR padding in the tail buffer, and that + * should not be included inline. + */ + if (wr_lst) { + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; + xdr_pad = xdr_padsize(xdr->page_len); + + if (len && xdr_pad) { + base += xdr_pad; + len -= xdr_pad; } + + goto tail; + } + + ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); + page_off = xdr->page_base & ~PAGE_MASK; + remaining = xdr->page_len; + while (remaining) { + len = min_t(u32, PAGE_SIZE - page_off, remaining); + + ret = svc_rdma_dma_map_page(rdma, ctxt, sge_no++, + *ppages++, page_off, len); + if (ret < 0) + return ret; + + remaining -= len; + page_off = 0; } - /* Update the req with the number of chunks actually used */ - svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no); - return rqstp->rq_res.page_len; + base = xdr->tail[0].iov_base; + len = xdr->tail[0].iov_len; +tail: + if (len) { + ret = svc_rdma_dma_map_buf(rdma, ctxt, sge_no++, base, len); + if (ret < 0) + return ret; + } -out_err: - pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret); - return -EIO; + return sge_no - 1; } -noinline -static int send_reply_chunks(struct svcxprt_rdma *xprt, - struct rpcrdma_write_array *rp_ary, - struct rpcrdma_msg *rdma_resp, - struct svc_rqst *rqstp, - struct svc_rdma_req_map *vec) +/* The svc_rqst and all resources it owns are released as soon as + * svc_rdma_sendto returns. Transfer pages under I/O to the ctxt + * so they are released by the Send completion handler. + */ +static void svc_rdma_save_io_pages(struct svc_rqst *rqstp, + struct svc_rdma_op_ctxt *ctxt) { - u32 xfer_len = rqstp->rq_res.len; - int write_len; - u32 xdr_off; - int chunk_no; - int chunk_off; - int nchunks; - struct rpcrdma_segment *ch; - struct rpcrdma_write_array *res_ary; - int ret; + int i, pages = rqstp->rq_next_page - rqstp->rq_respages; - /* XXX: need to fix when reply lists occur with read-list and or - * write-list */ - res_ary = (struct rpcrdma_write_array *) - &rdma_resp->rm_body.rm_chunks[2]; - - /* xdr offset starts at RPC message */ - nchunks = be32_to_cpu(rp_ary->wc_nchunks); - for (xdr_off = 0, chunk_no = 0; - xfer_len && chunk_no < nchunks; - chunk_no++) { - u64 rs_offset; - ch = &rp_ary->wc_array[chunk_no].wc_target; - write_len = min(xfer_len, be32_to_cpu(ch->rs_length)); - - /* Prepare the reply chunk given the length actually - * written */ - xdr_decode_hyper((__be32 *)&ch->rs_offset, &rs_offset); - svc_rdma_xdr_encode_array_chunk(res_ary, chunk_no, - ch->rs_handle, ch->rs_offset, - write_len); - chunk_off = 0; - while (write_len) { - ret = send_write(xprt, rqstp, - be32_to_cpu(ch->rs_handle), - rs_offset + chunk_off, - xdr_off, - write_len, - vec); - if (ret <= 0) - goto out_err; - chunk_off += ret; - xdr_off += ret; - xfer_len -= ret; - write_len -= ret; - } + ctxt->count += pages; + for (i = 0; i < pages; i++) { + ctxt->pages[i + 1] = rqstp->rq_respages[i]; + rqstp->rq_respages[i] = NULL; } - /* Update the req with the number of chunks actually used */ - svc_rdma_xdr_encode_reply_array(res_ary, chunk_no); + rqstp->rq_next_page = rqstp->rq_respages + 1; +} - return rqstp->rq_res.len; +/** + * svc_rdma_post_send_wr - Set up and post one Send Work Request + * @rdma: controlling transport + * @ctxt: op_ctxt for transmitting the Send WR + * @num_sge: number of SGEs to send + * @inv_rkey: R_key argument to Send With Invalidate, or zero + * + * Returns: + * %0 if the Send* was posted successfully, + * %-ENOTCONN if the connection was lost or dropped, + * %-EINVAL if there was a problem with the Send we built, + * %-ENOMEM if ib_post_send failed. + */ +int svc_rdma_post_send_wr(struct svcxprt_rdma *rdma, + struct svc_rdma_op_ctxt *ctxt, int num_sge, + u32 inv_rkey) +{ + struct ib_send_wr *send_wr = &ctxt->send_wr; -out_err: - pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret); - return -EIO; + dprintk("svcrdma: posting Send WR with %u sge(s)\n", num_sge); + + send_wr->next = NULL; + ctxt->cqe.done = svc_rdma_wc_send; + send_wr->wr_cqe = &ctxt->cqe; + send_wr->sg_list = ctxt->sge; + send_wr->num_sge = num_sge; + send_wr->send_flags = IB_SEND_SIGNALED; + if (inv_rkey) { + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = inv_rkey; + } else { + send_wr->opcode = IB_WR_SEND; + } + + return svc_rdma_send(rdma, send_wr); } -/* This function prepares the portion of the RPCRDMA message to be - * sent in the RDMA_SEND. This function is called after data sent via - * RDMA has already been transmitted. There are three cases: - * - The RPCRDMA header, RPC header, and payload are all sent in a - * single RDMA_SEND. This is the "inline" case. - * - The RPCRDMA header and some portion of the RPC header and data - * are sent via this RDMA_SEND and another portion of the data is - * sent via RDMA. - * - The RPCRDMA header [NOMSG] is sent in this RDMA_SEND and the RPC - * header and data are all transmitted via RDMA. - * In all three cases, this function prepares the RPCRDMA header in - * sge[0], the 'type' parameter indicates the type to place in the - * RPCRDMA header, and the 'byte_count' field indicates how much of - * the XDR to include in this RDMA_SEND. NB: The offset of the payload - * to send is zero in the XDR. +/* Prepare the portion of the RPC Reply that will be transmitted + * via RDMA Send. The RPC-over-RDMA transport header is prepared + * in sge[0], and the RPC xdr_buf is prepared in following sges. + * + * Depending on whether a Write list or Reply chunk is present, + * the server may send all, a portion of, or none of the xdr_buf. + * In the latter case, only the transport header (sge[0]) is + * transmitted. + * + * RDMA Send is the last step of transmitting an RPC reply. Pages + * involved in the earlier RDMA Writes are here transferred out + * of the rqstp and into the ctxt's page array. These pages are + * DMA unmapped by each Write completion, but the subsequent Send + * completion finally releases these pages. + * + * Assumptions: + * - The Reply's transport header will never be larger than a page. */ -static int send_reply(struct svcxprt_rdma *rdma, - struct svc_rqst *rqstp, - struct page *page, - struct rpcrdma_msg *rdma_resp, - struct svc_rdma_req_map *vec, - int byte_count, - u32 inv_rkey) +static int svc_rdma_send_reply_msg(struct svcxprt_rdma *rdma, + __be32 *rdma_argp, __be32 *rdma_resp, + struct svc_rqst *rqstp, + __be32 *wr_lst, __be32 *rp_ch) { struct svc_rdma_op_ctxt *ctxt; - struct ib_send_wr send_wr; - u32 xdr_off; - int sge_no; - int sge_bytes; - int page_no; - int pages; - int ret = -EIO; - - /* Prepare the context */ + u32 inv_rkey; + int ret; + + dprintk("svcrdma: sending %s reply: head=%zu, pagelen=%u, tail=%zu\n", + (rp_ch ? "RDMA_NOMSG" : "RDMA_MSG"), + rqstp->rq_res.head[0].iov_len, + rqstp->rq_res.page_len, + rqstp->rq_res.tail[0].iov_len); + ctxt = svc_rdma_get_context(rdma); - ctxt->direction = DMA_TO_DEVICE; - ctxt->pages[0] = page; - ctxt->count = 1; - /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[0].length = - svc_rdma_xdr_get_reply_hdr_len((__be32 *)rdma_resp); - ctxt->sge[0].addr = - ib_dma_map_page(rdma->sc_cm_id->device, page, 0, - ctxt->sge[0].length, DMA_TO_DEVICE); - if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, + svc_rdma_reply_hdr_len(rdma_resp)); + if (ret < 0) goto err; - svc_rdma_count_mappings(rdma, ctxt); - - ctxt->direction = DMA_TO_DEVICE; - /* Map the payload indicated by 'byte_count' */ - xdr_off = 0; - for (sge_no = 1; byte_count && sge_no < vec->count; sge_no++) { - sge_bytes = min_t(size_t, vec->sge[sge_no].iov_len, byte_count); - byte_count -= sge_bytes; - ctxt->sge[sge_no].addr = - dma_map_xdr(rdma, &rqstp->rq_res, xdr_off, - sge_bytes, DMA_TO_DEVICE); - xdr_off += sge_bytes; - if (ib_dma_mapping_error(rdma->sc_cm_id->device, - ctxt->sge[sge_no].addr)) + if (!rp_ch) { + ret = svc_rdma_map_reply_msg(rdma, ctxt, + &rqstp->rq_res, wr_lst); + if (ret < 0) goto err; - svc_rdma_count_mappings(rdma, ctxt); - ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; - ctxt->sge[sge_no].length = sge_bytes; } - if (byte_count != 0) { - pr_err("svcrdma: Could not map %d bytes\n", byte_count); + + svc_rdma_save_io_pages(rqstp, ctxt); + + inv_rkey = 0; + if (rdma->sc_snd_w_inv) + inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_lst, rp_ch); + ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, inv_rkey); + if (ret) goto err; - } - /* Save all respages in the ctxt and remove them from the - * respages array. They are our pages until the I/O - * completes. + return 0; + +err: + pr_err("svcrdma: failed to post Send WR (%d)\n", ret); + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + return ret; +} + +/* Given the client-provided Write and Reply chunks, the server was not + * able to form a complete reply. Return an RDMA_ERROR message so the + * client can retire this RPC transaction. As above, the Send completion + * routine releases payload pages that were part of a previous RDMA Write. + * + * Remote Invalidation is skipped for simplicity. + */ +static int svc_rdma_send_error_msg(struct svcxprt_rdma *rdma, + __be32 *rdma_resp, struct svc_rqst *rqstp) +{ + struct svc_rdma_op_ctxt *ctxt; + __be32 *p; + int ret; + + ctxt = svc_rdma_get_context(rdma); + + /* Replace the original transport header with an + * RDMA_ERROR response. XID etc are preserved. */ - pages = rqstp->rq_next_page - rqstp->rq_respages; - for (page_no = 0; page_no < pages; page_no++) { - ctxt->pages[page_no+1] = rqstp->rq_respages[page_no]; - ctxt->count++; - rqstp->rq_respages[page_no] = NULL; - } - rqstp->rq_next_page = rqstp->rq_respages + 1; + p = rdma_resp + 3; + *p++ = rdma_error; + *p = err_chunk; - if (sge_no > rdma->sc_max_sge) { - pr_err("svcrdma: Too many sges (%d)\n", sge_no); + ret = svc_rdma_map_reply_hdr(rdma, ctxt, rdma_resp, 20); + if (ret < 0) goto err; - } - memset(&send_wr, 0, sizeof send_wr); - ctxt->cqe.done = svc_rdma_wc_send; - send_wr.wr_cqe = &ctxt->cqe; - send_wr.sg_list = ctxt->sge; - send_wr.num_sge = sge_no; - if (inv_rkey) { - send_wr.opcode = IB_WR_SEND_WITH_INV; - send_wr.ex.invalidate_rkey = inv_rkey; - } else - send_wr.opcode = IB_WR_SEND; - send_wr.send_flags = IB_SEND_SIGNALED; - ret = svc_rdma_send(rdma, &send_wr); + svc_rdma_save_io_pages(rqstp, ctxt); + + ret = svc_rdma_post_send_wr(rdma, ctxt, 1 + ret, 0); if (ret) goto err; return 0; - err: +err: + pr_err("svcrdma: failed to post Send WR (%d)\n", ret); svc_rdma_unmap_dma(ctxt); svc_rdma_put_context(ctxt, 1); return ret; @@ -552,39 +599,36 @@ void svc_rdma_prep_reply_hdr(struct svc_rqst *rqstp) { } +/** + * svc_rdma_sendto - Transmit an RPC reply + * @rqstp: processed RPC request, reply XDR already in ::rq_res + * + * Any resources still associated with @rqstp are released upon return. + * If no reply message was possible, the connection is closed. + * + * Returns: + * %0 if an RPC reply has been successfully posted, + * %-ENOMEM if a resource shortage occurred (connection is lost), + * %-ENOTCONN if posting failed (connection is lost). + */ int svc_rdma_sendto(struct svc_rqst *rqstp) { struct svc_xprt *xprt = rqstp->rq_xprt; struct svcxprt_rdma *rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); - struct rpcrdma_msg *rdma_argp; - struct rpcrdma_msg *rdma_resp; - struct rpcrdma_write_array *wr_ary, *rp_ary; - int ret; - int inline_bytes; + __be32 *p, *rdma_argp, *rdma_resp, *wr_lst, *rp_ch; + struct xdr_buf *xdr = &rqstp->rq_res; struct page *res_page; - struct svc_rdma_req_map *vec; - u32 inv_rkey; - __be32 *p; - - dprintk("svcrdma: sending response for rqstp=%p\n", rqstp); + int ret; - /* Get the RDMA request header. The receive logic always - * places this at the start of page 0. + /* Find the call's chunk lists to decide how to send the reply. + * Receive places the Call's xprt header at the start of page 0. */ rdma_argp = page_address(rqstp->rq_pages[0]); - svc_rdma_get_write_arrays(rdma_argp, &wr_ary, &rp_ary); - - inv_rkey = 0; - if (rdma->sc_snd_w_inv) - inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary); + svc_rdma_get_write_arrays(rdma_argp, &wr_lst, &rp_ch); - /* Build an req vec for the XDR */ - vec = svc_rdma_get_req_map(rdma); - ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL); - if (ret) - goto err0; - inline_bytes = rqstp->rq_res.len; + dprintk("svcrdma: preparing response for XID 0x%08x\n", + be32_to_cpup(rdma_argp)); /* Create the RDMA response header. xprt->xpt_mutex, * acquired in svc_send(), serializes RPC replies. The @@ -598,115 +642,57 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) goto err0; rdma_resp = page_address(res_page); - p = &rdma_resp->rm_xid; - *p++ = rdma_argp->rm_xid; - *p++ = rdma_argp->rm_vers; + p = rdma_resp; + *p++ = *rdma_argp; + *p++ = *(rdma_argp + 1); *p++ = rdma->sc_fc_credits; - *p++ = rp_ary ? rdma_nomsg : rdma_msg; + *p++ = rp_ch ? rdma_nomsg : rdma_msg; /* Start with empty chunks */ *p++ = xdr_zero; *p++ = xdr_zero; *p = xdr_zero; - /* Send any write-chunk data and build resp write-list */ - if (wr_ary) { - ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec); + if (wr_lst) { + /* XXX: Presume the client sent only one Write chunk */ + ret = svc_rdma_send_write_chunk(rdma, wr_lst, xdr); if (ret < 0) - goto err1; - inline_bytes -= ret + xdr_padsize(ret); + goto err2; + svc_rdma_xdr_encode_write_list(rdma_resp, wr_lst, ret); } - - /* Send any reply-list data and update resp reply-list */ - if (rp_ary) { - ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec); + if (rp_ch) { + ret = svc_rdma_send_reply_chunk(rdma, rp_ch, wr_lst, xdr); if (ret < 0) - goto err1; - inline_bytes -= ret; + goto err2; + svc_rdma_xdr_encode_reply_chunk(rdma_resp, rp_ch, ret); } - /* Post a fresh Receive buffer _before_ sending the reply */ ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) goto err1; - - ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec, - inline_bytes, inv_rkey); + ret = svc_rdma_send_reply_msg(rdma, rdma_argp, rdma_resp, rqstp, + wr_lst, rp_ch); if (ret < 0) goto err0; + return 0; - svc_rdma_put_req_map(rdma, vec); - dprintk("svcrdma: send_reply returns %d\n", ret); - return ret; + err2: + if (ret != -E2BIG) + goto err1; + + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); + if (ret) + goto err1; + ret = svc_rdma_send_error_msg(rdma, rdma_resp, rqstp); + if (ret < 0) + goto err0; + return 0; err1: put_page(res_page); err0: - svc_rdma_put_req_map(rdma, vec); pr_err("svcrdma: Could not send reply, err=%d. Closing transport.\n", ret); - set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + set_bit(XPT_CLOSE, &xprt->xpt_flags); return -ENOTCONN; } - -void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, - int status) -{ - struct ib_send_wr err_wr; - struct page *p; - struct svc_rdma_op_ctxt *ctxt; - enum rpcrdma_errcode err; - __be32 *va; - int length; - int ret; - - ret = svc_rdma_repost_recv(xprt, GFP_KERNEL); - if (ret) - return; - - p = alloc_page(GFP_KERNEL); - if (!p) - return; - va = page_address(p); - - /* XDR encode an error reply */ - err = ERR_CHUNK; - if (status == -EPROTONOSUPPORT) - err = ERR_VERS; - length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va); - - ctxt = svc_rdma_get_context(xprt); - ctxt->direction = DMA_TO_DEVICE; - ctxt->count = 1; - ctxt->pages[0] = p; - - /* Prepare SGE for local address */ - ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; - ctxt->sge[0].length = length; - ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device, - p, 0, length, DMA_TO_DEVICE); - if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) { - dprintk("svcrdma: Error mapping buffer for protocol error\n"); - svc_rdma_put_context(ctxt, 1); - return; - } - svc_rdma_count_mappings(xprt, ctxt); - - /* Prepare SEND WR */ - memset(&err_wr, 0, sizeof(err_wr)); - ctxt->cqe.done = svc_rdma_wc_send; - err_wr.wr_cqe = &ctxt->cqe; - err_wr.sg_list = ctxt->sge; - err_wr.num_sge = 1; - err_wr.opcode = IB_WR_SEND; - err_wr.send_flags = IB_SEND_SIGNALED; - - /* Post It */ - ret = svc_rdma_send(xprt, &err_wr); - if (ret) { - dprintk("svcrdma: Error %d posting send for protocol error\n", - ret); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 1); - } -} diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index fc8f14c7bfec60dc5828340861a747e49f06193e..a9d9cb1ba4c6068b1a80c225cb147d1993a75d76 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -272,85 +272,6 @@ static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) } } -static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) -{ - struct svc_rdma_req_map *map; - - map = kmalloc(sizeof(*map), flags); - if (map) - INIT_LIST_HEAD(&map->free); - return map; -} - -static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) -{ - unsigned int i; - - /* One for each receive buffer on this connection. */ - i = xprt->sc_max_requests; - - while (i--) { - struct svc_rdma_req_map *map; - - map = alloc_req_map(GFP_KERNEL); - if (!map) { - dprintk("svcrdma: No memory for request map\n"); - return false; - } - list_add(&map->free, &xprt->sc_maps); - } - return true; -} - -struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) -{ - struct svc_rdma_req_map *map = NULL; - - spin_lock(&xprt->sc_map_lock); - if (list_empty(&xprt->sc_maps)) - goto out_empty; - - map = list_first_entry(&xprt->sc_maps, - struct svc_rdma_req_map, free); - list_del_init(&map->free); - spin_unlock(&xprt->sc_map_lock); - -out: - map->count = 0; - return map; - -out_empty: - spin_unlock(&xprt->sc_map_lock); - - /* Pre-allocation amount was incorrect */ - map = alloc_req_map(GFP_NOIO); - if (map) - goto out; - - WARN_ONCE(1, "svcrdma: empty request map list?\n"); - return NULL; -} - -void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, - struct svc_rdma_req_map *map) -{ - spin_lock(&xprt->sc_map_lock); - list_add(&map->free, &xprt->sc_maps); - spin_unlock(&xprt->sc_map_lock); -} - -static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) -{ - while (!list_empty(&xprt->sc_maps)) { - struct svc_rdma_req_map *map; - - map = list_first_entry(&xprt->sc_maps, - struct svc_rdma_req_map, free); - list_del(&map->free); - kfree(map); - } -} - /* QP event handler */ static void qp_event_handler(struct ib_event *event, void *context) { @@ -473,24 +394,6 @@ void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) svc_rdma_put_context(ctxt, 1); } -/** - * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC - * @cq: completion queue - * @wc: completed WR - * - */ -void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc) -{ - struct ib_cqe *cqe = wc->wr_cqe; - struct svc_rdma_op_ctxt *ctxt; - - svc_rdma_send_wc_common_put(cq, wc, "write"); - - ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe); - svc_rdma_unmap_dma(ctxt); - svc_rdma_put_context(ctxt, 0); -} - /** * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC * @cq: completion queue @@ -561,14 +464,14 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); INIT_LIST_HEAD(&cma_xprt->sc_ctxts); - INIT_LIST_HEAD(&cma_xprt->sc_maps); + INIT_LIST_HEAD(&cma_xprt->sc_rw_ctxts); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); spin_lock_init(&cma_xprt->sc_ctxt_lock); - spin_lock_init(&cma_xprt->sc_map_lock); + spin_lock_init(&cma_xprt->sc_rw_ctxt_lock); /* * Note that this implies that the underlying transport support @@ -999,6 +902,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) newxprt, newxprt->sc_cm_id); dev = newxprt->sc_cm_id->device; + newxprt->sc_port_num = newxprt->sc_cm_id->port_num; /* Qualify the transport resource defaults with the * capabilities of this particular device */ @@ -1014,13 +918,11 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) svcrdma_max_bc_requests); newxprt->sc_rq_depth = newxprt->sc_max_requests + newxprt->sc_max_bc_requests; - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + newxprt->sc_sq_depth = newxprt->sc_rq_depth; atomic_set(&newxprt->sc_sq_avail, newxprt->sc_sq_depth); if (!svc_rdma_prealloc_ctxts(newxprt)) goto errout; - if (!svc_rdma_prealloc_maps(newxprt)) - goto errout; /* * Limit ORD based on client limit, local device limit, and @@ -1050,6 +952,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) memset(&qp_attr, 0, sizeof qp_attr); qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; + qp_attr.port_num = newxprt->sc_cm_id->port_num; + qp_attr.cap.max_rdma_ctxs = newxprt->sc_max_requests; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; @@ -1248,8 +1152,8 @@ static void __svc_rdma_free(struct work_struct *work) } rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_rw_ctxts(rdma); svc_rdma_destroy_ctxts(rdma); - svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index c717f54107768902db7c571ce65f4019624c7bcc..62ecbccd9748e54874059209070ebe4a6b9591e7 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -66,8 +66,8 @@ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; -static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; - int xprt_rdma_pad_optimize = 0; +unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; +int xprt_rdma_pad_optimize; #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) @@ -396,7 +396,7 @@ xprt_setup_rdma(struct xprt_create *args) new_xprt = rpcx_to_rdmax(xprt); - rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy); + rc = rpcrdma_ia_open(new_xprt, sap); if (rc) goto out1; @@ -457,19 +457,33 @@ xprt_setup_rdma(struct xprt_create *args) return ERR_PTR(rc); } -/* - * Close a connection, during shutdown or timeout/reconnect +/** + * xprt_rdma_close - Close down RDMA connection + * @xprt: generic transport to be closed + * + * Called during transport shutdown reconnect, or device + * removal. Caller holds the transport's write lock. */ static void xprt_rdma_close(struct rpc_xprt *xprt) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + + dprintk("RPC: %s: closing xprt %p\n", __func__, xprt); - dprintk("RPC: %s: closing\n", __func__); - if (r_xprt->rx_ep.rep_connected > 0) + if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) { + xprt_clear_connected(xprt); + rpcrdma_ia_remove(ia); + return; + } + if (ep->rep_connected == -ENODEV) + return; + if (ep->rep_connected > 0) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); - rpcrdma_ep_disconnect(&r_xprt->rx_ep, &r_xprt->rx_ia); + rpcrdma_ep_disconnect(ep, ia); } static void @@ -484,6 +498,27 @@ xprt_rdma_set_port(struct rpc_xprt *xprt, u16 port) dprintk("RPC: %s: %u\n", __func__, port); } +/** + * xprt_rdma_timer - invoked when an RPC times out + * @xprt: controlling RPC transport + * @task: RPC task that timed out + * + * Invoked when the transport is still connected, but an RPC + * retransmit timeout occurs. + * + * Since RDMA connections don't have a keep-alive, forcibly + * disconnect and retry to connect. This drives full + * detection of the network path, and retransmissions of + * all pending RPCs. + */ +static void +xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) +{ + dprintk("RPC: %5u %s: xprt = %p\n", task->tk_pid, __func__, xprt); + + xprt_force_disconnect(xprt); +} + static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -659,6 +694,8 @@ xprt_rdma_free(struct rpc_task *task) * xprt_rdma_send_request - marshal and send an RPC request * @task: RPC task with an RPC message in rq_snd_buf * + * Caller holds the transport's write lock. + * * Return values: * 0: The request has been sent * ENOTCONN: Caller needs to invoke connect logic then call again @@ -685,6 +722,9 @@ xprt_rdma_send_request(struct rpc_task *task) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); int rc = 0; + if (!xprt_connected(xprt)) + goto drop_connection; + /* On retransmit, remove any previously registered chunks */ if (unlikely(!list_empty(&req->rl_registered))) r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false); @@ -776,6 +816,7 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .alloc_slot = xprt_alloc_slot, .release_request = xprt_release_rqst_cong, /* ditto */ .set_retrans_timeout = xprt_set_retrans_timeout_def, /* ditto */ + .timer = xprt_rdma_timer, .rpcbind = rpcb_getport_async, /* sunrpc/rpcb_clnt.c */ .set_port = xprt_rdma_set_port, .connect = xprt_rdma_connect, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 3b332b395045b5b0ad07bc13a30db1420d7f7082..3dbce9ac4327a89ea6d84f348041609c5d653b4a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -53,7 +53,7 @@ #include #include #include -#include /* try_module_get()/module_put() */ + #include #include "xprt_rdma.h" @@ -69,8 +69,11 @@ /* * internal functions */ +static void rpcrdma_create_mrs(struct rpcrdma_xprt *r_xprt); +static void rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf); +static void rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb); -static struct workqueue_struct *rpcrdma_receive_wq; +static struct workqueue_struct *rpcrdma_receive_wq __read_mostly; int rpcrdma_alloc_wq(void) @@ -180,7 +183,7 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) rep->rr_wc_flags = wc->wc_flags; rep->rr_inv_rkey = wc->ex.invalidate_rkey; - ib_dma_sync_single_for_cpu(rep->rr_device, + ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), rdmab_addr(rep->rr_rdmabuf), rep->rr_len, DMA_FROM_DEVICE); @@ -262,6 +265,21 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) __func__, ep); complete(&ia->ri_done); break; + case RDMA_CM_EVENT_DEVICE_REMOVAL: +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) + pr_info("rpcrdma: removing device for %pIS:%u\n", + sap, rpc_get_port(sap)); +#endif + set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); + ep->rep_connected = -ENODEV; + xprt_force_disconnect(&xprt->rx_xprt); + wait_for_completion(&ia->ri_remove_done); + + ia->ri_id = NULL; + ia->ri_pd = NULL; + ia->ri_device = NULL; + /* Return 1 to ensure the core destroys the id. */ + return 1; case RDMA_CM_EVENT_ESTABLISHED: connstate = 1; ib_query_qp(ia->ri_id->qp, attr, @@ -291,9 +309,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) goto connected; case RDMA_CM_EVENT_DISCONNECTED: connstate = -ECONNABORTED; - goto connected; - case RDMA_CM_EVENT_DEVICE_REMOVAL: - connstate = -ENODEV; connected: dprintk("RPC: %s: %sconnected\n", __func__, connstate > 0 ? "" : "dis"); @@ -329,14 +344,6 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) return 0; } -static void rpcrdma_destroy_id(struct rdma_cm_id *id) -{ - if (id) { - module_put(id->device->owner); - rdma_destroy_id(id); - } -} - static struct rdma_cm_id * rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia, struct sockaddr *addr) @@ -346,6 +353,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, int rc; init_completion(&ia->ri_done); + init_completion(&ia->ri_remove_done); id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); @@ -370,16 +378,6 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, goto out; } - /* FIXME: - * Until xprtrdma supports DEVICE_REMOVAL, the provider must - * be pinned while there are active NFS/RDMA mounts to prevent - * hangs and crashes at umount time. - */ - if (!ia->ri_async_rc && !try_module_get(id->device->owner)) { - dprintk("RPC: %s: Failed to get device module\n", - __func__); - ia->ri_async_rc = -ENODEV; - } rc = ia->ri_async_rc; if (rc) goto out; @@ -389,21 +387,20 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, if (rc) { dprintk("RPC: %s: rdma_resolve_route() failed %i\n", __func__, rc); - goto put; + goto out; } rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout); if (rc < 0) { dprintk("RPC: %s: wait() exited: %i\n", __func__, rc); - goto put; + goto out; } rc = ia->ri_async_rc; if (rc) - goto put; + goto out; return id; -put: - module_put(id->device->owner); + out: rdma_destroy_id(id); return ERR_PTR(rc); @@ -413,13 +410,16 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, * Exported functions. */ -/* - * Open and initialize an Interface Adapter. - * o initializes fields of struct rpcrdma_ia, including - * interface and provider attributes and protection zone. +/** + * rpcrdma_ia_open - Open and initialize an Interface Adapter. + * @xprt: controlling transport + * @addr: IP address of remote peer + * + * Returns 0 on success, negative errno if an appropriate + * Interface Adapter could not be found and opened. */ int -rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) +rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr) { struct rpcrdma_ia *ia = &xprt->rx_ia; int rc; @@ -427,7 +427,7 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) ia->ri_id = rpcrdma_create_id(xprt, ia, addr); if (IS_ERR(ia->ri_id)) { rc = PTR_ERR(ia->ri_id); - goto out1; + goto out_err; } ia->ri_device = ia->ri_id->device; @@ -435,10 +435,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) if (IS_ERR(ia->ri_pd)) { rc = PTR_ERR(ia->ri_pd); pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc); - goto out2; + goto out_err; } - switch (memreg) { + switch (xprt_rdma_memreg_strategy) { case RPCRDMA_FRMR: if (frwr_is_supported(ia)) { ia->ri_ops = &rpcrdma_frwr_memreg_ops; @@ -452,28 +452,73 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) } /*FALLTHROUGH*/ default: - pr_err("rpcrdma: Unsupported memory registration mode: %d\n", - memreg); + pr_err("rpcrdma: Device %s does not support memreg mode %d\n", + ia->ri_device->name, xprt_rdma_memreg_strategy); rc = -EINVAL; - goto out3; + goto out_err; } return 0; -out3: - ib_dealloc_pd(ia->ri_pd); - ia->ri_pd = NULL; -out2: - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; -out1: +out_err: + rpcrdma_ia_close(ia); return rc; } -/* - * Clean up/close an IA. - * o if event handles and PD have been initialized, free them. - * o close the IA +/** + * rpcrdma_ia_remove - Handle device driver unload + * @ia: interface adapter being removed + * + * Divest transport H/W resources associated with this adapter, + * but allow it to be restored later. + */ +void +rpcrdma_ia_remove(struct rpcrdma_ia *ia) +{ + struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, + rx_ia); + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req; + struct rpcrdma_rep *rep; + + cancel_delayed_work_sync(&buf->rb_refresh_worker); + + /* This is similar to rpcrdma_ep_destroy, but: + * - Don't cancel the connect worker. + * - Don't call rpcrdma_ep_disconnect, which waits + * for another conn upcall, which will deadlock. + * - rdma_disconnect is unneeded, the underlying + * connection is already gone. + */ + if (ia->ri_id->qp) { + ib_drain_qp(ia->ri_id->qp); + rdma_destroy_qp(ia->ri_id); + ia->ri_id->qp = NULL; + } + ib_free_cq(ep->rep_attr.recv_cq); + ib_free_cq(ep->rep_attr.send_cq); + + /* The ULP is responsible for ensuring all DMA + * mappings and MRs are gone. + */ + list_for_each_entry(rep, &buf->rb_recv_bufs, rr_list) + rpcrdma_dma_unmap_regbuf(rep->rr_rdmabuf); + list_for_each_entry(req, &buf->rb_allreqs, rl_all) { + rpcrdma_dma_unmap_regbuf(req->rl_rdmabuf); + rpcrdma_dma_unmap_regbuf(req->rl_sendbuf); + rpcrdma_dma_unmap_regbuf(req->rl_recvbuf); + } + rpcrdma_destroy_mrs(buf); + + /* Allow waiters to continue */ + complete(&ia->ri_remove_done); +} + +/** + * rpcrdma_ia_close - Clean up/close an IA. + * @ia: interface adapter to close + * */ void rpcrdma_ia_close(struct rpcrdma_ia *ia) @@ -482,13 +527,15 @@ rpcrdma_ia_close(struct rpcrdma_ia *ia) if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { if (ia->ri_id->qp) rdma_destroy_qp(ia->ri_id); - rpcrdma_destroy_id(ia->ri_id); - ia->ri_id = NULL; + rdma_destroy_id(ia->ri_id); } + ia->ri_id = NULL; + ia->ri_device = NULL; /* If the pd is still busy, xprtrdma missed freeing a resource */ if (ia->ri_pd && !IS_ERR(ia->ri_pd)) ib_dealloc_pd(ia->ri_pd); + ia->ri_pd = NULL; } /* @@ -646,6 +693,99 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) ib_free_cq(ep->rep_attr.send_cq); } +/* Re-establish a connection after a device removal event. + * Unlike a normal reconnection, a fresh PD and a new set + * of MRs and buffers is needed. + */ +static int +rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + int rc, err; + + pr_info("%s: r_xprt = %p\n", __func__, r_xprt); + + rc = -EHOSTUNREACH; + if (rpcrdma_ia_open(r_xprt, sap)) + goto out1; + + rc = -ENOMEM; + err = rpcrdma_ep_create(ep, ia, &r_xprt->rx_data); + if (err) { + pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err); + goto out2; + } + + rc = -ENETUNREACH; + err = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); + if (err) { + pr_err("rpcrdma: rdma_create_qp returned %d\n", err); + goto out3; + } + + rpcrdma_create_mrs(r_xprt); + return 0; + +out3: + rpcrdma_ep_destroy(ep, ia); +out2: + rpcrdma_ia_close(ia); +out1: + return rc; +} + +static int +rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt, struct rpcrdma_ep *ep, + struct rpcrdma_ia *ia) +{ + struct sockaddr *sap = (struct sockaddr *)&r_xprt->rx_data.addr; + struct rdma_cm_id *id, *old; + int err, rc; + + dprintk("RPC: %s: reconnecting...\n", __func__); + + rpcrdma_ep_disconnect(ep, ia); + + rc = -EHOSTUNREACH; + id = rpcrdma_create_id(r_xprt, ia, sap); + if (IS_ERR(id)) + goto out; + + /* As long as the new ID points to the same device as the + * old ID, we can reuse the transport's existing PD and all + * previously allocated MRs. Also, the same device means + * the transport's previous DMA mappings are still valid. + * + * This is a sanity check only. There should be no way these + * point to two different devices here. + */ + old = id; + rc = -ENETUNREACH; + if (ia->ri_device != id->device) { + pr_err("rpcrdma: can't reconnect on different device!\n"); + goto out_destroy; + } + + err = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); + if (err) { + dprintk("RPC: %s: rdma_create_qp returned %d\n", + __func__, err); + goto out_destroy; + } + + /* Atomically replace the transport's ID and QP. */ + rc = 0; + old = ia->ri_id; + ia->ri_id = id; + rdma_destroy_qp(old); + +out_destroy: + rdma_destroy_id(old); +out: + return rc; +} + /* * Connect unconnected endpoint. */ @@ -654,61 +794,30 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); - struct rdma_cm_id *id, *old; - struct sockaddr *sap; unsigned int extras; - int rc = 0; + int rc; - if (ep->rep_connected != 0) { retry: - dprintk("RPC: %s: reconnecting...\n", __func__); - - rpcrdma_ep_disconnect(ep, ia); - - sap = (struct sockaddr *)&r_xprt->rx_data.addr; - id = rpcrdma_create_id(r_xprt, ia, sap); - if (IS_ERR(id)) { - rc = -EHOSTUNREACH; - goto out; - } - /* TEMP TEMP TEMP - fail if new device: - * Deregister/remarshal *all* requests! - * Close and recreate adapter, pd, etc! - * Re-determine all attributes still sane! - * More stuff I haven't thought of! - * Rrrgh! - */ - if (ia->ri_device != id->device) { - printk("RPC: %s: can't reconnect on " - "different device!\n", __func__); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - /* END TEMP */ - rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); - if (rc) { - dprintk("RPC: %s: rdma_create_qp failed %i\n", - __func__, rc); - rpcrdma_destroy_id(id); - rc = -ENETUNREACH; - goto out; - } - - old = ia->ri_id; - ia->ri_id = id; - - rdma_destroy_qp(old); - rpcrdma_destroy_id(old); - } else { + switch (ep->rep_connected) { + case 0: dprintk("RPC: %s: connecting...\n", __func__); rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); if (rc) { dprintk("RPC: %s: rdma_create_qp failed %i\n", __func__, rc); - /* do not update ep->rep_connected */ - return -ENETUNREACH; + rc = -ENETUNREACH; + goto out_noupdate; } + break; + case -ENODEV: + rc = rpcrdma_ep_recreate_xprt(r_xprt, ep, ia); + if (rc) + goto out_noupdate; + break; + default: + rc = rpcrdma_ep_reconnect(r_xprt, ep, ia); + if (rc) + goto out; } ep->rep_connected = 0; @@ -736,6 +845,8 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) out: if (rc) ep->rep_connected = rc; + +out_noupdate: return rc; } @@ -878,7 +989,6 @@ struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; struct rpcrdma_rep *rep; int rc; @@ -894,7 +1004,6 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) goto out_free; } - rep->rr_device = ia->ri_device; rep->rr_cqe.done = rpcrdma_wc_receive; rep->rr_rxprt = r_xprt; INIT_WORK(&rep->rr_work, rpcrdma_reply_handler); @@ -1037,6 +1146,7 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { cancel_delayed_work_sync(&buf->rb_recovery_worker); + cancel_delayed_work_sync(&buf->rb_refresh_worker); while (!list_empty(&buf->rb_recv_bufs)) { struct rpcrdma_rep *rep; @@ -1081,7 +1191,8 @@ rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) out_nomws: dprintk("RPC: %s: no MWs available\n", __func__); - schedule_delayed_work(&buf->rb_refresh_worker, 0); + if (r_xprt->rx_ep.rep_connected != -ENODEV) + schedule_delayed_work(&buf->rb_refresh_worker, 0); /* Allow the reply handler and refresh worker to run */ cond_resched(); @@ -1231,17 +1342,19 @@ rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction, bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) { + struct ib_device *device = ia->ri_device; + if (rb->rg_direction == DMA_NONE) return false; - rb->rg_iov.addr = ib_dma_map_single(ia->ri_device, + rb->rg_iov.addr = ib_dma_map_single(device, (void *)rb->rg_base, rdmab_length(rb), rb->rg_direction); - if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb))) + if (ib_dma_mapping_error(device, rdmab_addr(rb))) return false; - rb->rg_device = ia->ri_device; + rb->rg_device = device; rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey; return true; } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 171a35116de911878ce88402c3bac2979740af63..1d66acf1a723e51efb0b2e91065122cd66856af7 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -69,6 +69,7 @@ struct rpcrdma_ia { struct rdma_cm_id *ri_id; struct ib_pd *ri_pd; struct completion ri_done; + struct completion ri_remove_done; int ri_async_rc; unsigned int ri_max_segs; unsigned int ri_max_frmr_depth; @@ -78,10 +79,15 @@ struct rpcrdma_ia { bool ri_reminv_expected; bool ri_implicit_roundup; enum ib_mr_type ri_mrtype; + unsigned long ri_flags; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; +enum { + RPCRDMA_IAF_REMOVING = 0, +}; + /* * RDMA Endpoint -- one per transport instance */ @@ -164,6 +170,12 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +static inline struct ib_device * +rdmab_device(struct rpcrdma_regbuf *rb) +{ + return rb->rg_device; +} + #define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) /* To ensure a transport can always make forward progress, @@ -209,7 +221,6 @@ struct rpcrdma_rep { unsigned int rr_len; int rr_wc_flags; u32 rr_inv_rkey; - struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; struct work_struct rr_work; struct list_head rr_list; @@ -380,7 +391,6 @@ struct rpcrdma_buffer { spinlock_t rb_mwlock; /* protect rb_mws list */ struct list_head rb_mws; struct list_head rb_all; - char *rb_pool; spinlock_t rb_lock; /* protect buf lists */ int rb_send_count, rb_recv_count; @@ -497,10 +507,16 @@ struct rpcrdma_xprt { * Default is 0, see sysctl entry and rpc_rdma.c rpcrdma_convert_iovs() */ extern int xprt_rdma_pad_optimize; +/* This setting controls the hunt for a supported memory + * registration strategy. + */ +extern unsigned int xprt_rdma_memreg_strategy; + /* * Interface Adapter calls - xprtrdma/verbs.c */ -int rpcrdma_ia_open(struct rpcrdma_xprt *, struct sockaddr *, int); +int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr); +void rpcrdma_ia_remove(struct rpcrdma_ia *ia); void rpcrdma_ia_close(struct rpcrdma_ia *); bool frwr_is_supported(struct rpcrdma_ia *); bool fmr_is_supported(struct rpcrdma_ia *); diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c index 017801f9dbaae28ff3bd411af3fce3a84da6c90d..8d40a7d31c9908c22b757fc0ab92bce436211c90 100644 --- a/net/switchdev/switchdev.c +++ b/net/switchdev/switchdev.c @@ -826,7 +826,7 @@ static int switchdev_port_br_setlink_protinfo(struct net_device *dev, int err; err = nla_validate_nested(protinfo, IFLA_BRPORT_MAX, - switchdev_port_bridge_policy); + switchdev_port_bridge_policy, NULL); if (err) return err; diff --git a/net/sysctl_net.c b/net/sysctl_net.c index 919981324171bd2028ed3a8662d16bf77f520c1a..9aed6fe1bf1ad67057d5ec001806729cd5988a94 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -106,7 +106,6 @@ __init int net_sysctl_init(void) ret = register_pernet_subsys(&sysctl_pernet_ops); if (ret) goto out1; - register_sysctl_root(&net_sysctl_root); out: return ret; out1: diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 33a5bdfbef76c00578921198bfbbe9c735f643d2..d174ee3254eecb523dbc86061d80a1e812dcc305 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -802,7 +802,7 @@ int tipc_nl_bearer_get(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -851,7 +851,7 @@ int tipc_nl_bearer_disable(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -891,7 +891,7 @@ int tipc_nl_bearer_enable(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -939,7 +939,7 @@ int tipc_nl_bearer_add(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -982,7 +982,7 @@ int tipc_nl_bearer_set(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_BEARER_MAX, info->attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, info->extack); if (err) return err; @@ -1104,7 +1104,7 @@ int tipc_nl_media_get(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, info->attrs[TIPC_NLA_MEDIA], - tipc_nl_media_policy); + tipc_nl_media_policy, info->extack); if (err) return err; @@ -1152,7 +1152,7 @@ int tipc_nl_media_set(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_MEDIA_MAX, info->attrs[TIPC_NLA_MEDIA], - tipc_nl_media_policy); + tipc_nl_media_policy, info->extack); if (!attrs[TIPC_NLA_MEDIA_NAME]) return -EINVAL; diff --git a/net/tipc/link.c b/net/tipc/link.c index ddd2dd6f77aae1a5cd77c6bd86fac293db66e145..60820dc35a08ade9805080de4c15ff9819e150f5 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1827,7 +1827,7 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]) int err; err = nla_parse_nested(props, TIPC_NLA_PROP_MAX, prop, - tipc_nl_prop_policy); + tipc_nl_prop_policy, NULL); if (err) return err; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 9be6592e4a6fa20c78995396ffa3dfcd1f19537a..bd0aac87b41ac627e5c897256a79150226926514 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -416,6 +416,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq, tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns); + tipc_subscrp_get(s); list_add(&s->nameseq_list, &nseq->subscriptions); if (!sseq) @@ -787,6 +788,7 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s) if (seq != NULL) { spin_lock_bh(&seq->lock); list_del_init(&s->nameseq_list); + tipc_subscrp_put(s); if (!seq->first_free && list_empty(&seq->subscriptions)) { hlist_del_init_rcu(&seq->ns_list); kfree(seq->sseqs); diff --git a/net/tipc/net.c b/net/tipc/net.c index ab8a2d5d1e3245d31f97375e5a27deda9a15ad58..719c5924b6383e6bf548baf57fdb713283012d87 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -211,8 +211,8 @@ int tipc_nl_net_set(struct sk_buff *skb, struct genl_info *info) return -EINVAL; err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, - info->attrs[TIPC_NLA_NET], - tipc_nl_net_policy); + info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, + info->extack); if (err) return err; diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c index 26ca8dd64ded64407db8ef885f8dbb6edd30e4b8..b76f13f6fea10a53d00ed14a38cdf5cdf7afa44c 100644 --- a/net/tipc/netlink.c +++ b/net/tipc/netlink.c @@ -268,7 +268,8 @@ int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***attr) if (!*attr) return -EOPNOTSUPP; - return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy); + return nlmsg_parse(nlh, GENL_HDRLEN, *attr, maxattr, tipc_nl_policy, + NULL); } int __init tipc_netlink_start(void) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index e1ae8a8a2b8eacf93224cc332fff4a537d8d1ab0..9bfe886ab33080f653ec1c39e0255b15709d0b76 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -296,7 +296,7 @@ static int __tipc_nl_compat_doit(struct tipc_nl_compat_cmd_doit *cmd, err = nla_parse(attrbuf, tipc_genl_family.maxattr, (const struct nlattr *)trans_buf->data, - trans_buf->len, NULL); + trans_buf->len, NULL, NULL); if (err) goto parse_out; @@ -352,7 +352,7 @@ static int tipc_nl_compat_bearer_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(bearer, TIPC_NLA_BEARER_MAX, - attrs[TIPC_NLA_BEARER], NULL); + attrs[TIPC_NLA_BEARER], NULL, NULL); if (err) return err; @@ -472,7 +472,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], - NULL); + NULL, NULL); if (err) return err; @@ -480,7 +480,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(prop, TIPC_NLA_PROP_MAX, - link[TIPC_NLA_LINK_PROP], NULL); + link[TIPC_NLA_LINK_PROP], NULL, NULL); if (err) return err; @@ -488,7 +488,7 @@ static int tipc_nl_compat_link_stat_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(stats, TIPC_NLA_STATS_MAX, - link[TIPC_NLA_LINK_STATS], NULL); + link[TIPC_NLA_LINK_STATS], NULL, NULL); if (err) return err; @@ -598,7 +598,7 @@ static int tipc_nl_compat_link_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(link, TIPC_NLA_LINK_MAX, attrs[TIPC_NLA_LINK], - NULL); + NULL, NULL); if (err) return err; @@ -795,7 +795,7 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(nt, TIPC_NLA_NAME_TABLE_MAX, - attrs[TIPC_NLA_NAME_TABLE], NULL); + attrs[TIPC_NLA_NAME_TABLE], NULL, NULL); if (err) return err; @@ -803,7 +803,7 @@ static int tipc_nl_compat_name_table_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, - nt[TIPC_NLA_NAME_TABLE_PUBL], NULL); + nt[TIPC_NLA_NAME_TABLE_PUBL], NULL, NULL); if (err) return err; @@ -863,7 +863,7 @@ static int __tipc_nl_compat_publ_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(publ, TIPC_NLA_PUBL_MAX, attrs[TIPC_NLA_PUBL], - NULL); + NULL, NULL); if (err) return err; @@ -929,7 +929,7 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], - NULL); + NULL, NULL); if (err) return err; @@ -940,8 +940,8 @@ static int tipc_nl_compat_sk_dump(struct tipc_nl_compat_msg *msg, u32 node; struct nlattr *con[TIPC_NLA_CON_MAX + 1]; - nla_parse_nested(con, TIPC_NLA_CON_MAX, sock[TIPC_NLA_SOCK_CON], - NULL); + nla_parse_nested(con, TIPC_NLA_CON_MAX, + sock[TIPC_NLA_SOCK_CON], NULL, NULL); node = nla_get_u32(con[TIPC_NLA_CON_NODE]); tipc_tlv_sprintf(msg->rep, " connected to <%u.%u.%u:%u>", @@ -977,8 +977,8 @@ static int tipc_nl_compat_media_dump(struct tipc_nl_compat_msg *msg, if (!attrs[TIPC_NLA_MEDIA]) return -EINVAL; - err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, attrs[TIPC_NLA_MEDIA], - NULL); + err = nla_parse_nested(media, TIPC_NLA_MEDIA_MAX, + attrs[TIPC_NLA_MEDIA], NULL, NULL); if (err) return err; @@ -998,7 +998,7 @@ static int tipc_nl_compat_node_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(node, TIPC_NLA_NODE_MAX, attrs[TIPC_NLA_NODE], - NULL); + NULL, NULL); if (err) return err; @@ -1045,7 +1045,7 @@ static int tipc_nl_compat_net_dump(struct tipc_nl_compat_msg *msg, return -EINVAL; err = nla_parse_nested(net, TIPC_NLA_NET_MAX, attrs[TIPC_NLA_NET], - NULL); + NULL, NULL); if (err) return err; diff --git a/net/tipc/node.c b/net/tipc/node.c index 4512e83652b16da259db9327fe0958c3642e70f1..aeef8011ac7d82d828289f4085efe3acaa8a3945 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -1607,8 +1607,8 @@ int tipc_nl_peer_rm(struct sk_buff *skb, struct genl_info *info) return -EINVAL; err = nla_parse_nested(attrs, TIPC_NLA_NET_MAX, - info->attrs[TIPC_NLA_NET], - tipc_nl_net_policy); + info->attrs[TIPC_NLA_NET], tipc_nl_net_policy, + info->extack); if (err) return err; @@ -1774,7 +1774,7 @@ int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy); + tipc_nl_link_policy, info->extack); if (err) return err; @@ -1902,7 +1902,7 @@ int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX, info->attrs[TIPC_NLA_LINK], - tipc_nl_link_policy); + tipc_nl_link_policy, info->extack); if (err) return err; @@ -2042,7 +2042,7 @@ int tipc_nl_node_set_monitor(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(attrs, TIPC_NLA_MON_MAX, info->attrs[TIPC_NLA_MON], - tipc_nl_monitor_policy); + tipc_nl_monitor_policy, info->extack); if (err) return err; @@ -2098,6 +2098,8 @@ int tipc_nl_node_get_monitor(struct sk_buff *skb, struct genl_info *info) int err; msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); + if (!msg.skb) + return -ENOMEM; msg.portid = info->snd_portid; msg.seq = info->snd_seq; @@ -2163,7 +2165,7 @@ int tipc_nl_node_dump_monitor_peer(struct sk_buff *skb, err = nla_parse_nested(mon, TIPC_NLA_MON_MAX, attrs[TIPC_NLA_MON], - tipc_nl_monitor_policy); + tipc_nl_monitor_policy, NULL); if (err) return err; diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7130e73bd42c21758e88b24b875da4bd97b3c4d2..0d4f2f455a7c91a1d7ea2b0fd9c8d121e6da98af 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -51,6 +51,7 @@ #define TIPC_FWD_MSG 1 #define TIPC_MAX_PORT 0xffffffff #define TIPC_MIN_PORT 1 +#define TIPC_ACK_RATE 4 /* ACK at 1/4 of of rcv window size */ enum { TIPC_LISTEN = TCP_LISTEN, @@ -866,6 +867,14 @@ static void tipc_sk_proto_rcv(struct tipc_sock *tsk, struct sk_buff *skb, if (!tsk_peer_msg(tsk, hdr)) goto exit; + if (unlikely(msg_errcode(hdr))) { + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + tipc_node_remove_conn(sock_net(sk), tsk_peer_node(tsk), + tsk_peer_port(tsk)); + sk->sk_state_change(sk); + goto exit; + } + tsk->probe_unacked = false; if (mtyp == CONN_PROBE) { @@ -1083,7 +1092,7 @@ static int __tipc_sendstream(struct socket *sock, struct msghdr *m, size_t dlen) } } while (sent < dlen && !rc); - return rc ? rc : sent; + return sent ? sent : rc; } /** @@ -1259,7 +1268,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) struct sock *sk = sock->sk; DEFINE_WAIT(wait); long timeo = *timeop; - int err; + int err = sock_error(sk); + + if (err) + return err; for (;;) { prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); @@ -1281,6 +1293,10 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) err = sock_intr_errno(timeo); if (signal_pending(current)) break; + + err = sock_error(sk); + if (err) + break; } finish_wait(sk_sleep(sk), &wait); *timeop = timeo; @@ -1290,7 +1306,7 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) /** * tipc_recvmsg - receive packet-oriented message * @m: descriptor for message info - * @buf_len: total size of user buffer area + * @buflen: length of user buffer area * @flags: receive flags * * Used for SOCK_DGRAM, SOCK_RDM, and SOCK_SEQPACKET messages. @@ -1298,95 +1314,85 @@ static int tipc_wait_for_rcvmsg(struct socket *sock, long *timeop) * * Returns size of returned message data, errno otherwise */ -static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, - int flags) +static int tipc_recvmsg(struct socket *sock, struct msghdr *m, + size_t buflen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *buf; - struct tipc_msg *msg; - bool is_connectionless = tipc_sk_type_connectionless(sk); - long timeo; - unsigned int sz; - u32 err; - int res, hlen; + struct sk_buff *skb; + struct tipc_msg *hdr; + bool connected = !tipc_sk_type_connectionless(sk); + int rc, err, hlen, dlen, copy; + long timeout; /* Catch invalid receive requests */ - if (unlikely(!buf_len)) + if (unlikely(!buflen)) return -EINVAL; lock_sock(sk); - - if (!is_connectionless && unlikely(sk->sk_state == TIPC_OPEN)) { - res = -ENOTCONN; + if (unlikely(connected && sk->sk_state == TIPC_OPEN)) { + rc = -ENOTCONN; goto exit; } + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); -restart: - - /* Look for a message in receive queue; wait if necessary */ - res = tipc_wait_for_rcvmsg(sock, &timeo); - if (res) - goto exit; - - /* Look at first message in receive queue */ - buf = skb_peek(&sk->sk_receive_queue); - msg = buf_msg(buf); - sz = msg_data_sz(msg); - hlen = msg_hdr_sz(msg); - err = msg_errcode(msg); - - /* Discard an empty non-errored message & try again */ - if ((!sz) && (!err)) { + do { + /* Look at first msg in receive queue; wait if necessary */ + rc = tipc_wait_for_rcvmsg(sock, &timeout); + if (unlikely(rc)) + goto exit; + skb = skb_peek(&sk->sk_receive_queue); + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + hlen = msg_hdr_sz(hdr); + err = msg_errcode(hdr); + if (likely(dlen || err)) + break; tsk_advance_rx_queue(sk); - goto restart; - } - - /* Capture sender's address (optional) */ - set_orig_addr(m, msg); + } while (1); - /* Capture ancillary data (optional) */ - res = tipc_sk_anc_data_recv(m, msg, tsk); - if (res) + /* Collect msg meta data, including error code and rejected data */ + set_orig_addr(m, hdr); + rc = tipc_sk_anc_data_recv(m, hdr, tsk); + if (unlikely(rc)) goto exit; - /* Capture message data (if valid) & compute return value (always) */ - if (!err) { - if (unlikely(buf_len < sz)) { - sz = buf_len; + /* Capture data if non-error msg, otherwise just set return value */ + if (likely(!err)) { + copy = min_t(int, dlen, buflen); + if (unlikely(copy != dlen)) m->msg_flags |= MSG_TRUNC; - } - res = skb_copy_datagram_msg(buf, hlen, m, sz); - if (res) - goto exit; - res = sz; + rc = skb_copy_datagram_msg(skb, hlen, m, copy); } else { - if (is_connectionless || err == TIPC_CONN_SHUTDOWN || - m->msg_control) - res = 0; - else - res = -ECONNRESET; + copy = 0; + rc = 0; + if (err != TIPC_CONN_SHUTDOWN && connected && !m->msg_control) + rc = -ECONNRESET; } + if (unlikely(rc)) + goto exit; + /* Caption of data or error code/rejected data was successful */ if (unlikely(flags & MSG_PEEK)) goto exit; - if (likely(!is_connectionless)) { - tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); - if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) - tipc_sk_send_ack(tsk); - } tsk_advance_rx_queue(sk); + if (likely(!connected)) + goto exit; + + /* Send connection flow control ack when applicable */ + tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); + if (tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE) + tipc_sk_send_ack(tsk); exit: release_sock(sk); - return res; + return rc ? rc : copy; } /** - * tipc_recv_stream - receive stream-oriented data + * tipc_recvstream - receive stream-oriented data * @m: descriptor for message info - * @buf_len: total size of user buffer area + * @buflen: total size of user buffer area * @flags: receive flags * * Used for SOCK_STREAM messages only. If not enough data is available @@ -1394,111 +1400,98 @@ static int tipc_recvmsg(struct socket *sock, struct msghdr *m, size_t buf_len, * * Returns size of returned message data, errno otherwise */ -static int tipc_recv_stream(struct socket *sock, struct msghdr *m, - size_t buf_len, int flags) +static int tipc_recvstream(struct socket *sock, struct msghdr *m, + size_t buflen, int flags) { struct sock *sk = sock->sk; struct tipc_sock *tsk = tipc_sk(sk); - struct sk_buff *buf; - struct tipc_msg *msg; - long timeo; - unsigned int sz; - int target; - int sz_copied = 0; - u32 err; - int res = 0, hlen; + struct sk_buff *skb; + struct tipc_msg *hdr; + struct tipc_skb_cb *skb_cb; + bool peek = flags & MSG_PEEK; + int offset, required, copy, copied = 0; + int hlen, dlen, err, rc; + long timeout; /* Catch invalid receive attempts */ - if (unlikely(!buf_len)) + if (unlikely(!buflen)) return -EINVAL; lock_sock(sk); if (unlikely(sk->sk_state == TIPC_OPEN)) { - res = -ENOTCONN; - goto exit; - } - - target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - -restart: - /* Look for a message in receive queue; wait if necessary */ - res = tipc_wait_for_rcvmsg(sock, &timeo); - if (res) + rc = -ENOTCONN; goto exit; - - /* Look at first message in receive queue */ - buf = skb_peek(&sk->sk_receive_queue); - msg = buf_msg(buf); - sz = msg_data_sz(msg); - hlen = msg_hdr_sz(msg); - err = msg_errcode(msg); - - /* Discard an empty non-errored message & try again */ - if ((!sz) && (!err)) { - tsk_advance_rx_queue(sk); - goto restart; } + required = sock_rcvlowat(sk, flags & MSG_WAITALL, buflen); + timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - /* Optionally capture sender's address & ancillary data of first msg */ - if (sz_copied == 0) { - set_orig_addr(m, msg); - res = tipc_sk_anc_data_recv(m, msg, tsk); - if (res) - goto exit; - } - - /* Capture message data (if valid) & compute return value (always) */ - if (!err) { - u32 offset = TIPC_SKB_CB(buf)->bytes_read; - u32 needed; - int sz_to_copy; - - sz -= offset; - needed = (buf_len - sz_copied); - sz_to_copy = min(sz, needed); - - res = skb_copy_datagram_msg(buf, hlen + offset, m, sz_to_copy); - if (res) - goto exit; + do { + /* Look at first msg in receive queue; wait if necessary */ + rc = tipc_wait_for_rcvmsg(sock, &timeout); + if (unlikely(rc)) + break; + skb = skb_peek(&sk->sk_receive_queue); + skb_cb = TIPC_SKB_CB(skb); + hdr = buf_msg(skb); + dlen = msg_data_sz(hdr); + hlen = msg_hdr_sz(hdr); + err = msg_errcode(hdr); + + /* Discard any empty non-errored (SYN-) message */ + if (unlikely(!dlen && !err)) { + tsk_advance_rx_queue(sk); + continue; + } - sz_copied += sz_to_copy; + /* Collect msg meta data, incl. error code and rejected data */ + if (!copied) { + set_orig_addr(m, hdr); + rc = tipc_sk_anc_data_recv(m, hdr, tsk); + if (rc) + break; + } - if (sz_to_copy < sz) { - if (!(flags & MSG_PEEK)) - TIPC_SKB_CB(buf)->bytes_read = - offset + sz_to_copy; - goto exit; + /* Copy data if msg ok, otherwise return error/partial data */ + if (likely(!err)) { + offset = skb_cb->bytes_read; + copy = min_t(int, dlen - offset, buflen - copied); + rc = skb_copy_datagram_msg(skb, hlen + offset, m, copy); + if (unlikely(rc)) + break; + copied += copy; + offset += copy; + if (unlikely(offset < dlen)) { + if (!peek) + skb_cb->bytes_read = offset; + break; + } + } else { + rc = 0; + if ((err != TIPC_CONN_SHUTDOWN) && !m->msg_control) + rc = -ECONNRESET; + if (copied || rc) + break; } - } else { - if (sz_copied != 0) - goto exit; /* can't add error msg to valid data */ - if ((err == TIPC_CONN_SHUTDOWN) || m->msg_control) - res = 0; - else - res = -ECONNRESET; - } + if (unlikely(peek)) + break; - if (unlikely(flags & MSG_PEEK)) - goto exit; + tsk_advance_rx_queue(sk); - tsk->rcv_unacked += tsk_inc(tsk, hlen + sz); - if (unlikely(tsk->rcv_unacked >= (tsk->rcv_win / 4))) - tipc_sk_send_ack(tsk); - tsk_advance_rx_queue(sk); + /* Send connection flow control advertisement when applicable */ + tsk->rcv_unacked += tsk_inc(tsk, hlen + dlen); + if (unlikely(tsk->rcv_unacked >= tsk->rcv_win / TIPC_ACK_RATE)) + tipc_sk_send_ack(tsk); - /* Loop around if more data is required */ - if ((sz_copied < buf_len) && /* didn't get all requested data */ - (!skb_queue_empty(&sk->sk_receive_queue) || - (sz_copied < target)) && /* and more is ready or required */ - (!err)) /* and haven't reached a FIN */ - goto restart; + /* Exit if all requested data or FIN/error received */ + if (copied == buflen || err) + break; + } while (!skb_queue_empty(&sk->sk_receive_queue) || copied < required); exit: release_sock(sk); - return sz_copied ? sz_copied : res; + return copied ? copied : rc; } /** @@ -1551,6 +1544,8 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) struct sock *sk = &tsk->sk; struct net *net = sock_net(sk); struct tipc_msg *hdr = buf_msg(skb); + u32 pport = msg_origport(hdr); + u32 pnode = msg_orignode(hdr); if (unlikely(msg_mcast(hdr))) return false; @@ -1558,18 +1553,28 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) switch (sk->sk_state) { case TIPC_CONNECTING: /* Accept only ACK or NACK message */ - if (unlikely(!msg_connected(hdr))) - return false; + if (unlikely(!msg_connected(hdr))) { + if (pport != tsk_peer_port(tsk) || + pnode != tsk_peer_node(tsk)) + return false; + + tipc_set_sk_state(sk, TIPC_DISCONNECTING); + sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); + return true; + } if (unlikely(msg_errcode(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = ECONNREFUSED; + sk->sk_state_change(sk); return true; } if (unlikely(!msg_isdata(hdr))) { tipc_set_sk_state(sk, TIPC_DISCONNECTING); sk->sk_err = EINVAL; + sk->sk_state_change(sk); return true; } @@ -1581,8 +1586,7 @@ static bool filter_connect(struct tipc_sock *tsk, struct sk_buff *skb) return true; /* If empty 'ACK-' message, wake up sleeping connect() */ - if (waitqueue_active(sk_sleep(sk))) - wake_up_interruptible(sk_sleep(sk)); + sk->sk_data_ready(sk); /* 'ACK-' message is neither accepted nor rejected: */ msg_set_dest_droppable(hdr, 1); @@ -2511,6 +2515,28 @@ static int tipc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } } +static int tipc_socketpair(struct socket *sock1, struct socket *sock2) +{ + struct tipc_sock *tsk2 = tipc_sk(sock2->sk); + struct tipc_sock *tsk1 = tipc_sk(sock1->sk); + u32 onode = tipc_own_addr(sock_net(sock1->sk)); + + tsk1->peer.family = AF_TIPC; + tsk1->peer.addrtype = TIPC_ADDR_ID; + tsk1->peer.scope = TIPC_NODE_SCOPE; + tsk1->peer.addr.id.ref = tsk2->portid; + tsk1->peer.addr.id.node = onode; + tsk2->peer.family = AF_TIPC; + tsk2->peer.addrtype = TIPC_ADDR_ID; + tsk2->peer.scope = TIPC_NODE_SCOPE; + tsk2->peer.addr.id.ref = tsk1->portid; + tsk2->peer.addr.id.node = onode; + + tipc_sk_finish_conn(tsk1, tsk2->portid, onode); + tipc_sk_finish_conn(tsk2, tsk1->portid, onode); + return 0; +} + /* Protocol switches for the various types of TIPC sockets */ static const struct proto_ops msg_ops = { @@ -2519,7 +2545,7 @@ static const struct proto_ops msg_ops = { .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = sock_no_accept, .getname = tipc_getname, .poll = tipc_poll, @@ -2540,7 +2566,7 @@ static const struct proto_ops packet_ops = { .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, @@ -2561,7 +2587,7 @@ static const struct proto_ops stream_ops = { .release = tipc_release, .bind = tipc_bind, .connect = tipc_connect, - .socketpair = sock_no_socketpair, + .socketpair = tipc_socketpair, .accept = tipc_accept, .getname = tipc_getname, .poll = tipc_poll, @@ -2571,7 +2597,7 @@ static const struct proto_ops stream_ops = { .setsockopt = tipc_setsockopt, .getsockopt = tipc_getsockopt, .sendmsg = tipc_sendstream, - .recvmsg = tipc_recv_stream, + .recvmsg = tipc_recvstream, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage }; @@ -2844,7 +2870,7 @@ int tipc_nl_publ_dump(struct sk_buff *skb, struct netlink_callback *cb) err = nla_parse_nested(sock, TIPC_NLA_SOCK_MAX, attrs[TIPC_NLA_SOCK], - tipc_nl_sock_policy); + tipc_nl_sock_policy, NULL); if (err) return err; diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 271cd66e4b3b66534d8686bec94132bc9737314e..0bf91cd3733cb37ecc8ba4ccf7ae5a26cb6e966d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -54,8 +54,6 @@ struct tipc_subscriber { static void tipc_subscrp_delete(struct tipc_subscription *sub); static void tipc_subscrb_put(struct tipc_subscriber *subscriber); -static void tipc_subscrp_put(struct tipc_subscription *subscription); -static void tipc_subscrp_get(struct tipc_subscription *subscription); /** * htohl - convert value to endianness used by destination @@ -125,7 +123,6 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, { struct tipc_name_seq seq; - tipc_subscrp_get(sub); tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq); if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper)) return; @@ -135,7 +132,6 @@ void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower, tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref, node); - tipc_subscrp_put(sub); } static void tipc_subscrp_timeout(unsigned long data) @@ -145,6 +141,7 @@ static void tipc_subscrp_timeout(unsigned long data) spin_lock_bh(&subscriber->lock); tipc_nametbl_unsubscribe(sub); + list_del(&sub->subscrp_list); spin_unlock_bh(&subscriber->lock); /* Notify subscriber of timeout */ @@ -177,20 +174,17 @@ static void tipc_subscrp_kref_release(struct kref *kref) struct tipc_net *tn = net_generic(sub->net, tipc_net_id); struct tipc_subscriber *subscriber = sub->subscriber; - spin_lock_bh(&subscriber->lock); - list_del(&sub->subscrp_list); atomic_dec(&tn->subscription_count); - spin_unlock_bh(&subscriber->lock); kfree(sub); tipc_subscrb_put(subscriber); } -static void tipc_subscrp_put(struct tipc_subscription *subscription) +void tipc_subscrp_put(struct tipc_subscription *subscription) { kref_put(&subscription->kref, tipc_subscrp_kref_release); } -static void tipc_subscrp_get(struct tipc_subscription *subscription) +void tipc_subscrp_get(struct tipc_subscription *subscription) { kref_get(&subscription->kref); } @@ -210,11 +204,8 @@ static void tipc_subscrb_subscrp_delete(struct tipc_subscriber *subscriber, continue; tipc_nametbl_unsubscribe(sub); - tipc_subscrp_get(sub); - spin_unlock_bh(&subscriber->lock); + list_del(&sub->subscrp_list); tipc_subscrp_delete(sub); - tipc_subscrp_put(sub); - spin_lock_bh(&subscriber->lock); if (s) break; diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index ffdc214c117a924f34b416fde415fcd18201ebc0..ee52957dc9524a76ac371aa19d950dc90bfe4035 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -78,4 +78,7 @@ u32 tipc_subscrp_convert_seq_type(u32 type, int swap); int tipc_topsrv_start(struct net *net); void tipc_topsrv_stop(struct net *net); +void tipc_subscrp_put(struct tipc_subscription *subscription); +void tipc_subscrp_get(struct tipc_subscription *subscription); + #endif diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c index 46061cf48cd13506a12cb9b8fb53f64d9a56aa06..ecca64fc6a6f223bf8c0e09e3a299fe4fd62509d 100644 --- a/net/tipc/udp_media.c +++ b/net/tipc/udp_media.c @@ -457,7 +457,7 @@ int tipc_udp_nl_dump_remoteip(struct sk_buff *skb, struct netlink_callback *cb) err = nla_parse_nested(battrs, TIPC_NLA_BEARER_MAX, attrs[TIPC_NLA_BEARER], - tipc_nl_bearer_policy); + tipc_nl_bearer_policy, NULL); if (err) return err; @@ -609,7 +609,8 @@ int tipc_udp_nl_bearer_add(struct tipc_bearer *b, struct nlattr *attr) struct nlattr *opts[TIPC_NLA_UDP_MAX + 1]; struct udp_media_addr *dst; - if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, tipc_nl_udp_policy)) + if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attr, + tipc_nl_udp_policy, NULL)) return -EINVAL; if (!opts[TIPC_NLA_UDP_REMOTE]) @@ -662,7 +663,7 @@ static int tipc_udp_enable(struct net *net, struct tipc_bearer *b, if (nla_parse_nested(opts, TIPC_NLA_UDP_MAX, attrs[TIPC_NLA_BEARER_UDP_OPTS], - tipc_nl_udp_policy)) + tipc_nl_udp_policy, NULL)) goto err; if (!opts[TIPC_NLA_UDP_LOCAL] || !opts[TIPC_NLA_UDP_REMOTE]) { diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 928691c434087e8ff72b84fc6515539115c0509b..6a7fe7660551f45c065a7f472b805c0b8073f6bb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -996,7 +996,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) unsigned int hash; struct unix_address *addr; struct hlist_head *list; - struct path path = { NULL, NULL }; + struct path path = { }; err = -EINVAL; if (sunaddr->sun_family != AF_UNIX) diff --git a/net/vmw_vsock/Makefile b/net/vmw_vsock/Makefile index bc27c70e0e59f76a487af92455ac6542a035f762..09fc2eb29dc88898f631bfdf3e747bf16fbe4462 100644 --- a/net/vmw_vsock/Makefile +++ b/net/vmw_vsock/Makefile @@ -3,7 +3,7 @@ obj-$(CONFIG_VMWARE_VMCI_VSOCKETS) += vmw_vsock_vmci_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS) += vmw_vsock_virtio_transport.o obj-$(CONFIG_VIRTIO_VSOCKETS_COMMON) += vmw_vsock_virtio_transport_common.o -vsock-y += af_vsock.o vsock_addr.o +vsock-y += af_vsock.o af_vsock_tap.o vsock_addr.o vmw_vsock_vmci_transport-y += vmci_transport.o vmci_transport_notify.o \ vmci_transport_notify_qstate.o diff --git a/net/vmw_vsock/af_vsock_tap.c b/net/vmw_vsock/af_vsock_tap.c new file mode 100644 index 0000000000000000000000000000000000000000..98f09b5393668e1e8ac6bfd5b03e4cf5ffd903b7 --- /dev/null +++ b/net/vmw_vsock/af_vsock_tap.c @@ -0,0 +1,114 @@ +/* + * Tap functions for AF_VSOCK sockets. + * + * Code based on net/netlink/af_netlink.c tap functions. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +static DEFINE_SPINLOCK(vsock_tap_lock); +static struct list_head vsock_tap_all __read_mostly = + LIST_HEAD_INIT(vsock_tap_all); + +int vsock_add_tap(struct vsock_tap *vt) +{ + if (unlikely(vt->dev->type != ARPHRD_VSOCKMON)) + return -EINVAL; + + __module_get(vt->module); + + spin_lock(&vsock_tap_lock); + list_add_rcu(&vt->list, &vsock_tap_all); + spin_unlock(&vsock_tap_lock); + + return 0; +} +EXPORT_SYMBOL_GPL(vsock_add_tap); + +int vsock_remove_tap(struct vsock_tap *vt) +{ + struct vsock_tap *tmp; + bool found = false; + + spin_lock(&vsock_tap_lock); + + list_for_each_entry(tmp, &vsock_tap_all, list) { + if (vt == tmp) { + list_del_rcu(&vt->list); + found = true; + goto out; + } + } + + pr_warn("vsock_remove_tap: %p not found\n", vt); +out: + spin_unlock(&vsock_tap_lock); + + synchronize_net(); + + if (found) + module_put(vt->module); + + return found ? 0 : -ENODEV; +} +EXPORT_SYMBOL_GPL(vsock_remove_tap); + +static int __vsock_deliver_tap_skb(struct sk_buff *skb, + struct net_device *dev) +{ + int ret = 0; + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + if (nskb) { + dev_hold(dev); + + nskb->dev = dev; + ret = dev_queue_xmit(nskb); + if (unlikely(ret > 0)) + ret = net_xmit_errno(ret); + + dev_put(dev); + } + + return ret; +} + +static void __vsock_deliver_tap(struct sk_buff *skb) +{ + int ret; + struct vsock_tap *tmp; + + list_for_each_entry_rcu(tmp, &vsock_tap_all, list) { + ret = __vsock_deliver_tap_skb(skb, tmp->dev); + if (unlikely(ret)) + break; + } +} + +void vsock_deliver_tap(struct sk_buff *build_skb(void *opaque), void *opaque) +{ + struct sk_buff *skb; + + rcu_read_lock(); + + if (likely(list_empty(&vsock_tap_all))) + goto out; + + skb = build_skb(opaque); + if (skb) { + __vsock_deliver_tap(skb); + consume_skb(skb); + } + +out: + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(vsock_deliver_tap); diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c index 68675a151f22b8b63c02b25a67b833d9a6046d84..403d86e80162e7796fd75249b1ae876d1eee1e6a 100644 --- a/net/vmw_vsock/virtio_transport.c +++ b/net/vmw_vsock/virtio_transport.c @@ -144,6 +144,8 @@ virtio_transport_send_pkt_work(struct work_struct *work) list_del_init(&pkt->list); spin_unlock_bh(&vsock->send_pkt_list_lock); + virtio_transport_deliver_tap_pkt(pkt); + reply = pkt->reply; sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); @@ -370,6 +372,7 @@ static void virtio_transport_rx_work(struct work_struct *work) } pkt->len = len - sizeof(pkt->hdr); + virtio_transport_deliver_tap_pkt(pkt); virtio_transport_recv_pkt(pkt); } } while (!virtqueue_enable_cb(vq)); @@ -573,9 +576,9 @@ static int virtio_vsock_probe(struct virtio_device *vdev) vsock->vdev = vdev; - ret = vsock->vdev->config->find_vqs(vsock->vdev, VSOCK_VQ_MAX, - vsock->vqs, callbacks, names, - NULL); + ret = virtio_find_vqs(vsock->vdev, VSOCK_VQ_MAX, + vsock->vqs, callbacks, names, + NULL); if (ret < 0) goto out; diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index af087b44ceea2311e53060e2442b4af2024bb037..18e24793659f928221297208ff4c82c8e385d14b 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -85,6 +86,69 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, return NULL; } +/* Packet capture */ +static struct sk_buff *virtio_transport_build_skb(void *opaque) +{ + struct virtio_vsock_pkt *pkt = opaque; + unsigned char *t_hdr, *payload; + struct af_vsockmon_hdr *hdr; + struct sk_buff *skb; + + skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + pkt->len, + GFP_ATOMIC); + if (!skb) + return NULL; + + hdr = (struct af_vsockmon_hdr *)skb_put(skb, sizeof(*hdr)); + + /* pkt->hdr is little-endian so no need to byteswap here */ + hdr->src_cid = pkt->hdr.src_cid; + hdr->src_port = pkt->hdr.src_port; + hdr->dst_cid = pkt->hdr.dst_cid; + hdr->dst_port = pkt->hdr.dst_port; + + hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); + hdr->len = cpu_to_le16(sizeof(pkt->hdr)); + memset(hdr->reserved, 0, sizeof(hdr->reserved)); + + switch (le16_to_cpu(pkt->hdr.op)) { + case VIRTIO_VSOCK_OP_REQUEST: + case VIRTIO_VSOCK_OP_RESPONSE: + hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); + break; + case VIRTIO_VSOCK_OP_RST: + case VIRTIO_VSOCK_OP_SHUTDOWN: + hdr->op = cpu_to_le16(AF_VSOCK_OP_DISCONNECT); + break; + case VIRTIO_VSOCK_OP_RW: + hdr->op = cpu_to_le16(AF_VSOCK_OP_PAYLOAD); + break; + case VIRTIO_VSOCK_OP_CREDIT_UPDATE: + case VIRTIO_VSOCK_OP_CREDIT_REQUEST: + hdr->op = cpu_to_le16(AF_VSOCK_OP_CONTROL); + break; + default: + hdr->op = cpu_to_le16(AF_VSOCK_OP_UNKNOWN); + break; + } + + t_hdr = skb_put(skb, sizeof(pkt->hdr)); + memcpy(t_hdr, &pkt->hdr, sizeof(pkt->hdr)); + + if (pkt->len) { + payload = skb_put(skb, pkt->len); + memcpy(payload, pkt->buf, pkt->len); + } + + return skb; +} + +void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) +{ + vsock_deliver_tap(virtio_transport_build_skb, pkt); +} +EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); + static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, struct virtio_vsock_pkt_info *info) { diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 4be4fbbc0b5035662b1cd756bd4e99dd3351309e..10ae7823a19def7bde20d669e3913a40178e7da2 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -96,31 +96,23 @@ static int PROTOCOL_OVERRIDE = -1; static s32 vmci_transport_error_to_vsock_error(s32 vmci_error) { - int err; - switch (vmci_error) { case VMCI_ERROR_NO_MEM: - err = ENOMEM; - break; + return -ENOMEM; case VMCI_ERROR_DUPLICATE_ENTRY: case VMCI_ERROR_ALREADY_EXISTS: - err = EADDRINUSE; - break; + return -EADDRINUSE; case VMCI_ERROR_NO_ACCESS: - err = EPERM; - break; + return -EPERM; case VMCI_ERROR_NO_RESOURCES: - err = ENOBUFS; - break; + return -ENOBUFS; case VMCI_ERROR_INVALID_RESOURCE: - err = EHOSTUNREACH; - break; + return -EHOSTUNREACH; case VMCI_ERROR_INVALID_ARGS: default: - err = EINVAL; + break; } - - return err > 0 ? -err : err; + return -EINVAL; } static u32 vmci_transport_peer_rid(u32 peer_cid) diff --git a/net/wireless/ap.c b/net/wireless/ap.c index bdad1f951561b3d8b7c75d8992e1c3c1a623f146..25666d3009be8be07410f7b0b53c81f62566111a 100644 --- a/net/wireless/ap.c +++ b/net/wireless/ap.c @@ -32,6 +32,11 @@ int __cfg80211_stop_ap(struct cfg80211_registered_device *rdev, rdev_set_qos_map(rdev, dev, NULL); if (notify) nl80211_send_ap_stopped(wdev); + + /* Should we apply the grace period during beaconing interface + * shutdown also? + */ + cfg80211_sched_dfs_chan_update(rdev); } return err; diff --git a/net/wireless/chan.c b/net/wireless/chan.c index 5497d022fadabf17512c0f4c0b77ce50cce345dc..b8aa5a7d5c77ae8453062de794fee7329e429f75 100644 --- a/net/wireless/chan.c +++ b/net/wireless/chan.c @@ -456,6 +456,123 @@ bool cfg80211_chandef_dfs_usable(struct wiphy *wiphy, return (r1 + r2 > 0); } +/* + * Checks if center frequency of chan falls with in the bandwidth + * range of chandef. + */ +bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, + struct ieee80211_channel *chan) +{ + int width; + u32 cf_offset, freq; + + if (chandef->chan->center_freq == chan->center_freq) + return true; + + width = cfg80211_chandef_get_width(chandef); + if (width <= 20) + return false; + + cf_offset = width / 2 - 10; + + for (freq = chandef->center_freq1 - width / 2 + 10; + freq <= chandef->center_freq1 + width / 2 - 10; freq += 20) { + if (chan->center_freq == freq) + return true; + } + + if (!chandef->center_freq2) + return false; + + for (freq = chandef->center_freq2 - width / 2 + 10; + freq <= chandef->center_freq2 + width / 2 - 10; freq += 20) { + if (chan->center_freq == freq) + return true; + } + + return false; +} + +bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev) +{ + bool active = false; + + ASSERT_WDEV_LOCK(wdev); + + if (!wdev->chandef.chan) + return false; + + switch (wdev->iftype) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + active = wdev->beacon_interval != 0; + break; + case NL80211_IFTYPE_ADHOC: + active = wdev->ssid_len != 0; + break; + case NL80211_IFTYPE_MESH_POINT: + active = wdev->mesh_id_len != 0; + break; + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_OCB: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_MONITOR: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_WDS: + case NL80211_IFTYPE_P2P_DEVICE: + /* Can NAN type be considered as beaconing interface? */ + case NL80211_IFTYPE_NAN: + break; + case NL80211_IFTYPE_UNSPECIFIED: + case NUM_NL80211_IFTYPES: + WARN_ON(1); + } + + return active; +} + +static bool cfg80211_is_wiphy_oper_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan) +{ + struct wireless_dev *wdev; + + list_for_each_entry(wdev, &wiphy->wdev_list, list) { + wdev_lock(wdev); + if (!cfg80211_beaconing_iface_active(wdev)) { + wdev_unlock(wdev); + continue; + } + + if (cfg80211_is_sub_chan(&wdev->chandef, chan)) { + wdev_unlock(wdev); + return true; + } + wdev_unlock(wdev); + } + + return false; +} + +bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan) +{ + struct cfg80211_registered_device *rdev; + + ASSERT_RTNL(); + + if (!(chan->flags & IEEE80211_CHAN_RADAR)) + return false; + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) + continue; + + if (cfg80211_is_wiphy_oper_chan(&rdev->wiphy, chan)) + return true; + } + + return false; +} static bool cfg80211_get_chans_dfs_available(struct wiphy *wiphy, u32 center_freq, diff --git a/net/wireless/core.c b/net/wireless/core.c index e55e05bc48053f5542696a4c2d53170dfd02577c..83ea164f16b3e018546260d8e809ae6b244d85e3 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -305,30 +305,14 @@ static void cfg80211_event_work(struct work_struct *work) void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev) { - struct cfg80211_iface_destroy *item; + struct wireless_dev *wdev, *tmp; ASSERT_RTNL(); - spin_lock_irq(&rdev->destroy_list_lock); - while ((item = list_first_entry_or_null(&rdev->destroy_list, - struct cfg80211_iface_destroy, - list))) { - struct wireless_dev *wdev, *tmp; - u32 nlportid = item->nlportid; - - list_del(&item->list); - kfree(item); - spin_unlock_irq(&rdev->destroy_list_lock); - - list_for_each_entry_safe(wdev, tmp, - &rdev->wiphy.wdev_list, list) { - if (nlportid == wdev->owner_nlportid) - rdev_del_virtual_intf(rdev, wdev); - } - - spin_lock_irq(&rdev->destroy_list_lock); + list_for_each_entry_safe(wdev, tmp, &rdev->wiphy.wdev_list, list) { + if (wdev->nl_owner_dead) + rdev_del_virtual_intf(rdev, wdev); } - spin_unlock_irq(&rdev->destroy_list_lock); } static void cfg80211_destroy_iface_wk(struct work_struct *work) @@ -346,13 +330,47 @@ static void cfg80211_destroy_iface_wk(struct work_struct *work) static void cfg80211_sched_scan_stop_wk(struct work_struct *work) { struct cfg80211_registered_device *rdev; + struct cfg80211_sched_scan_request *req, *tmp; rdev = container_of(work, struct cfg80211_registered_device, sched_scan_stop_wk); rtnl_lock(); + list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { + if (req->nl_owner_dead) + cfg80211_stop_sched_scan_req(rdev, req, false); + } + rtnl_unlock(); +} - __cfg80211_stop_sched_scan(rdev, false); +static void cfg80211_propagate_radar_detect_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + propagate_radar_detect_wk); + + rtnl_lock(); + + regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->radar_chandef, + NL80211_DFS_UNAVAILABLE, + NL80211_RADAR_DETECTED); + + rtnl_unlock(); +} + +static void cfg80211_propagate_cac_done_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + + rdev = container_of(work, struct cfg80211_registered_device, + propagate_cac_done_wk); + + rtnl_lock(); + + regulatory_propagate_dfs_state(&rdev->wiphy, &rdev->cac_done_chandef, + NL80211_DFS_AVAILABLE, + NL80211_RADAR_CAC_FINISHED); rtnl_unlock(); } @@ -436,8 +454,8 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, spin_lock_init(&rdev->beacon_registrations_lock); spin_lock_init(&rdev->bss_lock); INIT_LIST_HEAD(&rdev->bss_list); + INIT_LIST_HEAD(&rdev->sched_scan_req_list); INIT_WORK(&rdev->scan_done_wk, __cfg80211_scan_done); - INIT_WORK(&rdev->sched_scan_results_wk, __cfg80211_sched_scan_results); INIT_LIST_HEAD(&rdev->mlme_unreg); spin_lock_init(&rdev->mlme_unreg_lock); INIT_WORK(&rdev->mlme_unreg_wk, cfg80211_mlme_unreg_wk); @@ -452,10 +470,12 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv, rdev->wiphy.dev.platform_data = rdev; device_enable_async_suspend(&rdev->wiphy.dev); - INIT_LIST_HEAD(&rdev->destroy_list); - spin_lock_init(&rdev->destroy_list_lock); INIT_WORK(&rdev->destroy_work, cfg80211_destroy_iface_wk); INIT_WORK(&rdev->sched_scan_stop_wk, cfg80211_sched_scan_stop_wk); + INIT_WORK(&rdev->sched_scan_res_wk, cfg80211_sched_scan_results_wk); + INIT_WORK(&rdev->propagate_radar_detect_wk, + cfg80211_propagate_radar_detect_wk); + INIT_WORK(&rdev->propagate_cac_done_wk, cfg80211_propagate_cac_done_wk); #ifdef CONFIG_CFG80211_DEFAULT_PS rdev->wiphy.flags |= WIPHY_FLAG_PS_ON_BY_DEFAULT; @@ -915,6 +935,8 @@ void wiphy_unregister(struct wiphy *wiphy) flush_work(&rdev->destroy_work); flush_work(&rdev->sched_scan_stop_wk); flush_work(&rdev->mlme_unreg_wk); + flush_work(&rdev->propagate_radar_detect_wk); + flush_work(&rdev->propagate_cac_done_wk); #ifdef CONFIG_PM if (rdev->wiphy.wowlan_config && rdev->ops->set_wakeup) @@ -954,6 +976,12 @@ void wiphy_rfkill_set_hw_state(struct wiphy *wiphy, bool blocked) } EXPORT_SYMBOL(wiphy_rfkill_set_hw_state); +void cfg80211_cqm_config_free(struct wireless_dev *wdev) +{ + kfree(wdev->cqm_config); + wdev->cqm_config = NULL; +} + void cfg80211_unregister_wdev(struct wireless_dev *wdev) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -980,6 +1008,8 @@ void cfg80211_unregister_wdev(struct wireless_dev *wdev) WARN_ON_ONCE(1); break; } + + cfg80211_cqm_config_free(wdev); } EXPORT_SYMBOL(cfg80211_unregister_wdev); @@ -1001,7 +1031,7 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { struct net_device *dev = wdev->netdev; - struct cfg80211_sched_scan_request *sched_scan_req; + struct cfg80211_sched_scan_request *pos, *tmp; ASSERT_RTNL(); ASSERT_WDEV_LOCK(wdev); @@ -1012,9 +1042,11 @@ void __cfg80211_leave(struct cfg80211_registered_device *rdev, break; case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - if (sched_scan_req && dev == sched_scan_req->dev) - __cfg80211_stop_sched_scan(rdev, false); + list_for_each_entry_safe(pos, tmp, &rdev->sched_scan_req_list, + list) { + if (dev == pos->dev) + cfg80211_stop_sched_scan_req(rdev, pos, false); + } #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.ie); @@ -1089,7 +1121,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, struct net_device *dev = netdev_notifier_info_to_dev(ptr); struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev; - struct cfg80211_sched_scan_request *sched_scan_req; + struct cfg80211_sched_scan_request *pos, *tmp; if (!wdev) return NOTIFY_DONE; @@ -1114,7 +1146,15 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, INIT_LIST_HEAD(&wdev->mgmt_registrations); spin_lock_init(&wdev->mgmt_registrations_lock); - wdev->identifier = ++rdev->wdev_id; + /* + * We get here also when the interface changes network namespaces, + * as it's registered into the new one, but we don't want it to + * change ID in that case. Checking if the ID is already assigned + * works, because 0 isn't considered a valid ID and the memory is + * 0-initialized. + */ + if (!wdev->identifier) + wdev->identifier = ++rdev->wdev_id; list_add_rcu(&wdev->list, &rdev->wiphy.wdev_list); rdev->devlist_generation++; /* can only change netns with wiphy */ @@ -1158,10 +1198,10 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, ___cfg80211_scan_done(rdev, false); } - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - if (WARN_ON(sched_scan_req && - sched_scan_req->dev == wdev->netdev)) { - __cfg80211_stop_sched_scan(rdev, false); + list_for_each_entry_safe(pos, tmp, + &rdev->sched_scan_req_list, list) { + if (WARN_ON(pos && pos->dev == wdev->netdev)) + cfg80211_stop_sched_scan_req(rdev, pos, false); } rdev->opencount--; @@ -1208,12 +1248,12 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, */ if ((wdev->iftype == NL80211_IFTYPE_STATION || wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && - rdev->ops->set_power_mgmt) - if (rdev_set_power_mgmt(rdev, dev, wdev->ps, - wdev->ps_timeout)) { - /* assume this means it's off */ - wdev->ps = false; - } + rdev->ops->set_power_mgmt && + rdev_set_power_mgmt(rdev, dev, wdev->ps, + wdev->ps_timeout)) { + /* assume this means it's off */ + wdev->ps = false; + } break; case NETDEV_UNREGISTER: /* @@ -1234,6 +1274,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb, kzfree(wdev->wext.keys); #endif flush_work(&wdev->disconnect_wk); + cfg80211_cqm_config_free(wdev); } /* * synchronise (so that we won't find this netdev diff --git a/net/wireless/core.h b/net/wireless/core.h index 58ca206982feafa7ddfe2c90d4483791e5057cf0..6e809325af3bf090f09fb8fef4356bada3a1beed 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -74,10 +74,9 @@ struct cfg80211_registered_device { u32 bss_entries; struct cfg80211_scan_request *scan_req; /* protected by RTNL */ struct sk_buff *scan_msg; - struct cfg80211_sched_scan_request __rcu *sched_scan_req; + struct list_head sched_scan_req_list; unsigned long suspend_at; struct work_struct scan_done_wk; - struct work_struct sched_scan_results_wk; struct genl_info *cur_cmd_info; @@ -91,11 +90,15 @@ struct cfg80211_registered_device { struct cfg80211_coalesce *coalesce; - spinlock_t destroy_list_lock; - struct list_head destroy_list; struct work_struct destroy_work; - struct work_struct sched_scan_stop_wk; + struct work_struct sched_scan_res_wk; + + struct cfg80211_chan_def radar_chandef; + struct work_struct propagate_radar_detect_wk; + + struct cfg80211_chan_def cac_done_chandef; + struct work_struct propagate_cac_done_wk; /* must be last because of the way we do wiphy_priv(), * and it should at least be aligned to NETDEV_ALIGN */ @@ -220,23 +223,8 @@ struct cfg80211_event { enum cfg80211_event_type type; union { - struct { - u8 bssid[ETH_ALEN]; - const u8 *req_ie; - const u8 *resp_ie; - size_t req_ie_len; - size_t resp_ie_len; - struct cfg80211_bss *bss; - int status; /* -1 = failed; 0..65535 = status code */ - enum nl80211_timeout_reason timeout_reason; - } cr; - struct { - const u8 *req_ie; - const u8 *resp_ie; - size_t req_ie_len; - size_t resp_ie_len; - struct cfg80211_bss *bss; - } rm; + struct cfg80211_connect_resp_params cr; + struct cfg80211_roam_info rm; struct { const u8 *ie; size_t ie_len; @@ -267,9 +255,11 @@ struct cfg80211_beacon_registration { u32 nlportid; }; -struct cfg80211_iface_destroy { - struct list_head list; - u32 nlportid; +struct cfg80211_cqm_config { + u32 rssi_hyst; + s32 last_rssi_event_value; + int n_rssi_thresholds; + s32 rssi_thresholds[0]; }; void cfg80211_destroy_ifaces(struct cfg80211_registered_device *rdev); @@ -385,21 +375,16 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev, struct cfg80211_connect_params *connect, struct cfg80211_cached_keys *connkeys, const u8 *prev_bssid); -void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, - int status, bool wextev, - struct cfg80211_bss *bss, - enum nl80211_timeout_reason timeout_reason); +void __cfg80211_connect_result(struct net_device *dev, + struct cfg80211_connect_resp_params *params, + bool wextev); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); int cfg80211_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, - struct cfg80211_bss *bss, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len); + struct cfg80211_roam_info *info); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); @@ -423,13 +408,20 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, void __cfg80211_scan_done(struct work_struct *wk); void ___cfg80211_scan_done(struct cfg80211_registered_device *rdev, bool send_message); -void __cfg80211_sched_scan_results(struct work_struct *wk); +void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req); +int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, + bool want_multi); +void cfg80211_sched_scan_results_wk(struct work_struct *work); +int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req, + bool driver_initiated); int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, - bool driver_initiated); + u64 reqid, bool driver_initiated); void cfg80211_upload_connect_keys(struct wireless_dev *wdev); int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, - u32 *flags, struct vif_params *params); + struct vif_params *params); void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev); void cfg80211_process_wdev_events(struct wireless_dev *wdev); @@ -459,6 +451,16 @@ unsigned int cfg80211_chandef_dfs_cac_time(struct wiphy *wiphy, const struct cfg80211_chan_def *chandef); +void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev); + +bool cfg80211_any_wiphy_oper_chan(struct wiphy *wiphy, + struct ieee80211_channel *chan); + +bool cfg80211_beaconing_iface_active(struct wireless_dev *wdev); + +bool cfg80211_is_sub_chan(struct cfg80211_chan_def *chandef, + struct ieee80211_channel *chan); + static inline unsigned int elapsed_jiffies_msecs(unsigned long start) { unsigned long end = jiffies; @@ -512,4 +514,6 @@ void cfg80211_stop_nan(struct cfg80211_registered_device *rdev, #define CFG80211_DEV_WARN_ON(cond) ({bool __r = (cond); __r; }) #endif +void cfg80211_cqm_config_free(struct wireless_dev *wdev); + #endif /* __NET_WIRELESS_CORE_H */ diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 364f900a3dc4d2810c90fa3f613d1b9a08cb5e71..10bf040a0982d2f7859f98399c24de26a7fd5383 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -190,6 +190,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) if (!nowext) wdev->wext.ibss.ssid_len = 0; #endif + cfg80211_sched_dfs_chan_update(rdev); } void cfg80211_clear_ibss(struct net_device *dev, bool nowext) diff --git a/net/wireless/mesh.c b/net/wireless/mesh.c index 2d8518a37eabc4795e97f7f600f14cdf0f63d7ff..ec0b1c20ac9920106efbc8798b53d1077a646e36 100644 --- a/net/wireless/mesh.c +++ b/net/wireless/mesh.c @@ -262,6 +262,7 @@ int __cfg80211_leave_mesh(struct cfg80211_registered_device *rdev, wdev->beacon_interval = 0; memset(&wdev->chandef, 0, sizeof(wdev->chandef)); rdev_set_qos_map(rdev, dev, NULL); + cfg80211_sched_dfs_chan_update(rdev); } return err; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 22b3d999006559d7572287c2b8e6f4fad286766d..d8df7a5180a0473b56e74e8b4eb93b380965ea49 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -26,9 +26,16 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; - u8 *ie = mgmt->u.assoc_resp.variable; - int ieoffs = offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); - u16 status_code = le16_to_cpu(mgmt->u.assoc_resp.status_code); + struct cfg80211_connect_resp_params cr; + + memset(&cr, 0, sizeof(cr)); + cr.status = (int)le16_to_cpu(mgmt->u.assoc_resp.status_code); + cr.bssid = mgmt->bssid; + cr.bss = bss; + cr.resp_ie = mgmt->u.assoc_resp.variable; + cr.resp_ie_len = + len - offsetof(struct ieee80211_mgmt, u.assoc_resp.variable); + cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; trace_cfg80211_send_rx_assoc(dev, bss); @@ -38,7 +45,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, * and got a reject -- we only try again with an assoc * frame instead of reassoc. */ - if (cfg80211_sme_rx_assoc_resp(wdev, status_code)) { + if (cfg80211_sme_rx_assoc_resp(wdev, cr.status)) { cfg80211_unhold_bss(bss_from_pub(bss)); cfg80211_put_bss(wiphy, bss); return; @@ -46,10 +53,7 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, struct cfg80211_bss *bss, nl80211_send_rx_assoc(rdev, dev, buf, len, GFP_KERNEL, uapsd_queues); /* update current_bss etc., consumes the bss reference */ - __cfg80211_connect_result(dev, mgmt->bssid, NULL, 0, ie, len - ieoffs, - status_code, - status_code == WLAN_STATUS_SUCCESS, bss, - NL80211_TIMEOUT_UNSPECIFIED); + __cfg80211_connect_result(dev, &cr, cr.status == WLAN_STATUS_SUCCESS); } EXPORT_SYMBOL(cfg80211_rx_assoc_resp); @@ -745,6 +749,12 @@ bool cfg80211_rx_mgmt(struct wireless_dev *wdev, int freq, int sig_mbm, } EXPORT_SYMBOL(cfg80211_rx_mgmt); +void cfg80211_sched_dfs_chan_update(struct cfg80211_registered_device *rdev) +{ + cancel_delayed_work(&rdev->dfs_update_channels_wk); + queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, 0); +} + void cfg80211_dfs_channels_update_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); @@ -755,6 +765,8 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) struct wiphy *wiphy; bool check_again = false; unsigned long timeout, next_time = 0; + unsigned long time_dfs_update; + enum nl80211_radar_event radar_event; int bandid, i; rdev = container_of(delayed_work, struct cfg80211_registered_device, @@ -770,11 +782,27 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) for (i = 0; i < sband->n_channels; i++) { c = &sband->channels[i]; - if (c->dfs_state != NL80211_DFS_UNAVAILABLE) + if (!(c->flags & IEEE80211_CHAN_RADAR)) + continue; + + if (c->dfs_state != NL80211_DFS_UNAVAILABLE && + c->dfs_state != NL80211_DFS_AVAILABLE) continue; - timeout = c->dfs_state_entered + msecs_to_jiffies( - IEEE80211_DFS_MIN_NOP_TIME_MS); + if (c->dfs_state == NL80211_DFS_UNAVAILABLE) { + time_dfs_update = IEEE80211_DFS_MIN_NOP_TIME_MS; + radar_event = NL80211_RADAR_NOP_FINISHED; + } else { + if (regulatory_pre_cac_allowed(wiphy) || + cfg80211_any_wiphy_oper_chan(wiphy, c)) + continue; + + time_dfs_update = REG_PRE_CAC_EXPIRY_GRACE_MS; + radar_event = NL80211_RADAR_PRE_CAC_EXPIRED; + } + + timeout = c->dfs_state_entered + + msecs_to_jiffies(time_dfs_update); if (time_after_eq(jiffies, timeout)) { c->dfs_state = NL80211_DFS_USABLE; @@ -784,8 +812,12 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work) NL80211_CHAN_NO_HT); nl80211_radar_notify(rdev, &chandef, - NL80211_RADAR_NOP_FINISHED, - NULL, GFP_ATOMIC); + radar_event, NULL, + GFP_ATOMIC); + + regulatory_propagate_dfs_state(wiphy, &chandef, + c->dfs_state, + radar_event); continue; } @@ -810,7 +842,6 @@ void cfg80211_radar_event(struct wiphy *wiphy, gfp_t gfp) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); - unsigned long timeout; trace_cfg80211_radar_event(wiphy, chandef); @@ -820,11 +851,12 @@ void cfg80211_radar_event(struct wiphy *wiphy, */ cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_UNAVAILABLE); - timeout = msecs_to_jiffies(IEEE80211_DFS_MIN_NOP_TIME_MS); - queue_delayed_work(cfg80211_wq, &rdev->dfs_update_channels_wk, - timeout); + cfg80211_sched_dfs_chan_update(rdev); nl80211_radar_notify(rdev, chandef, NL80211_RADAR_DETECTED, NULL, gfp); + + memcpy(&rdev->radar_chandef, chandef, sizeof(struct cfg80211_chan_def)); + queue_work(cfg80211_wq, &rdev->propagate_radar_detect_wk); } EXPORT_SYMBOL(cfg80211_radar_event); @@ -851,6 +883,10 @@ void cfg80211_cac_event(struct net_device *netdev, msecs_to_jiffies(wdev->cac_time_ms); WARN_ON(!time_after_eq(jiffies, timeout)); cfg80211_set_dfs_state(wiphy, chandef, NL80211_DFS_AVAILABLE); + memcpy(&rdev->cac_done_chandef, chandef, + sizeof(struct cfg80211_chan_def)); + queue_work(cfg80211_wq, &rdev->propagate_cac_done_wk); + cfg80211_sched_dfs_chan_update(rdev); break; case NL80211_RADAR_CAC_ABORTED: break; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 2312dc2ffdb98b37b2909274c57eed68935267d7..c3bc9da30cff997970dc2cf8aebd05c4795c3cf9 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -410,6 +410,16 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = { .len = sizeof(struct nl80211_bss_select_rssi_adjust) }, [NL80211_ATTR_TIMEOUT_REASON] = { .type = NLA_U32 }, + [NL80211_ATTR_FILS_ERP_USERNAME] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_USERNAME_LEN }, + [NL80211_ATTR_FILS_ERP_REALM] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_REALM_LEN }, + [NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] = { .type = NLA_U16 }, + [NL80211_ATTR_FILS_ERP_RRK] = { .type = NLA_BINARY, + .len = FILS_ERP_MAX_RRK_LEN }, + [NL80211_ATTR_FILS_CACHE_ID] = { .len = 2 }, + [NL80211_ATTR_PMK] = { .type = NLA_BINARY, .len = PMK_MAX_LEN }, + [NL80211_ATTR_SCHED_SCAN_MULTI] = { .type = NLA_FLAG }, }; /* policy for the key attributes */ @@ -487,6 +497,7 @@ static const struct nla_policy nl80211_match_policy[NL80211_SCHED_SCAN_MATCH_ATTR_MAX + 1] = { [NL80211_SCHED_SCAN_MATCH_ATTR_SSID] = { .type = NLA_BINARY, .len = IEEE80211_MAX_SSID_LEN }, + [NL80211_SCHED_SCAN_MATCH_ATTR_BSSID] = { .len = ETH_ALEN }, [NL80211_SCHED_SCAN_MATCH_ATTR_RSSI] = { .type = NLA_U32 }, }; @@ -548,7 +559,7 @@ static int nl80211_prepare_wdev_dump(struct sk_buff *skb, if (!cb->args[0]) { err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, genl_family_attrbuf(&nl80211_fam), - nl80211_fam.maxattr, nl80211_policy); + nl80211_fam.maxattr, nl80211_policy, NULL); if (err) return err; @@ -719,7 +730,7 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) { struct nlattr *tb[NL80211_KEY_MAX + 1]; int err = nla_parse_nested(tb, NL80211_KEY_MAX, key, - nl80211_key_policy); + nl80211_key_policy, NULL); if (err) return err; @@ -760,7 +771,7 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) err = nla_parse_nested(kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, tb[NL80211_KEY_DEFAULT_TYPES], - nl80211_key_default_policy); + nl80211_key_default_policy, NULL); if (err) return err; @@ -807,10 +818,11 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) if (info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES]) { struct nlattr *kdt[NUM_NL80211_KEY_DEFAULT_TYPES]; - int err = nla_parse_nested( - kdt, NUM_NL80211_KEY_DEFAULT_TYPES - 1, - info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES], - nl80211_key_default_policy); + int err = nla_parse_nested(kdt, + NUM_NL80211_KEY_DEFAULT_TYPES - 1, + info->attrs[NL80211_ATTR_KEY_DEFAULT_TYPES], + nl80211_key_default_policy, + info->extack); if (err) return err; @@ -1366,7 +1378,7 @@ static int nl80211_add_commands_unsplit(struct cfg80211_registered_device *rdev, CMD(tdls_mgmt, TDLS_MGMT); CMD(tdls_oper, TDLS_OPER); } - if (rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + if (rdev->wiphy.max_sched_scan_reqs) CMD(sched_scan_start, START_SCHED_SCAN); CMD(probe_client, PROBE_CLIENT); CMD(set_noack_map, SET_NOACK_MAP); @@ -1805,6 +1817,11 @@ static int nl80211_send_wiphy(struct cfg80211_registered_device *rdev, nla_put_flag(msg, NL80211_ATTR_WIPHY_SELF_MANAGED_REG)) goto nla_put_failure; + if (rdev->wiphy.max_sched_scan_reqs && + nla_put_u32(msg, NL80211_ATTR_SCHED_SCAN_MAX_REQS, + rdev->wiphy.max_sched_scan_reqs)) + goto nla_put_failure; + if (nla_put(msg, NL80211_ATTR_EXT_FEATURES, sizeof(rdev->wiphy.ext_features), rdev->wiphy.ext_features)) @@ -1892,8 +1909,8 @@ static int nl80211_dump_wiphy_parse(struct sk_buff *skb, struct nl80211_dump_wiphy_state *state) { struct nlattr **tb = genl_family_attrbuf(&nl80211_fam); - int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - tb, nl80211_fam.maxattr, nl80211_policy); + int ret = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, tb, + nl80211_fam.maxattr, nl80211_policy, NULL); /* ignore parse errors for backward compatibility */ if (ret) return 0; @@ -2308,7 +2325,8 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rem_txq_params) { result = nla_parse_nested(tb, NL80211_TXQ_ATTR_MAX, nl_txq_params, - txq_params_policy); + txq_params_policy, + info->extack); if (result) return result; result = parse_txq_params(tb, &txq_params); @@ -2695,17 +2713,82 @@ static int parse_monitor_flags(struct nlattr *nla, u32 *mntrflags) if (!nla) return -EINVAL; - if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX, - nla, mntr_flags_policy)) + if (nla_parse_nested(flags, NL80211_MNTR_FLAG_MAX, nla, + mntr_flags_policy, NULL)) return -EINVAL; for (flag = 1; flag <= NL80211_MNTR_FLAG_MAX; flag++) if (flags[flag]) *mntrflags |= (1<attrs[NL80211_ATTR_MNTR_FLAGS]) { + if (type != NL80211_IFTYPE_MONITOR) + return -EINVAL; + + err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS], + ¶ms->flags); + if (err) + return err; + + change = true; + } + + if (params->flags & MONITOR_FLAG_ACTIVE && + !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) + return -EOPNOTSUPP; + + if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) { + const u8 *mumimo_groups; + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (type != NL80211_IFTYPE_MONITOR) + return -EINVAL; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + mumimo_groups = + nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]); + + /* bits 0 and 63 are reserved and must be zero */ + if ((mumimo_groups[0] & BIT(0)) || + (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(7))) + return -EINVAL; + + params->vht_mumimo_groups = mumimo_groups; + change = true; + } + + if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) { + u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; + + if (type != NL80211_IFTYPE_MONITOR) + return -EINVAL; + + if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) + return -EOPNOTSUPP; + + params->vht_mumimo_follow_addr = + nla_data(info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]); + change = true; + } + + return change ? 1 : 0; +} + static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 use_4addr, enum nl80211_iftype iftype) @@ -2739,7 +2822,6 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) int err; enum nl80211_iftype otype, ntype; struct net_device *dev = info->user_ptr[1]; - u32 _flags, *flags = NULL; bool change = false; memset(¶ms, 0, sizeof(params)); @@ -2782,56 +2864,14 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) params.use_4addr = -1; } - if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) { - if (ntype != NL80211_IFTYPE_MONITOR) - return -EINVAL; - err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS], - &_flags); - if (err) - return err; - - flags = &_flags; - change = true; - } - - if (info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]) { - const u8 *mumimo_groups; - u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; - - if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) - return -EOPNOTSUPP; - - mumimo_groups = - nla_data(info->attrs[NL80211_ATTR_MU_MIMO_GROUP_DATA]); - - /* bits 0 and 63 are reserved and must be zero */ - if ((mumimo_groups[0] & BIT(7)) || - (mumimo_groups[VHT_MUMIMO_GROUPS_DATA_LEN - 1] & BIT(0))) - return -EINVAL; - - memcpy(params.vht_mumimo_groups, mumimo_groups, - VHT_MUMIMO_GROUPS_DATA_LEN); - change = true; - } - - if (info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR]) { - u32 cap_flag = NL80211_EXT_FEATURE_MU_MIMO_AIR_SNIFFER; - - if (!wiphy_ext_feature_isset(&rdev->wiphy, cap_flag)) - return -EOPNOTSUPP; - - nla_memcpy(params.macaddr, - info->attrs[NL80211_ATTR_MU_MIMO_FOLLOW_MAC_ADDR], - ETH_ALEN); + err = nl80211_parse_mon_options(rdev, ntype, info, ¶ms); + if (err < 0) + return err; + if (err > 0) change = true; - } - - if (flags && (*flags & MONITOR_FLAG_ACTIVE) && - !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) - return -EOPNOTSUPP; if (change) - err = cfg80211_change_iface(rdev, dev, ntype, flags, ¶ms); + err = cfg80211_change_iface(rdev, dev, ntype, ¶ms); else err = 0; @@ -2849,7 +2889,6 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) struct sk_buff *msg; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; - u32 flags; /* to avoid failing a new interface creation due to pending removal */ cfg80211_destroy_ifaces(rdev); @@ -2885,13 +2924,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return err; } - err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? - info->attrs[NL80211_ATTR_MNTR_FLAGS] : NULL, - &flags); - - if (!err && (flags & MONITOR_FLAG_ACTIVE) && - !(rdev->wiphy.features & NL80211_FEATURE_ACTIVE_MONITOR)) - return -EOPNOTSUPP; + err = nl80211_parse_mon_options(rdev, type, info, ¶ms); + if (err < 0) + return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) @@ -2899,8 +2934,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) wdev = rdev_add_virtual_intf(rdev, nla_data(info->attrs[NL80211_ATTR_IFNAME]), - NET_NAME_USER, type, err ? NULL : &flags, - ¶ms); + NET_NAME_USER, type, ¶ms); if (WARN_ON(!wdev)) { nlmsg_free(msg); return -EPROTO; @@ -3561,7 +3595,7 @@ static int nl80211_parse_tx_bitrate_mask(struct genl_info *info, if (sband == NULL) return -EINVAL; err = nla_parse_nested(tb, NL80211_TXRATE_MAX, tx_rates, - nl80211_txattr_policy); + nl80211_txattr_policy, info->extack); if (err) return err; if (tb[NL80211_TXRATE_LEGACY]) { @@ -3818,6 +3852,19 @@ static bool nl80211_valid_auth_type(struct cfg80211_registered_device *rdev, return false; return true; case NL80211_CMD_CONNECT: + /* SAE not supported yet */ + if (auth_type == NL80211_AUTHTYPE_SAE) + return false; + /* FILS with SK PFS or PK not supported yet */ + if (auth_type == NL80211_AUTHTYPE_FILS_SK_PFS || + auth_type == NL80211_AUTHTYPE_FILS_PK) + return false; + if (!wiphy_ext_feature_isset( + &rdev->wiphy, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && + auth_type == NL80211_AUTHTYPE_FILS_SK) + return false; + return true; case NL80211_CMD_START_AP: /* SAE not supported yet */ if (auth_type == NL80211_AUTHTYPE_SAE) @@ -4100,8 +4147,8 @@ static int parse_station_flags(struct genl_info *info, if (!nla) return 0; - if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX, - nla, sta_flags_policy)) + if (nla_parse_nested(flags, NL80211_STA_FLAG_MAX, nla, + sta_flags_policy, info->extack)) return -EINVAL; /* @@ -4151,7 +4198,7 @@ static bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, struct nlattr *rate; u32 bitrate; u16 bitrate_compat; - enum nl80211_attrs rate_flg; + enum nl80211_rate_info rate_flg; rate = nla_nest_start(msg, attr); if (!rate) @@ -4728,7 +4775,7 @@ static int nl80211_parse_sta_wme(struct genl_info *info, nla = info->attrs[NL80211_ATTR_STA_WME]; err = nla_parse_nested(tb, NL80211_STA_WME_MAX, nla, - nl80211_sta_wme_policy); + nl80211_sta_wme_policy, info->extack); if (err) return err; @@ -5703,7 +5750,7 @@ static int nl80211_get_mesh_config(struct sk_buff *skb, cur_params.dot11MeshGateAnnouncementProtocol) || nla_put_u8(msg, NL80211_MESHCONF_FORWARDING, cur_params.dot11MeshForwarding) || - nla_put_u32(msg, NL80211_MESHCONF_RSSI_THRESHOLD, + nla_put_s32(msg, NL80211_MESHCONF_RSSI_THRESHOLD, cur_params.rssi_threshold) || nla_put_u32(msg, NL80211_MESHCONF_HT_OPMODE, cur_params.ht_opmode) || @@ -5853,7 +5900,7 @@ do { \ return -EINVAL; if (nla_parse_nested(tb, NL80211_MESHCONF_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_CONFIG], - nl80211_meshconf_params_policy)) + nl80211_meshconf_params_policy, info->extack)) return -EINVAL; /* This makes sure that there aren't more than 32 mesh config @@ -6002,7 +6049,7 @@ static int nl80211_parse_mesh_setup(struct genl_info *info, return -EINVAL; if (nla_parse_nested(tb, NL80211_MESH_SETUP_ATTR_MAX, info->attrs[NL80211_ATTR_MESH_SETUP], - nl80211_mesh_setup_params_policy)) + nl80211_mesh_setup_params_policy, info->extack)) return -EINVAL; if (tb[NL80211_MESH_SETUP_ENABLE_VENDOR_SYNC]) @@ -6393,7 +6440,8 @@ static int nl80211_set_reg(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(nl_reg_rule, info->attrs[NL80211_ATTR_REG_RULES], rem_reg_rules) { r = nla_parse_nested(tb, NL80211_REG_RULE_ATTR_MAX, - nl_reg_rule, reg_rule_policy); + nl_reg_rule, reg_rule_policy, + info->extack); if (r) goto bad_reg; r = parse_reg_rule(tb, &rd->reg_rules[rule_idx]); @@ -6461,7 +6509,7 @@ static int parse_bss_select(struct nlattr *nla, struct wiphy *wiphy, return -EINVAL; err = nla_parse_nested(attr, NL80211_BSS_SELECT_ATTR_MAX, nest, - nl80211_bss_select_policy); + nl80211_bss_select_policy, NULL); if (err) return err; @@ -6545,6 +6593,19 @@ static int nl80211_parse_random_mac(struct nlattr **attrs, return 0; } +static bool cfg80211_off_channel_oper_allowed(struct wireless_dev *wdev) +{ + ASSERT_WDEV_LOCK(wdev); + + if (!cfg80211_beaconing_iface_active(wdev)) + return true; + + if (!(wdev->chandef.chan->flags & IEEE80211_CHAN_RADAR)) + return true; + + return regulatory_pre_cac_allowed(wdev->wiphy); +} + static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; @@ -6670,6 +6731,25 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) request->n_channels = i; + wdev_lock(wdev); + if (!cfg80211_off_channel_oper_allowed(wdev)) { + struct ieee80211_channel *chan; + + if (request->n_channels != 1) { + wdev_unlock(wdev); + err = -EBUSY; + goto out_free; + } + + chan = request->channels[0]; + if (chan->center_freq != wdev->chandef.chan->center_freq) { + wdev_unlock(wdev); + err = -EBUSY; + goto out_free; + } + } + wdev_unlock(wdev); + i = 0; if (n_ssids) { nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) { @@ -6862,7 +6942,7 @@ nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans, return -EINVAL; err = nla_parse_nested(plan, NL80211_SCHED_SCAN_PLAN_MAX, - attr, nl80211_plan_policy); + attr, nl80211_plan_policy, NULL); if (err) return err; @@ -6953,11 +7033,19 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, err = nla_parse_nested(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, - attr, nl80211_match_policy); + attr, nl80211_match_policy, + NULL); if (err) return ERR_PTR(err); + + /* SSID and BSSID are mutually exclusive */ + if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] && + tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]) + return ERR_PTR(-EINVAL); + /* add other standalone attributes here */ - if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]) { + if (tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID] || + tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]) { n_match_sets++; continue; } @@ -7128,15 +7216,17 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, nla_for_each_nested(attr, attrs[NL80211_ATTR_SCHED_SCAN_MATCH], tmp) { - struct nlattr *ssid, *rssi; + struct nlattr *ssid, *bssid, *rssi; err = nla_parse_nested(tb, NL80211_SCHED_SCAN_MATCH_ATTR_MAX, - attr, nl80211_match_policy); + attr, nl80211_match_policy, + NULL); if (err) goto out_free; ssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_SSID]; - if (ssid) { + bssid = tb[NL80211_SCHED_SCAN_MATCH_ATTR_BSSID]; + if (ssid || bssid) { if (WARN_ON(i >= n_match_sets)) { /* this indicates a programming error, * the loop above should have verified @@ -7146,14 +7236,25 @@ nl80211_parse_sched_scan(struct wiphy *wiphy, struct wireless_dev *wdev, goto out_free; } - if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { - err = -EINVAL; - goto out_free; + if (ssid) { + if (nla_len(ssid) > IEEE80211_MAX_SSID_LEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->match_sets[i].ssid.ssid, + nla_data(ssid), nla_len(ssid)); + request->match_sets[i].ssid.ssid_len = + nla_len(ssid); } - memcpy(request->match_sets[i].ssid.ssid, - nla_data(ssid), nla_len(ssid)); - request->match_sets[i].ssid.ssid_len = - nla_len(ssid); + if (bssid) { + if (nla_len(bssid) != ETH_ALEN) { + err = -EINVAL; + goto out_free; + } + memcpy(request->match_sets[i].bssid, + nla_data(bssid), ETH_ALEN); + } + /* special attribute - old implementation w/a */ request->match_sets[i].rssi_thold = default_match_rssi; @@ -7261,14 +7362,16 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_sched_scan_request *sched_scan_req; + bool want_multi; int err; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || - !rdev->ops->sched_scan_start) + if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_start) return -EOPNOTSUPP; - if (rdev->sched_scan_req) - return -EINPROGRESS; + want_multi = info->attrs[NL80211_ATTR_SCHED_SCAN_MULTI]; + err = cfg80211_sched_scan_req_possible(rdev, want_multi); + if (err) + return err; sched_scan_req = nl80211_parse_sched_scan(&rdev->wiphy, wdev, info->attrs, @@ -7278,6 +7381,14 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (err) goto out_err; + /* leave request id zero for legacy request + * or if driver does not support multi-scheduled scan + */ + if (want_multi && rdev->wiphy.max_sched_scan_reqs > 1) { + while (!sched_scan_req->reqid) + sched_scan_req->reqid = rdev->wiphy.cookie_counter++; + } + err = rdev_sched_scan_start(rdev, dev, sched_scan_req); if (err) goto out_free; @@ -7288,10 +7399,9 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, if (info->attrs[NL80211_ATTR_SOCKET_OWNER]) sched_scan_req->owner_nlportid = info->snd_portid; - rcu_assign_pointer(rdev->sched_scan_req, sched_scan_req); + cfg80211_add_sched_scan_req(rdev, sched_scan_req); - nl80211_send_sched_scan(rdev, dev, - NL80211_CMD_START_SCHED_SCAN); + nl80211_send_sched_scan(sched_scan_req, NL80211_CMD_START_SCHED_SCAN); return 0; out_free: @@ -7303,13 +7413,27 @@ static int nl80211_start_sched_scan(struct sk_buff *skb, static int nl80211_stop_sched_scan(struct sk_buff *skb, struct genl_info *info) { + struct cfg80211_sched_scan_request *req; struct cfg80211_registered_device *rdev = info->user_ptr[0]; + u64 cookie; - if (!(rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) || - !rdev->ops->sched_scan_stop) + if (!rdev->wiphy.max_sched_scan_reqs || !rdev->ops->sched_scan_stop) return -EOPNOTSUPP; - return __cfg80211_stop_sched_scan(rdev, false); + if (info->attrs[NL80211_ATTR_COOKIE]) { + cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); + return __cfg80211_stop_sched_scan(rdev, cookie, false); + } + + req = list_first_or_null_rcu(&rdev->sched_scan_req_list, + struct cfg80211_sched_scan_request, + list); + if (!req || req->reqid || + (req->owner_nlportid && + req->owner_nlportid != info->snd_portid)) + return -ENOENT; + + return cfg80211_stop_sched_scan_req(rdev, req, false); } static int nl80211_start_radar_detection(struct sk_buff *skb, @@ -7433,7 +7557,7 @@ static int nl80211_channel_switch(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(csa_attrs, NL80211_ATTR_MAX, info->attrs[NL80211_ATTR_CSA_IES], - nl80211_policy); + nl80211_policy, info->extack); if (err) return err; @@ -8639,7 +8763,8 @@ static int nl80211_testmode_dump(struct sk_buff *skb, struct nlattr **attrbuf = genl_family_attrbuf(&nl80211_fam); err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - attrbuf, nl80211_fam.maxattr, nl80211_policy); + attrbuf, nl80211_fam.maxattr, + nl80211_policy, NULL); if (err) goto out_err; @@ -8867,6 +8992,35 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) } } + if (wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_FILS_SK_OFFLOAD) && + info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] && + info->attrs[NL80211_ATTR_FILS_ERP_REALM] && + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] && + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + connect.fils_erp_username = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_username_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_USERNAME]); + connect.fils_erp_realm = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_realm_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_REALM]); + connect.fils_erp_next_seq_num = + nla_get_u16( + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM]); + connect.fils_erp_rrk = + nla_data(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + connect.fils_erp_rrk_len = + nla_len(info->attrs[NL80211_ATTR_FILS_ERP_RRK]); + } else if (info->attrs[NL80211_ATTR_FILS_ERP_USERNAME] || + info->attrs[NL80211_ATTR_FILS_ERP_REALM] || + info->attrs[NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM] || + info->attrs[NL80211_ATTR_FILS_ERP_RRK]) { + kzfree(connkeys); + return -EINVAL; + } + wdev_lock(dev->ieee80211_ptr); err = cfg80211_connect(rdev, dev, &connect, connkeys, @@ -8986,14 +9140,28 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); - if (!info->attrs[NL80211_ATTR_MAC]) - return -EINVAL; - if (!info->attrs[NL80211_ATTR_PMKID]) return -EINVAL; pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); - pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + + if (info->attrs[NL80211_ATTR_MAC]) { + pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + } else if (info->attrs[NL80211_ATTR_SSID] && + info->attrs[NL80211_ATTR_FILS_CACHE_ID] && + (info->genlhdr->cmd == NL80211_CMD_DEL_PMKSA || + info->attrs[NL80211_ATTR_PMK])) { + pmksa.ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); + pmksa.ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); + pmksa.cache_id = + nla_data(info->attrs[NL80211_ATTR_FILS_CACHE_ID]); + } else { + return -EINVAL; + } + if (info->attrs[NL80211_ATTR_PMK]) { + pmksa.pmk = nla_data(info->attrs[NL80211_ATTR_PMK]); + pmksa.pmk_len = nla_len(info->attrs[NL80211_ATTR_PMK]); + } if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) @@ -9096,6 +9264,7 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev = info->user_ptr[1]; struct cfg80211_chan_def chandef; + const struct cfg80211_chan_def *compat_chandef; struct sk_buff *msg; void *hdr; u64 cookie; @@ -9124,6 +9293,18 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, if (err) return err; + wdev_lock(wdev); + if (!cfg80211_off_channel_oper_allowed(wdev) && + !cfg80211_chandef_identical(&wdev->chandef, &chandef)) { + compat_chandef = cfg80211_chandef_compatible(&wdev->chandef, + &chandef); + if (compat_chandef != &chandef) { + wdev_unlock(wdev); + return -EBUSY; + } + } + wdev_unlock(wdev); + msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) return -ENOMEM; @@ -9299,6 +9480,13 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (!chandef.chan && params.offchan) return -EINVAL; + wdev_lock(wdev); + if (params.offchan && !cfg80211_off_channel_oper_allowed(wdev)) { + wdev_unlock(wdev); + return -EBUSY; + } + wdev_unlock(wdev); + params.buf = nla_data(info->attrs[NL80211_ATTR_FRAME]); params.len = nla_len(info->attrs[NL80211_ATTR_FRAME]); @@ -9466,7 +9654,7 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) static const struct nla_policy nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] = { - [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_U32 }, + [NL80211_ATTR_CQM_RSSI_THOLD] = { .type = NLA_BINARY }, [NL80211_ATTR_CQM_RSSI_HYST] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_RSSI_THRESHOLD_EVENT] = { .type = NLA_U32 }, [NL80211_ATTR_CQM_TXE_RATE] = { .type = NLA_U32 }, @@ -9495,28 +9683,123 @@ static int nl80211_set_cqm_txe(struct genl_info *info, return rdev_set_cqm_txe_config(rdev, dev, rate, pkts, intvl); } +static int cfg80211_cqm_rssi_update(struct cfg80211_registered_device *rdev, + struct net_device *dev) +{ + struct wireless_dev *wdev = dev->ieee80211_ptr; + s32 last, low, high; + u32 hyst; + int i, n; + int err; + + /* RSSI reporting disabled? */ + if (!wdev->cqm_config) + return rdev_set_cqm_rssi_range_config(rdev, dev, 0, 0); + + /* + * Obtain current RSSI value if possible, if not and no RSSI threshold + * event has been received yet, we should receive an event after a + * connection is established and enough beacons received to calculate + * the average. + */ + if (!wdev->cqm_config->last_rssi_event_value && wdev->current_bss && + rdev->ops->get_station) { + struct station_info sinfo; + u8 *mac_addr; + + mac_addr = wdev->current_bss->pub.bssid; + + err = rdev_get_station(rdev, dev, mac_addr, &sinfo); + if (err) + return err; + + if (sinfo.filled & BIT(NL80211_STA_INFO_BEACON_SIGNAL_AVG)) + wdev->cqm_config->last_rssi_event_value = + (s8) sinfo.rx_beacon_signal_avg; + } + + last = wdev->cqm_config->last_rssi_event_value; + hyst = wdev->cqm_config->rssi_hyst; + n = wdev->cqm_config->n_rssi_thresholds; + + for (i = 0; i < n; i++) + if (last < wdev->cqm_config->rssi_thresholds[i]) + break; + + low = i > 0 ? + (wdev->cqm_config->rssi_thresholds[i - 1] - hyst) : S32_MIN; + high = i < n ? + (wdev->cqm_config->rssi_thresholds[i] + hyst - 1) : S32_MAX; + + return rdev_set_cqm_rssi_range_config(rdev, dev, low, high); +} + static int nl80211_set_cqm_rssi(struct genl_info *info, - s32 threshold, u32 hysteresis) + const s32 *thresholds, int n_thresholds, + u32 hysteresis) { struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; + int i, err; + s32 prev = S32_MIN; - if (threshold > 0) - return -EINVAL; - - /* disabling - hysteresis should also be zero then */ - if (threshold == 0) - hysteresis = 0; + /* Check all values negative and sorted */ + for (i = 0; i < n_thresholds; i++) { + if (thresholds[i] > 0 || thresholds[i] <= prev) + return -EINVAL; - if (!rdev->ops->set_cqm_rssi_config) - return -EOPNOTSUPP; + prev = thresholds[i]; + } if (wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - return rdev_set_cqm_rssi_config(rdev, dev, threshold, hysteresis); + wdev_lock(wdev); + cfg80211_cqm_config_free(wdev); + wdev_unlock(wdev); + + if (n_thresholds <= 1 && rdev->ops->set_cqm_rssi_config) { + if (n_thresholds == 0 || thresholds[0] == 0) /* Disabling */ + return rdev_set_cqm_rssi_config(rdev, dev, 0, 0); + + return rdev_set_cqm_rssi_config(rdev, dev, + thresholds[0], hysteresis); + } + + if (!wiphy_ext_feature_isset(&rdev->wiphy, + NL80211_EXT_FEATURE_CQM_RSSI_LIST)) + return -EOPNOTSUPP; + + if (n_thresholds == 1 && thresholds[0] == 0) /* Disabling */ + n_thresholds = 0; + + wdev_lock(wdev); + if (n_thresholds) { + struct cfg80211_cqm_config *cqm_config; + + cqm_config = kzalloc(sizeof(struct cfg80211_cqm_config) + + n_thresholds * sizeof(s32), GFP_KERNEL); + if (!cqm_config) { + err = -ENOMEM; + goto unlock; + } + + cqm_config->rssi_hyst = hysteresis; + cqm_config->n_rssi_thresholds = n_thresholds; + memcpy(cqm_config->rssi_thresholds, thresholds, + n_thresholds * sizeof(s32)); + + wdev->cqm_config = cqm_config; + } + + err = cfg80211_cqm_rssi_update(rdev, dev); + +unlock: + wdev_unlock(wdev); + + return err; } static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) @@ -9530,16 +9813,22 @@ static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) return -EINVAL; err = nla_parse_nested(attrs, NL80211_ATTR_CQM_MAX, cqm, - nl80211_attr_cqm_policy); + nl80211_attr_cqm_policy, info->extack); if (err) return err; if (attrs[NL80211_ATTR_CQM_RSSI_THOLD] && attrs[NL80211_ATTR_CQM_RSSI_HYST]) { - s32 threshold = nla_get_s32(attrs[NL80211_ATTR_CQM_RSSI_THOLD]); + const s32 *thresholds = + nla_data(attrs[NL80211_ATTR_CQM_RSSI_THOLD]); + int len = nla_len(attrs[NL80211_ATTR_CQM_RSSI_THOLD]); u32 hysteresis = nla_get_u32(attrs[NL80211_ATTR_CQM_RSSI_HYST]); - return nl80211_set_cqm_rssi(info, threshold, hysteresis); + if (len % 4) + return -EINVAL; + + return nl80211_set_cqm_rssi(info, thresholds, len / 4, + hysteresis); } if (attrs[NL80211_ATTR_CQM_TXE_RATE] && @@ -9940,7 +10229,7 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, return -EINVAL; err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TCP, attr, - nl80211_wowlan_tcp_policy); + nl80211_wowlan_tcp_policy, NULL); if (err) return err; @@ -10085,7 +10374,8 @@ static int nl80211_parse_wowlan_nd(struct cfg80211_registered_device *rdev, goto out; } - err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy); + err = nla_parse_nested(tb, NL80211_ATTR_MAX, attr, nl80211_policy, + NULL); if (err) goto out; @@ -10122,7 +10412,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(tb, MAX_NL80211_WOWLAN_TRIG, info->attrs[NL80211_ATTR_WOWLAN_TRIGGERS], - nl80211_wowlan_policy); + nl80211_wowlan_policy, info->extack); if (err) return err; @@ -10205,7 +10495,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, - NULL); + NULL, info->extack); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -10416,7 +10706,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, struct nlattr *pat_tb[NUM_NL80211_PKTPAT]; err = nla_parse_nested(tb, NL80211_ATTR_COALESCE_RULE_MAX, rule, - nl80211_coalesce_policy); + nl80211_coalesce_policy, NULL); if (err) return err; @@ -10454,7 +10744,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, rem) { u8 *mask_pat; - nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL); + nla_parse_nested(pat_tb, MAX_NL80211_PKTPAT, pat, NULL, NULL); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; @@ -10575,7 +10865,7 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) err = nla_parse_nested(tb, MAX_NL80211_REKEY_DATA, info->attrs[NL80211_ATTR_REKEY_DATA], - nl80211_rekey_policy); + nl80211_rekey_policy, info->extack); if (err) return err; @@ -10892,7 +11182,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, err = nla_parse_nested(tb, NL80211_NAN_FUNC_ATTR_MAX, info->attrs[NL80211_ATTR_NAN_FUNC], - nl80211_nan_func_policy); + nl80211_nan_func_policy, info->extack); if (err) return err; @@ -10989,7 +11279,7 @@ static int nl80211_nan_add_func(struct sk_buff *skb, err = nla_parse_nested(srf_tb, NL80211_NAN_SRF_ATTR_MAX, tb[NL80211_NAN_FUNC_SRF], - nl80211_nan_srf_policy); + nl80211_nan_srf_policy, info->extack); if (err) goto out; @@ -11524,8 +11814,8 @@ static int nl80211_prepare_vendor_dump(struct sk_buff *skb, return 0; } - err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, - attrbuf, nl80211_fam.maxattr, nl80211_policy); + err = nlmsg_parse(cb->nlh, GENL_HDRLEN + nl80211_fam.hdrsize, attrbuf, + nl80211_fam.maxattr, nl80211_policy, NULL); if (err) return err; @@ -12970,18 +13260,19 @@ static int nl80211_prep_scan_msg(struct sk_buff *msg, static int nl80211_prep_sched_scan_msg(struct sk_buff *msg, - struct cfg80211_registered_device *rdev, - struct net_device *netdev, - u32 portid, u32 seq, int flags, u32 cmd) + struct cfg80211_sched_scan_request *req, u32 cmd) { void *hdr; - hdr = nl80211hdr_put(msg, portid, seq, flags, cmd); + hdr = nl80211hdr_put(msg, 0, 0, 0, cmd); if (!hdr) return -1; - if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || - nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex)) + if (nla_put_u32(msg, NL80211_ATTR_WIPHY, + wiphy_to_rdev(req->wiphy)->wiphy_idx) || + nla_put_u32(msg, NL80211_ATTR_IFINDEX, req->dev->ifindex) || + nla_put_u64_64bit(msg, NL80211_ATTR_COOKIE, req->reqid, + NL80211_ATTR_PAD)) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13041,8 +13332,7 @@ void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, NL80211_MCGRP_SCAN, GFP_KERNEL); } -void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 cmd) +void nl80211_send_sched_scan(struct cfg80211_sched_scan_request *req, u32 cmd) { struct sk_buff *msg; @@ -13050,12 +13340,12 @@ void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, if (!msg) return; - if (nl80211_prep_sched_scan_msg(msg, rdev, netdev, 0, 0, 0, cmd) < 0) { + if (nl80211_prep_sched_scan_msg(msg, req, cmd) < 0) { nlmsg_free(msg); return; } - genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, + genlmsg_multicast_netns(&nl80211_fam, wiphy_net(req->wiphy), msg, 0, NL80211_MCGRP_SCAN, GFP_KERNEL); } @@ -13296,17 +13586,16 @@ void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev, } void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, - int status, - enum nl80211_timeout_reason timeout_reason, + struct net_device *netdev, + struct cfg80211_connect_resp_params *cr, gfp_t gfp) { struct sk_buff *msg; void *hdr; - msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); + msg = nlmsg_new(100 + cr->req_ie_len + cr->resp_ie_len + + cr->fils_kek_len + cr->pmk_len + + (cr->pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!msg) return; @@ -13318,17 +13607,31 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || - (bssid && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) || + (cr->bssid && + nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, cr->bssid)) || nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, - status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : - status) || - (status < 0 && + cr->status < 0 ? WLAN_STATUS_UNSPECIFIED_FAILURE : + cr->status) || + (cr->status < 0 && (nla_put_flag(msg, NL80211_ATTR_TIMED_OUT) || - nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, timeout_reason))) || - (req_ie && - nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || - (resp_ie && - nla_put(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie))) + nla_put_u32(msg, NL80211_ATTR_TIMEOUT_REASON, + cr->timeout_reason))) || + (cr->req_ie && + nla_put(msg, NL80211_ATTR_REQ_IE, cr->req_ie_len, cr->req_ie)) || + (cr->resp_ie && + nla_put(msg, NL80211_ATTR_RESP_IE, cr->resp_ie_len, + cr->resp_ie)) || + (cr->update_erp_next_seq_num && + nla_put_u16(msg, NL80211_ATTR_FILS_ERP_NEXT_SEQ_NUM, + cr->fils_erp_next_seq_num)) || + (cr->status == WLAN_STATUS_SUCCESS && + ((cr->fils_kek && + nla_put(msg, NL80211_ATTR_FILS_KEK, cr->fils_kek_len, + cr->fils_kek)) || + (cr->pmk && + nla_put(msg, NL80211_ATTR_PMK, cr->pmk_len, cr->pmk)) || + (cr->pmkid && + nla_put(msg, NL80211_ATTR_PMKID, WLAN_PMKID_LEN, cr->pmkid))))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13343,14 +13646,14 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, } void nl80211_send_roamed(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp) + struct net_device *netdev, + struct cfg80211_roam_info *info, gfp_t gfp) { struct sk_buff *msg; void *hdr; + const u8 *bssid = info->bss ? info->bss->bssid : info->bssid; - msg = nlmsg_new(100 + req_ie_len + resp_ie_len, gfp); + msg = nlmsg_new(100 + info->req_ie_len + info->resp_ie_len, gfp); if (!msg) return; @@ -13363,10 +13666,12 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, if (nla_put_u32(msg, NL80211_ATTR_WIPHY, rdev->wiphy_idx) || nla_put_u32(msg, NL80211_ATTR_IFINDEX, netdev->ifindex) || nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid) || - (req_ie && - nla_put(msg, NL80211_ATTR_REQ_IE, req_ie_len, req_ie)) || - (resp_ie && - nla_put(msg, NL80211_ATTR_RESP_IE, resp_ie_len, resp_ie))) + (info->req_ie && + nla_put(msg, NL80211_ATTR_REQ_IE, info->req_ie_len, + info->req_ie)) || + (info->resp_ie && + nla_put(msg, NL80211_ATTR_RESP_IE, info->resp_ie_len, + info->resp_ie))) goto nla_put_failure; genlmsg_end(msg, hdr); @@ -13968,6 +14273,8 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, s32 rssi_level, gfp_t gfp) { struct sk_buff *msg; + struct wireless_dev *wdev = dev->ieee80211_ptr; + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); trace_cfg80211_cqm_rssi_notify(dev, rssi_event, rssi_level); @@ -13975,6 +14282,15 @@ void cfg80211_cqm_rssi_notify(struct net_device *dev, rssi_event != NL80211_CQM_RSSI_THRESHOLD_EVENT_HIGH)) return; + if (wdev->cqm_config) { + wdev->cqm_config->last_rssi_event_value = rssi_level; + + cfg80211_cqm_rssi_update(rdev, dev); + + if (rssi_level == 0) + rssi_level = wdev->cqm_config->last_rssi_event_value; + } + msg = cfg80211_prepare_cqm(dev, NULL, gfp); if (!msg) return; @@ -14619,26 +14935,26 @@ static int nl80211_netlink_notify(struct notifier_block * nb, rcu_read_lock(); list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) { - bool schedule_destroy_work = false; - struct cfg80211_sched_scan_request *sched_scan_req = - rcu_dereference(rdev->sched_scan_req); - - if (sched_scan_req && notify->portid && - sched_scan_req->owner_nlportid == notify->portid) { - sched_scan_req->owner_nlportid = 0; + struct cfg80211_sched_scan_request *sched_scan_req; - if (rdev->ops->sched_scan_stop && - rdev->wiphy.flags & WIPHY_FLAG_SUPPORTS_SCHED_SCAN) + list_for_each_entry_rcu(sched_scan_req, + &rdev->sched_scan_req_list, + list) { + if (sched_scan_req->owner_nlportid == notify->portid) { + sched_scan_req->nl_owner_dead = true; schedule_work(&rdev->sched_scan_stop_wk); + } } list_for_each_entry_rcu(wdev, &rdev->wiphy.wdev_list, list) { cfg80211_mlme_unregister_socket(wdev, notify->portid); - if (wdev->owner_nlportid == notify->portid) - schedule_destroy_work = true; - else if (wdev->conn_owner_nlportid == notify->portid) + if (wdev->owner_nlportid == notify->portid) { + wdev->nl_owner_dead = true; + schedule_work(&rdev->destroy_work); + } else if (wdev->conn_owner_nlportid == notify->portid) { schedule_work(&wdev->disconnect_wk); + } } spin_lock_bh(&rdev->beacon_registrations_lock); @@ -14651,19 +14967,6 @@ static int nl80211_netlink_notify(struct notifier_block * nb, } } spin_unlock_bh(&rdev->beacon_registrations_lock); - - if (schedule_destroy_work) { - struct cfg80211_iface_destroy *destroy; - - destroy = kzalloc(sizeof(*destroy), GFP_ATOMIC); - if (destroy) { - destroy->nlportid = notify->portid; - spin_lock(&rdev->destroy_list_lock); - list_add(&destroy->list, &rdev->destroy_list); - spin_unlock(&rdev->destroy_list_lock); - schedule_work(&rdev->destroy_work); - } - } } rcu_read_unlock(); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index e488dca87423eb7c85fc80d408c3d45f91e9a97a..b96933322077c6238dd39c4d990941e85154e757 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -16,8 +16,7 @@ struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, bool aborted); void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, struct sk_buff *msg); -void nl80211_send_sched_scan(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 cmd); +void nl80211_send_sched_scan(struct cfg80211_sched_scan_request *req, u32 cmd); void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, struct regulatory_request *request); @@ -53,16 +52,12 @@ void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, gfp_t gfp); void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, - int status, - enum nl80211_timeout_reason timeout_reason, + struct net_device *netdev, + struct cfg80211_connect_resp_params *params, gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp); + struct net_device *netdev, + struct cfg80211_roam_info *info, gfp_t gfp); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h index 2f425075ada8eae81c8ecb04d975d0c9f0d36578..0598c1e5d0adbba6375beec2dcfb667fb4883192 100644 --- a/net/wireless/rdev-ops.h +++ b/net/wireless/rdev-ops.h @@ -36,13 +36,13 @@ static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, - enum nl80211_iftype type, u32 *flags, + enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, - type, flags, params); + type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } @@ -61,12 +61,11 @@ rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, - u32 *flags, struct vif_params *params) + struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); - ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, flags, - params); + ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } @@ -749,6 +748,18 @@ rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, return ret; } +static inline int +rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, + struct net_device *dev, s32 low, s32 high) +{ + int ret; + trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); + ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, + low, high); + trace_rdev_return_int(&rdev->wiphy, ret); + return ret; +} + static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) @@ -802,18 +813,18 @@ rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct cfg80211_sched_scan_request *request) { int ret; - trace_rdev_sched_scan_start(&rdev->wiphy, dev, request); + trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, - struct net_device *dev) + struct net_device *dev, u64 reqid) { int ret; - trace_rdev_sched_scan_stop(&rdev->wiphy, dev); - ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev); + trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); + ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 753efcd51fa3495c66e7063d3ce6c1aca7fd9d14..5fae296a6a583256f9874319dfaa9ee87e977fff 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -2067,6 +2067,88 @@ reg_process_hint_country_ie(struct wiphy *wiphy, return REG_REQ_IGNORE; } +bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2) +{ + const struct ieee80211_regdomain *wiphy1_regd = NULL; + const struct ieee80211_regdomain *wiphy2_regd = NULL; + const struct ieee80211_regdomain *cfg80211_regd = NULL; + bool dfs_domain_same; + + rcu_read_lock(); + + cfg80211_regd = rcu_dereference(cfg80211_regdomain); + wiphy1_regd = rcu_dereference(wiphy1->regd); + if (!wiphy1_regd) + wiphy1_regd = cfg80211_regd; + + wiphy2_regd = rcu_dereference(wiphy2->regd); + if (!wiphy2_regd) + wiphy2_regd = cfg80211_regd; + + dfs_domain_same = wiphy1_regd->dfs_region == wiphy2_regd->dfs_region; + + rcu_read_unlock(); + + return dfs_domain_same; +} + +static void reg_copy_dfs_chan_state(struct ieee80211_channel *dst_chan, + struct ieee80211_channel *src_chan) +{ + if (!(dst_chan->flags & IEEE80211_CHAN_RADAR) || + !(src_chan->flags & IEEE80211_CHAN_RADAR)) + return; + + if (dst_chan->flags & IEEE80211_CHAN_DISABLED || + src_chan->flags & IEEE80211_CHAN_DISABLED) + return; + + if (src_chan->center_freq == dst_chan->center_freq && + dst_chan->dfs_state == NL80211_DFS_USABLE) { + dst_chan->dfs_state = src_chan->dfs_state; + dst_chan->dfs_state_entered = src_chan->dfs_state_entered; + } +} + +static void wiphy_share_dfs_chan_state(struct wiphy *dst_wiphy, + struct wiphy *src_wiphy) +{ + struct ieee80211_supported_band *src_sband, *dst_sband; + struct ieee80211_channel *src_chan, *dst_chan; + int i, j, band; + + if (!reg_dfs_domain_same(dst_wiphy, src_wiphy)) + return; + + for (band = 0; band < NUM_NL80211_BANDS; band++) { + dst_sband = dst_wiphy->bands[band]; + src_sband = src_wiphy->bands[band]; + if (!dst_sband || !src_sband) + continue; + + for (i = 0; i < dst_sband->n_channels; i++) { + dst_chan = &dst_sband->channels[i]; + for (j = 0; j < src_sband->n_channels; j++) { + src_chan = &src_sband->channels[j]; + reg_copy_dfs_chan_state(dst_chan, src_chan); + } + } + } +} + +static void wiphy_all_share_dfs_chan_state(struct wiphy *wiphy) +{ + struct cfg80211_registered_device *rdev; + + ASSERT_RTNL(); + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (wiphy == &rdev->wiphy) + continue; + wiphy_share_dfs_chan_state(wiphy, &rdev->wiphy); + } +} + /* This processes *all* regulatory hints */ static void reg_process_hint(struct regulatory_request *reg_request) { @@ -2110,6 +2192,7 @@ static void reg_process_hint(struct regulatory_request *reg_request) if (treatment == REG_REQ_ALREADY_SET && wiphy && wiphy->regulatory_flags & REGULATORY_STRICT_REG) { wiphy_update_regulatory(wiphy, reg_request->initiator); + wiphy_all_share_dfs_chan_state(wiphy); reg_check_channels(); } @@ -3061,6 +3144,7 @@ void wiphy_regulatory_register(struct wiphy *wiphy) lr = get_last_request(); wiphy_update_regulatory(wiphy, lr->initiator); + wiphy_all_share_dfs_chan_state(wiphy); } void wiphy_regulatory_deregister(struct wiphy *wiphy) @@ -3120,6 +3204,67 @@ bool regulatory_indoor_allowed(void) return reg_is_indoor; } +bool regulatory_pre_cac_allowed(struct wiphy *wiphy) +{ + const struct ieee80211_regdomain *regd = NULL; + const struct ieee80211_regdomain *wiphy_regd = NULL; + bool pre_cac_allowed = false; + + rcu_read_lock(); + + regd = rcu_dereference(cfg80211_regdomain); + wiphy_regd = rcu_dereference(wiphy->regd); + if (!wiphy_regd) { + if (regd->dfs_region == NL80211_DFS_ETSI) + pre_cac_allowed = true; + + rcu_read_unlock(); + + return pre_cac_allowed; + } + + if (regd->dfs_region == wiphy_regd->dfs_region && + wiphy_regd->dfs_region == NL80211_DFS_ETSI) + pre_cac_allowed = true; + + rcu_read_unlock(); + + return pre_cac_allowed; +} + +void regulatory_propagate_dfs_state(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_dfs_state dfs_state, + enum nl80211_radar_event event) +{ + struct cfg80211_registered_device *rdev; + + ASSERT_RTNL(); + + if (WARN_ON(!cfg80211_chandef_valid(chandef))) + return; + + list_for_each_entry(rdev, &cfg80211_rdev_list, list) { + if (wiphy == &rdev->wiphy) + continue; + + if (!reg_dfs_domain_same(wiphy, &rdev->wiphy)) + continue; + + if (!ieee80211_get_channel(&rdev->wiphy, + chandef->chan->center_freq)) + continue; + + cfg80211_set_dfs_state(&rdev->wiphy, chandef, dfs_state); + + if (event == NL80211_RADAR_DETECTED || + event == NL80211_RADAR_CAC_FINISHED) + cfg80211_sched_dfs_chan_update(rdev); + + nl80211_radar_notify(rdev, chandef, event, NULL, GFP_KERNEL); + } +} + int __init regulatory_init(void) { int err = 0; diff --git a/net/wireless/reg.h b/net/wireless/reg.h index f6ced316b5a49e204632ef42675309614237cc54..ca7fedf2e7a16e2a39ec7b0d3c988154d0e76d79 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -143,4 +143,40 @@ int cfg80211_get_unii(int freq); */ bool regulatory_indoor_allowed(void); +/* + * Grace period to timeout pre-CAC results on the dfs channels. This timeout + * value is used for Non-ETSI domain. + * TODO: May be make this timeout available through regdb? + */ +#define REG_PRE_CAC_EXPIRY_GRACE_MS 2000 + +/** + * regulatory_pre_cac_allowed - if pre-CAC allowed in the current dfs domain + * @wiphy: wiphy for which pre-CAC capability is checked. + + * Pre-CAC is allowed only in ETSI domain. + */ +bool regulatory_pre_cac_allowed(struct wiphy *wiphy); + +/** + * regulatory_propagate_dfs_state - Propagate DFS channel state to other wiphys + * @wiphy - wiphy on which radar is detected and the event will be propagated + * to other available wiphys having the same DFS domain + * @chandef - Channel definition of radar detected channel + * @dfs_state - DFS channel state to be set + * @event - Type of radar event which triggered this DFS state change + * + * This function should be called with rtnl lock held. + */ +void regulatory_propagate_dfs_state(struct wiphy *wiphy, + struct cfg80211_chan_def *chandef, + enum nl80211_dfs_state dfs_state, + enum nl80211_radar_event event); + +/** + * reg_dfs_domain_same - Checks if both wiphy have same DFS domain configured + * @wiphy1 - wiphy it's dfs_region to be checked against that of wiphy2 + * @wiphy2 - wiphy it's dfs_region to be checked against that of wiphy1 + */ +bool reg_dfs_domain_same(struct wiphy *wiphy1, struct wiphy *wiphy2); #endif /* __NET_WIRELESS_REG_H */ diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 21be56b3128ee74c4119ee67d5f0b85fecfe0b2e..14d5f0c8c45ff07224f2dc8e6310c2edc440874e 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -300,93 +300,168 @@ void cfg80211_scan_done(struct cfg80211_scan_request *request, } EXPORT_SYMBOL(cfg80211_scan_done); -void __cfg80211_sched_scan_results(struct work_struct *wk) +void cfg80211_add_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req) { - struct cfg80211_registered_device *rdev; - struct cfg80211_sched_scan_request *request; + ASSERT_RTNL(); - rdev = container_of(wk, struct cfg80211_registered_device, - sched_scan_results_wk); + list_add_rcu(&req->list, &rdev->sched_scan_req_list); +} - rtnl_lock(); +static void cfg80211_del_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req) +{ + ASSERT_RTNL(); - request = rtnl_dereference(rdev->sched_scan_req); + list_del_rcu(&req->list); + kfree_rcu(req, rcu_head); +} - /* we don't have sched_scan_req anymore if the scan is stopping */ - if (request) { - if (request->flags & NL80211_SCAN_FLAG_FLUSH) { - /* flush entries from previous scans */ - spin_lock_bh(&rdev->bss_lock); - __cfg80211_bss_expire(rdev, request->scan_start); - spin_unlock_bh(&rdev->bss_lock); - request->scan_start = jiffies; - } - nl80211_send_sched_scan(rdev, request->dev, - NL80211_CMD_SCHED_SCAN_RESULTS); +static struct cfg80211_sched_scan_request * +cfg80211_find_sched_scan_req(struct cfg80211_registered_device *rdev, u64 reqid) +{ + struct cfg80211_sched_scan_request *pos; + + ASSERT_RTNL(); + + list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + if (pos->reqid == reqid) + return pos; } + return NULL; +} + +/* + * Determines if a scheduled scan request can be handled. When a legacy + * scheduled scan is running no other scheduled scan is allowed regardless + * whether the request is for legacy or multi-support scan. When a multi-support + * scheduled scan is running a request for legacy scan is not allowed. In this + * case a request for multi-support scan can be handled if resources are + * available, ie. struct wiphy::max_sched_scan_reqs limit is not yet reached. + */ +int cfg80211_sched_scan_req_possible(struct cfg80211_registered_device *rdev, + bool want_multi) +{ + struct cfg80211_sched_scan_request *pos; + int i = 0; + + list_for_each_entry(pos, &rdev->sched_scan_req_list, list) { + /* request id zero means legacy in progress */ + if (!i && !pos->reqid) + return -EINPROGRESS; + i++; + } + + if (i) { + /* no legacy allowed when multi request(s) are active */ + if (!want_multi) + return -EINPROGRESS; + + /* resource limit reached */ + if (i == rdev->wiphy.max_sched_scan_reqs) + return -ENOSPC; + } + return 0; +} + +void cfg80211_sched_scan_results_wk(struct work_struct *work) +{ + struct cfg80211_registered_device *rdev; + struct cfg80211_sched_scan_request *req, *tmp; + rdev = container_of(work, struct cfg80211_registered_device, + sched_scan_res_wk); + + rtnl_lock(); + list_for_each_entry_safe(req, tmp, &rdev->sched_scan_req_list, list) { + if (req->report_results) { + req->report_results = false; + if (req->flags & NL80211_SCAN_FLAG_FLUSH) { + /* flush entries from previous scans */ + spin_lock_bh(&rdev->bss_lock); + __cfg80211_bss_expire(rdev, req->scan_start); + spin_unlock_bh(&rdev->bss_lock); + req->scan_start = jiffies; + } + nl80211_send_sched_scan(req, + NL80211_CMD_SCHED_SCAN_RESULTS); + } + } rtnl_unlock(); } -void cfg80211_sched_scan_results(struct wiphy *wiphy) +void cfg80211_sched_scan_results(struct wiphy *wiphy, u64 reqid) { - trace_cfg80211_sched_scan_results(wiphy); + struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); + struct cfg80211_sched_scan_request *request; + + trace_cfg80211_sched_scan_results(wiphy, reqid); /* ignore if we're not scanning */ - if (rcu_access_pointer(wiphy_to_rdev(wiphy)->sched_scan_req)) - queue_work(cfg80211_wq, - &wiphy_to_rdev(wiphy)->sched_scan_results_wk); + rtnl_lock(); + request = cfg80211_find_sched_scan_req(rdev, reqid); + if (request) { + request->report_results = true; + queue_work(cfg80211_wq, &rdev->sched_scan_res_wk); + } + rtnl_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_results); -void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy) +void cfg80211_sched_scan_stopped_rtnl(struct wiphy *wiphy, u64 reqid) { struct cfg80211_registered_device *rdev = wiphy_to_rdev(wiphy); ASSERT_RTNL(); - trace_cfg80211_sched_scan_stopped(wiphy); + trace_cfg80211_sched_scan_stopped(wiphy, reqid); - __cfg80211_stop_sched_scan(rdev, true); + __cfg80211_stop_sched_scan(rdev, reqid, true); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped_rtnl); -void cfg80211_sched_scan_stopped(struct wiphy *wiphy) +void cfg80211_sched_scan_stopped(struct wiphy *wiphy, u64 reqid) { rtnl_lock(); - cfg80211_sched_scan_stopped_rtnl(wiphy); + cfg80211_sched_scan_stopped_rtnl(wiphy, reqid); rtnl_unlock(); } EXPORT_SYMBOL(cfg80211_sched_scan_stopped); -int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, - bool driver_initiated) +int cfg80211_stop_sched_scan_req(struct cfg80211_registered_device *rdev, + struct cfg80211_sched_scan_request *req, + bool driver_initiated) { - struct cfg80211_sched_scan_request *sched_scan_req; - struct net_device *dev; - ASSERT_RTNL(); - if (!rdev->sched_scan_req) - return -ENOENT; - - sched_scan_req = rtnl_dereference(rdev->sched_scan_req); - dev = sched_scan_req->dev; - if (!driver_initiated) { - int err = rdev_sched_scan_stop(rdev, dev); + int err = rdev_sched_scan_stop(rdev, req->dev, req->reqid); if (err) return err; } - nl80211_send_sched_scan(rdev, dev, NL80211_CMD_SCHED_SCAN_STOPPED); + nl80211_send_sched_scan(req, NL80211_CMD_SCHED_SCAN_STOPPED); - RCU_INIT_POINTER(rdev->sched_scan_req, NULL); - kfree_rcu(sched_scan_req, rcu_head); + cfg80211_del_sched_scan_req(rdev, req); return 0; } +int __cfg80211_stop_sched_scan(struct cfg80211_registered_device *rdev, + u64 reqid, bool driver_initiated) +{ + struct cfg80211_sched_scan_request *sched_scan_req; + + ASSERT_RTNL(); + + sched_scan_req = cfg80211_find_sched_scan_req(rdev, reqid); + if (!sched_scan_req) + return -ENOENT; + + return cfg80211_stop_sched_scan_req(rdev, sched_scan_req, + driver_initiated); +} + void cfg80211_bss_age(struct cfg80211_registered_device *rdev, unsigned long age_secs) { diff --git a/net/wireless/sme.c b/net/wireless/sme.c index b347e63d7aaa6814f524d3b080a005a88966d65d..532a0007ce82a7380536e32d5be3749e58bb4fc7 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -5,6 +5,7 @@ * * Copyright 2009 Johannes Berg * Copyright (C) 2009 Intel Corporation. All rights reserved. + * Copyright 2017 Intel Deutschland GmbH */ #include @@ -253,10 +254,13 @@ void cfg80211_conn_work(struct work_struct *work) } treason = NL80211_TIMEOUT_UNSPECIFIED; if (cfg80211_conn_do_work(wdev, &treason)) { - __cfg80211_connect_result( - wdev->netdev, bssid, - NULL, 0, NULL, 0, -1, false, NULL, - treason); + struct cfg80211_connect_resp_params cr; + + memset(&cr, 0, sizeof(cr)); + cr.status = -1; + cr.bssid = bssid; + cr.timeout_reason = treason; + __cfg80211_connect_result(wdev->netdev, &cr, false); } wdev_unlock(wdev); } @@ -359,10 +363,13 @@ void cfg80211_sme_rx_auth(struct wireless_dev *wdev, const u8 *buf, size_t len) wdev->conn->state = CFG80211_CONN_AUTHENTICATE_NEXT; schedule_work(&rdev->conn_work); } else if (status_code != WLAN_STATUS_SUCCESS) { - __cfg80211_connect_result(wdev->netdev, mgmt->bssid, - NULL, 0, NULL, 0, - status_code, false, NULL, - NL80211_TIMEOUT_UNSPECIFIED); + struct cfg80211_connect_resp_params cr; + + memset(&cr, 0, sizeof(cr)); + cr.status = status_code; + cr.bssid = mgmt->bssid; + cr.timeout_reason = NL80211_TIMEOUT_UNSPECIFIED; + __cfg80211_connect_result(wdev->netdev, &cr, false); } else if (wdev->conn->state == CFG80211_CONN_AUTHENTICATING) { wdev->conn->state = CFG80211_CONN_ASSOCIATE_NEXT; schedule_work(&rdev->conn_work); @@ -669,12 +676,9 @@ static DECLARE_WORK(cfg80211_disconnect_work, disconnect_work); */ /* This method must consume bss one way or another */ -void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, - int status, bool wextev, - struct cfg80211_bss *bss, - enum nl80211_timeout_reason timeout_reason) +void __cfg80211_connect_result(struct net_device *dev, + struct cfg80211_connect_resp_params *cr, + bool wextev) { struct wireless_dev *wdev = dev->ieee80211_ptr; const u8 *country_ie; @@ -686,48 +690,48 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) { - cfg80211_put_bss(wdev->wiphy, bss); + cfg80211_put_bss(wdev->wiphy, cr->bss); return; } - nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, - bssid, req_ie, req_ie_len, - resp_ie, resp_ie_len, - status, timeout_reason, GFP_KERNEL); + nl80211_send_connect_result(wiphy_to_rdev(wdev->wiphy), dev, cr, + GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT if (wextev) { - if (req_ie && status == WLAN_STATUS_SUCCESS) { + if (cr->req_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = req_ie_len; - wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, req_ie); + wrqu.data.length = cr->req_ie_len; + wireless_send_event(dev, IWEVASSOCREQIE, &wrqu, + cr->req_ie); } - if (resp_ie && status == WLAN_STATUS_SUCCESS) { + if (cr->resp_ie && cr->status == WLAN_STATUS_SUCCESS) { memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = resp_ie_len; - wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, resp_ie); + wrqu.data.length = cr->resp_ie_len; + wireless_send_event(dev, IWEVASSOCRESPIE, &wrqu, + cr->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; - if (bssid && status == WLAN_STATUS_SUCCESS) { - memcpy(wrqu.ap_addr.sa_data, bssid, ETH_ALEN); - memcpy(wdev->wext.prev_bssid, bssid, ETH_ALEN); + if (cr->bssid && cr->status == WLAN_STATUS_SUCCESS) { + memcpy(wrqu.ap_addr.sa_data, cr->bssid, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, cr->bssid, ETH_ALEN); wdev->wext.prev_bssid_valid = true; } wireless_send_event(dev, SIOCGIWAP, &wrqu, NULL); } #endif - if (!bss && (status == WLAN_STATUS_SUCCESS)) { + if (!cr->bss && (cr->status == WLAN_STATUS_SUCCESS)) { WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect); - bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid, - wdev->ssid, wdev->ssid_len, - wdev->conn_bss_type, - IEEE80211_PRIVACY_ANY); - if (bss) - cfg80211_hold_bss(bss_from_pub(bss)); + cr->bss = cfg80211_get_bss(wdev->wiphy, NULL, cr->bssid, + wdev->ssid, wdev->ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + if (cr->bss) + cfg80211_hold_bss(bss_from_pub(cr->bss)); } if (wdev->current_bss) { @@ -736,29 +740,29 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, wdev->current_bss = NULL; } - if (status != WLAN_STATUS_SUCCESS) { + if (cr->status != WLAN_STATUS_SUCCESS) { kzfree(wdev->connect_keys); wdev->connect_keys = NULL; wdev->ssid_len = 0; wdev->conn_owner_nlportid = 0; - if (bss) { - cfg80211_unhold_bss(bss_from_pub(bss)); - cfg80211_put_bss(wdev->wiphy, bss); + if (cr->bss) { + cfg80211_unhold_bss(bss_from_pub(cr->bss)); + cfg80211_put_bss(wdev->wiphy, cr->bss); } cfg80211_sme_free(wdev); return; } - if (WARN_ON(!bss)) + if (WARN_ON(!cr->bss)) return; - wdev->current_bss = bss_from_pub(bss); + wdev->current_bss = bss_from_pub(cr->bss); if (!(wdev->wiphy->flags & WIPHY_FLAG_HAS_STATIC_WEP)) cfg80211_upload_connect_keys(wdev); rcu_read_lock(); - country_ie = ieee80211_bss_get_ie(bss, WLAN_EID_COUNTRY); + country_ie = ieee80211_bss_get_ie(cr->bss, WLAN_EID_COUNTRY); if (!country_ie) { rcu_read_unlock(); return; @@ -775,70 +779,99 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, * - country_ie + 2, the start of the country ie data, and * - and country_ie[1] which is the IE length */ - regulatory_hint_country_ie(wdev->wiphy, bss->channel->band, + regulatory_hint_country_ie(wdev->wiphy, cr->bss->channel->band, country_ie + 2, country_ie[1]); kfree(country_ie); } /* Consumes bss object one way or another */ -void cfg80211_connect_bss(struct net_device *dev, const u8 *bssid, - struct cfg80211_bss *bss, const u8 *req_ie, - size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, int status, gfp_t gfp, - enum nl80211_timeout_reason timeout_reason) +void cfg80211_connect_done(struct net_device *dev, + struct cfg80211_connect_resp_params *params, + gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; + u8 *next; - if (bss) { + if (params->bss) { /* Make sure the bss entry provided by the driver is valid. */ - struct cfg80211_internal_bss *ibss = bss_from_pub(bss); + struct cfg80211_internal_bss *ibss = bss_from_pub(params->bss); if (WARN_ON(list_empty(&ibss->list))) { - cfg80211_put_bss(wdev->wiphy, bss); + cfg80211_put_bss(wdev->wiphy, params->bss); return; } } - ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); + ev = kzalloc(sizeof(*ev) + (params->bssid ? ETH_ALEN : 0) + + params->req_ie_len + params->resp_ie_len + + params->fils_kek_len + params->pmk_len + + (params->pmkid ? WLAN_PMKID_LEN : 0), gfp); if (!ev) { - cfg80211_put_bss(wdev->wiphy, bss); + cfg80211_put_bss(wdev->wiphy, params->bss); return; } ev->type = EVENT_CONNECT_RESULT; - if (bssid) - memcpy(ev->cr.bssid, bssid, ETH_ALEN); - if (req_ie_len) { - ev->cr.req_ie = ((u8 *)ev) + sizeof(*ev); - ev->cr.req_ie_len = req_ie_len; - memcpy((void *)ev->cr.req_ie, req_ie, req_ie_len); + next = ((u8 *)ev) + sizeof(*ev); + if (params->bssid) { + ev->cr.bssid = next; + memcpy((void *)ev->cr.bssid, params->bssid, ETH_ALEN); + next += ETH_ALEN; } - if (resp_ie_len) { - ev->cr.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len; - ev->cr.resp_ie_len = resp_ie_len; - memcpy((void *)ev->cr.resp_ie, resp_ie, resp_ie_len); + if (params->req_ie_len) { + ev->cr.req_ie = next; + ev->cr.req_ie_len = params->req_ie_len; + memcpy((void *)ev->cr.req_ie, params->req_ie, + params->req_ie_len); + next += params->req_ie_len; } - if (bss) - cfg80211_hold_bss(bss_from_pub(bss)); - ev->cr.bss = bss; - ev->cr.status = status; - ev->cr.timeout_reason = timeout_reason; + if (params->resp_ie_len) { + ev->cr.resp_ie = next; + ev->cr.resp_ie_len = params->resp_ie_len; + memcpy((void *)ev->cr.resp_ie, params->resp_ie, + params->resp_ie_len); + next += params->resp_ie_len; + } + if (params->fils_kek_len) { + ev->cr.fils_kek = next; + ev->cr.fils_kek_len = params->fils_kek_len; + memcpy((void *)ev->cr.fils_kek, params->fils_kek, + params->fils_kek_len); + next += params->fils_kek_len; + } + if (params->pmk_len) { + ev->cr.pmk = next; + ev->cr.pmk_len = params->pmk_len; + memcpy((void *)ev->cr.pmk, params->pmk, params->pmk_len); + next += params->pmk_len; + } + if (params->pmkid) { + ev->cr.pmkid = next; + memcpy((void *)ev->cr.pmkid, params->pmkid, WLAN_PMKID_LEN); + next += WLAN_PMKID_LEN; + } + ev->cr.update_erp_next_seq_num = params->update_erp_next_seq_num; + if (params->update_erp_next_seq_num) + ev->cr.fils_erp_next_seq_num = params->fils_erp_next_seq_num; + if (params->bss) + cfg80211_hold_bss(bss_from_pub(params->bss)); + ev->cr.bss = params->bss; + ev->cr.status = params->status; + ev->cr.timeout_reason = params->timeout_reason; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } -EXPORT_SYMBOL(cfg80211_connect_bss); +EXPORT_SYMBOL(cfg80211_connect_done); /* Consumes bss object one way or another */ void __cfg80211_roamed(struct wireless_dev *wdev, - struct cfg80211_bss *bss, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len) + struct cfg80211_roam_info *info) { #ifdef CONFIG_CFG80211_WEXT union iwreq_data wrqu; @@ -856,97 +889,84 @@ void __cfg80211_roamed(struct wireless_dev *wdev, cfg80211_put_bss(wdev->wiphy, &wdev->current_bss->pub); wdev->current_bss = NULL; - cfg80211_hold_bss(bss_from_pub(bss)); - wdev->current_bss = bss_from_pub(bss); + if (WARN_ON(!info->bss)) + return; + + cfg80211_hold_bss(bss_from_pub(info->bss)); + wdev->current_bss = bss_from_pub(info->bss); nl80211_send_roamed(wiphy_to_rdev(wdev->wiphy), - wdev->netdev, bss->bssid, - req_ie, req_ie_len, resp_ie, resp_ie_len, - GFP_KERNEL); + wdev->netdev, info, GFP_KERNEL); #ifdef CONFIG_CFG80211_WEXT - if (req_ie) { + if (info->req_ie) { memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = req_ie_len; + wrqu.data.length = info->req_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCREQIE, - &wrqu, req_ie); + &wrqu, info->req_ie); } - if (resp_ie) { + if (info->resp_ie) { memset(&wrqu, 0, sizeof(wrqu)); - wrqu.data.length = resp_ie_len; + wrqu.data.length = info->resp_ie_len; wireless_send_event(wdev->netdev, IWEVASSOCRESPIE, - &wrqu, resp_ie); + &wrqu, info->resp_ie); } memset(&wrqu, 0, sizeof(wrqu)); wrqu.ap_addr.sa_family = ARPHRD_ETHER; - memcpy(wrqu.ap_addr.sa_data, bss->bssid, ETH_ALEN); - memcpy(wdev->wext.prev_bssid, bss->bssid, ETH_ALEN); + memcpy(wrqu.ap_addr.sa_data, info->bss->bssid, ETH_ALEN); + memcpy(wdev->wext.prev_bssid, info->bss->bssid, ETH_ALEN); wdev->wext.prev_bssid_valid = true; wireless_send_event(wdev->netdev, SIOCGIWAP, &wrqu, NULL); #endif return; out: - cfg80211_put_bss(wdev->wiphy, bss); -} - -void cfg80211_roamed(struct net_device *dev, - struct ieee80211_channel *channel, - const u8 *bssid, - const u8 *req_ie, size_t req_ie_len, - const u8 *resp_ie, size_t resp_ie_len, gfp_t gfp) -{ - struct wireless_dev *wdev = dev->ieee80211_ptr; - struct cfg80211_bss *bss; - - bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid, - wdev->ssid_len, - wdev->conn_bss_type, IEEE80211_PRIVACY_ANY); - if (WARN_ON(!bss)) - return; - - cfg80211_roamed_bss(dev, bss, req_ie, req_ie_len, resp_ie, - resp_ie_len, gfp); + cfg80211_put_bss(wdev->wiphy, info->bss); } -EXPORT_SYMBOL(cfg80211_roamed); -/* Consumes bss object one way or another */ -void cfg80211_roamed_bss(struct net_device *dev, - struct cfg80211_bss *bss, const u8 *req_ie, - size_t req_ie_len, const u8 *resp_ie, - size_t resp_ie_len, gfp_t gfp) +/* Consumes info->bss object one way or another */ +void cfg80211_roamed(struct net_device *dev, struct cfg80211_roam_info *info, + gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); struct cfg80211_event *ev; unsigned long flags; - if (WARN_ON(!bss)) + if (!info->bss) { + info->bss = cfg80211_get_bss(wdev->wiphy, info->channel, + info->bssid, wdev->ssid, + wdev->ssid_len, + wdev->conn_bss_type, + IEEE80211_PRIVACY_ANY); + } + + if (WARN_ON(!info->bss)) return; - ev = kzalloc(sizeof(*ev) + req_ie_len + resp_ie_len, gfp); + ev = kzalloc(sizeof(*ev) + info->req_ie_len + info->resp_ie_len, gfp); if (!ev) { - cfg80211_put_bss(wdev->wiphy, bss); + cfg80211_put_bss(wdev->wiphy, info->bss); return; } ev->type = EVENT_ROAMED; ev->rm.req_ie = ((u8 *)ev) + sizeof(*ev); - ev->rm.req_ie_len = req_ie_len; - memcpy((void *)ev->rm.req_ie, req_ie, req_ie_len); - ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + req_ie_len; - ev->rm.resp_ie_len = resp_ie_len; - memcpy((void *)ev->rm.resp_ie, resp_ie, resp_ie_len); - ev->rm.bss = bss; + ev->rm.req_ie_len = info->req_ie_len; + memcpy((void *)ev->rm.req_ie, info->req_ie, info->req_ie_len); + ev->rm.resp_ie = ((u8 *)ev) + sizeof(*ev) + info->req_ie_len; + ev->rm.resp_ie_len = info->resp_ie_len; + memcpy((void *)ev->rm.resp_ie, info->resp_ie, info->resp_ie_len); + ev->rm.bss = info->bss; spin_lock_irqsave(&wdev->event_lock, flags); list_add_tail(&ev->list, &wdev->event_list); spin_unlock_irqrestore(&wdev->event_lock, flags); queue_work(cfg80211_wq, &rdev->event_work); } -EXPORT_SYMBOL(cfg80211_roamed_bss); +EXPORT_SYMBOL(cfg80211_roamed); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap) diff --git a/net/wireless/trace.h b/net/wireless/trace.h index 776e80cef9b4ee2761681f3884427a8343e6050a..ca8b2059f92c43c5d4d3bac24f39fd141cd31141 100644 --- a/net/wireless/trace.h +++ b/net/wireless/trace.h @@ -576,11 +576,6 @@ DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap, TP_ARGS(wiphy, netdev) ); -DEFINE_EVENT(wiphy_netdev_evt, rdev_sched_scan_stop, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), - TP_ARGS(wiphy, netdev) -); - DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) @@ -1322,6 +1317,28 @@ TRACE_EVENT(rdev_set_cqm_rssi_config, __entry->rssi_thold, __entry->rssi_hyst) ); +TRACE_EVENT(rdev_set_cqm_rssi_range_config, + TP_PROTO(struct wiphy *wiphy, + struct net_device *netdev, s32 low, s32 high), + TP_ARGS(wiphy, netdev, low, high), + TP_STRUCT__entry( + WIPHY_ENTRY + NETDEV_ENTRY + __field(s32, rssi_low) + __field(s32, rssi_high) + ), + TP_fast_assign( + WIPHY_ASSIGN; + NETDEV_ASSIGN; + __entry->rssi_low = low; + __entry->rssi_high = high; + ), + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT + ", range: %d - %d ", + WIPHY_PR_ARG, NETDEV_PR_ARG, + __entry->rssi_low, __entry->rssi_high) +); + TRACE_EVENT(rdev_set_cqm_txe_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u32 rate, u32 pkts, u32 intvl), @@ -1588,20 +1605,31 @@ DEFINE_EVENT(tx_rx_evt, rdev_set_antenna, TP_ARGS(wiphy, rx, tx) ); -TRACE_EVENT(rdev_sched_scan_start, - TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, - struct cfg80211_sched_scan_request *request), - TP_ARGS(wiphy, netdev, request), +DECLARE_EVENT_CLASS(wiphy_netdev_id_evt, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), + TP_ARGS(wiphy, netdev, id), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY + __field(u64, id) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; + __entry->id = id; ), - TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, - WIPHY_PR_ARG, NETDEV_PR_ARG) + TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", id: %llu", + WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->id) +); + +DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_start, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), + TP_ARGS(wiphy, netdev, id) +); + +DEFINE_EVENT(wiphy_netdev_id_evt, rdev_sched_scan_stop, + TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u64 id), + TP_ARGS(wiphy, netdev, id) ); TRACE_EVENT(rdev_tdls_mgmt, @@ -2792,14 +2820,28 @@ TRACE_EVENT(cfg80211_scan_done, MAC_PR_ARG(tsf_bssid)) ); -DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_results, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) +DECLARE_EVENT_CLASS(wiphy_id_evt, + TP_PROTO(struct wiphy *wiphy, u64 id), + TP_ARGS(wiphy, id), + TP_STRUCT__entry( + WIPHY_ENTRY + __field(u64, id) + ), + TP_fast_assign( + WIPHY_ASSIGN; + __entry->id = id; + ), + TP_printk(WIPHY_PR_FMT ", id: %llu", WIPHY_PR_ARG, __entry->id) ); -DEFINE_EVENT(wiphy_only_evt, cfg80211_sched_scan_stopped, - TP_PROTO(struct wiphy *wiphy), - TP_ARGS(wiphy) +DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_stopped, + TP_PROTO(struct wiphy *wiphy, u64 id), + TP_ARGS(wiphy, id) +); + +DEFINE_EVENT(wiphy_id_evt, cfg80211_sched_scan_results, + TP_PROTO(struct wiphy *wiphy, u64 id), + TP_ARGS(wiphy, id) ); TRACE_EVENT(cfg80211_get_bss, diff --git a/net/wireless/util.c b/net/wireless/util.c index 68e5f2ecee1aa22f17ab9a55eb566124e585740b..7198373e29206a0f19f006578596fea78e82f66e 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -659,7 +659,7 @@ __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, int offset, int len) { struct skb_shared_info *sh = skb_shinfo(skb); - const skb_frag_t *frag = &sh->frags[-1]; + const skb_frag_t *frag = &sh->frags[0]; struct page *frag_page; void *frag_ptr; int frag_len, frag_size; @@ -672,10 +672,10 @@ __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, while (offset >= frag_size) { offset -= frag_size; - frag++; frag_page = skb_frag_page(frag); frag_ptr = skb_frag_address(frag); frag_size = skb_frag_size(frag); + frag++; } frag_ptr += offset; @@ -687,12 +687,12 @@ __ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame, len -= cur_len; while (len > 0) { - frag++; frag_len = skb_frag_size(frag); cur_len = min(len, frag_len); __frame_add_frag(frame, skb_frag_page(frag), skb_frag_address(frag), cur_len, frag_len); len -= cur_len; + frag++; } } @@ -914,11 +914,11 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) netdev_err(dev, "failed to set key %d\n", i); continue; } - if (wdev->connect_keys->def == i) - if (rdev_set_default_key(rdev, dev, i, true, true)) { - netdev_err(dev, "failed to set defkey %d\n", i); - continue; - } + if (wdev->connect_keys->def == i && + rdev_set_default_key(rdev, dev, i, true, true)) { + netdev_err(dev, "failed to set defkey %d\n", i); + continue; + } } kzfree(wdev->connect_keys); @@ -929,7 +929,6 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) { struct cfg80211_event *ev; unsigned long flags; - const u8 *bssid = NULL; spin_lock_irqsave(&wdev->event_lock, flags); while (!list_empty(&wdev->event_list)) { @@ -941,20 +940,13 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) wdev_lock(wdev); switch (ev->type) { case EVENT_CONNECT_RESULT: - if (!is_zero_ether_addr(ev->cr.bssid)) - bssid = ev->cr.bssid; __cfg80211_connect_result( - wdev->netdev, bssid, - ev->cr.req_ie, ev->cr.req_ie_len, - ev->cr.resp_ie, ev->cr.resp_ie_len, - ev->cr.status, - ev->cr.status == WLAN_STATUS_SUCCESS, - ev->cr.bss, ev->cr.timeout_reason); + wdev->netdev, + &ev->cr, + ev->cr.status == WLAN_STATUS_SUCCESS); break; case EVENT_ROAMED: - __cfg80211_roamed(wdev, ev->rm.bss, ev->rm.req_ie, - ev->rm.req_ie_len, ev->rm.resp_ie, - ev->rm.resp_ie_len); + __cfg80211_roamed(wdev, &ev->rm); break; case EVENT_DISCONNECTED: __cfg80211_disconnected(wdev->netdev, @@ -991,7 +983,7 @@ void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev) int cfg80211_change_iface(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype ntype, - u32 *flags, struct vif_params *params) + struct vif_params *params) { int err; enum nl80211_iftype otype = dev->ieee80211_ptr->iftype; @@ -1049,7 +1041,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, cfg80211_process_rdev_events(rdev); } - err = rdev_change_virtual_intf(rdev, dev, ntype, flags, params); + err = rdev_change_virtual_intf(rdev, dev, ntype, params); WARN_ON(!err && dev->ieee80211_ptr->iftype != ntype); @@ -1097,6 +1089,35 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, return err; } +static u32 cfg80211_calculate_bitrate_ht(struct rate_info *rate) +{ + int modulation, streams, bitrate; + + /* the formula below does only work for MCS values smaller than 32 */ + if (WARN_ON_ONCE(rate->mcs >= 32)) + return 0; + + modulation = rate->mcs & 7; + streams = (rate->mcs >> 3) + 1; + + bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; + + if (modulation < 4) + bitrate *= (modulation + 1); + else if (modulation == 4) + bitrate *= (modulation + 2); + else + bitrate *= (modulation + 3); + + bitrate *= streams; + + if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) + bitrate = (bitrate / 9) * 10; + + /* do NOT round down here */ + return (bitrate + 50000) / 100000; +} + static u32 cfg80211_calculate_bitrate_60g(struct rate_info *rate) { static const u32 __mcs2bitrate[] = { @@ -1230,39 +1251,14 @@ static u32 cfg80211_calculate_bitrate_vht(struct rate_info *rate) u32 cfg80211_calculate_bitrate(struct rate_info *rate) { - int modulation, streams, bitrate; - - if (!(rate->flags & RATE_INFO_FLAGS_MCS) && - !(rate->flags & RATE_INFO_FLAGS_VHT_MCS)) - return rate->legacy; + if (rate->flags & RATE_INFO_FLAGS_MCS) + return cfg80211_calculate_bitrate_ht(rate); if (rate->flags & RATE_INFO_FLAGS_60G) return cfg80211_calculate_bitrate_60g(rate); if (rate->flags & RATE_INFO_FLAGS_VHT_MCS) return cfg80211_calculate_bitrate_vht(rate); - /* the formula below does only work for MCS values smaller than 32 */ - if (WARN_ON_ONCE(rate->mcs >= 32)) - return 0; - - modulation = rate->mcs & 7; - streams = (rate->mcs >> 3) + 1; - - bitrate = (rate->bw == RATE_INFO_BW_40) ? 13500000 : 6500000; - - if (modulation < 4) - bitrate *= (modulation + 1); - else if (modulation == 4) - bitrate *= (modulation + 2); - else - bitrate *= (modulation + 3); - - bitrate *= streams; - - if (rate->flags & RATE_INFO_FLAGS_SHORT_GI) - bitrate = (bitrate / 9) * 10; - - /* do NOT round down here */ - return (bitrate + 50000) / 100000; + return rate->legacy; } EXPORT_SYMBOL(cfg80211_calculate_bitrate); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index a220156cf2175a459727fcf67e1291036dca21ae..5d4a02c7979b0d7478ff2b39f82e24bed97a6e78 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -62,7 +62,7 @@ int cfg80211_wext_siwmode(struct net_device *dev, struct iw_request_info *info, memset(&vifparams, 0, sizeof(vifparams)); - return cfg80211_change_iface(rdev, dev, type, NULL, &vifparams); + return cfg80211_change_iface(rdev, dev, type, &vifparams); } EXPORT_WEXT_HANDLER(cfg80211_wext_siwmode); diff --git a/net/xfrm/Makefile b/net/xfrm/Makefile index c0e961983f17a077bde9bab18d946ecef0cba0ae..abf81b329dc1f3276e1cf5631ebae4ec31a229de 100644 --- a/net/xfrm/Makefile +++ b/net/xfrm/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_XFRM) := xfrm_policy.o xfrm_state.o xfrm_hash.o \ xfrm_input.o xfrm_output.o \ xfrm_sysctl.o xfrm_replay.o +obj-$(CONFIG_XFRM_OFFLOAD) += xfrm_device.o obj-$(CONFIG_XFRM_STATISTICS) += xfrm_proc.o obj-$(CONFIG_XFRM_ALGO) += xfrm_algo.o obj-$(CONFIG_XFRM_USER) += xfrm_user.o diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c new file mode 100644 index 0000000000000000000000000000000000000000..8ec8a3fcf8d4740670b16c2c1af099ef0beaa5c5 --- /dev/null +++ b/net/xfrm/xfrm_device.c @@ -0,0 +1,208 @@ +/* + * xfrm_device.c - IPsec device offloading code. + * + * Copyright (c) 2015 secunet Security Networks AG + * + * Author: + * Steffen Klassert + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features) +{ + int err; + struct xfrm_state *x; + struct xfrm_offload *xo = xfrm_offload(skb); + + if (skb_is_gso(skb)) + return 0; + + if (xo) { + x = skb->sp->xvec[skb->sp->len - 1]; + if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) + return 0; + + x->outer_mode->xmit(x, skb); + + err = x->type_offload->xmit(x, skb, features); + if (err) { + XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); + return err; + } + + skb_push(skb, skb->data - skb_mac_header(skb)); + } + + return 0; +} +EXPORT_SYMBOL_GPL(validate_xmit_xfrm); + +int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, + struct xfrm_user_offload *xuo) +{ + int err; + struct dst_entry *dst; + struct net_device *dev; + struct xfrm_state_offload *xso = &x->xso; + xfrm_address_t *saddr; + xfrm_address_t *daddr; + + if (!x->type_offload) + return 0; + + /* We don't yet support UDP encapsulation, TFC padding and ESN. */ + if (x->encap || x->tfcpad || (x->props.flags & XFRM_STATE_ESN)) + return 0; + + dev = dev_get_by_index(net, xuo->ifindex); + if (!dev) { + if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { + saddr = &x->props.saddr; + daddr = &x->id.daddr; + } else { + saddr = &x->id.daddr; + daddr = &x->props.saddr; + } + + dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family); + if (IS_ERR(dst)) + return 0; + + dev = dst->dev; + + dev_hold(dev); + dst_release(dst); + } + + if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { + dev_put(dev); + return 0; + } + + xso->dev = dev; + xso->num_exthdrs = 1; + xso->flags = xuo->flags; + + err = dev->xfrmdev_ops->xdo_dev_state_add(x); + if (err) { + dev_put(dev); + return err; + } + + return 0; +} +EXPORT_SYMBOL_GPL(xfrm_dev_state_add); + +bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) +{ + int mtu; + struct dst_entry *dst = skb_dst(skb); + struct xfrm_dst *xdst = (struct xfrm_dst *)dst; + struct net_device *dev = x->xso.dev; + + if (!x->type_offload || x->encap) + return false; + + if ((x->xso.offload_handle && (dev == dst->path->dev)) && + !dst->child->xfrm && x->type->get_mtu) { + mtu = x->type->get_mtu(x, xdst->child_mtu_cached); + + if (skb->len <= mtu) + goto ok; + + if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) + goto ok; + } + + return false; + +ok: + if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok) + return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); + + return true; +} +EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); + +int xfrm_dev_register(struct net_device *dev) +{ + if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) + return NOTIFY_BAD; + if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && + !(dev->features & NETIF_F_HW_ESP)) + return NOTIFY_BAD; + + return NOTIFY_DONE; +} + +static int xfrm_dev_unregister(struct net_device *dev) +{ + return NOTIFY_DONE; +} + +static int xfrm_dev_feat_change(struct net_device *dev) +{ + if ((dev->features & NETIF_F_HW_ESP) && !dev->xfrmdev_ops) + return NOTIFY_BAD; + else if (!(dev->features & NETIF_F_HW_ESP)) + dev->xfrmdev_ops = NULL; + + if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && + !(dev->features & NETIF_F_HW_ESP)) + return NOTIFY_BAD; + + return NOTIFY_DONE; +} + +static int xfrm_dev_down(struct net_device *dev) +{ + if (dev->hw_features & NETIF_F_HW_ESP) + xfrm_dev_state_flush(dev_net(dev), dev, true); + + xfrm_garbage_collect(dev_net(dev)); + + return NOTIFY_DONE; +} + +static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + + switch (event) { + case NETDEV_REGISTER: + return xfrm_dev_register(dev); + + case NETDEV_UNREGISTER: + return xfrm_dev_unregister(dev); + + case NETDEV_FEAT_CHANGE: + return xfrm_dev_feat_change(dev); + + case NETDEV_DOWN: + return xfrm_dev_down(dev); + } + return NOTIFY_DONE; +} + +static struct notifier_block xfrm_dev_notifier = { + .notifier_call = xfrm_dev_event, +}; + +void __net_init xfrm_dev_init(void) +{ + register_netdevice_notifier(&xfrm_dev_notifier); +} diff --git a/net/xfrm/xfrm_hash.h b/net/xfrm/xfrm_hash.h index 666c5ffe929dca388b218d99c2d71946a231872f..eaea9c4fb3b0e6ec8903cfc5456519e8d4e7dcd1 100644 --- a/net/xfrm/xfrm_hash.h +++ b/net/xfrm/xfrm_hash.h @@ -54,8 +54,8 @@ static inline unsigned int __xfrm4_dpref_spref_hash(const xfrm_address_t *daddr, static inline unsigned int __xfrm6_pref_hash(const xfrm_address_t *addr, __u8 prefixlen) { - int pdw; - int pbi; + unsigned int pdw; + unsigned int pbi; u32 initval = 0; pdw = prefixlen >> 5; /* num of whole u32 in prefix */ diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index 46bdb4fbed0bb34a5d6ae40991b3fda6e5dff82c..9de4b1dbc0aea3f768a8d98eeab456c0c8e0fa61 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -107,6 +107,8 @@ struct sec_path *secpath_dup(struct sec_path *src) sp->len = 0; sp->olen = 0; + memset(sp->ovec, 0, sizeof(sp->ovec[XFRM_MAX_OFFLOAD_DEPTH])); + if (src) { int i; @@ -207,8 +209,9 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) unsigned int family; int decaps = 0; int async = 0; - struct xfrm_offload *xo; bool xfrm_gro = false; + bool crypto_done = false; + struct xfrm_offload *xo = xfrm_offload(skb); if (encap_type < 0) { x = xfrm_input_state(skb); @@ -220,9 +223,40 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) seq = XFRM_SKB_CB(skb)->seq.input.low; goto resume; } + /* encap_type < -1 indicates a GRO call. */ encap_type = 0; seq = XFRM_SPI_SKB_CB(skb)->seq; + + if (xo && (xo->flags & CRYPTO_DONE)) { + crypto_done = true; + x = xfrm_input_state(skb); + family = XFRM_SPI_SKB_CB(skb)->family; + + if (!(xo->status & CRYPTO_SUCCESS)) { + if (xo->status & + (CRYPTO_TRANSPORT_AH_AUTH_FAILED | + CRYPTO_TRANSPORT_ESP_AUTH_FAILED | + CRYPTO_TUNNEL_AH_AUTH_FAILED | + CRYPTO_TUNNEL_ESP_AUTH_FAILED)) { + + xfrm_audit_state_icvfail(x, skb, + x->type->proto); + x->stats.integrity_failed++; + XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEPROTOERROR); + goto drop; + } + + XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); + goto drop; + } + + if ((err = xfrm_parse_spi(skb, nexthdr, &spi, &seq)) != 0) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); + goto drop; + } + } + goto lock; } @@ -311,7 +345,10 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) skb_dst_force(skb); dev_hold(skb->dev); - nexthdr = x->type->input(x, skb); + if (crypto_done) + nexthdr = x->type_offload->input_tail(x, skb); + else + nexthdr = x->type->input(x, skb); if (nexthdr == -EINPROGRESS) return 0; @@ -395,7 +432,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) if (xo) xfrm_gro = xo->flags & XFRM_GRO; - err = x->inner_mode->afinfo->transport_finish(skb, async); + err = x->inner_mode->afinfo->transport_finish(skb, xfrm_gro || async); if (xfrm_gro) { skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index 8ba29fe58352abcc887235d36d42a06797d94b28..8c0b6722aaa87c6d346d0ba57c2e541a6703c79d 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -99,12 +99,13 @@ static int xfrm_output_one(struct sk_buff *skb, int err) skb_dst_force(skb); - /* Inner headers are invalid now. */ - skb->encapsulation = 0; - - err = x->type->output(x, skb); - if (err == -EINPROGRESS) - goto out; + if (xfrm_offload(skb)) { + x->type_offload->encap(x, skb); + } else { + err = x->type->output(x, skb); + if (err == -EINPROGRESS) + goto out; + } resume: if (err) { @@ -200,8 +201,40 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb int xfrm_output(struct sock *sk, struct sk_buff *skb) { struct net *net = dev_net(skb_dst(skb)->dev); + struct xfrm_state *x = skb_dst(skb)->xfrm; int err; + secpath_reset(skb); + skb->encapsulation = 0; + + if (xfrm_dev_offload_ok(skb, x)) { + struct sec_path *sp; + + sp = secpath_dup(skb->sp); + if (!sp) { + XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTERROR); + kfree_skb(skb); + return -ENOMEM; + } + if (skb->sp) + secpath_put(skb->sp); + skb->sp = sp; + skb->encapsulation = 1; + + sp->olen++; + sp->xvec[skb->sp->len++] = x; + xfrm_state_hold(x); + + if (skb_is_gso(skb)) { + skb_shinfo(skb)->gso_type |= SKB_GSO_ESP; + + return xfrm_output2(net, sk, skb); + } + + if (x->xso.dev && x->xso.dev->features & NETIF_F_HW_ESP_TX_CSUM) + goto out; + } + if (skb_is_gso(skb)) return xfrm_output_gso(net, sk, skb); @@ -214,6 +247,7 @@ int xfrm_output(struct sock *sk, struct sk_buff *skb) } } +out: return xfrm_output2(net, sk, skb); } EXPORT_SYMBOL_GPL(xfrm_output); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 236cbbc0ab9cfff05cd027ffb0dc56aa15e61033..b00a1d5a7f52e54f2ac454da3eeb14447f84d8b2 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -116,11 +116,10 @@ static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short fa return afinfo; } -static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, - int tos, int oif, - const xfrm_address_t *saddr, - const xfrm_address_t *daddr, - int family) +struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, + const xfrm_address_t *saddr, + const xfrm_address_t *daddr, + int family) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -135,6 +134,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, return dst; } +EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, @@ -1006,6 +1006,10 @@ int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); + + if (cnt) + xfrm_garbage_collect(net); + return err; } EXPORT_SYMBOL(xfrm_policy_flush); @@ -2929,21 +2933,6 @@ void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); -static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) -{ - struct net_device *dev = netdev_notifier_info_to_dev(ptr); - - switch (event) { - case NETDEV_DOWN: - xfrm_garbage_collect(dev_net(dev)); - } - return NOTIFY_DONE; -} - -static struct notifier_block xfrm_dev_notifier = { - .notifier_call = xfrm_dev_event, -}; - #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { @@ -3020,7 +3009,7 @@ static int __net_init xfrm_policy_init(struct net *net) INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); if (net_eq(net, &init_net)) - register_netdevice_notifier(&xfrm_dev_notifier); + xfrm_dev_init(); return 0; out_bydst: diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index cdc2e2e71bffa2d871d051870f0daf27067815fb..8b23c5bcf8e88bcf2141dd75fac6eeec54a1b143 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -45,7 +45,8 @@ u32 xfrm_replay_seqhi(struct xfrm_state *x, __be32 net_seq) return seq_hi; } - +EXPORT_SYMBOL(xfrm_replay_seqhi); +; static void xfrm_replay_notify(struct xfrm_state *x, int event) { struct km_event c; @@ -558,6 +559,158 @@ static void xfrm_replay_advance_esn(struct xfrm_state *x, __be32 net_seq) x->repl->notify(x, XFRM_REPLAY_UPDATE); } +#ifdef CONFIG_XFRM_OFFLOAD +static int xfrm_replay_overflow_offload(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct net *net = xs_net(x); + struct xfrm_offload *xo = xfrm_offload(skb); + __u32 oseq = x->replay.oseq; + + if (!xo) + return xfrm_replay_overflow(x, skb); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + if (!skb_is_gso(skb)) { + XFRM_SKB_CB(skb)->seq.output.low = ++oseq; + xo->seq.low = oseq; + } else { + XFRM_SKB_CB(skb)->seq.output.low = oseq + 1; + xo->seq.low = oseq + 1; + oseq += skb_shinfo(skb)->gso_segs; + } + + XFRM_SKB_CB(skb)->seq.output.hi = 0; + xo->seq.hi = 0; + if (unlikely(oseq < x->replay.oseq)) { + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } + + x->replay.oseq = oseq; + + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static int xfrm_replay_overflow_offload_bmp(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct net *net = xs_net(x); + __u32 oseq = replay_esn->oseq; + + if (!xo) + return xfrm_replay_overflow_bmp(x, skb); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + if (!skb_is_gso(skb)) { + XFRM_SKB_CB(skb)->seq.output.low = ++oseq; + xo->seq.low = oseq; + } else { + XFRM_SKB_CB(skb)->seq.output.low = oseq + 1; + xo->seq.low = oseq + 1; + oseq += skb_shinfo(skb)->gso_segs; + } + + XFRM_SKB_CB(skb)->seq.output.hi = 0; + xo->seq.hi = 0; + if (unlikely(oseq < replay_esn->oseq)) { + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } else { + replay_esn->oseq = oseq; + } + + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static int xfrm_replay_overflow_offload_esn(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = 0; + struct xfrm_offload *xo = xfrm_offload(skb); + struct xfrm_replay_state_esn *replay_esn = x->replay_esn; + struct net *net = xs_net(x); + __u32 oseq = replay_esn->oseq; + __u32 oseq_hi = replay_esn->oseq_hi; + + if (!xo) + return xfrm_replay_overflow_esn(x, skb); + + if (x->type->flags & XFRM_TYPE_REPLAY_PROT) { + if (!skb_is_gso(skb)) { + XFRM_SKB_CB(skb)->seq.output.low = ++oseq; + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.low = oseq; + xo->seq.hi = oseq_hi; + } else { + XFRM_SKB_CB(skb)->seq.output.low = oseq + 1; + XFRM_SKB_CB(skb)->seq.output.hi = oseq_hi; + xo->seq.low = oseq = oseq + 1; + xo->seq.hi = oseq_hi; + oseq += skb_shinfo(skb)->gso_segs; + } + + if (unlikely(oseq < replay_esn->oseq)) { + XFRM_SKB_CB(skb)->seq.output.hi = ++oseq_hi; + xo->seq.hi = oseq_hi; + + if (replay_esn->oseq_hi == 0) { + replay_esn->oseq--; + replay_esn->oseq_hi--; + xfrm_audit_state_replay_overflow(x, skb); + err = -EOVERFLOW; + + return err; + } + } + + replay_esn->oseq = oseq; + replay_esn->oseq_hi = oseq_hi; + + if (xfrm_aevent_is_on(net)) + x->repl->notify(x, XFRM_REPLAY_UPDATE); + } + + return err; +} + +static const struct xfrm_replay xfrm_replay_legacy = { + .advance = xfrm_replay_advance, + .check = xfrm_replay_check, + .recheck = xfrm_replay_check, + .notify = xfrm_replay_notify, + .overflow = xfrm_replay_overflow_offload, +}; + +static const struct xfrm_replay xfrm_replay_bmp = { + .advance = xfrm_replay_advance_bmp, + .check = xfrm_replay_check_bmp, + .recheck = xfrm_replay_check_bmp, + .notify = xfrm_replay_notify_bmp, + .overflow = xfrm_replay_overflow_offload_bmp, +}; + +static const struct xfrm_replay xfrm_replay_esn = { + .advance = xfrm_replay_advance_esn, + .check = xfrm_replay_check_esn, + .recheck = xfrm_replay_recheck_esn, + .notify = xfrm_replay_notify_esn, + .overflow = xfrm_replay_overflow_offload_esn, +}; +#else static const struct xfrm_replay xfrm_replay_legacy = { .advance = xfrm_replay_advance, .check = xfrm_replay_check, @@ -581,6 +734,7 @@ static const struct xfrm_replay xfrm_replay_esn = { .notify = xfrm_replay_notify_esn, .overflow = xfrm_replay_overflow_esn, }; +#endif int xfrm_init_replay(struct xfrm_state *x) { @@ -595,10 +749,12 @@ int xfrm_init_replay(struct xfrm_state *x) if (replay_esn->replay_window == 0) return -EINVAL; x->repl = &xfrm_replay_esn; - } else + } else { x->repl = &xfrm_replay_bmp; - } else + } + } else { x->repl = &xfrm_replay_legacy; + } return 0; } diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 5a597dbbe564f09ad6b77e1f529127a0410c7fd4..fc3c5aa387543a63bf18955e60309207bf06e961 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -251,6 +251,75 @@ static void xfrm_put_type(const struct xfrm_type *type) module_put(type->owner); } +static DEFINE_SPINLOCK(xfrm_type_offload_lock); +int xfrm_register_type_offload(const struct xfrm_type_offload *type, + unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + const struct xfrm_type_offload **typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_offload_map; + spin_lock_bh(&xfrm_type_offload_lock); + + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; + else + err = -EEXIST; + spin_unlock_bh(&xfrm_type_offload_lock); + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(xfrm_register_type_offload); + +int xfrm_unregister_type_offload(const struct xfrm_type_offload *type, + unsigned short family) +{ + struct xfrm_state_afinfo *afinfo = xfrm_state_get_afinfo(family); + const struct xfrm_type_offload **typemap; + int err = 0; + + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + typemap = afinfo->type_offload_map; + spin_lock_bh(&xfrm_type_offload_lock); + + if (unlikely(typemap[type->proto] != type)) + err = -ENOENT; + else + typemap[type->proto] = NULL; + spin_unlock_bh(&xfrm_type_offload_lock); + rcu_read_unlock(); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_type_offload); + +static const struct xfrm_type_offload *xfrm_get_type_offload(u8 proto, unsigned short family) +{ + struct xfrm_state_afinfo *afinfo; + const struct xfrm_type_offload **typemap; + const struct xfrm_type_offload *type; + + afinfo = xfrm_state_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + typemap = afinfo->type_offload_map; + + type = typemap[proto]; + if ((type && !try_module_get(type->owner))) + type = NULL; + + rcu_read_unlock(); + return type; +} + +static void xfrm_put_type_offload(const struct xfrm_type_offload *type) +{ + module_put(type->owner); +} + static DEFINE_SPINLOCK(xfrm_mode_lock); int xfrm_register_mode(struct xfrm_mode *mode, int family) { @@ -365,10 +434,13 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) xfrm_put_mode(x->inner_mode_iaf); if (x->outer_mode) xfrm_put_mode(x->outer_mode); + if (x->type_offload) + xfrm_put_type_offload(x->type_offload); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); } + xfrm_dev_state_free(x); security_xfrm_state_free(x); kfree(x); } @@ -538,6 +610,8 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + xfrm_dev_state_delete(x); + /* All xfrm_state objects are created by xfrm_state_alloc. * The xfrm_state_alloc call gives a reference, and that * is what we are dropping here. @@ -582,12 +656,41 @@ xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) return err; } + +static inline int +xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) +{ + int i, err = 0; + + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct xfrm_state *x; + struct xfrm_state_offload *xso; + + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + xso = &x->xso; + + if (xso->dev == dev && + (err = security_xfrm_state_delete(x)) != 0) { + xfrm_audit_state_delete(x, 0, task_valid); + return err; + } + } + } + + return err; +} #else static inline int xfrm_state_flush_secctx_check(struct net *net, u8 proto, bool task_valid) { return 0; } + +static inline int +xfrm_dev_state_flush_secctx_check(struct net *net, struct net_device *dev, bool task_valid) +{ + return 0; +} #endif int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) @@ -630,6 +733,48 @@ int xfrm_state_flush(struct net *net, u8 proto, bool task_valid) } EXPORT_SYMBOL(xfrm_state_flush); +int xfrm_dev_state_flush(struct net *net, struct net_device *dev, bool task_valid) +{ + int i, err = 0, cnt = 0; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + err = xfrm_dev_state_flush_secctx_check(net, dev, task_valid); + if (err) + goto out; + + err = -ESRCH; + for (i = 0; i <= net->xfrm.state_hmask; i++) { + struct xfrm_state *x; + struct xfrm_state_offload *xso; +restart: + hlist_for_each_entry(x, net->xfrm.state_bydst+i, bydst) { + xso = &x->xso; + + if (!xfrm_state_kern(x) && xso->dev == dev) { + xfrm_state_hold(x); + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + + err = xfrm_state_delete(x); + xfrm_audit_state_delete(x, err ? 0 : 1, + task_valid); + xfrm_state_put(x); + if (!err) + cnt++; + + spin_lock_bh(&net->xfrm.xfrm_state_lock); + goto restart; + } + } + } + if (cnt) + err = 0; + +out: + spin_unlock_bh(&net->xfrm.xfrm_state_lock); + return err; +} +EXPORT_SYMBOL(xfrm_dev_state_flush); + void xfrm_sad_getinfo(struct net *net, struct xfrmk_sadinfo *si) { spin_lock_bh(&net->xfrm.xfrm_state_lock); @@ -2077,6 +2222,8 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay) if (x->type == NULL) goto error; + x->type_offload = xfrm_get_type_offload(x->id.proto, family); + err = x->type->init_state(x); if (err) goto error; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 40a8aa39220d67d349c36f80ab1628d0e90c3fa7..38614df33ec8d5751bd88a48d0068926b9032556 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -55,7 +55,7 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) return -EINVAL; } - algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } @@ -71,7 +71,7 @@ static int verify_auth_trunc(struct nlattr **attrs) if (nla_len(rt) < xfrm_alg_auth_len(algp)) return -EINVAL; - algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } @@ -87,7 +87,7 @@ static int verify_aead(struct nlattr **attrs) if (nla_len(rt) < aead_len(algp)) return -EINVAL; - algp->alg_name[CRYPTO_MAX_ALG_NAME - 1] = '\0'; + algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } @@ -595,6 +595,13 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, goto error; } + if (attrs[XFRMA_OFFLOAD_DEV]) { + err = xfrm_dev_state_add(net, x, + nla_data(attrs[XFRMA_OFFLOAD_DEV])); + if (err) + goto error; + } + if ((err = xfrm_alloc_replay_state_esn(&x->replay_esn, &x->preplay_esn, attrs[XFRMA_REPLAY_ESN_VAL]))) goto error; @@ -779,6 +786,23 @@ static int copy_sec_ctx(struct xfrm_sec_ctx *s, struct sk_buff *skb) return 0; } +static int copy_user_offload(struct xfrm_state_offload *xso, struct sk_buff *skb) +{ + struct xfrm_user_offload *xuo; + struct nlattr *attr; + + attr = nla_reserve(skb, XFRMA_OFFLOAD_DEV, sizeof(*xuo)); + if (attr == NULL) + return -EMSGSIZE; + + xuo = nla_data(attr); + + xuo->ifindex = xso->dev->ifindex; + xuo->flags = xso->flags; + + return 0; +} + static int copy_to_user_auth(struct xfrm_algo_auth *auth, struct sk_buff *skb) { struct xfrm_algo *algo; @@ -869,6 +893,10 @@ static int copy_to_user_state_extra(struct xfrm_state *x, &x->replay); if (ret) goto out; + if(x->xso.dev) + ret = copy_user_offload(&x->xso, skb); + if (ret) + goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); out: @@ -932,8 +960,8 @@ static int xfrm_dump_sa(struct sk_buff *skb, struct netlink_callback *cb) u8 proto = 0; int err; - err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, - xfrma_policy); + err = nlmsg_parse(cb->nlh, 0, attrs, XFRMA_MAX, xfrma_policy, + NULL); if (err < 0) return err; @@ -2406,6 +2434,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OFFLOAD_DEV] = { .len = sizeof(struct xfrm_user_offload) }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2448,7 +2477,8 @@ static const struct xfrm_link { [XFRM_MSG_GETSPDINFO - XFRM_MSG_BASE] = { .doit = xfrm_get_spdinfo }, }; -static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) +static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct nlattr *attrs[XFRMA_MAX+1]; @@ -2488,7 +2518,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) err = nlmsg_parse(nlh, xfrm_msg_min[type], attrs, link->nla_max ? : XFRMA_MAX, - link->nla_pol ? : xfrma_policy); + link->nla_pol ? : xfrma_policy, extack); if (err < 0) return err; @@ -2622,6 +2652,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); + if (x->xso.dev) + l += nla_total_size(sizeof(x->xso)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size_64bit(sizeof(u64)); @@ -3108,7 +3140,6 @@ static bool xfrm_is_alive(const struct km_event *c) } static struct xfrm_mgr netlink_mgr = { - .id = "netlink", .notify = xfrm_send_state_notify, .acquire = xfrm_send_acquire, .compile_policy = xfrm_compile_policy, diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 09e9d535bd7487d81574cf8572a41b6e697566fd..6c7468eb3684494b46a248fcef39a1e451fd6856 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -34,6 +34,8 @@ hostprogs-y += sampleip hostprogs-y += tc_l2_redirect hostprogs-y += lwt_len_hist hostprogs-y += xdp_tx_iptunnel +hostprogs-y += test_map_in_map +hostprogs-y += per_socket_stats_example # Libbpf dependencies LIBBPF := ../../tools/lib/bpf/bpf.o @@ -72,6 +74,8 @@ sampleip-objs := bpf_load.o $(LIBBPF) sampleip_user.o tc_l2_redirect-objs := bpf_load.o $(LIBBPF) tc_l2_redirect_user.o lwt_len_hist-objs := bpf_load.o $(LIBBPF) lwt_len_hist_user.o xdp_tx_iptunnel-objs := bpf_load.o $(LIBBPF) xdp_tx_iptunnel_user.o +test_map_in_map-objs := bpf_load.o $(LIBBPF) test_map_in_map_user.o +per_socket_stats_example-objs := $(LIBBPF) cookie_uid_helper_example.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -105,6 +109,8 @@ always += trace_event_kern.o always += sampleip_kern.o always += lwt_len_hist_kern.o always += xdp_tx_iptunnel_kern.o +always += test_map_in_map_kern.o +always += cookie_uid_helper_example.o HOSTCFLAGS += -I$(objtree)/usr/include HOSTCFLAGS += -I$(srctree)/tools/lib/ @@ -139,6 +145,7 @@ HOSTLOADLIBES_sampleip += -lelf HOSTLOADLIBES_tc_l2_redirect += -l elf HOSTLOADLIBES_lwt_len_hist += -l elf HOSTLOADLIBES_xdp_tx_iptunnel += -lelf +HOSTLOADLIBES_test_map_in_map += -lelf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang @@ -182,4 +189,5 @@ $(obj)/%.o: $(src)/%.c -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -Wno-address-of-packed-member -Wno-tautological-compare \ + -Wno-unknown-warning-option \ -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@ diff --git a/samples/bpf/bpf_helpers.h b/samples/bpf/bpf_helpers.h index faaffe2e139a989de6f90835121a725091d4a289..9a9c95f2c9fb3a7de27eb70659429418ada04301 100644 --- a/samples/bpf/bpf_helpers.h +++ b/samples/bpf/bpf_helpers.h @@ -80,6 +80,7 @@ struct bpf_map_def { unsigned int value_size; unsigned int max_entries; unsigned int map_flags; + unsigned int inner_map_idx; }; static int (*bpf_skb_load_bytes)(void *ctx, int off, void *to, int len) = @@ -145,11 +146,30 @@ static int (*bpf_skb_change_head)(void *, int len, int flags) = #define PT_REGS_SP(x) ((x)->sp) #define PT_REGS_IP(x) ((x)->nip) +#elif defined(__sparc__) + +#define PT_REGS_PARM1(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_PARM2(x) ((x)->u_regs[UREG_I1]) +#define PT_REGS_PARM3(x) ((x)->u_regs[UREG_I2]) +#define PT_REGS_PARM4(x) ((x)->u_regs[UREG_I3]) +#define PT_REGS_PARM5(x) ((x)->u_regs[UREG_I4]) +#define PT_REGS_RET(x) ((x)->u_regs[UREG_I7]) +#define PT_REGS_RC(x) ((x)->u_regs[UREG_I0]) +#define PT_REGS_SP(x) ((x)->u_regs[UREG_FP]) +#if defined(__arch64__) +#define PT_REGS_IP(x) ((x)->tpc) +#else +#define PT_REGS_IP(x) ((x)->pc) +#endif + #endif #ifdef __powerpc__ #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = (ctx)->link; }) #define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP +#elif defined(__sparc__) +#define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ (ip) = PT_REGS_RET(ctx); }) +#define BPF_KRETPROBE_READ_RET_IP BPF_KPROBE_READ_RET_IP #else #define BPF_KPROBE_READ_RET_IP(ip, ctx) ({ \ bpf_probe_read(&(ip), sizeof(ip), (void *)PT_REGS_RET(ctx)); }) diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index b86ee54da2d14d6ba0de18481d2c55ed1a70a67b..74456b3eb89acb4671fcebaafc2ce7bd102b3ed3 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include "libbpf.h" #include "bpf_load.h" #include "perf-sys.h" @@ -37,13 +39,8 @@ int event_fd[MAX_PROGS]; int prog_cnt; int prog_array_fd = -1; -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -}; +struct bpf_map_data map_data[MAX_MAPS]; +int map_data_count = 0; static int populate_prog_array(const char *event, int prog_fd) { @@ -192,24 +189,45 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return 0; } -static int load_maps(struct bpf_map_def *maps, int len) +static int load_maps(struct bpf_map_data *maps, int nr_maps, + fixup_map_cb fixup_map) { int i; - for (i = 0; i < len / sizeof(struct bpf_map_def); i++) { + for (i = 0; i < nr_maps; i++) { + if (fixup_map) { + fixup_map(&maps[i], i); + /* Allow userspace to assign map FD prior to creation */ + if (maps[i].fd != -1) { + map_fd[i] = maps[i].fd; + continue; + } + } - map_fd[i] = bpf_create_map(maps[i].type, - maps[i].key_size, - maps[i].value_size, - maps[i].max_entries, - maps[i].map_flags); + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map(maps[i].def.type, + maps[i].def.key_size, + inner_map_fd, + maps[i].def.max_entries, + maps[i].def.map_flags); + } else { + map_fd[i] = bpf_create_map(maps[i].def.type, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags); + } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", errno, strerror(errno)); return 1; } + maps[i].fd = map_fd[i]; - if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) prog_array_fd = map_fd[i]; } return 0; @@ -239,7 +257,8 @@ static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, } static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn) + GElf_Shdr *shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) { int i, nrels; @@ -249,6 +268,8 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, GElf_Sym sym; GElf_Rel rel; unsigned int insn_idx; + bool match = false; + int j, map_idx; gelf_getrel(data, i, &rel); @@ -262,20 +283,157 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, return 1; } insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)]; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].fd; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } } return 0; } -int load_bpf_file(char *path) +static int cmp_symbols(const void *l, const void *r) { - int fd, i; + const GElf_Sym *lsym = (const GElf_Sym *)l; + const GElf_Sym *rsym = (const GElf_Sym *)r; + + if (lsym->st_value < rsym->st_value) + return -1; + else if (lsym->st_value > rsym->st_value) + return 1; + else + return 0; +} + +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf *elf, Elf_Data *symbols, int strtabidx) +{ + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + int copy_sz; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } + + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS+1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) + continue; + if (sym[nr_maps].st_shndx != maps_shndx) + continue; + /* Only increment iif maps section */ + nr_maps++; + } + + /* Align to map_fd[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + + /* Memcpy relevant part of ELF maps data to loader maps */ + for (i = 0; i < nr_maps; i++) { + unsigned char *addr, *end; + struct bpf_map_def *def; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { + printf("strdup(%s): %s(%d)\n", map_name, + strerror(errno), errno); + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char*) def + map_sz_copy; + end = (unsigned char*) def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } + } + } + + free(sym); + return nr_maps; +} + +static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) +{ + int fd, i, ret, maps_shndx = -1, strtabidx = -1; Elf *elf; GElf_Ehdr ehdr; GElf_Shdr shdr, shdr_prog; - Elf_Data *data, *data_prog, *symbols = NULL; + Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; char *shname, *shname_prog; + int nr_maps = 0; /* reset global variables */ kern_version = 0; @@ -323,14 +481,41 @@ int load_bpf_file(char *path) } memcpy(&kern_version, data->d_buf, sizeof(int)); } else if (strcmp(shname, "maps") == 0) { - processed_sec[i] = true; - if (load_maps(data->d_buf, data->d_size)) - return 1; + int j; + + maps_shndx = i; + data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].fd = -1; } else if (shdr.sh_type == SHT_SYMTAB) { + strtabidx = shdr.sh_link; symbols = data; } } + ret = 1; + + if (!symbols) { + printf("missing SHT_SYMTAB section\n"); + goto done; + } + + if (data_maps) { + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); + ret = 1; + goto done; + } + if (load_maps(map_data, nr_maps, fixup_map)) + goto done; + map_data_count = nr_maps; + + processed_sec[maps_shndx] = true; + } + /* load programs that need map fixup (relocations) */ for (i = 1; i < ehdr.e_shnum; i++) { if (processed_sec[i]) @@ -354,7 +539,8 @@ int load_bpf_file(char *path) processed_sec[shdr.sh_info] = true; processed_sec[i] = true; - if (parse_relo_and_apply(data, symbols, &shdr, insns)) + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) continue; if (memcmp(shname_prog, "kprobe/", 7) == 0 || @@ -387,8 +573,20 @@ int load_bpf_file(char *path) load_and_attach(shname, data->d_buf, data->d_size); } + ret = 0; +done: close(fd); - return 0; + return ret; +} + +int load_bpf_file(char *path) +{ + return do_load_bpf_file(path, NULL); +} + +int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map) +{ + return do_load_bpf_file(path, fixup_map); } void read_trace_pipe(void) @@ -473,7 +671,7 @@ struct ksym *ksym_search(long key) return &syms[0]; } -int set_link_xdp_fd(int ifindex, int fd) +int set_link_xdp_fd(int ifindex, int fd, __u32 flags) { struct sockaddr_nl sa; int sock, seq = 0, len, ret = -1; @@ -509,15 +707,28 @@ int set_link_xdp_fd(int ifindex, int fd) req.nh.nlmsg_seq = ++seq; req.ifinfo.ifi_family = AF_UNSPEC; req.ifinfo.ifi_index = ifindex; + + /* started nested attribute for XDP */ nla = (struct nlattr *)(((char *)&req) + NLMSG_ALIGN(req.nh.nlmsg_len)); nla->nla_type = NLA_F_NESTED | 43/*IFLA_XDP*/; + nla->nla_len = NLA_HDRLEN; - nla_xdp = (struct nlattr *)((char *)nla + NLA_HDRLEN); + /* add XDP fd */ + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); nla_xdp->nla_type = 1/*IFLA_XDP_FD*/; nla_xdp->nla_len = NLA_HDRLEN + sizeof(int); memcpy((char *)nla_xdp + NLA_HDRLEN, &fd, sizeof(fd)); - nla->nla_len = NLA_HDRLEN + nla_xdp->nla_len; + nla->nla_len += nla_xdp->nla_len; + + /* if user passed in any flags, add those too */ + if (flags) { + nla_xdp = (struct nlattr *)((char *)nla + nla->nla_len); + nla_xdp->nla_type = 3/*IFLA_XDP_FLAGS*/; + nla_xdp->nla_len = NLA_HDRLEN + sizeof(flags); + memcpy((char *)nla_xdp + NLA_HDRLEN, &flags, sizeof(flags)); + nla->nla_len += nla_xdp->nla_len; + } req.nh.nlmsg_len += NLA_ALIGN(nla->nla_len); diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index c827827299b3b77831650e762376b061b6c93021..ca0563d04744fe31a472b53824a5a1135b53be48 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -6,12 +6,36 @@ #define MAX_MAPS 32 #define MAX_PROGS 32 -extern int map_fd[MAX_MAPS]; +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; + unsigned int inner_map_idx; +}; + +struct bpf_map_data { + int fd; + char *name; + size_t elf_offset; + struct bpf_map_def def; +}; + +typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); + extern int prog_fd[MAX_PROGS]; extern int event_fd[MAX_PROGS]; extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; extern int prog_cnt; +/* There is a one-to-one mapping between map_fd[] and map_data[]. + * The map_data[] just contains more rich info on the given map. + */ +extern int map_fd[MAX_MAPS]; +extern struct bpf_map_data map_data[MAX_MAPS]; +extern int map_data_count; + /* parses elf file compiled by llvm .c->.o * . parses 'maps' section and creates maps via BPF syscall * . parses 'license' section and passes it to syscall @@ -25,6 +49,7 @@ extern int prog_cnt; * returns zero on success */ int load_bpf_file(char *path); +int load_bpf_file_fixup_map(const char *path, fixup_map_cb fixup_map); void read_trace_pipe(void); struct ksym { @@ -34,5 +59,5 @@ struct ksym { int load_kallsyms(void); struct ksym *ksym_search(long key); -int set_link_xdp_fd(int ifindex, int fd); +int set_link_xdp_fd(int ifindex, int fd, __u32 flags); #endif diff --git a/samples/bpf/cookie_uid_helper_example.c b/samples/bpf/cookie_uid_helper_example.c new file mode 100644 index 0000000000000000000000000000000000000000..b08ab4e889293c30f64b717065e6860603063f7c --- /dev/null +++ b/samples/bpf/cookie_uid_helper_example.c @@ -0,0 +1,321 @@ +/* This test is a demo of using get_socket_uid and get_socket_cookie + * helper function to do per socket based network traffic monitoring. + * It requires iptables version higher then 1.6.1. to load pinned eBPF + * program into the xt_bpf match. + * + * TEST: + * ./run_cookie_uid_helper_example.sh -option + * option: + * -t: do traffic monitoring test, the program will continuously + * print out network traffic happens after program started A sample + * output is shown below: + * + * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 + * cookie: 132, uid: 0x0, Pakcet Count: 2, Bytes Count: 286 + * cookie: 812, uid: 0x3e8, Pakcet Count: 3, Bytes Count: 1726 + * cookie: 802, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 + * cookie: 877, uid: 0x3e8, Pakcet Count: 20, Bytes Count: 11058 + * cookie: 831, uid: 0x3e8, Pakcet Count: 2, Bytes Count: 104 + * cookie: 0, uid: 0x0, Pakcet Count: 6, Bytes Count: 712 + * cookie: 880, uid: 0xfffe, Pakcet Count: 1, Bytes Count: 70 + * + * -s: do getsockopt SO_COOKIE test, the program will set up a pair of + * UDP sockets and send packets between them. And read out the traffic data + * directly from the ebpf map based on the socket cookie. + * + * Clean up: if using shell script, the script file will delete the iptables + * rule and unmount the bpf program when exit. Else the iptables rule need + * to be deleted by hand, see run_cookie_uid_helper_example.sh for detail. + */ + +#define _GNU_SOURCE + +#define offsetof(type, member) __builtin_offsetof(type, member) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" + +#define PORT 8888 + +struct stats { + uint32_t uid; + uint64_t packets; + uint64_t bytes; +}; + +static int map_fd, prog_fd; + +static bool test_finish; + +static void maps_create(void) +{ + map_fd = bpf_create_map(BPF_MAP_TYPE_HASH, sizeof(uint32_t), + sizeof(struct stats), 100, 0); + if (map_fd < 0) + error(1, errno, "map create failed!\n"); +} + +static void prog_load(void) +{ + static char log_buf[1 << 16]; + + struct bpf_insn prog[] = { + /* + * Save sk_buff for future usage. value stored in R6 to R10 will + * not be reset after a bpf helper function call. + */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), + /* + * pc1: BPF_FUNC_get_socket_cookie takes one parameter, + * R1: sk_buff + */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_cookie), + /* pc2-4: save &socketCookie to r7 for future usage*/ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, -8), + BPF_MOV64_REG(BPF_REG_7, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_7, -8), + /* + * pc5-8: set up the registers for BPF_FUNC_map_lookup_elem, + * it takes two parameters (R1: map_fd, R2: &socket_cookie) + */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_lookup_elem), + /* + * pc9. if r0 != 0x0, go to pc+14, since we have the cookie + * stored already + * Otherwise do pc10-22 to setup a new data entry. + */ + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 14), + BPF_MOV64_REG(BPF_REG_1, BPF_REG_6), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_get_socket_uid), + /* + * Place a struct stats in the R10 stack and sequentially + * place the member value into the memory. Packets value + * is set by directly place a IMM value 1 into the stack. + */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_0, + -32 + (__s16)offsetof(struct stats, uid)), + BPF_ST_MEM(BPF_DW, BPF_REG_10, + -32 + (__s16)offsetof(struct stats, packets), 1), + /* + * __sk_buff is a special struct used for eBPF program to + * directly access some sk_buff field. + */ + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_1, + -32 + (__s16)offsetof(struct stats, bytes)), + /* + * add new map entry using BPF_FUNC_map_update_elem, it takes + * 4 parameters (R1: map_fd, R2: &socket_cookie, R3: &stats, + * R4: flags) + */ + BPF_LD_MAP_FD(BPF_REG_1, map_fd), + BPF_MOV64_REG(BPF_REG_2, BPF_REG_7), + BPF_MOV64_REG(BPF_REG_3, BPF_REG_10), + BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -32), + BPF_MOV64_IMM(BPF_REG_4, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, + BPF_FUNC_map_update_elem), + BPF_JMP_IMM(BPF_JA, 0, 0, 5), + /* + * pc24-30 update the packet info to a exist data entry, it can + * be done by directly write to pointers instead of using + * BPF_FUNC_map_update_elem helper function + */ + BPF_MOV64_REG(BPF_REG_9, BPF_REG_0), + BPF_MOV64_IMM(BPF_REG_1, 1), + BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, packets)), + BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_STX_XADD(BPF_DW, BPF_REG_9, BPF_REG_1, + offsetof(struct stats, bytes)), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_6, + offsetof(struct __sk_buff, len)), + BPF_EXIT_INSN(), + }; + prog_fd = bpf_load_program(BPF_PROG_TYPE_SOCKET_FILTER, prog, + ARRAY_SIZE(prog), "GPL", 0, + log_buf, sizeof(log_buf)); + if (prog_fd < 0) + error(1, errno, "failed to load prog\n%s\n", log_buf); +} + +static void prog_attach_iptables(char *file) +{ + int ret; + char rules[100]; + + if (bpf_obj_pin(prog_fd, file)) + error(1, errno, "bpf_obj_pin"); + if (strlen(file) > 50) { + printf("file path too long: %s\n", file); + exit(1); + } + sprintf(rules, "iptables -A OUTPUT -m bpf --object-pinned %s -j ACCEPT", + file); + ret = system(rules); + if (ret < 0) { + printf("iptables rule update failed: %d/n", WEXITSTATUS(ret)); + exit(1); + } +} + +static void print_table(void) +{ + struct stats curEntry; + uint32_t curN = UINT32_MAX; + uint32_t nextN; + int res; + + while (bpf_map_get_next_key(map_fd, &curN, &nextN) > -1) { + curN = nextN; + res = bpf_map_lookup_elem(map_fd, &curN, &curEntry); + if (res < 0) { + error(1, errno, "fail to get entry value of Key: %u\n", + curN); + } else { + printf("cookie: %u, uid: 0x%x, Packet Count: %lu," + " Bytes Count: %lu\n", curN, curEntry.uid, + curEntry.packets, curEntry.bytes); + } + } +} + +static void udp_client(void) +{ + struct sockaddr_in si_other = {0}; + struct sockaddr_in si_me = {0}; + struct stats dataEntry; + int s_rcv, s_send, i, recv_len; + char message = 'a'; + char buf; + uint64_t cookie; + int res; + socklen_t cookie_len = sizeof(cookie); + socklen_t slen = sizeof(si_other); + + s_rcv = socket(PF_INET, SOCK_DGRAM, 0); + if (s_rcv < 0) + error(1, errno, "rcv socket creat failed!\n"); + si_other.sin_family = AF_INET; + si_other.sin_port = htons(PORT); + if (inet_aton("127.0.0.1", &si_other.sin_addr) == 0) + error(1, errno, "inet_aton\n"); + if (bind(s_rcv, (struct sockaddr *)&si_other, sizeof(si_other)) == -1) + error(1, errno, "bind\n"); + s_send = socket(PF_INET, SOCK_DGRAM, 0); + if (s_send < 0) + error(1, errno, "send socket creat failed!\n"); + res = getsockopt(s_send, SOL_SOCKET, SO_COOKIE, &cookie, &cookie_len); + if (res < 0) + printf("get cookie failed: %s\n", strerror(errno)); + res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); + if (res != -1) + error(1, errno, "socket stat found while flow not active\n"); + for (i = 0; i < 10; i++) { + res = sendto(s_send, &message, sizeof(message), 0, + (struct sockaddr *)&si_other, slen); + if (res == -1) + error(1, errno, "send\n"); + if (res != sizeof(message)) + error(1, 0, "%uB != %luB\n", res, sizeof(message)); + recv_len = recvfrom(s_rcv, &buf, sizeof(buf), 0, + (struct sockaddr *)&si_me, &slen); + if (recv_len < 0) + error(1, errno, "revieve\n"); + res = memcmp(&(si_other.sin_addr), &(si_me.sin_addr), + sizeof(si_me.sin_addr)); + if (res != 0) + error(1, EFAULT, "sender addr error: %d\n", res); + printf("Message received: %c\n", buf); + res = bpf_map_lookup_elem(map_fd, &cookie, &dataEntry); + if (res < 0) + error(1, errno, "lookup sk stat failed, cookie: %lu\n", + cookie); + printf("cookie: %lu, uid: 0x%x, Packet Count: %lu," + " Bytes Count: %lu\n\n", cookie, dataEntry.uid, + dataEntry.packets, dataEntry.bytes); + } + close(s_send); + close(s_rcv); +} + +static int usage(void) +{ + printf("Usage: ./run_cookie_uid_helper_example.sh" + " bpfObjName -option\n" + " -t traffic monitor test\n" + " -s getsockopt cookie test\n"); + return 1; +} + +static void finish(int ret) +{ + test_finish = true; +} + +int main(int argc, char *argv[]) +{ + int opt; + bool cfg_test_traffic = false; + bool cfg_test_cookie = false; + + if (argc != 3) + return usage(); + while ((opt = getopt(argc, argv, "ts")) != -1) { + switch (opt) { + case 't': + cfg_test_traffic = true; + break; + case 's': + cfg_test_cookie = true; + break; + + default: + printf("unknown option %c\n", opt); + usage(); + return -1; + } + } + maps_create(); + prog_load(); + prog_attach_iptables(argv[2]); + if (cfg_test_traffic) { + if (signal(SIGINT, finish) == SIG_ERR) + error(1, errno, "register handler failed"); + while (!test_finish) { + print_table(); + printf("\n"); + sleep(1); + }; + } else if (cfg_test_cookie) { + udp_client(); + } + close(prog_fd); + close(map_fd); + return 0; +} diff --git a/samples/bpf/libbpf.h b/samples/bpf/libbpf.h index 3705fba453a005fb32f5dfb51dd5763f5f364ccf..8ab36a04c174a9c154594b7ef4fe2ef2c5c5dc5e 100644 --- a/samples/bpf/libbpf.h +++ b/samples/bpf/libbpf.h @@ -135,6 +135,16 @@ struct bpf_insn; .off = OFF, \ .imm = 0 }) +/* Atomic memory add, *(uint *)(dst_reg + off16) += src_reg */ + +#define BPF_STX_XADD(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_XADD, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + /* Memory store, *(uint *) (dst_reg + off16) = imm32 */ #define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ diff --git a/samples/bpf/map_perf_test_kern.c b/samples/bpf/map_perf_test_kern.c index a91872a97742a6413c316548c66ab8349ba1aff0..245165817fbe63bd53ea9cf7f3a3ab6cf5e7a2fa 100644 --- a/samples/bpf/map_perf_test_kern.c +++ b/samples/bpf/map_perf_test_kern.c @@ -11,6 +11,7 @@ #include "bpf_helpers.h" #define MAX_ENTRIES 1000 +#define MAX_NR_CPUS 1024 struct bpf_map_def SEC("maps") hash_map = { .type = BPF_MAP_TYPE_HASH, @@ -26,7 +27,7 @@ struct bpf_map_def SEC("maps") lru_hash_map = { .max_entries = 10000, }; -struct bpf_map_def SEC("maps") percpu_lru_hash_map = { +struct bpf_map_def SEC("maps") nocommon_lru_hash_map = { .type = BPF_MAP_TYPE_LRU_HASH, .key_size = sizeof(u32), .value_size = sizeof(long), @@ -34,6 +35,19 @@ struct bpf_map_def SEC("maps") percpu_lru_hash_map = { .map_flags = BPF_F_NO_COMMON_LRU, }; +struct bpf_map_def SEC("maps") inner_lru_hash_map = { + .type = BPF_MAP_TYPE_LRU_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + +struct bpf_map_def SEC("maps") array_of_lru_hashs = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(u32), + .max_entries = MAX_NR_CPUS, +}; + struct bpf_map_def SEC("maps") percpu_hash_map = { .type = BPF_MAP_TYPE_PERCPU_HASH, .key_size = sizeof(u32), @@ -65,6 +79,13 @@ struct bpf_map_def SEC("maps") lpm_trie_map_alloc = { .map_flags = BPF_F_NO_PREALLOC, }; +struct bpf_map_def SEC("maps") array_map = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(long), + .max_entries = MAX_ENTRIES, +}; + SEC("kprobe/sys_getuid") int stress_hmap(struct pt_regs *ctx) { @@ -93,6 +114,7 @@ int stress_percpu_hmap(struct pt_regs *ctx) bpf_map_delete_elem(&percpu_hash_map, &key); return 0; } + SEC("kprobe/sys_getgid") int stress_hmap_alloc(struct pt_regs *ctx) { @@ -121,24 +143,56 @@ int stress_percpu_hmap_alloc(struct pt_regs *ctx) return 0; } -SEC("kprobe/sys_getpid") +SEC("kprobe/sys_connect") int stress_lru_hmap_alloc(struct pt_regs *ctx) { - u32 key = bpf_get_prandom_u32(); + struct sockaddr_in6 *in6; + u16 test_case, dst6[8]; + int addrlen, ret; + char fmt[] = "Failed at stress_lru_hmap_alloc. ret:%d\n"; long val = 1; + u32 key = bpf_get_prandom_u32(); - bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); + addrlen = (int)PT_REGS_PARM3(ctx); - return 0; -} + if (addrlen != sizeof(*in6)) + return 0; -SEC("kprobe/sys_getppid") -int stress_percpu_lru_hmap_alloc(struct pt_regs *ctx) -{ - u32 key = bpf_get_prandom_u32(); - long val = 1; + ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + if (ret) + goto done; - bpf_map_update_elem(&percpu_lru_hash_map, &key, &val, BPF_ANY); + if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + return 0; + + test_case = dst6[7]; + + if (test_case == 0) { + ret = bpf_map_update_elem(&lru_hash_map, &key, &val, BPF_ANY); + } else if (test_case == 1) { + ret = bpf_map_update_elem(&nocommon_lru_hash_map, &key, &val, + BPF_ANY); + } else if (test_case == 2) { + void *nolocal_lru_map; + int cpu = bpf_get_smp_processor_id(); + + nolocal_lru_map = bpf_map_lookup_elem(&array_of_lru_hashs, + &cpu); + if (!nolocal_lru_map) { + ret = -ENOENT; + goto done; + } + + ret = bpf_map_update_elem(nolocal_lru_map, &key, &val, + BPF_ANY); + } else { + ret = -EINVAL; + } + +done: + if (ret) + bpf_trace_printk(fmt, sizeof(fmt), ret); return 0; } @@ -165,5 +219,31 @@ int stress_lpm_trie_map_alloc(struct pt_regs *ctx) return 0; } +SEC("kprobe/sys_getpgid") +int stress_hash_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&hash_map, &key); + + return 0; +} + +SEC("kprobe/sys_getpgrp") +int stress_array_map_lookup(struct pt_regs *ctx) +{ + u32 key = 1, i; + long *value; + +#pragma clang loop unroll(full) + for (i = 0; i < 64; ++i) + value = bpf_map_lookup_elem(&array_map, &key); + + return 0; +} + char _license[] SEC("license") = "GPL"; u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 680260a91f50c893dd26a1b968cc220a299c530f..1a8894b5ac5147c36833ffae500ddfbeb00277d1 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -18,10 +18,14 @@ #include #include #include +#include +#include + #include "libbpf.h" #include "bpf_load.h" -#define MAX_CNT 1000000 +#define TEST_BIT(t) (1U << (t)) +#define MAX_NR_CPUS 1024 static __u64 time_get_ns(void) { @@ -31,15 +35,44 @@ static __u64 time_get_ns(void) return ts.tv_sec * 1000000000ull + ts.tv_nsec; } -#define HASH_PREALLOC (1 << 0) -#define PERCPU_HASH_PREALLOC (1 << 1) -#define HASH_KMALLOC (1 << 2) -#define PERCPU_HASH_KMALLOC (1 << 3) -#define LRU_HASH_PREALLOC (1 << 4) -#define PERCPU_LRU_HASH_PREALLOC (1 << 5) -#define LPM_KMALLOC (1 << 6) +enum test_type { + HASH_PREALLOC, + PERCPU_HASH_PREALLOC, + HASH_KMALLOC, + PERCPU_HASH_KMALLOC, + LRU_HASH_PREALLOC, + NOCOMMON_LRU_HASH_PREALLOC, + LPM_KMALLOC, + HASH_LOOKUP, + ARRAY_LOOKUP, + INNER_LRU_HASH_PREALLOC, + NR_TESTS, +}; + +const char *test_map_names[NR_TESTS] = { + [HASH_PREALLOC] = "hash_map", + [PERCPU_HASH_PREALLOC] = "percpu_hash_map", + [HASH_KMALLOC] = "hash_map_alloc", + [PERCPU_HASH_KMALLOC] = "percpu_hash_map_alloc", + [LRU_HASH_PREALLOC] = "lru_hash_map", + [NOCOMMON_LRU_HASH_PREALLOC] = "nocommon_lru_hash_map", + [LPM_KMALLOC] = "lpm_trie_map_alloc", + [HASH_LOOKUP] = "hash_map", + [ARRAY_LOOKUP] = "array_map", + [INNER_LRU_HASH_PREALLOC] = "inner_lru_hash_map", +}; static int test_flags = ~0; +static uint32_t num_map_entries; +static uint32_t inner_lru_hash_size; +static int inner_lru_hash_idx = -1; +static int array_of_lru_hashs_idx = -1; +static uint32_t max_cnt = 1000000; + +static int check_test_flags(enum test_type t) +{ + return test_flags & TEST_BIT(t); +} static void test_hash_prealloc(int cpu) { @@ -47,34 +80,89 @@ static void test_hash_prealloc(int cpu) int i; start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) + for (i = 0; i < max_cnt; i++) syscall(__NR_getuid); printf("%d:hash_map_perf pre-alloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } -static void test_lru_hash_prealloc(int cpu) +static void do_test_lru(enum test_type test, int cpu) { + static int inner_lru_map_fds[MAX_NR_CPUS]; + + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + const char *test_name; __u64 start_time; - int i; + int i, ret; + + if (test == INNER_LRU_HASH_PREALLOC) { + int outer_fd = map_fd[array_of_lru_hashs_idx]; + + assert(cpu < MAX_NR_CPUS); + + if (cpu) { + inner_lru_map_fds[cpu] = + bpf_create_map(BPF_MAP_TYPE_LRU_HASH, + sizeof(uint32_t), sizeof(long), + inner_lru_hash_size, 0); + if (inner_lru_map_fds[cpu] == -1) { + printf("cannot create BPF_MAP_TYPE_LRU_HASH %s(%d)\n", + strerror(errno), errno); + exit(1); + } + } else { + inner_lru_map_fds[cpu] = map_fd[inner_lru_hash_idx]; + } + + ret = bpf_map_update_elem(outer_fd, &cpu, + &inner_lru_map_fds[cpu], + BPF_ANY); + if (ret) { + printf("cannot update ARRAY_OF_LRU_HASHS with key:%u. %s(%d)\n", + cpu, strerror(errno), errno); + exit(1); + } + } + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + + if (test == LRU_HASH_PREALLOC) { + test_name = "lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[7] = 0; + } else if (test == NOCOMMON_LRU_HASH_PREALLOC) { + test_name = "nocommon_lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[7] = 1; + } else if (test == INNER_LRU_HASH_PREALLOC) { + test_name = "inner_lru_hash_map_perf"; + in6.sin6_addr.s6_addr16[7] = 2; + } else { + assert(0); + } start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) - syscall(__NR_getpid); - printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + for (i = 0; i < max_cnt; i++) { + ret = connect(-1, (const struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + } + printf("%d:%s pre-alloc %lld events per sec\n", + cpu, test_name, + max_cnt * 1000000000ll / (time_get_ns() - start_time)); +} + +static void test_lru_hash_prealloc(int cpu) +{ + do_test_lru(LRU_HASH_PREALLOC, cpu); } -static void test_percpu_lru_hash_prealloc(int cpu) +static void test_nocommon_lru_hash_prealloc(int cpu) { - __u64 start_time; - int i; + do_test_lru(NOCOMMON_LRU_HASH_PREALLOC, cpu); +} - start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) - syscall(__NR_getppid); - printf("%d:lru_hash_map_perf pre-alloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); +static void test_inner_lru_hash_prealloc(int cpu) +{ + do_test_lru(INNER_LRU_HASH_PREALLOC, cpu); } static void test_percpu_hash_prealloc(int cpu) @@ -83,10 +171,10 @@ static void test_percpu_hash_prealloc(int cpu) int i; start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) + for (i = 0; i < max_cnt; i++) syscall(__NR_geteuid); printf("%d:percpu_hash_map_perf pre-alloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } static void test_hash_kmalloc(int cpu) @@ -95,10 +183,10 @@ static void test_hash_kmalloc(int cpu) int i; start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) + for (i = 0; i < max_cnt; i++) syscall(__NR_getgid); printf("%d:hash_map_perf kmalloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } static void test_percpu_hash_kmalloc(int cpu) @@ -107,10 +195,10 @@ static void test_percpu_hash_kmalloc(int cpu) int i; start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) + for (i = 0; i < max_cnt; i++) syscall(__NR_getegid); printf("%d:percpu_hash_map_perf kmalloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } static void test_lpm_kmalloc(int cpu) @@ -119,40 +207,63 @@ static void test_lpm_kmalloc(int cpu) int i; start_time = time_get_ns(); - for (i = 0; i < MAX_CNT; i++) + for (i = 0; i < max_cnt; i++) syscall(__NR_gettid); printf("%d:lpm_perf kmalloc %lld events per sec\n", - cpu, MAX_CNT * 1000000000ll / (time_get_ns() - start_time)); + cpu, max_cnt * 1000000000ll / (time_get_ns() - start_time)); } -static void loop(int cpu) +static void test_hash_lookup(int cpu) { - cpu_set_t cpuset; - - CPU_ZERO(&cpuset); - CPU_SET(cpu, &cpuset); - sched_setaffinity(0, sizeof(cpuset), &cpuset); + __u64 start_time; + int i; - if (test_flags & HASH_PREALLOC) - test_hash_prealloc(cpu); + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getpgid, 0); + printf("%d:hash_lookup %lld lookups per sec\n", + cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); +} - if (test_flags & PERCPU_HASH_PREALLOC) - test_percpu_hash_prealloc(cpu); +static void test_array_lookup(int cpu) +{ + __u64 start_time; + int i; - if (test_flags & HASH_KMALLOC) - test_hash_kmalloc(cpu); + start_time = time_get_ns(); + for (i = 0; i < max_cnt; i++) + syscall(__NR_getpgrp, 0); + printf("%d:array_lookup %lld lookups per sec\n", + cpu, max_cnt * 1000000000ll * 64 / (time_get_ns() - start_time)); +} - if (test_flags & PERCPU_HASH_KMALLOC) - test_percpu_hash_kmalloc(cpu); +typedef void (*test_func)(int cpu); +const test_func test_funcs[] = { + [HASH_PREALLOC] = test_hash_prealloc, + [PERCPU_HASH_PREALLOC] = test_percpu_hash_prealloc, + [HASH_KMALLOC] = test_hash_kmalloc, + [PERCPU_HASH_KMALLOC] = test_percpu_hash_kmalloc, + [LRU_HASH_PREALLOC] = test_lru_hash_prealloc, + [NOCOMMON_LRU_HASH_PREALLOC] = test_nocommon_lru_hash_prealloc, + [LPM_KMALLOC] = test_lpm_kmalloc, + [HASH_LOOKUP] = test_hash_lookup, + [ARRAY_LOOKUP] = test_array_lookup, + [INNER_LRU_HASH_PREALLOC] = test_inner_lru_hash_prealloc, +}; - if (test_flags & LRU_HASH_PREALLOC) - test_lru_hash_prealloc(cpu); +static void loop(int cpu) +{ + cpu_set_t cpuset; + int i; - if (test_flags & PERCPU_LRU_HASH_PREALLOC) - test_percpu_lru_hash_prealloc(cpu); + CPU_ZERO(&cpuset); + CPU_SET(cpu, &cpuset); + sched_setaffinity(0, sizeof(cpuset), &cpuset); - if (test_flags & LPM_KMALLOC) - test_lpm_kmalloc(cpu); + for (i = 0; i < NR_TESTS; i++) { + if (check_test_flags(i)) + test_funcs[i](cpu); + } } static void run_perf_test(int tasks) @@ -209,6 +320,38 @@ static void fill_lpm_trie(void) assert(!r); } +static void fixup_map(struct bpf_map_data *map, int idx) +{ + int i; + + if (!strcmp("inner_lru_hash_map", map->name)) { + inner_lru_hash_idx = idx; + inner_lru_hash_size = map->def.max_entries; + } + + if (!strcmp("array_of_lru_hashs", map->name)) { + if (inner_lru_hash_idx == -1) { + printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n"); + exit(1); + } + map->def.inner_map_idx = inner_lru_hash_idx; + array_of_lru_hashs_idx = idx; + } + + if (num_map_entries <= 0) + return; + + inner_lru_hash_size = num_map_entries; + + /* Only change the max_entries for the enabled test(s) */ + for (i = 0; i < NR_TESTS; i++) { + if (!strcmp(test_map_names[i], map->name) && + (check_test_flags(i))) { + map->def.max_entries = num_map_entries; + } + } +} + int main(int argc, char **argv) { struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; @@ -224,7 +367,13 @@ int main(int argc, char **argv) if (argc > 2) num_cpu = atoi(argv[2]) ? : num_cpu; - if (load_bpf_file(filename)) { + if (argc > 3) + num_map_entries = atoi(argv[3]); + + if (argc > 4) + max_cnt = atoi(argv[4]); + + if (load_bpf_file_fixup_map(filename, fixup_map)) { printf("%s", bpf_log_buf); return 1; } diff --git a/samples/bpf/run_cookie_uid_helper_example.sh b/samples/bpf/run_cookie_uid_helper_example.sh new file mode 100755 index 0000000000000000000000000000000000000000..f898cfa2b1aa52f71a65db469450d4d001b08b2c --- /dev/null +++ b/samples/bpf/run_cookie_uid_helper_example.sh @@ -0,0 +1,14 @@ +#!/bin/bash +local_dir="$(pwd)" +root_dir=$local_dir/../.. +mnt_dir=$(mktemp -d --tmp) + +on_exit() { + iptables -D OUTPUT -m bpf --object-pinned ${mnt_dir}/bpf_prog -j ACCEPT + umount ${mnt_dir} + rm -r ${mnt_dir} +} + +trap on_exit EXIT +mount -t bpf bpf ${mnt_dir} +./per_socket_stats_example ${mnt_dir}/bpf_prog $1 diff --git a/samples/bpf/test_lru_dist.c b/samples/bpf/test_lru_dist.c index d96dc88d3b04067135fe80c95628fca37a29c022..73c35714226842b248027b214989e56570e4378f 100644 --- a/samples/bpf/test_lru_dist.c +++ b/samples/bpf/test_lru_dist.c @@ -25,7 +25,9 @@ #include "bpf_util.h" #define min(a, b) ((a) < (b) ? (a) : (b)) -#define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#ifndef offsetof +# define offsetof(TYPE, MEMBER) ((size_t)&((TYPE *)0)->MEMBER) +#endif #define container_of(ptr, type, member) ({ \ const typeof( ((type *)0)->member ) *__mptr = (ptr); \ (type *)( (char *)__mptr - offsetof(type,member) );}) diff --git a/samples/bpf/test_map_in_map_kern.c b/samples/bpf/test_map_in_map_kern.c new file mode 100644 index 0000000000000000000000000000000000000000..42c44d091dd176b87bb92d1aeb849b1f8ea8cdd6 --- /dev/null +++ b/samples/bpf/test_map_in_map_kern.c @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#define KBUILD_MODNAME "foo" +#include +#include +#include +#include +#include "bpf_helpers.h" + +#define MAX_NR_PORTS 65536 + +/* map #0 */ +struct bpf_map_def SEC("maps") port_a = { + .type = BPF_MAP_TYPE_ARRAY, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = MAX_NR_PORTS, +}; + +/* map #1 */ +struct bpf_map_def SEC("maps") port_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #2 */ +struct bpf_map_def SEC("maps") reg_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #3 */ +struct bpf_map_def SEC("maps") inline_result_h = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(u32), + .value_size = sizeof(int), + .max_entries = 1, +}; + +/* map #4 */ /* Test case #0 */ +struct bpf_map_def SEC("maps") a_of_port_a = { + .type = BPF_MAP_TYPE_ARRAY_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = MAX_NR_PORTS, +}; + +/* map #5 */ /* Test case #1 */ +struct bpf_map_def SEC("maps") h_of_port_a = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 0, /* map_fd[0] is port_a */ + .max_entries = 1, +}; + +/* map #6 */ /* Test case #2 */ +struct bpf_map_def SEC("maps") h_of_port_h = { + .type = BPF_MAP_TYPE_HASH_OF_MAPS, + .key_size = sizeof(u32), + .inner_map_idx = 1, /* map_fd[1] is port_h */ + .max_entries = 1, +}; + +static __always_inline int do_reg_lookup(void *inner_map, u32 port) +{ + int *result; + + result = bpf_map_lookup_elem(inner_map, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_array_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_a) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_a, &port); + return result ? *result : -ENOENT; +} + +static __always_inline int do_inline_hash_lookup(void *inner_map, u32 port) +{ + int *result; + + if (inner_map != &port_h) + return -EINVAL; + + result = bpf_map_lookup_elem(&port_h, &port); + return result ? *result : -ENOENT; +} + +SEC("kprobe/sys_connect") +int trace_sys_connect(struct pt_regs *ctx) +{ + struct sockaddr_in6 *in6; + u16 test_case, port, dst6[8]; + int addrlen, ret, inline_ret, ret_key = 0; + u32 port_key; + void *outer_map, *inner_map; + bool inline_hash = false; + + in6 = (struct sockaddr_in6 *)PT_REGS_PARM2(ctx); + addrlen = (int)PT_REGS_PARM3(ctx); + + if (addrlen != sizeof(*in6)) + return 0; + + ret = bpf_probe_read(dst6, sizeof(dst6), &in6->sin6_addr); + if (ret) { + inline_ret = ret; + goto done; + } + + if (dst6[0] != 0xdead || dst6[1] != 0xbeef) + return 0; + + test_case = dst6[7]; + + ret = bpf_probe_read(&port, sizeof(port), &in6->sin6_port); + if (ret) { + inline_ret = ret; + goto done; + } + + port_key = port; + + ret = -ENOENT; + if (test_case == 0) { + outer_map = &a_of_port_a; + } else if (test_case == 1) { + outer_map = &h_of_port_a; + } else if (test_case == 2) { + outer_map = &h_of_port_h; + } else { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + inner_map = bpf_map_lookup_elem(outer_map, &port_key); + if (!inner_map) { + ret = __LINE__; + inline_ret = ret; + goto done; + } + + ret = do_reg_lookup(inner_map, port_key); + + if (test_case == 0 || test_case == 1) + inline_ret = do_inline_array_lookup(inner_map, port_key); + else + inline_ret = do_inline_hash_lookup(inner_map, port_key); + +done: + bpf_map_update_elem(®_result_h, &ret_key, &ret, BPF_ANY); + bpf_map_update_elem(&inline_result_h, &ret_key, &inline_ret, BPF_ANY); + + return 0; +} + +char _license[] SEC("license") = "GPL"; +u32 _version SEC("version") = LINUX_VERSION_CODE; diff --git a/samples/bpf/test_map_in_map_user.c b/samples/bpf/test_map_in_map_user.c new file mode 100644 index 0000000000000000000000000000000000000000..f62fdc2bd4281005f959cdb85cf1aafe2dc08193 --- /dev/null +++ b/samples/bpf/test_map_in_map_user.c @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include "libbpf.h" +#include "bpf_load.h" + +#define PORT_A (map_fd[0]) +#define PORT_H (map_fd[1]) +#define REG_RESULT_H (map_fd[2]) +#define INLINE_RESULT_H (map_fd[3]) +#define A_OF_PORT_A (map_fd[4]) /* Test case #0 */ +#define H_OF_PORT_A (map_fd[5]) /* Test case #1 */ +#define H_OF_PORT_H (map_fd[6]) /* Test case #2 */ + +static const char * const test_names[] = { + "Array of Array", + "Hash of Array", + "Hash of Hash", +}; + +#define NR_TESTS (sizeof(test_names) / sizeof(*test_names)) + +static void populate_map(uint32_t port_key, int magic_result) +{ + int ret; + + ret = bpf_map_update_elem(PORT_A, &port_key, &magic_result, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(PORT_H, &port_key, &magic_result, + BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(A_OF_PORT_A, &port_key, &PORT_A, BPF_ANY); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_A, &port_key, &PORT_A, BPF_NOEXIST); + assert(!ret); + + ret = bpf_map_update_elem(H_OF_PORT_H, &port_key, &PORT_H, BPF_NOEXIST); + assert(!ret); +} + +static void test_map_in_map(void) +{ + struct sockaddr_in6 in6 = { .sin6_family = AF_INET6 }; + uint32_t result_key = 0, port_key; + int result, inline_result; + int magic_result = 0xfaceb00c; + int ret; + int i; + + port_key = rand() & 0x00FF; + populate_map(port_key, magic_result); + + in6.sin6_addr.s6_addr16[0] = 0xdead; + in6.sin6_addr.s6_addr16[1] = 0xbeef; + in6.sin6_port = port_key; + + for (i = 0; i < NR_TESTS; i++) { + printf("%s: ", test_names[i]); + + in6.sin6_addr.s6_addr16[7] = i; + ret = connect(-1, (struct sockaddr *)&in6, sizeof(in6)); + assert(ret == -1 && errno == EBADF); + + ret = bpf_map_lookup_elem(REG_RESULT_H, &result_key, &result); + assert(!ret); + + ret = bpf_map_lookup_elem(INLINE_RESULT_H, &result_key, + &inline_result); + assert(!ret); + + if (result != magic_result || inline_result != magic_result) { + printf("Error. result:%d inline_result:%d\n", + result, inline_result); + exit(1); + } + + bpf_map_delete_elem(REG_RESULT_H, &result_key); + bpf_map_delete_elem(INLINE_RESULT_H, &result_key); + + printf("Pass\n"); + } +} + +int main(int argc, char **argv) +{ + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; + char filename[256]; + + assert(!setrlimit(RLIMIT_MEMLOCK, &r)); + + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + + if (load_bpf_file(filename)) { + printf("%s", bpf_log_buf); + return 1; + } + + test_map_in_map(); + + return 0; +} diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index ded9804c503418c9ed46b1ffa677455720e33a7b..7fee0f1ba9a313baf2d233b3bf705994c183755e 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -4,6 +4,7 @@ #include #include #include +#include #include "libbpf.h" #include "bpf_load.h" @@ -112,6 +113,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; long key, next_key, value; FILE *f; @@ -119,6 +121,11 @@ int main(int ac, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + signal(SIGINT, int_exit); /* start 'ping' in the background to have some kfree_skb events */ diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 8f7d199d59459cf16520b62a0ef12dcb7e6f6c04..fe372239d5054c44a7dcb70f05530aa19eb3cfc9 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -11,6 +11,7 @@ #include #include #include +#include #include "libbpf.h" #include "bpf_load.h" @@ -112,11 +113,17 @@ static void print_hist(int fd) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index 03449f773cb1f4da6b1e42c272cc1c83da36074f..22c644f1f4c378db2877e94977dcfdd5df87118a 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -12,6 +12,8 @@ #include #include #include +#include + #include "libbpf.h" #include "bpf_load.h" @@ -50,11 +52,17 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/samples/bpf/xdp1_user.c b/samples/bpf/xdp1_user.c index d2be65d1fd86ed199909bc6c68327f7d31e6b35a..378850c70eb81afdbc5283611573f89ea4d8923e 100644 --- a/samples/bpf/xdp1_user.c +++ b/samples/bpf/xdp1_user.c @@ -5,6 +5,7 @@ * License as published by the Free Software Foundation. */ #include +#include #include #include #include @@ -12,16 +13,18 @@ #include #include #include +#include #include "bpf_load.h" #include "bpf_util.h" #include "libbpf.h" static int ifindex; +static __u32 xdp_flags; static void int_exit(int sig) { - set_link_xdp_fd(ifindex, -1); + set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -54,18 +57,39 @@ static void poll_stats(int interval) } } -int main(int ac, char **argv) +static void usage(const char *prog) { - char filename[256]; + fprintf(stderr, + "usage: %s [OPTS] IFINDEX\n\n" + "OPTS:\n" + " -S use skb-mode\n", + prog); +} - snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); +int main(int argc, char **argv) +{ + const char *optstr = "S"; + char filename[256]; + int opt; + + while ((opt = getopt(argc, argv, optstr)) != -1) { + switch (opt) { + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; + default: + usage(basename(argv[0])); + return 1; + } + } - if (ac != 2) { - printf("usage: %s IFINDEX\n", argv[0]); + if (optind == argc) { + usage(basename(argv[0])); return 1; } + ifindex = strtoul(argv[optind], NULL, 0); - ifindex = strtoul(argv[1], NULL, 0); + snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); @@ -79,7 +103,7 @@ int main(int ac, char **argv) signal(SIGINT, int_exit); - if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) { + if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } diff --git a/samples/bpf/xdp_tx_iptunnel_user.c b/samples/bpf/xdp_tx_iptunnel_user.c index 70e192fc61aa4b9a317a1e8b11df0d36034cc877..92b8bde9337c8cec6128e73d23d307b628ba5a38 100644 --- a/samples/bpf/xdp_tx_iptunnel_user.c +++ b/samples/bpf/xdp_tx_iptunnel_user.c @@ -5,6 +5,7 @@ * License as published by the Free Software Foundation. */ #include +#include #include #include #include @@ -24,11 +25,12 @@ #define STATS_INTERVAL_S 2U static int ifindex = -1; +static __u32 xdp_flags = 0; static void int_exit(int sig) { if (ifindex > -1) - set_link_xdp_fd(ifindex, -1); + set_link_xdp_fd(ifindex, -1, xdp_flags); exit(0); } @@ -136,7 +138,7 @@ int main(int argc, char **argv) { unsigned char opt_flags[256] = {}; unsigned int kill_after_s = 0; - const char *optstr = "i:a:p:s:d:m:T:P:h"; + const char *optstr = "i:a:p:s:d:m:T:P:Sh"; int min_port = 0, max_port = 0; struct iptnl_info tnl = {}; struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; @@ -201,6 +203,9 @@ int main(int argc, char **argv) case 'T': kill_after_s = atoi(optarg); break; + case 'S': + xdp_flags |= XDP_FLAGS_SKB_MODE; + break; default: usage(argv[0]); return 1; @@ -243,14 +248,14 @@ int main(int argc, char **argv) } } - if (set_link_xdp_fd(ifindex, prog_fd[0]) < 0) { + if (set_link_xdp_fd(ifindex, prog_fd[0], xdp_flags) < 0) { printf("link set xdp fd failed\n"); return 1; } poll_stats(kill_after_s); - set_link_xdp_fd(ifindex, -1); + set_link_xdp_fd(ifindex, -1, xdp_flags); return 0; } diff --git a/samples/livepatch/livepatch-sample.c b/samples/livepatch/livepatch-sample.c index e34f871e69b1ae60ebd8718dd1f0063a4db00e52..84795223f15f7e9611293f2812f68990c0cf29d0 100644 --- a/samples/livepatch/livepatch-sample.c +++ b/samples/livepatch/livepatch-sample.c @@ -17,6 +17,8 @@ * along with this program; if not, see . */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -69,6 +71,21 @@ static int livepatch_init(void) { int ret; + if (!klp_have_reliable_stack() && !patch.immediate) { + /* + * WARNING: Be very careful when using 'patch.immediate' in + * your patches. It's ok to use it for simple patches like + * this, but for more complex patches which change function + * semantics, locking semantics, or data structures, it may not + * be safe. Use of this option will also prevent removal of + * the patch. + * + * See Documentation/livepatch/livepatch.txt for more details. + */ + patch.immediate = true; + pr_notice("The consistency model isn't supported for your architecture. Bypassing safety mechanisms and applying the patch immediately.\n"); + } + ret = klp_register_patch(&patch); if (ret) return ret; @@ -82,7 +99,6 @@ static int livepatch_init(void) static void livepatch_exit(void) { - WARN_ON(klp_disable_patch(&patch)); WARN_ON(klp_unregister_patch(&patch)); } diff --git a/samples/mei/TODO b/samples/mei/TODO deleted file mode 100644 index 6b3625d3058cc16fbf341528f65a4e209a79100b..0000000000000000000000000000000000000000 --- a/samples/mei/TODO +++ /dev/null @@ -1,2 +0,0 @@ -TODO: - - Cleanup and split the timer function diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include index afe3fd3af1e40616857b3e6c425be632c1fa2667..61f87a99bf0a1c512e572d3cbdcf4b4b5d7ae785 100644 --- a/scripts/Kbuild.include +++ b/scripts/Kbuild.include @@ -116,12 +116,12 @@ CC_OPTION_CFLAGS = $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) # Usage: cflags-y += $(call cc-option,-march=winchip-c6,-march=i586) cc-option = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2)) # cc-option-yn # Usage: flag := $(call cc-option-yn,-march=winchip-c6) cc-option-yn = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) $(1) -c -x c /dev/null -o "$$TMP",y,n) # cc-option-align # Prefix align with either -falign or -malign @@ -131,7 +131,7 @@ cc-option-align = $(subst -functions=0,,\ # cc-disable-warning # Usage: cflags-y += $(call cc-disable-warning,unused-but-set-variable) cc-disable-warning = $(call try-run,\ - $(CC) $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) + $(CC) -Werror $(KBUILD_CPPFLAGS) $(CC_OPTION_CFLAGS) -W$(strip $(1)) -c -x c /dev/null -o "$$TMP",-Wno-$(strip $(1))) # cc-name # Expands to either gcc or clang diff --git a/scripts/Makefile.build b/scripts/Makefile.build index d883116ebaa452d9c2f6c657de53121ebd9d50bd..733e044fff8b37bae101788ac82be43fafa1a942 100644 --- a/scripts/Makefile.build +++ b/scripts/Makefile.build @@ -177,6 +177,14 @@ cmd_cc_symtypes_c = \ $(obj)/%.symtypes : $(src)/%.c FORCE $(call cmd,cc_symtypes_c) +# LLVM assembly +# Generate .ll files from .c +quiet_cmd_cc_ll_c = CC $(quiet_modtag) $@ + cmd_cc_ll_c = $(CC) $(c_flags) -emit-llvm -S -o $@ $< + +$(obj)/%.ll: $(src)/%.c FORCE + $(call if_changed_dep,cc_ll_c) + # C (.c) files # The C file is compiled and updated dependency information is generated. # (See cmd_cc_o_c + relevant part of rule_cc_o_c) @@ -272,14 +280,14 @@ define rule_cc_o_c $(call echo-cmd,checksrc) $(cmd_checksrc) \ $(call cmd_and_fixdep,cc_o_c) \ $(cmd_modversions_c) \ - $(cmd_objtool) \ + $(call echo-cmd,objtool) $(cmd_objtool) \ $(call echo-cmd,record_mcount) $(cmd_record_mcount) endef define rule_as_o_S $(call cmd_and_fixdep,as_o_S) \ $(cmd_modversions_S) \ - $(cmd_objtool) + $(call echo-cmd,objtool) $(cmd_objtool) endef # List module undefined symbols (or empty line if not enabled) diff --git a/scripts/Makefile.dtbinst b/scripts/Makefile.dtbinst index a1be75d0a5fd3fbf4742e555046896ea6fa6fe65..34614a48b717eafa2c15f418e6e6769905e32ec1 100644 --- a/scripts/Makefile.dtbinst +++ b/scripts/Makefile.dtbinst @@ -20,12 +20,6 @@ include include/config/auto.conf include scripts/Kbuild.include include $(src)/Makefile -PHONY += __dtbs_install_prep -__dtbs_install_prep: -ifeq ("$(dtbinst-root)", "$(obj)") - $(Q)mkdir -p $(INSTALL_DTBS_PATH) -endif - dtbinst-files := $(dtb-y) dtbinst-dirs := $(dts-dirs) @@ -35,8 +29,6 @@ quiet_cmd_dtb_install = INSTALL $< install-dir = $(patsubst $(dtbinst-root)%,$(INSTALL_DTBS_PATH)%,$(obj)) -$(dtbinst-files) $(dtbinst-dirs): | __dtbs_install_prep - $(dtbinst-files): %.dtb: $(obj)/%.dtb $(call cmd,dtb_install,$(install-dir)) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 7c321a603b079d355bd127aebb04adc0294b4ba5..fb3522fd87029f8e841469d2bb300ef6eb7fe1cb 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -64,7 +64,6 @@ ifeq ($(cc-name),clang) KBUILD_CFLAGS += $(call cc-disable-warning, initializer-overrides) KBUILD_CFLAGS += $(call cc-disable-warning, unused-value) KBUILD_CFLAGS += $(call cc-disable-warning, format) -KBUILD_CFLAGS += $(call cc-disable-warning, unknown-warning-option) KBUILD_CFLAGS += $(call cc-disable-warning, sign-compare) KBUILD_CFLAGS += $(call cc-disable-warning, format-zero-length) KBUILD_CFLAGS += $(call cc-disable-warning, uninitialized) diff --git a/scripts/Makefile.headersinst b/scripts/Makefile.headersinst index 1106d6ca3a384baa81b077b599d83b8e9941108e..6ba97a1f9c5a26304abdf0f043211efe6c273ffc 100644 --- a/scripts/Makefile.headersinst +++ b/scripts/Makefile.headersinst @@ -1,20 +1,22 @@ # ========================================================================== # Installing headers # -# header-y - list files to be installed. They are preprocessed -# to remove __KERNEL__ section of the file -# genhdr-y - Same as header-y but in a generated/ directory +# All headers under include/uapi, include/generated/uapi, +# arch//include/uapi and arch//include/generated/uapi are +# exported. +# They are preprocessed to remove __KERNEL__ section of the file. # # ========================================================================== # generated header directory gen := $(if $(gen),$(gen),$(subst include/,include/generated/,$(obj))) +# Kbuild file is optional kbuild-file := $(srctree)/$(obj)/Kbuild -include $(kbuild-file) +-include $(kbuild-file) # called may set destination dir (when installing to asm/) -_dst := $(if $(destination-y),$(destination-y),$(if $(dst),$(dst),$(obj))) +_dst := $(if $(dst),$(dst),$(obj)) old-kbuild-file := $(srctree)/$(subst uapi/,,$(obj))/Kbuild ifneq ($(wildcard $(old-kbuild-file)),) @@ -25,9 +27,14 @@ include scripts/Kbuild.include installdir := $(INSTALL_HDR_PATH)/$(subst uapi/,,$(_dst)) -header-y := $(sort $(header-y)) -subdirs := $(patsubst %/,%,$(filter %/, $(header-y))) -header-y := $(filter-out %/, $(header-y)) +srcdir := $(srctree)/$(obj) +gendir := $(objtree)/$(gen) +subdirs := $(patsubst $(srcdir)/%/.,%,$(wildcard $(srcdir)/*/.)) +header-files := $(notdir $(wildcard $(srcdir)/*.h)) +header-files += $(notdir $(wildcard $(srcdir)/*.agh)) +header-files := $(filter-out $(no-export-headers), $(header-files)) +genhdr-files := $(notdir $(wildcard $(gendir)/*.h)) +genhdr-files := $(filter-out $(header-files), $(genhdr-files)) # files used to track state of install/check install-file := $(installdir)/.install @@ -35,36 +42,20 @@ check-file := $(installdir)/.check # generic-y list all files an architecture uses from asm-generic # Use this to build a list of headers which require a wrapper -wrapper-files := $(filter $(header-y), $(generic-y)) - -srcdir := $(srctree)/$(obj) -gendir := $(objtree)/$(gen) - -oldsrcdir := $(srctree)/$(subst /uapi,,$(obj)) +generic-files := $(notdir $(wildcard $(srctree)/include/uapi/asm-generic/*.h)) +wrapper-files := $(filter $(generic-files), $(generic-y)) +wrapper-files := $(filter-out $(header-files), $(wrapper-files)) # all headers files for this dir -header-y := $(filter-out $(generic-y), $(header-y)) -all-files := $(header-y) $(genhdr-y) $(wrapper-files) +all-files := $(header-files) $(genhdr-files) $(wrapper-files) output-files := $(addprefix $(installdir)/, $(all-files)) -input-files1 := $(foreach hdr, $(header-y), \ - $(if $(wildcard $(srcdir)/$(hdr)), \ - $(wildcard $(srcdir)/$(hdr))) \ - ) -input-files1-name := $(notdir $(input-files1)) -input-files2 := $(foreach hdr, $(header-y), \ - $(if $(wildcard $(srcdir)/$(hdr)),, \ - $(if $(wildcard $(oldsrcdir)/$(hdr)), \ - $(wildcard $(oldsrcdir)/$(hdr)), \ - $(error Missing UAPI file $(srcdir)/$(hdr))) \ - )) -input-files2-name := $(notdir $(input-files2)) -input-files3 := $(foreach hdr, $(genhdr-y), \ - $(if $(wildcard $(gendir)/$(hdr)), \ - $(wildcard $(gendir)/$(hdr)), \ - $(error Missing generated UAPI file $(gendir)/$(hdr)) \ - )) -input-files3-name := $(notdir $(input-files3)) +ifneq ($(mandatory-y),) +missing := $(filter-out $(all-files),$(mandatory-y)) +ifneq ($(missing),) +$(error Some mandatory headers ($(missing)) are missing in $(obj)) +endif +endif # Work out what needs to be removed oldheaders := $(patsubst $(installdir)/%,%,$(wildcard $(installdir)/*.h)) @@ -78,9 +69,8 @@ printdir = $(patsubst $(INSTALL_HDR_PATH)/%/,%,$(dir $@)) quiet_cmd_install = INSTALL $(printdir) ($(words $(all-files))\ file$(if $(word 2, $(all-files)),s)) cmd_install = \ - $(CONFIG_SHELL) $< $(installdir) $(srcdir) $(input-files1-name); \ - $(CONFIG_SHELL) $< $(installdir) $(oldsrcdir) $(input-files2-name); \ - $(CONFIG_SHELL) $< $(installdir) $(gendir) $(input-files3-name); \ + $(CONFIG_SHELL) $< $(installdir) $(srcdir) $(header-files); \ + $(CONFIG_SHELL) $< $(installdir) $(gendir) $(genhdr-files); \ for F in $(wrapper-files); do \ echo "\#include " > $(installdir)/$$F; \ done; \ @@ -106,7 +96,9 @@ __headersinst: $(subdirs) $(install-file) @: targets += $(install-file) -$(install-file): scripts/headers_install.sh $(input-files1) $(input-files2) $(input-files3) FORCE +$(install-file): scripts/headers_install.sh \ + $(addprefix $(srcdir)/,$(header-files)) \ + $(addprefix $(gendir)/,$(genhdr-files)) FORCE $(if $(unwanted),$(call cmd,remove),) $(if $(wildcard $(dir $@)),,$(shell mkdir -p $(dir $@))) $(call if_changed,install) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 7234e61e7ce370a775ec6981b391b6d102a01770..6dc1eda13b8e841d8cdbdd2b0ec664df84607801 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -280,9 +280,21 @@ DTC ?= $(objtree)/scripts/dtc/dtc # Disable noisy checks by default ifeq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),) -DTC_FLAGS += -Wno-unit_address_vs_reg +DTC_FLAGS += -Wno-unit_address_vs_reg \ + -Wno-simple_bus_reg \ + -Wno-unit_address_format \ + -Wno-pci_bridge \ + -Wno-pci_device_bus_num \ + -Wno-pci_device_reg endif +ifeq ($(KBUILD_ENABLE_EXTRA_GCC_CHECKS),2) +DTC_FLAGS += -Wnode_name_chars_strict \ + -Wproperty_name_chars_strict +endif + +DTC_FLAGS += $(DTC_FLAGS_$(basetarget)) + # Generate an assembly file to wrap the output of the device tree compiler quiet_cmd_dt_S_dtb= DTB $@ cmd_dt_S_dtb= \ @@ -408,3 +420,34 @@ quiet_cmd_xzmisc = XZMISC $@ cmd_xzmisc = (cat $(filter-out FORCE,$^) | \ xz --check=crc32 --lzma2=dict=1MiB) > $@ || \ (rm -f $@ ; false) + +# ASM offsets +# --------------------------------------------------------------------------- + +# Default sed regexp - multiline due to syntax constraints +# +# Use [:space:] because LLVM's integrated assembler inserts around +# the .ascii directive whereas GCC keeps the as-is. +define sed-offsets + 's:^[[:space:]]*\.ascii[[:space:]]*"\(.*\)".*:\1:; \ + /^->/{s:->#\(.*\):/* \1 */:; \ + s:^->\([^ ]*\) [\$$#]*\([^ ]*\) \(.*\):#define \1 \2 /* \3 */:; \ + s:->::; p;}' +endef + +# Use filechk to avoid rebuilds when a header changes, but the resulting file +# does not +define filechk_offsets + (set -e; \ + echo "#ifndef $2"; \ + echo "#define $2"; \ + echo "/*"; \ + echo " * DO NOT MODIFY."; \ + echo " *"; \ + echo " * This file was generated by Kbuild"; \ + echo " */"; \ + echo ""; \ + sed -ne $(sed-offsets); \ + echo ""; \ + echo "#endif" ) +endef diff --git a/scripts/analyze_suspend.py b/scripts/analyze_suspend.py deleted file mode 100755 index 20cdb2bc1daec4fc1205c71c38c8d082f0602863..0000000000000000000000000000000000000000 --- a/scripts/analyze_suspend.py +++ /dev/null @@ -1,5235 +0,0 @@ -#!/usr/bin/python -# -# Tool for analyzing suspend/resume timing -# Copyright (c) 2013, Intel Corporation. -# -# This program is free software; you can redistribute it and/or modify it -# under the terms and conditions of the GNU General Public License, -# version 2, as published by the Free Software Foundation. -# -# This program is distributed in the hope it will be useful, but WITHOUT -# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or -# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for -# more details. -# -# You should have received a copy of the GNU General Public License along with -# this program; if not, write to the Free Software Foundation, Inc., -# 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. -# -# Authors: -# Todd Brandt -# -# Links: -# Home Page -# https://01.org/suspendresume -# Source repo -# https://github.com/01org/suspendresume -# -# Description: -# This tool is designed to assist kernel and OS developers in optimizing -# their linux stack's suspend/resume time. Using a kernel image built -# with a few extra options enabled, the tool will execute a suspend and -# will capture dmesg and ftrace data until resume is complete. This data -# is transformed into a device timeline and a callgraph to give a quick -# and detailed view of which devices and callbacks are taking the most -# time in suspend/resume. The output is a single html file which can be -# viewed in firefox or chrome. -# -# The following kernel build options are required: -# CONFIG_PM_DEBUG=y -# CONFIG_PM_SLEEP_DEBUG=y -# CONFIG_FTRACE=y -# CONFIG_FUNCTION_TRACER=y -# CONFIG_FUNCTION_GRAPH_TRACER=y -# CONFIG_KPROBES=y -# CONFIG_KPROBES_ON_FTRACE=y -# -# For kernel versions older than 3.15: -# The following additional kernel parameters are required: -# (e.g. in file /etc/default/grub) -# GRUB_CMDLINE_LINUX_DEFAULT="... initcall_debug log_buf_len=16M ..." -# - -# ----------------- LIBRARIES -------------------- - -import sys -import time -import os -import string -import re -import platform -from datetime import datetime -import struct -import ConfigParser -from threading import Thread -from subprocess import call, Popen, PIPE - -# ----------------- CLASSES -------------------- - -# Class: SystemValues -# Description: -# A global, single-instance container used to -# store system values and test parameters -class SystemValues: - ansi = False - version = '4.5' - verbose = False - addlogs = False - mindevlen = 0.0 - mincglen = 0.0 - cgphase = '' - cgtest = -1 - callloopmaxgap = 0.0001 - callloopmaxlen = 0.005 - srgap = 0 - cgexp = False - outdir = '' - testdir = '.' - tpath = '/sys/kernel/debug/tracing/' - fpdtpath = '/sys/firmware/acpi/tables/FPDT' - epath = '/sys/kernel/debug/tracing/events/power/' - traceevents = [ - 'suspend_resume', - 'device_pm_callback_end', - 'device_pm_callback_start' - ] - logmsg = '' - testcommand = '' - mempath = '/dev/mem' - powerfile = '/sys/power/state' - suspendmode = 'mem' - hostname = 'localhost' - prefix = 'test' - teststamp = '' - dmesgstart = 0.0 - dmesgfile = '' - ftracefile = '' - htmlfile = '' - embedded = False - rtcwake = False - rtcwaketime = 10 - rtcpath = '' - devicefilter = [] - stamp = 0 - execcount = 1 - x2delay = 0 - usecallgraph = False - usetraceevents = False - usetraceeventsonly = False - usetracemarkers = True - usekprobes = True - usedevsrc = False - useprocmon = False - notestrun = False - mixedphaseheight = True - devprops = dict() - predelay = 0 - postdelay = 0 - procexecfmt = 'ps - (?P.*)$' - devpropfmt = '# Device Properties: .*' - tracertypefmt = '# tracer: (?P.*)' - firmwarefmt = '# fwsuspend (?P[0-9]*) fwresume (?P[0-9]*)$' - stampfmt = '# suspend-(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})-'+\ - '(?P[0-9]{2})(?P[0-9]{2})(?P[0-9]{2})'+\ - ' (?P.*) (?P.*) (?P.*)$' - tracefuncs = { - 'sys_sync': dict(), - 'pm_prepare_console': dict(), - 'pm_notifier_call_chain': dict(), - 'freeze_processes': dict(), - 'freeze_kernel_threads': dict(), - 'pm_restrict_gfp_mask': dict(), - 'acpi_suspend_begin': dict(), - 'suspend_console': dict(), - 'acpi_pm_prepare': dict(), - 'syscore_suspend': dict(), - 'arch_enable_nonboot_cpus_end': dict(), - 'syscore_resume': dict(), - 'acpi_pm_finish': dict(), - 'resume_console': dict(), - 'acpi_pm_end': dict(), - 'pm_restore_gfp_mask': dict(), - 'thaw_processes': dict(), - 'pm_restore_console': dict(), - 'CPU_OFF': { - 'func':'_cpu_down', - 'args_x86_64': {'cpu':'%di:s32'}, - 'format': 'CPU_OFF[{cpu}]' - }, - 'CPU_ON': { - 'func':'_cpu_up', - 'args_x86_64': {'cpu':'%di:s32'}, - 'format': 'CPU_ON[{cpu}]' - }, - } - dev_tracefuncs = { - # general wait/delay/sleep - 'msleep': { 'args_x86_64': {'time':'%di:s32'}, 'ub': 1 }, - 'schedule_timeout_uninterruptible': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, - 'schedule_timeout': { 'args_x86_64': {'timeout':'%di:s32'}, 'ub': 1 }, - 'udelay': { 'func':'__const_udelay', 'args_x86_64': {'loops':'%di:s32'}, 'ub': 1 }, - 'usleep_range': { 'args_x86_64': {'min':'%di:s32', 'max':'%si:s32'}, 'ub': 1 }, - 'mutex_lock_slowpath': { 'func':'__mutex_lock_slowpath', 'ub': 1 }, - 'acpi_os_stall': {'ub': 1}, - # ACPI - 'acpi_resume_power_resources': dict(), - 'acpi_ps_parse_aml': dict(), - # filesystem - 'ext4_sync_fs': dict(), - # 80211 - 'iwlagn_mac_start': dict(), - 'iwlagn_alloc_bcast_station': dict(), - 'iwl_trans_pcie_start_hw': dict(), - 'iwl_trans_pcie_start_fw': dict(), - 'iwl_run_init_ucode': dict(), - 'iwl_load_ucode_wait_alive': dict(), - 'iwl_alive_start': dict(), - 'iwlagn_mac_stop': dict(), - 'iwlagn_mac_suspend': dict(), - 'iwlagn_mac_resume': dict(), - 'iwlagn_mac_add_interface': dict(), - 'iwlagn_mac_remove_interface': dict(), - 'iwlagn_mac_change_interface': dict(), - 'iwlagn_mac_config': dict(), - 'iwlagn_configure_filter': dict(), - 'iwlagn_mac_hw_scan': dict(), - 'iwlagn_bss_info_changed': dict(), - 'iwlagn_mac_channel_switch': dict(), - 'iwlagn_mac_flush': dict(), - # ATA - 'ata_eh_recover': { 'args_x86_64': {'port':'+36(%di):s32'} }, - # i915 - 'i915_gem_resume': dict(), - 'i915_restore_state': dict(), - 'intel_opregion_setup': dict(), - 'g4x_pre_enable_dp': dict(), - 'vlv_pre_enable_dp': dict(), - 'chv_pre_enable_dp': dict(), - 'g4x_enable_dp': dict(), - 'vlv_enable_dp': dict(), - 'intel_hpd_init': dict(), - 'intel_opregion_register': dict(), - 'intel_dp_detect': dict(), - 'intel_hdmi_detect': dict(), - 'intel_opregion_init': dict(), - 'intel_fbdev_set_suspend': dict(), - } - kprobes = dict() - timeformat = '%.3f' - def __init__(self): - # if this is a phoronix test run, set some default options - if('LOG_FILE' in os.environ and 'TEST_RESULTS_IDENTIFIER' in os.environ): - self.embedded = True - self.addlogs = True - self.htmlfile = os.environ['LOG_FILE'] - self.archargs = 'args_'+platform.machine() - self.hostname = platform.node() - if(self.hostname == ''): - self.hostname = 'localhost' - rtc = "rtc0" - if os.path.exists('/dev/rtc'): - rtc = os.readlink('/dev/rtc') - rtc = '/sys/class/rtc/'+rtc - if os.path.exists(rtc) and os.path.exists(rtc+'/date') and \ - os.path.exists(rtc+'/time') and os.path.exists(rtc+'/wakealarm'): - self.rtcpath = rtc - if (hasattr(sys.stdout, 'isatty') and sys.stdout.isatty()): - self.ansi = True - def setPrecision(self, num): - if num < 0 or num > 6: - return - self.timeformat = '%.{0}f'.format(num) - def setOutputFolder(self, value): - args = dict() - n = datetime.now() - args['date'] = n.strftime('%y%m%d') - args['time'] = n.strftime('%H%M%S') - args['hostname'] = self.hostname - self.outdir = value.format(**args) - def setOutputFile(self): - if((self.htmlfile == '') and (self.dmesgfile != '')): - m = re.match('(?P.*)_dmesg\.txt$', self.dmesgfile) - if(m): - self.htmlfile = m.group('name')+'.html' - if((self.htmlfile == '') and (self.ftracefile != '')): - m = re.match('(?P.*)_ftrace\.txt$', self.ftracefile) - if(m): - self.htmlfile = m.group('name')+'.html' - if(self.htmlfile == ''): - self.htmlfile = 'output.html' - def initTestOutput(self, subdir, testpath=''): - self.prefix = self.hostname - v = open('/proc/version', 'r').read().strip() - kver = string.split(v)[2] - n = datetime.now() - testtime = n.strftime('suspend-%m%d%y-%H%M%S') - if not testpath: - testpath = n.strftime('suspend-%y%m%d-%H%M%S') - if(subdir != "."): - self.testdir = subdir+"/"+testpath - else: - self.testdir = testpath - self.teststamp = \ - '# '+testtime+' '+self.prefix+' '+self.suspendmode+' '+kver - if(self.embedded): - self.dmesgfile = \ - '/tmp/'+testtime+'_'+self.suspendmode+'_dmesg.txt' - self.ftracefile = \ - '/tmp/'+testtime+'_'+self.suspendmode+'_ftrace.txt' - return - self.dmesgfile = \ - self.testdir+'/'+self.prefix+'_'+self.suspendmode+'_dmesg.txt' - self.ftracefile = \ - self.testdir+'/'+self.prefix+'_'+self.suspendmode+'_ftrace.txt' - self.htmlfile = \ - self.testdir+'/'+self.prefix+'_'+self.suspendmode+'.html' - if not os.path.isdir(self.testdir): - os.mkdir(self.testdir) - def setDeviceFilter(self, value): - self.devicefilter = [] - if value: - value = value.split(',') - for i in value: - self.devicefilter.append(i.strip()) - def rtcWakeAlarmOn(self): - call('echo 0 > '+self.rtcpath+'/wakealarm', shell=True) - outD = open(self.rtcpath+'/date', 'r').read().strip() - outT = open(self.rtcpath+'/time', 'r').read().strip() - mD = re.match('^(?P[0-9]*)-(?P[0-9]*)-(?P[0-9]*)', outD) - mT = re.match('^(?P[0-9]*):(?P[0-9]*):(?P[0-9]*)', outT) - if(mD and mT): - # get the current time from hardware - utcoffset = int((datetime.now() - datetime.utcnow()).total_seconds()) - dt = datetime(\ - int(mD.group('y')), int(mD.group('m')), int(mD.group('d')), - int(mT.group('h')), int(mT.group('m')), int(mT.group('s'))) - nowtime = int(dt.strftime('%s')) + utcoffset - else: - # if hardware time fails, use the software time - nowtime = int(datetime.now().strftime('%s')) - alarm = nowtime + self.rtcwaketime - call('echo %d > %s/wakealarm' % (alarm, self.rtcpath), shell=True) - def rtcWakeAlarmOff(self): - call('echo 0 > %s/wakealarm' % self.rtcpath, shell=True) - def initdmesg(self): - # get the latest time stamp from the dmesg log - fp = Popen('dmesg', stdout=PIPE).stdout - ktime = '0' - for line in fp: - line = line.replace('\r\n', '') - idx = line.find('[') - if idx > 1: - line = line[idx:] - m = re.match('[ \t]*(\[ *)(?P[0-9\.]*)(\]) (?P.*)', line) - if(m): - ktime = m.group('ktime') - fp.close() - self.dmesgstart = float(ktime) - def getdmesg(self): - # store all new dmesg lines since initdmesg was called - fp = Popen('dmesg', stdout=PIPE).stdout - op = open(self.dmesgfile, 'a') - for line in fp: - line = line.replace('\r\n', '') - idx = line.find('[') - if idx > 1: - line = line[idx:] - m = re.match('[ \t]*(\[ *)(?P[0-9\.]*)(\]) (?P.*)', line) - if(not m): - continue - ktime = float(m.group('ktime')) - if ktime > self.dmesgstart: - op.write(line) - fp.close() - op.close() - def addFtraceFilterFunctions(self, file): - fp = open(file) - list = fp.read().split('\n') - fp.close() - for i in list: - if len(i) < 2: - continue - self.tracefuncs[i] = dict() - def getFtraceFilterFunctions(self, current): - rootCheck(True) - if not current: - call('cat '+self.tpath+'available_filter_functions', shell=True) - return - fp = open(self.tpath+'available_filter_functions') - master = fp.read().split('\n') - fp.close() - for i in self.tracefuncs: - if 'func' in self.tracefuncs[i]: - i = self.tracefuncs[i]['func'] - if i in master: - print i - else: - print self.colorText(i) - def setFtraceFilterFunctions(self, list): - fp = open(self.tpath+'available_filter_functions') - master = fp.read().split('\n') - fp.close() - flist = '' - for i in list: - if i not in master: - continue - if ' [' in i: - flist += i.split(' ')[0]+'\n' - else: - flist += i+'\n' - fp = open(self.tpath+'set_graph_function', 'w') - fp.write(flist) - fp.close() - def basicKprobe(self, name): - self.kprobes[name] = {'name': name,'func': name,'args': dict(),'format': name} - def defaultKprobe(self, name, kdata): - k = kdata - for field in ['name', 'format', 'func']: - if field not in k: - k[field] = name - if self.archargs in k: - k['args'] = k[self.archargs] - else: - k['args'] = dict() - k['format'] = name - self.kprobes[name] = k - def kprobeColor(self, name): - if name not in self.kprobes or 'color' not in self.kprobes[name]: - return '' - return self.kprobes[name]['color'] - def kprobeDisplayName(self, name, dataraw): - if name not in self.kprobes: - self.basicKprobe(name) - data = '' - quote=0 - # first remvoe any spaces inside quotes, and the quotes - for c in dataraw: - if c == '"': - quote = (quote + 1) % 2 - if quote and c == ' ': - data += '_' - elif c != '"': - data += c - fmt, args = self.kprobes[name]['format'], self.kprobes[name]['args'] - arglist = dict() - # now process the args - for arg in sorted(args): - arglist[arg] = '' - m = re.match('.* '+arg+'=(?P.*) ', data); - if m: - arglist[arg] = m.group('arg') - else: - m = re.match('.* '+arg+'=(?P.*)', data); - if m: - arglist[arg] = m.group('arg') - out = fmt.format(**arglist) - out = out.replace(' ', '_').replace('"', '') - return out - def kprobeText(self, kname, kprobe): - name = fmt = func = kname - args = dict() - if 'name' in kprobe: - name = kprobe['name'] - if 'format' in kprobe: - fmt = kprobe['format'] - if 'func' in kprobe: - func = kprobe['func'] - if self.archargs in kprobe: - args = kprobe[self.archargs] - if 'args' in kprobe: - args = kprobe['args'] - if re.findall('{(?P[a-z,A-Z,0-9]*)}', func): - doError('Kprobe "%s" has format info in the function name "%s"' % (name, func)) - for arg in re.findall('{(?P[a-z,A-Z,0-9]*)}', fmt): - if arg not in args: - doError('Kprobe "%s" is missing argument "%s"' % (name, arg)) - val = 'p:%s_cal %s' % (name, func) - for i in sorted(args): - val += ' %s=%s' % (i, args[i]) - val += '\nr:%s_ret %s $retval\n' % (name, func) - return val - def addKprobes(self, output=False): - if len(sysvals.kprobes) < 1: - return - if output: - print(' kprobe functions in this kernel:') - # first test each kprobe - rejects = [] - # sort kprobes: trace, ub-dev, custom, dev - kpl = [[], [], [], []] - for name in sorted(self.kprobes): - res = self.colorText('YES', 32) - if not self.testKprobe(name, self.kprobes[name]): - res = self.colorText('NO') - rejects.append(name) - else: - if name in self.tracefuncs: - kpl[0].append(name) - elif name in self.dev_tracefuncs: - if 'ub' in self.dev_tracefuncs[name]: - kpl[1].append(name) - else: - kpl[3].append(name) - else: - kpl[2].append(name) - if output: - print(' %s: %s' % (name, res)) - kplist = kpl[0] + kpl[1] + kpl[2] + kpl[3] - # remove all failed ones from the list - for name in rejects: - self.kprobes.pop(name) - # set the kprobes all at once - self.fsetVal('', 'kprobe_events') - kprobeevents = '' - for kp in kplist: - kprobeevents += self.kprobeText(kp, self.kprobes[kp]) - self.fsetVal(kprobeevents, 'kprobe_events') - # verify that the kprobes were set as ordered - check = self.fgetVal('kprobe_events') - linesout = len(kprobeevents.split('\n')) - 1 - linesack = len(check.split('\n')) - 1 - if output: - res = '%d/%d' % (linesack, linesout) - if linesack < linesout: - res = self.colorText(res, 31) - else: - res = self.colorText(res, 32) - print(' working kprobe functions enabled: %s' % res) - self.fsetVal('1', 'events/kprobes/enable') - def testKprobe(self, kname, kprobe): - self.fsetVal('0', 'events/kprobes/enable') - kprobeevents = self.kprobeText(kname, kprobe) - if not kprobeevents: - return False - try: - self.fsetVal(kprobeevents, 'kprobe_events') - check = self.fgetVal('kprobe_events') - except: - return False - linesout = len(kprobeevents.split('\n')) - linesack = len(check.split('\n')) - if linesack < linesout: - return False - return True - def fsetVal(self, val, path, mode='w'): - file = self.tpath+path - if not os.path.exists(file): - return False - try: - fp = open(file, mode, 0) - fp.write(val) - fp.flush() - fp.close() - except: - pass - return True - def fgetVal(self, path): - file = self.tpath+path - res = '' - if not os.path.exists(file): - return res - try: - fp = open(file, 'r') - res = fp.read() - fp.close() - except: - pass - return res - def cleanupFtrace(self): - if(self.usecallgraph or self.usetraceevents): - self.fsetVal('0', 'events/kprobes/enable') - self.fsetVal('', 'kprobe_events') - def setupAllKprobes(self): - for name in self.tracefuncs: - self.defaultKprobe(name, self.tracefuncs[name]) - for name in self.dev_tracefuncs: - self.defaultKprobe(name, self.dev_tracefuncs[name]) - def isCallgraphFunc(self, name): - if len(self.tracefuncs) < 1 and self.suspendmode == 'command': - return True - for i in self.tracefuncs: - if 'func' in self.tracefuncs[i]: - f = self.tracefuncs[i]['func'] - else: - f = i - if name == f: - return True - return False - def initFtrace(self, testing=False): - print('INITIALIZING FTRACE...') - # turn trace off - self.fsetVal('0', 'tracing_on') - self.cleanupFtrace() - # set the trace clock to global - self.fsetVal('global', 'trace_clock') - # set trace buffer to a huge value - self.fsetVal('nop', 'current_tracer') - self.fsetVal('100000', 'buffer_size_kb') - # go no further if this is just a status check - if testing: - return - # initialize the callgraph trace - if(self.usecallgraph): - # set trace type - self.fsetVal('function_graph', 'current_tracer') - self.fsetVal('', 'set_ftrace_filter') - # set trace format options - self.fsetVal('print-parent', 'trace_options') - self.fsetVal('funcgraph-abstime', 'trace_options') - self.fsetVal('funcgraph-cpu', 'trace_options') - self.fsetVal('funcgraph-duration', 'trace_options') - self.fsetVal('funcgraph-proc', 'trace_options') - self.fsetVal('funcgraph-tail', 'trace_options') - self.fsetVal('nofuncgraph-overhead', 'trace_options') - self.fsetVal('context-info', 'trace_options') - self.fsetVal('graph-time', 'trace_options') - self.fsetVal('0', 'max_graph_depth') - cf = ['dpm_run_callback'] - if(self.usetraceeventsonly): - cf += ['dpm_prepare', 'dpm_complete'] - for fn in self.tracefuncs: - if 'func' in self.tracefuncs[fn]: - cf.append(self.tracefuncs[fn]['func']) - else: - cf.append(fn) - self.setFtraceFilterFunctions(cf) - # initialize the kprobe trace - elif self.usekprobes: - for name in self.tracefuncs: - self.defaultKprobe(name, self.tracefuncs[name]) - if self.usedevsrc: - for name in self.dev_tracefuncs: - self.defaultKprobe(name, self.dev_tracefuncs[name]) - print('INITIALIZING KPROBES...') - self.addKprobes(self.verbose) - if(self.usetraceevents): - # turn trace events on - events = iter(self.traceevents) - for e in events: - self.fsetVal('1', 'events/power/'+e+'/enable') - # clear the trace buffer - self.fsetVal('', 'trace') - def verifyFtrace(self): - # files needed for any trace data - files = ['buffer_size_kb', 'current_tracer', 'trace', 'trace_clock', - 'trace_marker', 'trace_options', 'tracing_on'] - # files needed for callgraph trace data - tp = self.tpath - if(self.usecallgraph): - files += [ - 'available_filter_functions', - 'set_ftrace_filter', - 'set_graph_function' - ] - for f in files: - if(os.path.exists(tp+f) == False): - return False - return True - def verifyKprobes(self): - # files needed for kprobes to work - files = ['kprobe_events', 'events'] - tp = self.tpath - for f in files: - if(os.path.exists(tp+f) == False): - return False - return True - def colorText(self, str, color=31): - if not self.ansi: - return str - return '\x1B[%d;40m%s\x1B[m' % (color, str) - -sysvals = SystemValues() - -# Class: DevProps -# Description: -# Simple class which holds property values collected -# for all the devices used in the timeline. -class DevProps: - syspath = '' - altname = '' - async = True - xtraclass = '' - xtrainfo = '' - def out(self, dev): - return '%s,%s,%d;' % (dev, self.altname, self.async) - def debug(self, dev): - print '%s:\n\taltname = %s\n\t async = %s' % (dev, self.altname, self.async) - def altName(self, dev): - if not self.altname or self.altname == dev: - return dev - return '%s [%s]' % (self.altname, dev) - def xtraClass(self): - if self.xtraclass: - return ' '+self.xtraclass - if not self.async: - return ' sync' - return '' - def xtraInfo(self): - if self.xtraclass: - return ' '+self.xtraclass - if self.async: - return ' async_device' - return ' sync_device' - -# Class: DeviceNode -# Description: -# A container used to create a device hierachy, with a single root node -# and a tree of child nodes. Used by Data.deviceTopology() -class DeviceNode: - name = '' - children = 0 - depth = 0 - def __init__(self, nodename, nodedepth): - self.name = nodename - self.children = [] - self.depth = nodedepth - -# Class: Data -# Description: -# The primary container for suspend/resume test data. There is one for -# each test run. The data is organized into a cronological hierarchy: -# Data.dmesg { -# phases { -# 10 sequential, non-overlapping phases of S/R -# contents: times for phase start/end, order/color data for html -# devlist { -# device callback or action list for this phase -# device { -# a single device callback or generic action -# contents: start/stop times, pid/cpu/driver info -# parents/children, html id for timeline/callgraph -# optionally includes an ftrace callgraph -# optionally includes dev/ps data -# } -# } -# } -# } -# -class Data: - dmesg = {} # root data structure - phases = [] # ordered list of phases - start = 0.0 # test start - end = 0.0 # test end - tSuspended = 0.0 # low-level suspend start - tResumed = 0.0 # low-level resume start - tKernSus = 0.0 # kernel level suspend start - tKernRes = 0.0 # kernel level resume end - tLow = 0.0 # time spent in low-level suspend (standby/freeze) - fwValid = False # is firmware data available - fwSuspend = 0 # time spent in firmware suspend - fwResume = 0 # time spent in firmware resume - dmesgtext = [] # dmesg text file in memory - pstl = 0 # process timeline - testnumber = 0 - idstr = '' - html_device_id = 0 - stamp = 0 - outfile = '' - devpids = [] - kerror = False - def __init__(self, num): - idchar = 'abcdefghij' - self.pstl = dict() - self.testnumber = num - self.idstr = idchar[num] - self.dmesgtext = [] - self.phases = [] - self.dmesg = { # fixed list of 10 phases - 'suspend_prepare': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#CCFFCC', 'order': 0}, - 'suspend': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#88FF88', 'order': 1}, - 'suspend_late': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#00AA00', 'order': 2}, - 'suspend_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#008888', 'order': 3}, - 'suspend_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#0000FF', 'order': 4}, - 'resume_machine': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF0000', 'order': 5}, - 'resume_noirq': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FF9900', 'order': 6}, - 'resume_early': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFCC00', 'order': 7}, - 'resume': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFF88', 'order': 8}, - 'resume_complete': {'list': dict(), 'start': -1.0, 'end': -1.0, - 'row': 0, 'color': '#FFFFCC', 'order': 9} - } - self.phases = self.sortedPhases() - self.devicegroups = [] - for phase in self.phases: - self.devicegroups.append([phase]) - self.errorinfo = {'suspend':[],'resume':[]} - def extractErrorInfo(self, dmesg): - error = '' - tm = 0.0 - for i in range(len(dmesg)): - if 'Call Trace:' in dmesg[i]: - m = re.match('[ \t]*(\[ *)(?P[0-9\.]*)(\]) .*', dmesg[i]) - if not m: - continue - tm = float(m.group('ktime')) - if tm < self.start or tm > self.end: - continue - for j in range(i-10, i+1): - error += dmesg[j] - continue - if error: - m = re.match('[ \t]*\[ *[0-9\.]*\] \[\<[0-9a-fA-F]*\>\] .*', dmesg[i]) - if m: - error += dmesg[i] - else: - if tm < self.tSuspended: - dir = 'suspend' - else: - dir = 'resume' - error = error.replace('<', '<').replace('>', '>') - vprint('kernel error found in %s at %f' % (dir, tm)) - self.errorinfo[dir].append((tm, error)) - self.kerror = True - error = '' - def setStart(self, time): - self.start = time - def setEnd(self, time): - self.end = time - def isTraceEventOutsideDeviceCalls(self, pid, time): - for phase in self.phases: - list = self.dmesg[phase]['list'] - for dev in list: - d = list[dev] - if(d['pid'] == pid and time >= d['start'] and - time < d['end']): - return False - return True - def sourcePhase(self, start): - for phase in self.phases: - pend = self.dmesg[phase]['end'] - if start <= pend: - return phase - return 'resume_complete' - def sourceDevice(self, phaselist, start, end, pid, type): - tgtdev = '' - for phase in phaselist: - list = self.dmesg[phase]['list'] - for devname in list: - dev = list[devname] - # pid must match - if dev['pid'] != pid: - continue - devS = dev['start'] - devE = dev['end'] - if type == 'device': - # device target event is entirely inside the source boundary - if(start < devS or start >= devE or end <= devS or end > devE): - continue - elif type == 'thread': - # thread target event will expand the source boundary - if start < devS: - dev['start'] = start - if end > devE: - dev['end'] = end - tgtdev = dev - break - return tgtdev - def addDeviceFunctionCall(self, displayname, kprobename, proc, pid, start, end, cdata, rdata): - # try to place the call in a device - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'device') - # calls with device pids that occur outside device bounds are dropped - # TODO: include these somehow - if not tgtdev and pid in self.devpids: - return False - # try to place the call in a thread - if not tgtdev: - tgtdev = self.sourceDevice(self.phases, start, end, pid, 'thread') - # create new thread blocks, expand as new calls are found - if not tgtdev: - if proc == '<...>': - threadname = 'kthread-%d' % (pid) - else: - threadname = '%s-%d' % (proc, pid) - tgtphase = self.sourcePhase(start) - self.newAction(tgtphase, threadname, pid, '', start, end, '', ' kth', '') - return self.addDeviceFunctionCall(displayname, kprobename, proc, pid, start, end, cdata, rdata) - # this should not happen - if not tgtdev: - vprint('[%f - %f] %s-%d %s %s %s' % \ - (start, end, proc, pid, kprobename, cdata, rdata)) - return False - # place the call data inside the src element of the tgtdev - if('src' not in tgtdev): - tgtdev['src'] = [] - dtf = sysvals.dev_tracefuncs - ubiquitous = False - if kprobename in dtf and 'ub' in dtf[kprobename]: - ubiquitous = True - title = cdata+' '+rdata - mstr = '\(.*\) *(?P.*) *\((?P.*)\+.* arg1=(?P.*)' - m = re.match(mstr, title) - if m: - c = m.group('caller') - a = m.group('args').strip() - r = m.group('ret') - if len(r) > 6: - r = '' - else: - r = 'ret=%s ' % r - if ubiquitous and c in dtf and 'ub' in dtf[c]: - return False - color = sysvals.kprobeColor(kprobename) - e = DevFunction(displayname, a, c, r, start, end, ubiquitous, proc, pid, color) - tgtdev['src'].append(e) - return True - def overflowDevices(self): - # get a list of devices that extend beyond the end of this test run - devlist = [] - for phase in self.phases: - list = self.dmesg[phase]['list'] - for devname in list: - dev = list[devname] - if dev['end'] > self.end: - devlist.append(dev) - return devlist - def mergeOverlapDevices(self, devlist): - # merge any devices that overlap devlist - for dev in devlist: - devname = dev['name'] - for phase in self.phases: - list = self.dmesg[phase]['list'] - if devname not in list: - continue - tdev = list[devname] - o = min(dev['end'], tdev['end']) - max(dev['start'], tdev['start']) - if o <= 0: - continue - dev['end'] = tdev['end'] - if 'src' not in dev or 'src' not in tdev: - continue - dev['src'] += tdev['src'] - del list[devname] - def usurpTouchingThread(self, name, dev): - # the caller test has priority of this thread, give it to him - for phase in self.phases: - list = self.dmesg[phase]['list'] - if name in list: - tdev = list[name] - if tdev['start'] - dev['end'] < 0.1: - dev['end'] = tdev['end'] - if 'src' not in dev: - dev['src'] = [] - if 'src' in tdev: - dev['src'] += tdev['src'] - del list[name] - break - def stitchTouchingThreads(self, testlist): - # merge any threads between tests that touch - for phase in self.phases: - list = self.dmesg[phase]['list'] - for devname in list: - dev = list[devname] - if 'htmlclass' not in dev or 'kth' not in dev['htmlclass']: - continue - for data in testlist: - data.usurpTouchingThread(devname, dev) - def optimizeDevSrc(self): - # merge any src call loops to reduce timeline size - for phase in self.phases: - list = self.dmesg[phase]['list'] - for dev in list: - if 'src' not in list[dev]: - continue - src = list[dev]['src'] - p = 0 - for e in sorted(src, key=lambda event: event.time): - if not p or not e.repeat(p): - p = e - continue - # e is another iteration of p, move it into p - p.end = e.end - p.length = p.end - p.time - p.count += 1 - src.remove(e) - def trimTimeVal(self, t, t0, dT, left): - if left: - if(t > t0): - if(t - dT < t0): - return t0 - return t - dT - else: - return t - else: - if(t < t0 + dT): - if(t > t0): - return t0 + dT - return t + dT - else: - return t - def trimTime(self, t0, dT, left): - self.tSuspended = self.trimTimeVal(self.tSuspended, t0, dT, left) - self.tResumed = self.trimTimeVal(self.tResumed, t0, dT, left) - self.start = self.trimTimeVal(self.start, t0, dT, left) - self.tKernSus = self.trimTimeVal(self.tKernSus, t0, dT, left) - self.tKernRes = self.trimTimeVal(self.tKernRes, t0, dT, left) - self.end = self.trimTimeVal(self.end, t0, dT, left) - for phase in self.phases: - p = self.dmesg[phase] - p['start'] = self.trimTimeVal(p['start'], t0, dT, left) - p['end'] = self.trimTimeVal(p['end'], t0, dT, left) - list = p['list'] - for name in list: - d = list[name] - d['start'] = self.trimTimeVal(d['start'], t0, dT, left) - d['end'] = self.trimTimeVal(d['end'], t0, dT, left) - if('ftrace' in d): - cg = d['ftrace'] - cg.start = self.trimTimeVal(cg.start, t0, dT, left) - cg.end = self.trimTimeVal(cg.end, t0, dT, left) - for line in cg.list: - line.time = self.trimTimeVal(line.time, t0, dT, left) - if('src' in d): - for e in d['src']: - e.time = self.trimTimeVal(e.time, t0, dT, left) - def normalizeTime(self, tZero): - # trim out any standby or freeze clock time - if(self.tSuspended != self.tResumed): - if(self.tResumed > tZero): - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, True) - else: - self.trimTime(self.tSuspended, \ - self.tResumed-self.tSuspended, False) - def setPhase(self, phase, ktime, isbegin): - if(isbegin): - self.dmesg[phase]['start'] = ktime - else: - self.dmesg[phase]['end'] = ktime - def dmesgSortVal(self, phase): - return self.dmesg[phase]['order'] - def sortedPhases(self): - return sorted(self.dmesg, key=self.dmesgSortVal) - def sortedDevices(self, phase): - list = self.dmesg[phase]['list'] - slist = [] - tmp = dict() - for devname in list: - dev = list[devname] - tmp[dev['start']] = devname - for t in sorted(tmp): - slist.append(tmp[t]) - return slist - def fixupInitcalls(self, phase): - # if any calls never returned, clip them at system resume end - phaselist = self.dmesg[phase]['list'] - for devname in phaselist: - dev = phaselist[devname] - if(dev['end'] < 0): - for p in self.phases: - if self.dmesg[p]['end'] > dev['start']: - dev['end'] = self.dmesg[p]['end'] - break - vprint('%s (%s): callback didnt return' % (devname, phase)) - def deviceFilter(self, devicefilter): - for phase in self.phases: - list = self.dmesg[phase]['list'] - rmlist = [] - for name in list: - keep = False - for filter in devicefilter: - if filter in name or \ - ('drv' in list[name] and filter in list[name]['drv']): - keep = True - if not keep: - rmlist.append(name) - for name in rmlist: - del list[name] - def fixupInitcallsThatDidntReturn(self): - # if any calls never returned, clip them at system resume end - for phase in self.phases: - self.fixupInitcalls(phase) - def phaseOverlap(self, phases): - rmgroups = [] - newgroup = [] - for group in self.devicegroups: - for phase in phases: - if phase not in group: - continue - for p in group: - if p not in newgroup: - newgroup.append(p) - if group not in rmgroups: - rmgroups.append(group) - for group in rmgroups: - self.devicegroups.remove(group) - self.devicegroups.append(newgroup) - def newActionGlobal(self, name, start, end, pid=-1, color=''): - # which phase is this device callback or action in - targetphase = 'none' - htmlclass = '' - overlap = 0.0 - phases = [] - for phase in self.phases: - pstart = self.dmesg[phase]['start'] - pend = self.dmesg[phase]['end'] - # see if the action overlaps this phase - o = max(0, min(end, pend) - max(start, pstart)) - if o > 0: - phases.append(phase) - # set the target phase to the one that overlaps most - if o > overlap: - if overlap > 0 and phase == 'post_resume': - continue - targetphase = phase - overlap = o - # if no target phase was found, pin it to the edge - if targetphase == 'none': - p0start = self.dmesg[self.phases[0]]['start'] - if start <= p0start: - targetphase = self.phases[0] - else: - targetphase = self.phases[-1] - if pid == -2: - htmlclass = ' bg' - elif pid == -3: - htmlclass = ' ps' - if len(phases) > 1: - htmlclass = ' bg' - self.phaseOverlap(phases) - if targetphase in self.phases: - newname = self.newAction(targetphase, name, pid, '', start, end, '', htmlclass, color) - return (targetphase, newname) - return False - def newAction(self, phase, name, pid, parent, start, end, drv, htmlclass='', color=''): - # new device callback for a specific phase - self.html_device_id += 1 - devid = '%s%d' % (self.idstr, self.html_device_id) - list = self.dmesg[phase]['list'] - length = -1.0 - if(start >= 0 and end >= 0): - length = end - start - if pid == -2: - i = 2 - origname = name - while(name in list): - name = '%s[%d]' % (origname, i) - i += 1 - list[name] = {'name': name, 'start': start, 'end': end, 'pid': pid, - 'par': parent, 'length': length, 'row': 0, 'id': devid, 'drv': drv } - if htmlclass: - list[name]['htmlclass'] = htmlclass - if color: - list[name]['color'] = color - return name - def deviceChildren(self, devname, phase): - devlist = [] - list = self.dmesg[phase]['list'] - for child in list: - if(list[child]['par'] == devname): - devlist.append(child) - return devlist - def printDetails(self): - vprint('Timeline Details:') - vprint(' test start: %f' % self.start) - vprint('kernel suspend start: %f' % self.tKernSus) - for phase in self.phases: - dc = len(self.dmesg[phase]['list']) - vprint(' %16s: %f - %f (%d devices)' % (phase, \ - self.dmesg[phase]['start'], self.dmesg[phase]['end'], dc)) - vprint(' kernel resume end: %f' % self.tKernRes) - vprint(' test end: %f' % self.end) - def deviceChildrenAllPhases(self, devname): - devlist = [] - for phase in self.phases: - list = self.deviceChildren(devname, phase) - for dev in list: - if dev not in devlist: - devlist.append(dev) - return devlist - def masterTopology(self, name, list, depth): - node = DeviceNode(name, depth) - for cname in list: - # avoid recursions - if name == cname: - continue - clist = self.deviceChildrenAllPhases(cname) - cnode = self.masterTopology(cname, clist, depth+1) - node.children.append(cnode) - return node - def printTopology(self, node): - html = '' - if node.name: - info = '' - drv = '' - for phase in self.phases: - list = self.dmesg[phase]['list'] - if node.name in list: - s = list[node.name]['start'] - e = list[node.name]['end'] - if list[node.name]['drv']: - drv = ' {'+list[node.name]['drv']+'}' - info += ('
  • %s: %.3fms
  • ' % (phase, (e-s)*1000)) - html += '
  • '+node.name+drv+'' - if info: - html += '
      '+info+'
    ' - html += '
  • ' - if len(node.children) > 0: - html += '
      ' - for cnode in node.children: - html += self.printTopology(cnode) - html += '
    ' - return html - def rootDeviceList(self): - # list of devices graphed - real = [] - for phase in self.dmesg: - list = self.dmesg[phase]['list'] - for dev in list: - if list[dev]['pid'] >= 0 and dev not in real: - real.append(dev) - # list of top-most root devices - rootlist = [] - for phase in self.dmesg: - list = self.dmesg[phase]['list'] - for dev in list: - pdev = list[dev]['par'] - pid = list[dev]['pid'] - if(pid < 0 or re.match('[0-9]*-[0-9]*\.[0-9]*[\.0-9]*\:[\.0-9]*$', pdev)): - continue - if pdev and pdev not in real and pdev not in rootlist: - rootlist.append(pdev) - return rootlist - def deviceTopology(self): - rootlist = self.rootDeviceList() - master = self.masterTopology('', rootlist, 0) - return self.printTopology(master) - def selectTimelineDevices(self, widfmt, tTotal, mindevlen): - # only select devices that will actually show up in html - self.tdevlist = dict() - for phase in self.dmesg: - devlist = [] - list = self.dmesg[phase]['list'] - for dev in list: - length = (list[dev]['end'] - list[dev]['start']) * 1000 - width = widfmt % (((list[dev]['end']-list[dev]['start'])*100)/tTotal) - if width != '0.000000' and length >= mindevlen: - devlist.append(dev) - self.tdevlist[phase] = devlist - def addHorizontalDivider(self, devname, devend): - phase = 'suspend_prepare' - self.newAction(phase, devname, -2, '', \ - self.start, devend, '', ' sec', '') - if phase not in self.tdevlist: - self.tdevlist[phase] = [] - self.tdevlist[phase].append(devname) - d = DevItem(0, phase, self.dmesg[phase]['list'][devname]) - return d - def addProcessUsageEvent(self, name, times): - # get the start and end times for this process - maxC = 0 - tlast = 0 - start = -1 - end = -1 - for t in sorted(times): - if tlast == 0: - tlast = t - continue - if name in self.pstl[t]: - if start == -1 or tlast < start: - start = tlast - if end == -1 or t > end: - end = t - tlast = t - if start == -1 or end == -1: - return 0 - # add a new action for this process and get the object - out = self.newActionGlobal(name, start, end, -3) - if not out: - return 0 - phase, devname = out - dev = self.dmesg[phase]['list'][devname] - # get the cpu exec data - tlast = 0 - clast = 0 - cpuexec = dict() - for t in sorted(times): - if tlast == 0 or t <= start or t > end: - tlast = t - continue - list = self.pstl[t] - c = 0 - if name in list: - c = list[name] - if c > maxC: - maxC = c - if c != clast: - key = (tlast, t) - cpuexec[key] = c - tlast = t - clast = c - dev['cpuexec'] = cpuexec - return maxC - def createProcessUsageEvents(self): - # get an array of process names - proclist = [] - for t in self.pstl: - pslist = self.pstl[t] - for ps in pslist: - if ps not in proclist: - proclist.append(ps) - # get a list of data points for suspend and resume - tsus = [] - tres = [] - for t in sorted(self.pstl): - if t < self.tSuspended: - tsus.append(t) - else: - tres.append(t) - # process the events for suspend and resume - if len(proclist) > 0: - vprint('Process Execution:') - for ps in proclist: - c = self.addProcessUsageEvent(ps, tsus) - if c > 0: - vprint('%25s (sus): %d' % (ps, c)) - c = self.addProcessUsageEvent(ps, tres) - if c > 0: - vprint('%25s (res): %d' % (ps, c)) - -# Class: DevFunction -# Description: -# A container for kprobe function data we want in the dev timeline -class DevFunction: - row = 0 - count = 1 - def __init__(self, name, args, caller, ret, start, end, u, proc, pid, color): - self.name = name - self.args = args - self.caller = caller - self.ret = ret - self.time = start - self.length = end - start - self.end = end - self.ubiquitous = u - self.proc = proc - self.pid = pid - self.color = color - def title(self): - cnt = '' - if self.count > 1: - cnt = '(x%d)' % self.count - l = '%0.3fms' % (self.length * 1000) - if self.ubiquitous: - title = '%s(%s)%s <- %s, %s(%s)' % \ - (self.name, self.args, cnt, self.caller, self.ret, l) - else: - title = '%s(%s) %s%s(%s)' % (self.name, self.args, self.ret, cnt, l) - return title.replace('"', '') - def text(self): - if self.count > 1: - text = '%s(x%d)' % (self.name, self.count) - else: - text = self.name - return text - def repeat(self, tgt): - # is the tgt call just a repeat of this call (e.g. are we in a loop) - dt = self.time - tgt.end - # only combine calls if -all- attributes are identical - if tgt.caller == self.caller and \ - tgt.name == self.name and tgt.args == self.args and \ - tgt.proc == self.proc and tgt.pid == self.pid and \ - tgt.ret == self.ret and dt >= 0 and \ - dt <= sysvals.callloopmaxgap and \ - self.length < sysvals.callloopmaxlen: - return True - return False - -# Class: FTraceLine -# Description: -# A container for a single line of ftrace data. There are six basic types: -# callgraph line: -# call: " dpm_run_callback() {" -# return: " }" -# leaf: " dpm_run_callback();" -# trace event: -# tracing_mark_write: SUSPEND START or RESUME COMPLETE -# suspend_resume: phase or custom exec block data -# device_pm_callback: device callback info -class FTraceLine: - time = 0.0 - length = 0.0 - fcall = False - freturn = False - fevent = False - fkprobe = False - depth = 0 - name = '' - type = '' - def __init__(self, t, m='', d=''): - self.time = float(t) - if not m and not d: - return - # is this a trace event - if(d == 'traceevent' or re.match('^ *\/\* *(?P.*) \*\/ *$', m)): - if(d == 'traceevent'): - # nop format trace event - msg = m - else: - # function_graph format trace event - em = re.match('^ *\/\* *(?P.*) \*\/ *$', m) - msg = em.group('msg') - - emm = re.match('^(?P.*?): (?P.*)', msg) - if(emm): - self.name = emm.group('msg') - self.type = emm.group('call') - else: - self.name = msg - km = re.match('^(?P.*)_cal$', self.type) - if km: - self.fcall = True - self.fkprobe = True - self.type = km.group('n') - return - km = re.match('^(?P.*)_ret$', self.type) - if km: - self.freturn = True - self.fkprobe = True - self.type = km.group('n') - return - self.fevent = True - return - # convert the duration to seconds - if(d): - self.length = float(d)/1000000 - # the indentation determines the depth - match = re.match('^(?P *)(?P.*)$', m) - if(not match): - return - self.depth = self.getDepth(match.group('d')) - m = match.group('o') - # function return - if(m[0] == '}'): - self.freturn = True - if(len(m) > 1): - # includes comment with function name - match = re.match('^} *\/\* *(?P.*) *\*\/$', m) - if(match): - self.name = match.group('n').strip() - # function call - else: - self.fcall = True - # function call with children - if(m[-1] == '{'): - match = re.match('^(?P.*) *\(.*', m) - if(match): - self.name = match.group('n').strip() - # function call with no children (leaf) - elif(m[-1] == ';'): - self.freturn = True - match = re.match('^(?P.*) *\(.*', m) - if(match): - self.name = match.group('n').strip() - # something else (possibly a trace marker) - else: - self.name = m - def getDepth(self, str): - return len(str)/2 - def debugPrint(self, dev=''): - if(self.freturn and self.fcall): - print('%s -- %f (%02d): %s(); (%.3f us)' % (dev, self.time, \ - self.depth, self.name, self.length*1000000)) - elif(self.freturn): - print('%s -- %f (%02d): %s} (%.3f us)' % (dev, self.time, \ - self.depth, self.name, self.length*1000000)) - else: - print('%s -- %f (%02d): %s() { (%.3f us)' % (dev, self.time, \ - self.depth, self.name, self.length*1000000)) - def startMarker(self): - # Is this the starting line of a suspend? - if not self.fevent: - return False - if sysvals.usetracemarkers: - if(self.name == 'SUSPEND START'): - return True - return False - else: - if(self.type == 'suspend_resume' and - re.match('suspend_enter\[.*\] begin', self.name)): - return True - return False - def endMarker(self): - # Is this the ending line of a resume? - if not self.fevent: - return False - if sysvals.usetracemarkers: - if(self.name == 'RESUME COMPLETE'): - return True - return False - else: - if(self.type == 'suspend_resume' and - re.match('thaw_processes\[.*\] end', self.name)): - return True - return False - -# Class: FTraceCallGraph -# Description: -# A container for the ftrace callgraph of a single recursive function. -# This can be a dpm_run_callback, dpm_prepare, or dpm_complete callgraph -# Each instance is tied to a single device in a single phase, and is -# comprised of an ordered list of FTraceLine objects -class FTraceCallGraph: - start = -1.0 - end = -1.0 - list = [] - invalid = False - depth = 0 - pid = 0 - def __init__(self, pid): - self.start = -1.0 - self.end = -1.0 - self.list = [] - self.depth = 0 - self.pid = pid - def addLine(self, line, debug=False): - # if this is already invalid, just leave - if(self.invalid): - return False - # invalidate on too much data or bad depth - if(len(self.list) >= 1000000 or self.depth < 0): - self.invalidate(line) - return False - # compare current depth with this lines pre-call depth - prelinedep = line.depth - if(line.freturn and not line.fcall): - prelinedep += 1 - last = 0 - lasttime = line.time - virtualfname = 'execution_misalignment' - if len(self.list) > 0: - last = self.list[-1] - lasttime = last.time - # handle low misalignments by inserting returns - if prelinedep < self.depth: - if debug and last: - print '-------- task %d --------' % self.pid - last.debugPrint() - idx = 0 - # add return calls to get the depth down - while prelinedep < self.depth: - if debug: - print 'MISALIGN LOW (add returns): C%d - eC%d' % (self.depth, prelinedep) - self.depth -= 1 - if idx == 0 and last and last.fcall and not last.freturn: - # special case, turn last call into a leaf - last.depth = self.depth - last.freturn = True - last.length = line.time - last.time - if debug: - last.debugPrint() - else: - vline = FTraceLine(lasttime) - vline.depth = self.depth - vline.name = virtualfname - vline.freturn = True - self.list.append(vline) - if debug: - vline.debugPrint() - idx += 1 - if debug: - line.debugPrint() - print '' - # handle high misalignments by inserting calls - elif prelinedep > self.depth: - if debug and last: - print '-------- task %d --------' % self.pid - last.debugPrint() - idx = 0 - # add calls to get the depth up - while prelinedep > self.depth: - if debug: - print 'MISALIGN HIGH (add calls): C%d - eC%d' % (self.depth, prelinedep) - if idx == 0 and line.freturn and not line.fcall: - # special case, turn this return into a leaf - line.fcall = True - prelinedep -= 1 - else: - vline = FTraceLine(lasttime) - vline.depth = self.depth - vline.name = virtualfname - vline.fcall = True - if debug: - vline.debugPrint() - self.list.append(vline) - self.depth += 1 - if not last: - self.start = vline.time - idx += 1 - if debug: - line.debugPrint() - print '' - # process the call and set the new depth - if(line.fcall and not line.freturn): - self.depth += 1 - elif(line.freturn and not line.fcall): - self.depth -= 1 - if len(self.list) < 1: - self.start = line.time - self.list.append(line) - if(line.depth == 0 and line.freturn): - if(self.start < 0): - self.start = line.time - self.end = line.time - if line.fcall: - self.end += line.length - if self.list[0].name == virtualfname: - self.invalid = True - return True - return False - def invalidate(self, line): - if(len(self.list) > 0): - first = self.list[0] - self.list = [] - self.list.append(first) - self.invalid = True - id = 'task %s' % (self.pid) - window = '(%f - %f)' % (self.start, line.time) - if(self.depth < 0): - vprint('Too much data for '+id+\ - ' (buffer overflow), ignoring this callback') - else: - vprint('Too much data for '+id+\ - ' '+window+', ignoring this callback') - def slice(self, t0, tN): - minicg = FTraceCallGraph(0) - count = -1 - firstdepth = 0 - for l in self.list: - if(l.time < t0 or l.time > tN): - continue - if(count < 0): - if(not l.fcall or l.name == 'dev_driver_string'): - continue - firstdepth = l.depth - count = 0 - l.depth -= firstdepth - minicg.addLine(l) - if((count == 0 and l.freturn and l.fcall) or - (count > 0 and l.depth <= 0)): - break - count += 1 - return minicg - def repair(self, enddepth): - # bring the depth back to 0 with additional returns - fixed = False - last = self.list[-1] - for i in reversed(range(enddepth)): - t = FTraceLine(last.time) - t.depth = i - t.freturn = True - fixed = self.addLine(t) - if fixed: - self.end = last.time - return True - return False - def postProcess(self, debug=False): - stack = dict() - cnt = 0 - for l in self.list: - if(l.fcall and not l.freturn): - stack[l.depth] = l - cnt += 1 - elif(l.freturn and not l.fcall): - if(l.depth not in stack): - if debug: - print 'Post Process Error: Depth missing' - l.debugPrint() - return False - # transfer total time from return line to call line - stack[l.depth].length = l.length - stack.pop(l.depth) - l.length = 0 - cnt -= 1 - if(cnt == 0): - # trace caught the whole call tree - return True - elif(cnt < 0): - if debug: - print 'Post Process Error: Depth is less than 0' - return False - # trace ended before call tree finished - return self.repair(cnt) - def deviceMatch(self, pid, data): - found = False - # add the callgraph data to the device hierarchy - borderphase = { - 'dpm_prepare': 'suspend_prepare', - 'dpm_complete': 'resume_complete' - } - if(self.list[0].name in borderphase): - p = borderphase[self.list[0].name] - list = data.dmesg[p]['list'] - for devname in list: - dev = list[devname] - if(pid == dev['pid'] and - self.start <= dev['start'] and - self.end >= dev['end']): - dev['ftrace'] = self.slice(dev['start'], dev['end']) - found = True - return found - for p in data.phases: - if(data.dmesg[p]['start'] <= self.start and - self.start <= data.dmesg[p]['end']): - list = data.dmesg[p]['list'] - for devname in list: - dev = list[devname] - if(pid == dev['pid'] and - self.start <= dev['start'] and - self.end >= dev['end']): - dev['ftrace'] = self - found = True - break - break - return found - def newActionFromFunction(self, data): - name = self.list[0].name - if name in ['dpm_run_callback', 'dpm_prepare', 'dpm_complete']: - return - fs = self.start - fe = self.end - if fs < data.start or fe > data.end: - return - phase = '' - for p in data.phases: - if(data.dmesg[p]['start'] <= self.start and - self.start < data.dmesg[p]['end']): - phase = p - break - if not phase: - return - out = data.newActionGlobal(name, fs, fe, -2) - if out: - phase, myname = out - data.dmesg[phase]['list'][myname]['ftrace'] = self - def debugPrint(self): - print('[%f - %f] %s (%d)') % (self.start, self.end, self.list[0].name, self.pid) - for l in self.list: - if(l.freturn and l.fcall): - print('%f (%02d): %s(); (%.3f us)' % (l.time, \ - l.depth, l.name, l.length*1000000)) - elif(l.freturn): - print('%f (%02d): %s} (%.3f us)' % (l.time, \ - l.depth, l.name, l.length*1000000)) - else: - print('%f (%02d): %s() { (%.3f us)' % (l.time, \ - l.depth, l.name, l.length*1000000)) - print(' ') - -class DevItem: - def __init__(self, test, phase, dev): - self.test = test - self.phase = phase - self.dev = dev - def isa(self, cls): - if 'htmlclass' in self.dev and cls in self.dev['htmlclass']: - return True - return False - -# Class: Timeline -# Description: -# A container for a device timeline which calculates -# all the html properties to display it correctly -class Timeline: - html = {} - height = 0 # total timeline height - scaleH = 20 # timescale (top) row height - rowH = 30 # device row height - bodyH = 0 # body height - rows = 0 # total timeline rows - rowlines = dict() - rowheight = dict() - def __init__(self, rowheight, scaleheight): - self.rowH = rowheight - self.scaleH = scaleheight - self.html = { - 'header': '', - 'timeline': '', - 'legend': '', - } - # Function: getDeviceRows - # Description: - # determine how may rows the device funcs will take - # Arguments: - # rawlist: the list of devices/actions for a single phase - # Output: - # The total number of rows needed to display this phase of the timeline - def getDeviceRows(self, rawlist): - # clear all rows and set them to undefined - sortdict = dict() - for item in rawlist: - item.row = -1 - sortdict[item] = item.length - sortlist = sorted(sortdict, key=sortdict.get, reverse=True) - remaining = len(sortlist) - rowdata = dict() - row = 1 - # try to pack each row with as many ranges as possible - while(remaining > 0): - if(row not in rowdata): - rowdata[row] = [] - for i in sortlist: - if(i.row >= 0): - continue - s = i.time - e = i.time + i.length - valid = True - for ritem in rowdata[row]: - rs = ritem.time - re = ritem.time + ritem.length - if(not (((s <= rs) and (e <= rs)) or - ((s >= re) and (e >= re)))): - valid = False - break - if(valid): - rowdata[row].append(i) - i.row = row - remaining -= 1 - row += 1 - return row - # Function: getPhaseRows - # Description: - # Organize the timeline entries into the smallest - # number of rows possible, with no entry overlapping - # Arguments: - # devlist: the list of devices/actions in a group of contiguous phases - # Output: - # The total number of rows needed to display this phase of the timeline - def getPhaseRows(self, devlist, row=0): - # clear all rows and set them to undefined - remaining = len(devlist) - rowdata = dict() - sortdict = dict() - myphases = [] - # initialize all device rows to -1 and calculate devrows - for item in devlist: - dev = item.dev - tp = (item.test, item.phase) - if tp not in myphases: - myphases.append(tp) - dev['row'] = -1 - # sort by length 1st, then name 2nd - sortdict[item] = (float(dev['end']) - float(dev['start']), item.dev['name']) - if 'src' in dev: - dev['devrows'] = self.getDeviceRows(dev['src']) - # sort the devlist by length so that large items graph on top - sortlist = sorted(sortdict, key=sortdict.get, reverse=True) - orderedlist = [] - for item in sortlist: - if item.dev['pid'] == -2: - orderedlist.append(item) - for item in sortlist: - if item not in orderedlist: - orderedlist.append(item) - # try to pack each row with as many devices as possible - while(remaining > 0): - rowheight = 1 - if(row not in rowdata): - rowdata[row] = [] - for item in orderedlist: - dev = item.dev - if(dev['row'] < 0): - s = dev['start'] - e = dev['end'] - valid = True - for ritem in rowdata[row]: - rs = ritem.dev['start'] - re = ritem.dev['end'] - if(not (((s <= rs) and (e <= rs)) or - ((s >= re) and (e >= re)))): - valid = False - break - if(valid): - rowdata[row].append(item) - dev['row'] = row - remaining -= 1 - if 'devrows' in dev and dev['devrows'] > rowheight: - rowheight = dev['devrows'] - for t, p in myphases: - if t not in self.rowlines or t not in self.rowheight: - self.rowlines[t] = dict() - self.rowheight[t] = dict() - if p not in self.rowlines[t] or p not in self.rowheight[t]: - self.rowlines[t][p] = dict() - self.rowheight[t][p] = dict() - rh = self.rowH - # section headers should use a different row height - if len(rowdata[row]) == 1 and \ - 'htmlclass' in rowdata[row][0].dev and \ - 'sec' in rowdata[row][0].dev['htmlclass']: - rh = 15 - self.rowlines[t][p][row] = rowheight - self.rowheight[t][p][row] = rowheight * rh - row += 1 - if(row > self.rows): - self.rows = int(row) - return row - def phaseRowHeight(self, test, phase, row): - return self.rowheight[test][phase][row] - def phaseRowTop(self, test, phase, row): - top = 0 - for i in sorted(self.rowheight[test][phase]): - if i >= row: - break - top += self.rowheight[test][phase][i] - return top - # Function: calcTotalRows - # Description: - # Calculate the heights and offsets for the header and rows - def calcTotalRows(self): - maxrows = 0 - standardphases = [] - for t in self.rowlines: - for p in self.rowlines[t]: - total = 0 - for i in sorted(self.rowlines[t][p]): - total += self.rowlines[t][p][i] - if total > maxrows: - maxrows = total - if total == len(self.rowlines[t][p]): - standardphases.append((t, p)) - self.height = self.scaleH + (maxrows*self.rowH) - self.bodyH = self.height - self.scaleH - # if there is 1 line per row, draw them the standard way - for t, p in standardphases: - for i in sorted(self.rowheight[t][p]): - self.rowheight[t][p][i] = self.bodyH/len(self.rowlines[t][p]) - # Function: createTimeScale - # Description: - # Create the timescale for a timeline block - # Arguments: - # m0: start time (mode begin) - # mMax: end time (mode end) - # tTotal: total timeline time - # mode: suspend or resume - # Output: - # The html code needed to display the time scale - def createTimeScale(self, m0, mMax, tTotal, mode): - timescale = '
    {1}
    \n' - rline = '
    Resume
    \n' - output = '
    \n' - # set scale for timeline - mTotal = mMax - m0 - tS = 0.1 - if(tTotal <= 0): - return output+'
    \n' - if(tTotal > 4): - tS = 1 - divTotal = int(mTotal/tS) + 1 - divEdge = (mTotal - tS*(divTotal-1))*100/mTotal - for i in range(divTotal): - htmlline = '' - if(mode == 'resume'): - pos = '%0.3f' % (100 - ((float(i)*tS*100)/mTotal)) - val = '%0.fms' % (float(i)*tS*1000) - htmlline = timescale.format(pos, val) - if(i == 0): - htmlline = rline - else: - pos = '%0.3f' % (100 - ((float(i)*tS*100)/mTotal) - divEdge) - val = '%0.fms' % (float(i-divTotal+1)*tS*1000) - if(i == divTotal - 1): - val = 'Suspend' - htmlline = timescale.format(pos, val) - output += htmlline - output += '\n' - return output - -# Class: TestProps -# Description: -# A list of values describing the properties of these test runs -class TestProps: - stamp = '' - S0i3 = False - fwdata = [] - ftrace_line_fmt_fg = \ - '^ *(?P